Bottom Page

Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
 TypeError: '(slice(None, None, None), 0)' is an invalid key
#1
I am doing feature selection using python 2.7, the dataset was loaded fine but when i run the code it gives me this error
#!/usr/bin/env python

'''
An example file to show how to use the feature-selection code in ml_lib
'''

import os
import shutil

import json

from tempfile import mkdtemp

from tqdm import tqdm

from scipy.stats import randint as sp_randint

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_curve
from sklearn.externals import joblib
from sklearn.externals.joblib import Memory
from sklearn.metrics import matthews_corrcoef

import gplearn
#import gplearn.genetic
import gplearn.fitness

import numpy as np

from joblib import Parallel, delayed
from sklearn.externals.joblib import Parallel
from joblib import load, dump

from identity_transformer import IdentityTransformer

import depmeas



import pandas
from tqdm import tqdm
import pandas as pd
import csv
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import numpy as np
from multiprocessing import Process
import seaborn as sns
import matplotlib.pyplot as plt

import feature_select
import depmeas

if __name__ == '__main__':
    def generic_combined_scorer(x1, o1, ii_1, x2, o2, ii_2, y, h):
        s1 = h(x1, y)
        s2 = h(x2, y)
        o1[ii_1] = s1
        o2[ii_2] = s2

    NUM_CV = 3
    RANDOM_SEED = 123
    MAX_ITER = 1000

    # leuk = fetch_mldata('iris', transpose_data=True)

    X = pd.read_csv(r'C:\Users\pc\Desktop\dataset\leukemia.csv')
    y = pd.read_csv(r'C:\Users\pc\Desktop\dataset\leukemia.csv')


    # perform feature selection
    num_features_to_select = 25
    K_MAX = 1000
    estimator = depmeas.mi_tau
    n_jobs = -1
    verbose = True
    # print ( X.head(20))

    num_dim = X.shape[1]
    #print (num_dim)
    if (num_features_to_select is not None):
        num_selected_features = min(num_dim, num_features_to_select)
    else:
        num_selected_features = num_dim
    K_MAX_internal = min(num_dim, K_MAX)

    initial_scores = Parallel(n_jobs=n_jobs)(delayed(estimator)(X.iloc[:, 1], y.iloc[:, 1]) for ii in range(num_dim))
    # rank the scores in descending order
    sorted_scores_idxs = np.flipud(np.argsort(initial_scores))

    # subset the data down so that joblib doesn't have to
    # transport large matrices to its workers
    X_subset = X.iloc[:, sorted_scores_idxs[0:K_MAX_internal]]
    # memory map this for parallelization speed
    tmp_folder = mkdtemp()
    # TODO: why is X_subset crashing when we increase K_MAX_in?  Investigate in detail, but
    # for now, do not use memory mapping for X_subset for stability
    # X_subset_fname = os.path.join(tmp_folder, 'X_subset')
    # dump(X_subset, X_subset_fname)
    # X_subset = load(X_subset_fname, mmap_mode='r')

    selected_feature_idxs = np.zeros(num_selected_features, dtype=int)
    remaining_candidate_idxs = range(1, K_MAX_internal)

    # mi_matrix = np.empty((K_MAX_internal,num_selected_features-1))
    # mi_matrix[:] = np.nan

    relevance_vec_fname = os.path.join(tmp_folder, 'relevance_vec')
    feature_redundance_vec_fname = os.path.join(tmp_folder, 'feature_redundance_vec')
    mi_matrix_fname = os.path.join(tmp_folder, 'mi_matrix')
    relevance_vec = np.memmap(relevance_vec_fname, dtype=float,
                              shape=(K_MAX_internal,), mode='w+')
    feature_redundance_vec = np.memmap(feature_redundance_vec_fname, dtype=float,
                                       shape=(K_MAX_internal,), mode='w+')
    mi_matrix = np.memmap(mi_matrix_fname, dtype=float,
                          shape=(K_MAX_internal, num_selected_features - 1), mode='w+')
    mi_matrix[:] = np.nan

    # TODO: investigate whether its worth it to parallelize the nested for-loop?
    with tqdm(total=num_selected_features, desc='Selecting Features ...', disable=(not verbose)) as pbar:
        pbar.update(1)
        for k in range(1, num_selected_features):
            ncand = len(remaining_candidate_idxs)
            last_selected_feature = k - 1

            Parallel(n_jobs=n_jobs)(delayed(generic_combined_scorer)(y, relevance_vec, ii,
                                                                     X_subset[:, selected_feature_idxs[last_selected_feature]],
                                                                     feature_redundance_vec, ii, X_subset.iloc[:, ii],
                                                                     estimator)for ii in remaining_candidate_idxs)

            # copy the redundance into the mi_matrix, which accumulates our redundance as we compute
            mi_matrix[remaining_candidate_idxs, last_selected_feature] = feature_redundance_vec[remaining_candidate_idxs]
            redundance_vec = np.nanmean(mi_matrix[remaining_candidate_idxs, :], axis=1)

            tmp_idx = np.argmax(relevance_vec[remaining_candidate_idxs] - redundance_vec)
            selected_feature_idxs[k] = remaining_candidate_idxs[tmp_idx]
            del remaining_candidate_idxs[tmp_idx]
            pbar.update(1)
    # map the selected features back to the original dimensions
    selected_feature_idxs = sorted_scores_idxs[selected_feature_idxs]
    print('Leukemia Dataset Feature Selection\n Total # Features=%d' % (X.shape[1]))
    print('# Selected Features')
    print('selected_feature_idxs')

the Error is :
Error:
C:\Python2\python.exe C:/Users/pc/PycharmProjects/MymrmrTest/mytestmRmR.py Selecting Features ...: 4%|▍ | 1/25 [00:00<00:00, 500.04it/s] Traceback (most recent call last): File "C:/Users/pc/PycharmProjects/MymrmrTest/mytestmRmR.py", line 143, in <module> estimator)for ii in remaining_candidate_idxs) File "C:\Users\pc\AppData\Roaming\Python\Python27\site-packages\sklearn\externals\joblib\parallel.py", line 917, in __call__ if self.dispatch_one_batch(iterator): File "C:\Users\pc\AppData\Roaming\Python\Python27\site-packages\sklearn\externals\joblib\parallel.py", line 754, in dispatch_one_batch self._pickle_cache) File "C:\Users\pc\AppData\Roaming\Python\Python27\site-packages\sklearn\externals\joblib\parallel.py", line 210, in __init__ self.items = list(iterator_slice) File "C:/Users/pc/PycharmProjects/MymrmrTest/mytestmRmR.py", line 143, in <genexpr> estimator)for ii in remaining_candidate_idxs) File "C:\Users\pc\AppData\Roaming\Python\Python27\site-packages\pandas\core\frame.py", line 2927, in __getitem__ indexer = self.columns.get_loc(key) File "C:\Users\pc\AppData\Roaming\Python\Python27\site-packages\pandas\core\indexes\base.py", line 2657, in get_loc return self._engine.get_loc(key) File "pandas\_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc File "pandas\_libs\index.pyx", line 110, in pandas._libs.index.IndexEngine.get_loc TypeError: '(slice(None, None, None), 0)' is an invalid key Process finished with exit code 1
Quote
#2
I would suggest to put some print functions inside generic_combined_scorer., e.g.

def generic_combined_scorer(x1, o1, ii_1, x2, o2, ii_2, y, h):
        print(o1.columns, o2.columns etc.)  # print all columns names of df's etc
        s1 = h(x1, y)
        s2 = h(x2, y)
        o1[ii_1] = s1
        o2[ii_2] = s2
Don't pass arguments with indicies, e.g. X_subset.iloc[:, ii] could be replaced with X_subset, ii
is already passed, so you can do X_subset.iloc[:, ii] inside generic_combined_scorer. This should simplify debugging.
Quote

Top Page

Possibly Related Threads...
Thread Author Replies Views Last Post
  slice per group Progressive 3 384 Jul-20-2019, 06:52 AM
Last Post: scidam
  Melt or Slice Grin 0 470 Jun-24-2018, 06:02 PM
Last Post: Grin

Forum Jump:


Users browsing this thread: 1 Guest(s)