Jul-17-2019, 12:03 PM
I am doing feature selection using python 2.7, the dataset was loaded fine but when i run the code it gives me this error
#!/usr/bin/env python ''' An example file to show how to use the feature-selection code in ml_lib ''' import os import shutil import json from tempfile import mkdtemp from tqdm import tqdm from scipy.stats import randint as sp_randint from sklearn.model_selection import RandomizedSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.pipeline import FeatureUnion, Pipeline from sklearn.preprocessing import StandardScaler from sklearn.metrics import accuracy_score, roc_curve from sklearn.externals import joblib from sklearn.externals.joblib import Memory from sklearn.metrics import matthews_corrcoef import gplearn #import gplearn.genetic import gplearn.fitness import numpy as np from joblib import Parallel, delayed from sklearn.externals.joblib import Parallel from joblib import load, dump from identity_transformer import IdentityTransformer import depmeas import pandas from tqdm import tqdm import pandas as pd import csv from sklearn.datasets import fetch_mldata from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score import numpy as np from multiprocessing import Process import seaborn as sns import matplotlib.pyplot as plt import feature_select import depmeas if __name__ == '__main__': def generic_combined_scorer(x1, o1, ii_1, x2, o2, ii_2, y, h): s1 = h(x1, y) s2 = h(x2, y) o1[ii_1] = s1 o2[ii_2] = s2 NUM_CV = 3 RANDOM_SEED = 123 MAX_ITER = 1000 # leuk = fetch_mldata('iris', transpose_data=True) X = pd.read_csv(r'C:\Users\pc\Desktop\dataset\leukemia.csv') y = pd.read_csv(r'C:\Users\pc\Desktop\dataset\leukemia.csv') # perform feature selection num_features_to_select = 25 K_MAX = 1000 estimator = depmeas.mi_tau n_jobs = -1 verbose = True # print ( X.head(20)) num_dim = X.shape[1] #print (num_dim) if (num_features_to_select is not None): num_selected_features = min(num_dim, num_features_to_select) else: num_selected_features = num_dim K_MAX_internal = min(num_dim, K_MAX) initial_scores = Parallel(n_jobs=n_jobs)(delayed(estimator)(X.iloc[:, 1], y.iloc[:, 1]) for ii in range(num_dim)) # rank the scores in descending order sorted_scores_idxs = np.flipud(np.argsort(initial_scores)) # subset the data down so that joblib doesn't have to # transport large matrices to its workers X_subset = X.iloc[:, sorted_scores_idxs[0:K_MAX_internal]] # memory map this for parallelization speed tmp_folder = mkdtemp() # TODO: why is X_subset crashing when we increase K_MAX_in? Investigate in detail, but # for now, do not use memory mapping for X_subset for stability # X_subset_fname = os.path.join(tmp_folder, 'X_subset') # dump(X_subset, X_subset_fname) # X_subset = load(X_subset_fname, mmap_mode='r') selected_feature_idxs = np.zeros(num_selected_features, dtype=int) remaining_candidate_idxs = range(1, K_MAX_internal) # mi_matrix = np.empty((K_MAX_internal,num_selected_features-1)) # mi_matrix[:] = np.nan relevance_vec_fname = os.path.join(tmp_folder, 'relevance_vec') feature_redundance_vec_fname = os.path.join(tmp_folder, 'feature_redundance_vec') mi_matrix_fname = os.path.join(tmp_folder, 'mi_matrix') relevance_vec = np.memmap(relevance_vec_fname, dtype=float, shape=(K_MAX_internal,), mode='w+') feature_redundance_vec = np.memmap(feature_redundance_vec_fname, dtype=float, shape=(K_MAX_internal,), mode='w+') mi_matrix = np.memmap(mi_matrix_fname, dtype=float, shape=(K_MAX_internal, num_selected_features - 1), mode='w+') mi_matrix[:] = np.nan # TODO: investigate whether its worth it to parallelize the nested for-loop? with tqdm(total=num_selected_features, desc='Selecting Features ...', disable=(not verbose)) as pbar: pbar.update(1) for k in range(1, num_selected_features): ncand = len(remaining_candidate_idxs) last_selected_feature = k - 1 Parallel(n_jobs=n_jobs)(delayed(generic_combined_scorer)(y, relevance_vec, ii, X_subset[:, selected_feature_idxs[last_selected_feature]], feature_redundance_vec, ii, X_subset.iloc[:, ii], estimator)for ii in remaining_candidate_idxs) # copy the redundance into the mi_matrix, which accumulates our redundance as we compute mi_matrix[remaining_candidate_idxs, last_selected_feature] = feature_redundance_vec[remaining_candidate_idxs] redundance_vec = np.nanmean(mi_matrix[remaining_candidate_idxs, :], axis=1) tmp_idx = np.argmax(relevance_vec[remaining_candidate_idxs] - redundance_vec) selected_feature_idxs[k] = remaining_candidate_idxs[tmp_idx] del remaining_candidate_idxs[tmp_idx] pbar.update(1) # map the selected features back to the original dimensions selected_feature_idxs = sorted_scores_idxs[selected_feature_idxs] print('Leukemia Dataset Feature Selection\n Total # Features=%d' % (X.shape[1])) print('# Selected Features') print('selected_feature_idxs')the Error is :
Error:C:\Python2\python.exe C:/Users/pc/PycharmProjects/MymrmrTest/mytestmRmR.py
Selecting Features ...: 4%|▍ | 1/25 [00:00<00:00, 500.04it/s]
Traceback (most recent call last):
File "C:/Users/pc/PycharmProjects/MymrmrTest/mytestmRmR.py", line 143, in <module>
estimator)for ii in remaining_candidate_idxs)
File "C:\Users\pc\AppData\Roaming\Python\Python27\site-packages\sklearn\externals\joblib\parallel.py", line 917, in __call__
if self.dispatch_one_batch(iterator):
File "C:\Users\pc\AppData\Roaming\Python\Python27\site-packages\sklearn\externals\joblib\parallel.py", line 754, in dispatch_one_batch
self._pickle_cache)
File "C:\Users\pc\AppData\Roaming\Python\Python27\site-packages\sklearn\externals\joblib\parallel.py", line 210, in __init__
self.items = list(iterator_slice)
File "C:/Users/pc/PycharmProjects/MymrmrTest/mytestmRmR.py", line 143, in <genexpr>
estimator)for ii in remaining_candidate_idxs)
File "C:\Users\pc\AppData\Roaming\Python\Python27\site-packages\pandas\core\frame.py", line 2927, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Users\pc\AppData\Roaming\Python\Python27\site-packages\pandas\core\indexes\base.py", line 2657, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 110, in pandas._libs.index.IndexEngine.get_loc
TypeError: '(slice(None, None, None), 0)' is an invalid key
Process finished with exit code 1