Sep-26-2017, 08:29 AM
# Load Libraries import numpy as np import pandas as pd from nltk.corpus import stopwords from nltk.stem import PorterStemmer from sklearn.ensemble import GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.preprocessing import LabelEncoder import re from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.model_selection import cross_val_score from sklearn.metrics import accuracy_score, make_scorer # load data #train = pd.read_csv("train.csv") train=pd.DataFrame.from_csv("train.csv") #test = pd.read_csv("test.csv") test=pd.DataFrame.from_csv("train.csv") train.head() # function to clean data stops = set(stopwords.words("english")) def cleanData(text, lowercase = False, remove_stops = False, stemming = False): txt = str(text) txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt) txt = re.sub(r'\n',r' ',txt) if lowercase: txt = " ".join([w.lower() for w in txt.split()]) if remove_stops: txt = " ".join([w for w in txt.split() if w not in stops]) if stemming: st = PorterStemmer() txt = " ".join([st.stem(w) for w in txt.split()]) return txt ## join data test['Is_Response'] = np.nan alldata = pd.concat([train, test]).reset_index(drop=True) # clean description alldata['Description'] = alldata['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True)) # initialise the functions - we'll create separate models for each type. countvec = CountVectorizer(analyzer='word', ngram_range = (1,1), min_df=150, max_features=500) tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,1), min_df = 150, max_features=500) # create features bagofwords = countvec.fit_transform(alldata['Description']) tfidfdata = tfidfvec.fit_transform(alldata['Description']) # label encode categorical features in data given cols = ['Browser_Used','Device_Used'] for x in cols: lbl = LabelEncoder() alldata[x] = lbl.fit_transform(alldata[x]) # create dataframe for features bow_df = pd.DataFrame(bagofwords.todense()) tfidf_df = pd.DataFrame(tfidfdata.todense()) # create dataframe for features bow_df = pd.DataFrame(bagofwords.todense()) tfidf_df = pd.DataFrame(tfidfdata.todense()) # create separate data frame for bag of words and tf-idf bow_df_train = bow_df[:len(train)] bow_df_test = bow_df[len(train):] tfid_df_train = tfidf_df[:len(train)] tfid_df_test = tfidf_df[len(train):] # split the merged data file into train and test respectively train_feats = alldata[~pd.isnull(alldata.Is_Response)] test_feats = alldata[pd.isnull(alldata.Is_Response)] ## set target variable #train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']] train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']] # merge count (bag of word) features into train train_feats1 = pd.concat([train_feats[cols], bow_df_train], axis = 1) test_feats1 = pd.concat([test_feats[cols], bow_df_test], axis=1) #test_feats1 = pd.DataFrame.combine([train_feats[cols], bow_df_train]) #test_feats1=pd.DataFrame.combine([test_feats[cols], bow_df_test]) print('test done') test_feats1.reset_index(drop=True, inplace=True) # merge into a new data frame with tf-idf features train_feats2 = pd.concat([train_feats[cols], tfid_df_train], axis=1) test_feats2 = pd.concat([test_feats[cols], tfid_df_test], axis=1) #Naive bayes Algorithm # let's check cross validation score of the model # cv score acts a unbiased estimate of models accuracy on unseen data mod1 = GaussianNB() target = train_feats['Is_Response'] ## Naive Bayes 1 print(cross_val_score(mod1, train_feats1, target, cv=5, scoring=make_scorer(accuracy_score))) ## Naive Bayes 2 - tfidf is giving higher CV score print(cross_val_score(mod1, train_feats2, target, cv=5, scoring=make_scorer(accuracy_score))) # make our first set of predictions clf1 = GaussianNB() clf1.fit(train_feats1, target) clf2 = GaussianNB() clf2.fit(train_feats2, target) preds1 = clf1.predict(test_feats1) preds2 = clf2.predict(test_feats2) def to_labels(x): if x == 1: return "happy" return "not_happy" sub1 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds1}) sub1['Is_Response'] = sub1['Is_Response'].map(lambda x: to_labels(x)) sub2 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds2}) sub2['Is_Response'] = sub2['Is_Response'].map(lambda x: to_labels(x)) sub1 = sub1[['User_ID', 'Is_Response']] sub2 = sub2[['User_ID', 'Is_Response']] ## write submission files sub1.to_csv('submissions/sub1_cv.csv', index=False) sub2.to_csv('submissions/sub2_tf.csv', index=False)Warning (from warnings module):
File "C:\Python 3.6\program\happyness\happy.py", line 74
train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/sta...ersus-copy
Traceback (most recent call last):
File "C:\Python 3.6\program\happyness\happy.py", line 77, in <module>
test_feats1 = pd.concat([test_feats[cols], bow_df_test], axis=1)
File "C:\Python 3.6\lib\site-packages\pandas\core\reshape\concat.py", line 207, in concat
return op.get_result()
File "C:\Python 3.6\lib\site-packages\pandas\core\reshape\concat.py", line 407, in get_result
copy=self.copy)
File "C:\Python 3.6\lib\site-packages\pandas\core\internals.py", line 4832, in concatenate_block_managers
placement=placement) for placement, join_units in concat_plan]
File "C:\Python 3.6\lib\site-packages\pandas\core\internals.py", line 4832, in <listcomp>
placement=placement) for placement, join_units in concat_plan]
File "C:\Python 3.6\lib\site-packages\pandas\core\internals.py", line 4945, in concatenate_join_units
concat_values = concat_values.copy()
MemoryError