Python Forum
Thread Rating:
  • 1 Vote(s) - 5 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Memory Error
#1
# Load Libraries
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer
# load data
#train = pd.read_csv("train.csv")
train=pd.DataFrame.from_csv("train.csv")
#test = pd.read_csv("test.csv")
test=pd.DataFrame.from_csv("train.csv")

train.head()
# function to clean data

stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt
## join data
test['Is_Response'] = np.nan
alldata = pd.concat([train, test]).reset_index(drop=True)
# clean description
alldata['Description'] = alldata['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))
# initialise the functions - we'll create separate models for each type.
countvec = CountVectorizer(analyzer='word', ngram_range = (1,1), min_df=150, max_features=500)
tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,1), min_df = 150, max_features=500)
# create features
bagofwords = countvec.fit_transform(alldata['Description'])
tfidfdata = tfidfvec.fit_transform(alldata['Description'])
# label encode categorical features in data given
cols = ['Browser_Used','Device_Used']

for x in cols:
    lbl = LabelEncoder()
    alldata[x] = lbl.fit_transform(alldata[x])
# create dataframe for features
bow_df = pd.DataFrame(bagofwords.todense())
tfidf_df = pd.DataFrame(tfidfdata.todense())
# create dataframe for features
bow_df = pd.DataFrame(bagofwords.todense())
tfidf_df = pd.DataFrame(tfidfdata.todense())
# create separate data frame for bag of words and tf-idf

bow_df_train = bow_df[:len(train)]
bow_df_test = bow_df[len(train):]

tfid_df_train = tfidf_df[:len(train)]
tfid_df_test = tfidf_df[len(train):]
# split the merged data file into train and test respectively
train_feats = alldata[~pd.isnull(alldata.Is_Response)]
test_feats = alldata[pd.isnull(alldata.Is_Response)]
## set target variable
#train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]
train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]
# merge count (bag of word) features into train
train_feats1 = pd.concat([train_feats[cols], bow_df_train], axis = 1)
test_feats1 = pd.concat([test_feats[cols], bow_df_test], axis=1)
#test_feats1 = pd.DataFrame.combine([train_feats[cols], bow_df_train])
#test_feats1=pd.DataFrame.combine([test_feats[cols], bow_df_test])
print('test done')
test_feats1.reset_index(drop=True, inplace=True)
# merge into a new data frame with tf-idf features
train_feats2 = pd.concat([train_feats[cols], tfid_df_train], axis=1)
test_feats2 = pd.concat([test_feats[cols], tfid_df_test], axis=1)

#Naive bayes Algorithm
# let's check cross validation score of the model
# cv score acts a unbiased estimate of models accuracy on unseen data
mod1 = GaussianNB()
target = train_feats['Is_Response']
## Naive Bayes 1
print(cross_val_score(mod1, train_feats1, target, cv=5, scoring=make_scorer(accuracy_score)))
## Naive Bayes 2 - tfidf is giving higher CV score
print(cross_val_score(mod1, train_feats2, target, cv=5, scoring=make_scorer(accuracy_score)))
# make our first set of predictions
clf1 = GaussianNB()
clf1.fit(train_feats1, target)

clf2 = GaussianNB()
clf2.fit(train_feats2, target)
preds1 = clf1.predict(test_feats1)
preds2 = clf2.predict(test_feats2)
def to_labels(x):
    if x == 1:
        return "happy"
    return "not_happy"
sub1 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds1})
sub1['Is_Response'] = sub1['Is_Response'].map(lambda x: to_labels(x))
sub2 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds2})
sub2['Is_Response'] = sub2['Is_Response'].map(lambda x: to_labels(x))
sub1 = sub1[['User_ID', 'Is_Response']]
sub2 = sub2[['User_ID', 'Is_Response']]
## write submission files
sub1.to_csv('submissions/sub1_cv.csv', index=False)
sub2.to_csv('submissions/sub2_tf.csv', index=False)
Warning (from warnings module):
File "C:\Python 3.6\program\happyness\happy.py", line 74
train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/sta...ersus-copy
Traceback (most recent call last):
File "C:\Python 3.6\program\happyness\happy.py", line 77, in <module>
test_feats1 = pd.concat([test_feats[cols], bow_df_test], axis=1)
File "C:\Python 3.6\lib\site-packages\pandas\core\reshape\concat.py", line 207, in concat
return op.get_result()
File "C:\Python 3.6\lib\site-packages\pandas\core\reshape\concat.py", line 407, in get_result
copy=self.copy)
File "C:\Python 3.6\lib\site-packages\pandas\core\internals.py", line 4832, in concatenate_block_managers
placement=placement) for placement, join_units in concat_plan]
File "C:\Python 3.6\lib\site-packages\pandas\core\internals.py", line 4832, in <listcomp>
placement=placement) for placement, join_units in concat_plan]
File "C:\Python 3.6\lib\site-packages\pandas\core\internals.py", line 4945, in concatenate_join_units
concat_values = concat_values.copy()
MemoryError
Reply
#2
Quote:
train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]
This is just setting train_feats['Is_Response'] to a list of ones or zeros, where before looked like a list of strings?
Quote:>>> data = ['happy', 'not','somewhat']
>>> [1 if x == 'happy' else 0 for x in data]
[1, 0, 0]
Recommended Tutorials:
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  Memory Error jason413 6 7,954 Jun-21-2018, 06:35 PM
Last Post: nilamo
  Memory error while recursively adding np.arrays Afterdarkreader 0 4,005 Dec-22-2017, 04:02 PM
Last Post: Afterdarkreader
  Memory error in python 2.7 Afterdarkreader 4 6,491 Dec-20-2017, 02:26 AM
Last Post: Afterdarkreader

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020