Jan-03-2019, 01:00 PM
#Code for implementing step-by-step the checks mentioned in the pre-processing phase # this code takes a while to run as it needs to run on 500k sentences. if not os.path.isfile('final.sqlite'): final_string=[] all_positive_words=[] # store words from +ve reviews here all_negative_words=[] # store words from -ve reviews here. for i, sent in enumerate(tqdm(final['Text'].values)): filtered_sentence=[] #print(sent); sent=cleanhtml(sent) # remove HTMl tags for w in sent.split(): # we have used cleanpunc(w).split(), one more split function here because consider w="abc.def", cleanpunc(w) will return "abc def" # if we dont use .split() function then we will be considring "abc def" as a single word, but if you use .split() function we will get "abc", "def" for cleaned_words in cleanpunc(w).split(): if((cleaned_words.isalpha()) & (len(cleaned_words)>2)): if(cleaned_words.lower() not in stop): s=(sno.stem(cleaned_words.lower())).encode('utf8') filtered_sentence.append(s) if (final['Score'].values)[i] == 1: all_positive_words.append(s) #list of all words used to describe positive reviews if(final['Score'].values)[i] == 0: all_negative_words.append(s) #list of all words used to describe negative reviews reviews str1 = b" ".join(filtered_sentence) #final string of cleaned words #print("***********************************************************************") final_string.append(str1) #############---- storing the data into .sqlite file ------######################## final['CleanedText']=final_string #adding a column of CleanedText which displays the data after pre-processing of the review final['CleanedText']=final['CleanedText'].str.decode("utf-8") # store final table into an SQlLite table for future. conn = sqlite3.connect('final.sqlite') c=conn.cursor() conn.text_factory = str final.to_sql('Reviews', conn, schema=None, if_exists='replace', \ index=True, index_label=None, chunksize=None, dtype=None) conn.close() with open('positive_words.pkl', 'wb') as f: pickle.dump(all_positive_words, f) with open('negitive_words.pkl', 'wb') as f: pickle.dump(all_negative_words, f)