Jun-12-2018, 05:55 PM
I'm running python in a jupyter notebook, and it keeps crashing and giving me this error message. This is code I ran successfully last week! I haven't changed anything, but suddenly it won't work. I've updated jupyter, reset my computer multiple times, and checked the input files, but nothing has helped.
This is not code I wrote, but is an edited version of a topic modeling code from Github...I don't have the knowledge or experience to know where the errors are, but everything runs fine until it gets to [btm.run()]
Any ideas, or suggestions, are very much appreciated.
This is not code I wrote, but is an edited version of a topic modeling code from Github...I don't have the knowledge or experience to know where the errors are, but everything runs fine until it gets to [btm.run()]
Any ideas, or suggestions, are very much appreciated.
from collections import defaultdict import operator import os import random import time import nltk from nltk.tokenize import sent_tokenize, word_tokenize import os from itertools import chain from glob import glob file_content = open("nippur input.txt").read() tokens = nltk.word_tokenize(file_content) from nltk.corpus import stopwords stop_words = set(stopwords.words("nippurstopwords.txt")) file1 = open("nippur input.txt") line = file1.read() words = line.split() for r in words: if not r in stop_words: appendFile = open("filteredtext.txt","a") appendFile.write(" "+r) appendFile.close() class BTM(object): def __init__(self, data_path, alpha, beta, num_iter, num_topic, output_dir): self.data_path = data_path self.alpha = alpha self.beta = beta self.num_iter = num_iter self.num_topic = num_topic self.output_dir = output_dir self.word2Id = {} self.Id2Word = {} self.vocab_size = 0 self.wordId_corpus = [] self.biterms_in_doc = [] #list of dictionaries long->int self.num_doc_biterm = defaultdict(int) self.biterms = [] #List of numbers self.topic_biterm = [] self.topic_word_num = [] #list of lists self.num_topic_biterm = [] self.biterm_sum = {} #Map from long to double def get_file_reader(self, path = None): if path is None: path = self.data_path f = open(path, 'r') return f def get_file_writer(self,path, append = False): if append: read_mode = 'a' else: read_mode = 'w' g = open(os.path.join(self.output_dir, path), read_mode) return g def print_params(self): params = ['alpha','beta','num_iter','num_topic','topic_word_num','num_topic_biterm','topic_biterm'] for param in params: print(param,':',getattr(self, param)) print('-'*40) def load_data(self): f = self.get_file_reader() for line in f.readlines(): words = line.split() curr_doc = [] for word in words: if word not in self.word2Id: index = len(self.word2Id) self.word2Id[word] = index self.Id2Word[index] = word curr_doc.append(self.word2Id[word]) self.wordId_corpus.append(curr_doc) f.close() self.num_doc_biterm = [0]*len(self.wordId_corpus) def init_model(self): for doc_number, doc in enumerate(self.wordId_corpus): oneCop = defaultdict(int) for word1 in doc: for word2 in doc: if(word1<word2): item_num = word1*1000000+word2 #encoding the biterms oneCop[item_num] +=1 self.biterms.append(item_num) self.num_doc_biterm[doc_number] +=1 self.biterms_in_doc.append(oneCop) self.vocab_size = len(self.word2Id) self.topic_biterm = [0]*len(self.biterms) self.topic_word_num = {j: {i:0 for i in range(self.num_topic)} for j in range(self.vocab_size)} print(len(self.topic_word_num), len(self.topic_word_num[0])) self.num_topic_biterm = [1]*self.num_topic for biterm_index, biterm in enumerate(self.biterms): topic_id = random.randint(0, self.num_topic-1) #if biterm_index 5: #print(biterm, biterm%1000000, biterm//1000000) #print(self.topic_word_num) self.topic_word_num[biterm%1000000][topic_id] +=1 self.topic_word_num[biterm//1000000][topic_id] +=1 self.topic_biterm[biterm_index] = topic_id def save_topic_words(self, topic_word_num = 10): writer = self.get_file_writer(path = 'model-final-topic-words.txt') for topic_id in range(self.num_topic): topic_line = {} for word_id, word in enumerate(self.word2Id): topic_line[word_id] = self.topic_word_num[word_id][topic_id]/ self.num_topic_biterm[topic_id] / 2 sorted_topic_line = sorted(topic_line.items(), key = operator.itemgetter(1) ) writer.write("Topic:"+str(topic_id) + '\n') for topic_word,score in sorted_topic_line[:topic_word_num]: writer.write("\t"+str(self.Id2Word[topic_word])+"\t"+str(score) + '\n') writer.close() def save_wordIds(self): writer = self.get_file_writer(path = 'model-final-wordIds.txt') for key,value in self.word2Id.items(): writer.write(str(key) + ' ' + str(value) + '\n') writer.close() def get_sum(self, biterm): if biterm not in self.biterm_sum: word1 = biterm//1000000 word2 = biterm%1000000 sum = 0 for topic_id in range(self.num_topic): calculation = (self.num_topic_biterm[topic_id] + self.alpha) * (self.topic_word_num[word1][topic_id] + self.beta) * (self.topic_word_num[word2][topic_id] + self.beta) / ((2 * self.num_topic_biterm[topic_id] ) + (self.vocab_size * self.beta))**2 sum += calculation self.biterm_sum[biterm] = sum return self.biterm_sum[biterm] def save_theta(self): writer = self.get_file_writer(path = 'model-final-theta.txt') for doc_index, line in enumerate(self.biterms_in_doc): for topic_id in range(self.num_topic): one_sum = 0 for key in line: word1 = key//1000000 word2 = key%1000000 one_sum += ((line[key]/self.num_doc_biterm[doc_index]) * ((self.num_topic_biterm[topic_id] + self.alpha) * (self.topic_word_num[word1][topic_id] + self.beta) * (self.topic_word_num[word2][topic_id] + self.beta) / ((2 * self.num_topic_biterm[topic_id] ) + (self.vocab_size * self.beta))**2)/(self.get_sum(key))) writer.write(str(one_sum) + " ") writer.write('\n') writer.close() def save_phi(self): writer = self.get_file_writer(path = 'model-final-phi.txt') for topic_id in range(self.num_topic): for word_id in self.Id2Word: calculation = (self.topic_word_num[word_id][topic_id] + self.beta) / ((self.num_topic_biterm[topic_id] * 2) + (self.vocab_size * self.beta)) writer.write(str(calculation) + ' ') writer.write('\n') writer.close() def build_model(self): for it in range(self.num_iter): start_time = time.time() for biterm_index, old_topic_id in enumerate(self.topic_biterm): word1 = self.biterms[biterm_index]//1000000 word2 = self.biterms[biterm_index]%1000000 self.topic_word_num[word1][old_topic_id] -=1 self.topic_word_num[word2][old_topic_id] -=1 self.num_topic_biterm[old_topic_id] -=1 new_topic_id = -1 p = [0]*self.num_topic for k in range(self.num_topic): p[k] = (self.num_topic_biterm[k] + self.alpha) * (self.topic_word_num[word1][k] + self.beta) * (self.topic_word_num[word2][k] + self.beta) / ((2 * self.num_topic_biterm[k] ) + (self.vocab_size * self.beta))**2 for k in range(1,self.num_topic): p[k] += p[k-1] u = random.random() * p[-1] for k in range(self.num_topic): if u < p[k]: new_topic_id = k break self.topic_word_num[word1][new_topic_id] +=1 self.topic_word_num[word2][new_topic_id] +=1 self.num_topic_biterm[new_topic_id] += 1 self.topic_biterm[biterm_index] = new_topic_id print('Finished iteration:', it, 'Time taken:' + str(time.time()-start_time)) def save_result(self): self.save_topic_words(20) self.save_theta() self.save_wordIds() self.save_phi() def run(self): self.load_data() self.init_model() self.build_model() self.save_result() btm = BTM(data_path='../Topic Modeling/filteredtext.txt',alpha=2,beta=0.001, num_iter=10, num_topic=10, output_dir='.') btm.run() btm.save_result()