Python Forum

I'm running python in a jupyter notebook, and it keeps crashing and giving me this error message. This is code I ran successfully last week! I haven't changed anything, but suddenly it won't work. I've updated jupyter, reset my computer multiple times, and checked the input files, but nothing has helped.
This is not code I wrote, but is an edited version of a topic modeling code from Github...I don't have the knowledge or experience to know where the errors are, but everything runs fine until it gets to [btm.run()]

Any ideas, or suggestions, are very much appreciated.

from collections import defaultdict
import operator
import os
import random
import time
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import os
from itertools import chain
from glob import glob

file_content = open("nippur input.txt").read()
tokens = nltk.word_tokenize(file_content)

from nltk.corpus import stopwords
stop_words = set(stopwords.words("nippurstopwords.txt"))

file1 = open("nippur input.txt")
line = file1.read()
words = line.split()
for r in words:
    if not r in stop_words:
        appendFile = open("filteredtext.txt","a")
        appendFile.write(" "+r)
        appendFile.close()

class BTM(object):
    def __init__(self, data_path, alpha, beta, num_iter, num_topic, output_dir):
        self.data_path = data_path
        self.alpha = alpha
        self.beta = beta
        self.num_iter = num_iter
        self.num_topic = num_topic
        self.output_dir = output_dir
        
        self.word2Id = {}
        self.Id2Word = {}
        self.vocab_size = 0
        
        self.wordId_corpus = []
        
        self.biterms_in_doc = [] #list of dictionaries long->int
        self.num_doc_biterm = defaultdict(int)
        self.biterms = [] #List of numbers

        self.topic_biterm = []
        self.topic_word_num = [] #list of lists
        self.num_topic_biterm = []
        
        self.biterm_sum = {} #Map from long to double
        
    def get_file_reader(self, path = None):
        if path is None:
            path = self.data_path
        f = open(path, 'r')
        return f
    
    def get_file_writer(self,path, append = False):
        if append:
            read_mode = 'a'
        else:
            read_mode = 'w'
        g = open(os.path.join(self.output_dir, path), read_mode)
        return g
    
    def print_params(self):
        params = ['alpha','beta','num_iter','num_topic','topic_word_num','num_topic_biterm','topic_biterm']
        for param in params:
            print(param,':',getattr(self, param))
            print('-'*40)
    
    def load_data(self):
        f = self.get_file_reader()
        for line in f.readlines():
            words = line.split()
            curr_doc = []
            for word in words:
                if word not in self.word2Id:
                    index = len(self.word2Id)
                    self.word2Id[word] = index
                    self.Id2Word[index] = word
                curr_doc.append(self.word2Id[word])
            self.wordId_corpus.append(curr_doc)
        f.close()
        
        self.num_doc_biterm = [0]*len(self.wordId_corpus)
    
    def init_model(self):
        for doc_number, doc in enumerate(self.wordId_corpus):
            oneCop = defaultdict(int)
            for word1 in doc:
                for word2 in doc:
                    if(word1<word2):
                        item_num = word1*1000000+word2 #encoding the biterms
                        oneCop[item_num] +=1
                        self.biterms.append(item_num)
                        self.num_doc_biterm[doc_number] +=1
            self.biterms_in_doc.append(oneCop)
            
        self.vocab_size = len(self.word2Id)
        
        self.topic_biterm = [0]*len(self.biterms)
        self.topic_word_num = {j: {i:0 for i in range(self.num_topic)} for j in range(self.vocab_size)}
        print(len(self.topic_word_num), len(self.topic_word_num[0]))
        self.num_topic_biterm = [1]*self.num_topic
        
        for biterm_index, biterm in enumerate(self.biterms):
            topic_id = random.randint(0, self.num_topic-1)
            #if biterm_index  5:
                #print(biterm, biterm%1000000, biterm//1000000)
                #print(self.topic_word_num)
            self.topic_word_num[biterm%1000000][topic_id] +=1
            self.topic_word_num[biterm//1000000][topic_id] +=1
            self.topic_biterm[biterm_index] = topic_id
            
    def save_topic_words(self, topic_word_num = 10):
        writer = self.get_file_writer(path = 'model-final-topic-words.txt')
        for topic_id in range(self.num_topic):
            topic_line = {}
            for word_id, word in enumerate(self.word2Id):
                topic_line[word_id] = self.topic_word_num[word_id][topic_id]/ self.num_topic_biterm[topic_id] / 2
            sorted_topic_line = sorted(topic_line.items(), key = operator.itemgetter(1) )
            writer.write("Topic:"+str(topic_id) + '\n')
            for topic_word,score in sorted_topic_line[:topic_word_num]:
                writer.write("\t"+str(self.Id2Word[topic_word])+"\t"+str(score) + '\n')
        writer.close()
    
    def save_wordIds(self):
        writer = self.get_file_writer(path = 'model-final-wordIds.txt')
        for key,value in self.word2Id.items():
            writer.write(str(key) + ' ' + str(value) + '\n')
        writer.close()
        
    def get_sum(self, biterm):
        if biterm not in self.biterm_sum:
            word1 = biterm//1000000
            word2 = biterm%1000000
            sum = 0
            for topic_id in range(self.num_topic):
                calculation = (self.num_topic_biterm[topic_id] + self.alpha) * (self.topic_word_num[word1][topic_id] + self.beta) * (self.topic_word_num[word2][topic_id] + self.beta) / ((2 * self.num_topic_biterm[topic_id] ) + (self.vocab_size * self.beta))**2
                sum += calculation
            self.biterm_sum[biterm] = sum
        return self.biterm_sum[biterm]
        
    def save_theta(self):
        writer = self.get_file_writer(path = 'model-final-theta.txt')

        for doc_index, line in enumerate(self.biterms_in_doc):
            for topic_id in range(self.num_topic):
                one_sum = 0
                for key in line:
                    word1 = key//1000000
                    word2 = key%1000000
                    one_sum += ((line[key]/self.num_doc_biterm[doc_index]) * ((self.num_topic_biterm[topic_id] + self.alpha) * (self.topic_word_num[word1][topic_id] + self.beta) * (self.topic_word_num[word2][topic_id] + self.beta) / ((2 * self.num_topic_biterm[topic_id] ) + (self.vocab_size * self.beta))**2)/(self.get_sum(key)))
                writer.write(str(one_sum) + " ")
            writer.write('\n')
        writer.close()
        
    def save_phi(self):
        writer = self.get_file_writer(path = 'model-final-phi.txt')
        for topic_id in range(self.num_topic):
            for word_id in self.Id2Word:
                calculation = (self.topic_word_num[word_id][topic_id] + self.beta) / ((self.num_topic_biterm[topic_id] * 2) + (self.vocab_size * self.beta))
                writer.write(str(calculation) + ' ')
            writer.write('\n')
        writer.close()
        
    
    def build_model(self):
        for it in range(self.num_iter):
            start_time = time.time()
            for biterm_index, old_topic_id in enumerate(self.topic_biterm):
                word1 = self.biterms[biterm_index]//1000000
                word2 = self.biterms[biterm_index]%1000000
                self.topic_word_num[word1][old_topic_id] -=1
                self.topic_word_num[word2][old_topic_id] -=1
                self.num_topic_biterm[old_topic_id] -=1
                
                new_topic_id = -1
                
                p = [0]*self.num_topic
                for k in range(self.num_topic):
                    p[k] = (self.num_topic_biterm[k] + self.alpha) * (self.topic_word_num[word1][k] + self.beta) * (self.topic_word_num[word2][k] + self.beta) / ((2 * self.num_topic_biterm[k] ) + (self.vocab_size * self.beta))**2
                    
                for k in range(1,self.num_topic):
                    p[k] += p[k-1]
                
                u = random.random() * p[-1]
                for k in range(self.num_topic):
                    if u < p[k]:
                        new_topic_id = k
                        break
                
                self.topic_word_num[word1][new_topic_id] +=1
                self.topic_word_num[word2][new_topic_id] +=1
                self.num_topic_biterm[new_topic_id] += 1
                
                self.topic_biterm[biterm_index] = new_topic_id
                
            print('Finished iteration:', it, 'Time taken:' + str(time.time()-start_time))
    
    def save_result(self):
        self.save_topic_words(20)
        self.save_theta()
        self.save_wordIds()
        self.save_phi()
        
    def run(self):
        self.load_data()
        self.init_model()
        self.build_model()
        self.save_result()    

btm = BTM(data_path='../Topic Modeling/filteredtext.txt',alpha=2,beta=0.001, num_iter=10, num_topic=10, output_dir='.')

btm.run()

btm.save_result()

Seems it be easier, and more logical to get the code working outside of Jupyter notebook first, and then adding it back.
You are complicating things by running it in Jupyter.
If you need help setting up an environment for this, ask.

(Jun-12-2018, 09:16 PM)Larz60+ Wrote: [ -> ]Seems it be easier, and more logical to get the code working outside of Jupyter notebook first, and then adding it back.

I am just curious - why? I use Jupyter all the time, locally and remotely - without a glitch. There are also free Azure notebooks

I am not sure what your problem is, but if your computer receives dynamic IP, Jupyter session may become un-available - it often happens to me when I open session in the office, and then try to access it at home. You must kill the old session and start a new one.

I usually open it in "no-browser mode" and then open the produced link in a browser (I work on Linux)

Output:>jupyter-notebook --no-browser --ip=$( hostname -I | awk '{print $1}' )
[I 00:40:04.786 NotebookApp] Writing notebook server cookie secret to /run/user/1000/jupyter/notebook_cookie_secret
[I 00:40:05.277 NotebookApp] Serving notebooks from local directory: /home/mark
[I 00:40:05.278 NotebookApp] 0 active kernels
[I 00:40:05.278 NotebookApp] The Jupyter Notebook is running at:
[I 00:40:05.278 NotebookApp] http://10.0.0.8:8888/?token=27296b15eb9c22614ecb6d8716cffa7a4f7d41e868d2d21a
[I 00:40:05.278 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
[C 00:40:05.279 NotebookApp] 
    
    Copy/paste this URL into your browser when you connect for the first time,
    to login with a token:
        http://10.0.0.8:8888/?token=27296b15eb9c22614ecb6d8716cffa7a4f7d41e868d2d21a

I click the link at the bottom - and voila! - Jupyter session is ready

Quote:why? I use Jupyter all the time, locally and remotely - without a glitch ... but if your computer receives dynamic IP, Jupyter session may become un-available

'nuff' said

(Jun-12-2018, 09:50 PM)Larz60+ Wrote: [ -> ]
Quote:why? I use Jupyter all the time, locally and remotely - without a glitch ... but if your computer receives dynamic IP, Jupyter session may become un-available
'nuff' said

So, if you don't know the answer - "don't use it" is a good answer?!

PS And since you obviously don't know what you are talking about - Jupyter sessions are auto-saved

be nice, I use Jupyter all the time.

meganhollie

Larz60+

volcano63

Larz60+

volcano63

Larz60+