Jupyter error - 'The kernel appears to have died, it will restart automatically'

meganhollie · Jun-12-2018, 05:55 PM

I'm running python in a jupyter notebook, and it keeps crashing and giving me this error message. This is code I ran successfully last week! I haven't changed anything, but suddenly it won't work. I've updated jupyter, reset my computer multiple times, and checked the input files, but nothing has helped.
This is not code I wrote, but is an edited version of a topic modeling code from Github...I don't have the knowledge or experience to know where the errors are, but everything runs fine until it gets to [btm.run()]

Any ideas, or suggestions, are very much appreciated.

from collections import defaultdict
import operator
import os
import random
import time
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import os
from itertools import chain
from glob import glob

file_content = open("nippur input.txt").read()
tokens = nltk.word_tokenize(file_content)

from nltk.corpus import stopwords
stop_words = set(stopwords.words("nippurstopwords.txt"))

file1 = open("nippur input.txt")
line = file1.read()
words = line.split()
for r in words:
    if not r in stop_words:
        appendFile = open("filteredtext.txt","a")
        appendFile.write(" "+r)
        appendFile.close()

class BTM(object):
    def __init__(self, data_path, alpha, beta, num_iter, num_topic, output_dir):
        self.data_path = data_path
        self.alpha = alpha
        self.beta = beta
        self.num_iter = num_iter
        self.num_topic = num_topic
        self.output_dir = output_dir
        
        self.word2Id = {}
        self.Id2Word = {}
        self.vocab_size = 0
        
        self.wordId_corpus = []
        
        self.biterms_in_doc = [] #list of dictionaries long->int
        self.num_doc_biterm = defaultdict(int)
        self.biterms = [] #List of numbers

        self.topic_biterm = []
        self.topic_word_num = [] #list of lists
        self.num_topic_biterm = []
        
        self.biterm_sum = {} #Map from long to double
        
    def get_file_reader(self, path = None):
        if path is None:
            path = self.data_path
        f = open(path, 'r')
        return f
    
    def get_file_writer(self,path, append = False):
        if append:
            read_mode = 'a'
        else:
            read_mode = 'w'
        g = open(os.path.join(self.output_dir, path), read_mode)
        return g
    
    def print_params(self):
        params = ['alpha','beta','num_iter','num_topic','topic_word_num','num_topic_biterm','topic_biterm']
        for param in params:
            print(param,':',getattr(self, param))
            print('-'*40)
    
    def load_data(self):
        f = self.get_file_reader()
        for line in f.readlines():
            words = line.split()
            curr_doc = []
            for word in words:
                if word not in self.word2Id:
                    index = len(self.word2Id)
                    self.word2Id[word] = index
                    self.Id2Word[index] = word
                curr_doc.append(self.word2Id[word])
            self.wordId_corpus.append(curr_doc)
        f.close()
        
        self.num_doc_biterm = [0]*len(self.wordId_corpus)
    
    def init_model(self):
        for doc_number, doc in enumerate(self.wordId_corpus):
            oneCop = defaultdict(int)
            for word1 in doc:
                for word2 in doc:
                    if(word1<word2):
                        item_num = word1*1000000+word2 #encoding the biterms
                        oneCop[item_num] +=1
                        self.biterms.append(item_num)
                        self.num_doc_biterm[doc_number] +=1
            self.biterms_in_doc.append(oneCop)
            
        self.vocab_size = len(self.word2Id)
        
        self.topic_biterm = [0]*len(self.biterms)
        self.topic_word_num = {j: {i:0 for i in range(self.num_topic)} for j in range(self.vocab_size)}
        print(len(self.topic_word_num), len(self.topic_word_num[0]))
        self.num_topic_biterm = [1]*self.num_topic
        
        for biterm_index, biterm in enumerate(self.biterms):
            topic_id = random.randint(0, self.num_topic-1)
            #if biterm_index  5:
                #print(biterm, biterm%1000000, biterm//1000000)
                #print(self.topic_word_num)
            self.topic_word_num[biterm%1000000][topic_id] +=1
            self.topic_word_num[biterm//1000000][topic_id] +=1
            self.topic_biterm[biterm_index] = topic_id
            
    def save_topic_words(self, topic_word_num = 10):
        writer = self.get_file_writer(path = 'model-final-topic-words.txt')
        for topic_id in range(self.num_topic):
            topic_line = {}
            for word_id, word in enumerate(self.word2Id):
                topic_line[word_id] = self.topic_word_num[word_id][topic_id]/ self.num_topic_biterm[topic_id] / 2
            sorted_topic_line = sorted(topic_line.items(), key = operator.itemgetter(1) )
            writer.write("Topic:"+str(topic_id) + '\n')
            for topic_word,score in sorted_topic_line[:topic_word_num]:
                writer.write("\t"+str(self.Id2Word[topic_word])+"\t"+str(score) + '\n')
        writer.close()
    
    def save_wordIds(self):
        writer = self.get_file_writer(path = 'model-final-wordIds.txt')
        for key,value in self.word2Id.items():
            writer.write(str(key) + ' ' + str(value) + '\n')
        writer.close()
        
    def get_sum(self, biterm):
        if biterm not in self.biterm_sum:
            word1 = biterm//1000000
            word2 = biterm%1000000
            sum = 0
            for topic_id in range(self.num_topic):
                calculation = (self.num_topic_biterm[topic_id] + self.alpha) * (self.topic_word_num[word1][topic_id] + self.beta) * (self.topic_word_num[word2][topic_id] + self.beta) / ((2 * self.num_topic_biterm[topic_id] ) + (self.vocab_size * self.beta))**2
                sum += calculation
            self.biterm_sum[biterm] = sum
        return self.biterm_sum[biterm]
        
    def save_theta(self):
        writer = self.get_file_writer(path = 'model-final-theta.txt')

        for doc_index, line in enumerate(self.biterms_in_doc):
            for topic_id in range(self.num_topic):
                one_sum = 0
                for key in line:
                    word1 = key//1000000
                    word2 = key%1000000
                    one_sum += ((line[key]/self.num_doc_biterm[doc_index]) * ((self.num_topic_biterm[topic_id] + self.alpha) * (self.topic_word_num[word1][topic_id] + self.beta) * (self.topic_word_num[word2][topic_id] + self.beta) / ((2 * self.num_topic_biterm[topic_id] ) + (self.vocab_size * self.beta))**2)/(self.get_sum(key)))
                writer.write(str(one_sum) + " ")
            writer.write('\n')
        writer.close()
        
    def save_phi(self):
        writer = self.get_file_writer(path = 'model-final-phi.txt')
        for topic_id in range(self.num_topic):
            for word_id in self.Id2Word:
                calculation = (self.topic_word_num[word_id][topic_id] + self.beta) / ((self.num_topic_biterm[topic_id] * 2) + (self.vocab_size * self.beta))
                writer.write(str(calculation) + ' ')
            writer.write('\n')
        writer.close()
        
    
    def build_model(self):
        for it in range(self.num_iter):
            start_time = time.time()
            for biterm_index, old_topic_id in enumerate(self.topic_biterm):
                word1 = self.biterms[biterm_index]//1000000
                word2 = self.biterms[biterm_index]%1000000
                self.topic_word_num[word1][old_topic_id] -=1
                self.topic_word_num[word2][old_topic_id] -=1
                self.num_topic_biterm[old_topic_id] -=1
                
                new_topic_id = -1
                
                p = [0]*self.num_topic
                for k in range(self.num_topic):
                    p[k] = (self.num_topic_biterm[k] + self.alpha) * (self.topic_word_num[word1][k] + self.beta) * (self.topic_word_num[word2][k] + self.beta) / ((2 * self.num_topic_biterm[k] ) + (self.vocab_size * self.beta))**2
                    
                for k in range(1,self.num_topic):
                    p[k] += p[k-1]
                
                u = random.random() * p[-1]
                for k in range(self.num_topic):
                    if u < p[k]:
                        new_topic_id = k
                        break
                
                self.topic_word_num[word1][new_topic_id] +=1
                self.topic_word_num[word2][new_topic_id] +=1
                self.num_topic_biterm[new_topic_id] += 1
                
                self.topic_biterm[biterm_index] = new_topic_id
                
            print('Finished iteration:', it, 'Time taken:' + str(time.time()-start_time))
    
    def save_result(self):
        self.save_topic_words(20)
        self.save_theta()
        self.save_wordIds()
        self.save_phi()
        
    def run(self):
        self.load_data()
        self.init_model()
        self.build_model()
        self.save_result()    

btm = BTM(data_path='../Topic Modeling/filteredtext.txt',alpha=2,beta=0.001, num_iter=10, num_topic=10, output_dir='.')

btm.run()

btm.save_result()

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	error handler appears to be turned off. How do I turn it back on?	jpotter0	0	961	Nov-26-2022, 11:44 AM Last Post: jpotter0
	Joining two jupyter notebooks and getting an error!	Led_Zeppelin	1	1,978	Oct-20-2022, 04:28 PM Last Post: deanhystad
	Setting up new Python kernel for JupyterLab Desktop on M1 Mac	daler6	0	1,864	Jun-20-2022, 03:45 AM Last Post: daler6
	Jupyter kernel restarts	russellm10	0	2,043	Sep-14-2021, 04:24 AM Last Post: russellm10
	Problem: Restart kernel onPydev console when trying to install a python package	poppy2020	1	9,557	Nov-25-2020, 06:13 PM Last Post: Larz60+
	How a Mac OS software can restart itself with admin permission in Python 3.7?	Formationgrowthhacking	0	2,382	Sep-03-2020, 05:29 PM Last Post: Formationgrowthhacking
	Using a button to kill and restart a script	duckredbeard	3	4,663	Sep-01-2020, 12:53 AM Last Post: duckredbeard
	How to convert what appears to be a JSON file to CSV	NewBeie	4	3,343	Aug-28-2020, 04:45 PM Last Post: Larz60+
	Print a certain string only the first time it appears in a test file	buttercup	5	3,901	Jul-23-2020, 01:30 PM Last Post: palladium
	fileinput package appears to be zeroing files	rexrf	0	1,917	Jul-01-2020, 06:05 PM Last Post: rexrf

Jupyter error - 'The kernel appears to have died, it will restart automatically'

User Panel Messages

Announcements