Help with simple nltk Chatbot
Help with simple nltk Chatbot

I'm trying to build a simple chatbot using nltk. The problem I have with this code is that whenever I input a response (ex: my name is bob) the bot responds with this "Great to hear that, How can I help you?" OR "Nice to hear that" which is supposed to be triggered when I say/enter: "I am fine" , "i'm (.*) doing good" OR "great".

For some reason it keeps using that ("Great to hear that, How can I help you?" OR "Nice to hear that") as a default response no matter what I type. I had it working before (with the comments in the code included) and I don't know what happened.

I'm fairly new to nltk and I'm trying to learn but this issue has me stumped.
Can anyone help me figure out why it keeps defaulting to these responses?


Main Code:
import nltk
from import Chat, reflections

from Reflections import *
from Intents import *

def chat():
    print("Hi! I am a chatbot")
    chat = Chat(pairs, reflections)
#initiate the conversation
if __name__ == "__main__":
Intents script:
pairs = [
        ["Hello", "Hey there",]
        r"bye|goodbye| see ya",
        r"how are you ?",
        ["I'm doing good. How about you?",]
        r"I am fine|i'm (.*) doing good|great|",
        ["Great to hear that, How can I help you?","Nice to hear that",]
        r"my name is (.*)",
        ["Hello %1, How are you today?",]
        r"what is your name ?",
        ["I am a Baxter.",]
        r"sorry (.*)|sorry|(.*) sorry",
        ["It's alright","It's OK, never mind",]
        r"(.*) age?",
        ["I lost track.",]
        r"what (.*) want ?",
        ["Make me an offer I can't refuse",]
        r"(.*) created ?",
        ["I was created by you",]
        r"(.*) (sports|game) ?",
        ["I'm a very big fan of Soccer",]
        r"who (.*) sportsperson ?",
        r"who (.*) (moviestar|actor)?",
        ["Will Smith"]
        ["Until next time"]
Reflections Script:
reflections = {
  "i am"       : "you are",
  "i was"      : "you were",
  "i"          : "you",
  "i'm"        : "you are",
  "i'd"        : "you would",
  "i've"       : "you have",
  "i'll"       : "you will",
  "my"         : "your",
  "you are"    : "I am",
  "you were"   : "I was",
  "you've"     : "I have",
  "you'll"     : "I will",
  "your"       : "my",
  "yours"      : "mine",
  "you"        : "me",
  "me"         : "you"
I finally figured it out. I had an extra |.
That was driving me mad for so long! Doh

Old (error: |great|):
        r"I am fine|i'm (.*) doing good|great|",
        ["Great to hear that, How can I help you?","Nice to hear that",]
New (Working: |great)
        r"I am fine|i'm (.*) doing good|great",
        ["Great to hear that, How can I help you?","Nice to hear that",]
I was just messing around with it and noticed that the | was out of place. That happened to be the error but I don't know why.
If anyone knows, could you please explain why that caused that error?
I don't know how the nltk module works, but I'm guessing with the r-strings that those are regular expressions that it's trying to match. I'll bet the extra delimiter means that it assumes there's an empty string at the end and the empty string matches any input (and does so immediately)
Hello, I searched how to do it and I managed to make this code for 32 bit systems. The function that summarizes the articles does not work on the 64 bit systems with the new gensim version. The chatbot will look for its answers on wikipedia through the tor networks and on duckduckgo.
For the operation of the script create a bot file bot.db and generate the table with sqlite :
c.execute("CREATE TABLE conversation (ask TEXT, answer TEXT)")

#!/usr/bin/env python
import sys
if sys.version_info[0] >= 3:
    import PySimpleGUI as sg
    import PySimpleGUI27 as sg

import math
import httpx
from gensim.summarization.textcleaner import split_sentences
from rake_nltk import Rake
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
import sys
from bs4 import BeautifulSoup
import requests
import heapq
import nltk
from gensim.summarization.summarizer import summarize
import csv
import time
import socks
import socket
import random
from random import choice
import io
from googletrans import Translator
import numpy as np
import string
#import urllib.request
import urllib.request as urllib2
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.feature_extraction.text import HashingVectorizer
import colorama
from colorama import init
from colorama import Fore, Back, Style
from paraphraser import paraphrase
import sqlite3
import base64
import subprocess
import platform
import heapq
#from googlesearch import search
from duckpy import Client
import httplib2


client = Client()

#client = Client(proxies=['', ''])
timeout = httpx.Timeout(5)

conn = sqlite3.connect("bot.db")
c = conn.cursor()
translator = Translator()

def get_tor_session():
    session = requests.session()
    # Tor uses the 9050 port as the default socks port
    session.proxies = {'http':  'socks5://',
                       'https': 'socks5://'}
    return session

def f(seq):
    seen = set()
    return [x for x in seq if x not in seen and not seen.add(x)]

def summary(x, perc):
    if len(split_sentences(x)) > 10:
        test_summary = summarize(x, ratio=perc, split=True)
        test_summary = '\n'.join(map(str, f(test_summary)))
        test_summary = x
    return test_summary

desktop_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                  'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
                  'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
                  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
                  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
                  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
                  'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                  'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0']

def random_headers():
    return {'User-Agent': choice(desktop_agents), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}


def remove_text_inside_brackets(text, brackets="[]"):
    count = [0] * (len(brackets) // 2)  # count open/close brackets
    saved_chars = []
    for character in text:
        for i, b in enumerate(brackets):
            if character == b:  # found bracket
                kind, is_close = divmod(i, 2)
                count[kind] += (-1)**is_close  # `+1`: open, `-1`: close
                if count[kind] < 0:  # unbalanced bracket
                    count[kind] = 0  # keep it
                else:  # found bracket to remove
        else:  # character is not a [balanced] bracket
            if not any(count):  # outside brackets
    return ''.join(saved_chars)

def summary_nltk(article_text):
    sentence_list = nltk.sent_tokenize(article_text)

    stopwords = nltk.corpus.stopwords.words('english')

    word_frequencies = {}
    for word in nltk.word_tokenize(formatted_article_text):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
                word_frequencies[word] += 1

    maximum_frequncy = max(word_frequencies.values())

    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

    sentence_scores = {}
    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                        sentence_scores[sent] += word_frequencies[word]

    summary_sentences = heapq.nlargest(
        7, sentence_scores, key=sentence_scores.get)

    summary = ' '.join(summary_sentences)
    return summary

def databasefunct(user_input):
    final_txt = ' '
    totality_text = []
    jarvis_response = ''
    article_sentences = []
    seq = []
    rake = Rake()

    kw = rake.extract_keywords_from_text(user_input)

    seq = rake.get_ranked_phrases()
    vectorizer = TfidfVectorizer(
        sublinear_tf=True, encoding='latin-1', stop_words='english')

    X = vectorizer.fit_transform(seq)

    true_k = 1
    km = KMeans(n_clusters=true_k, init='k-means++',
                max_iter=100, n_init=1, random_state=1)
    y =

    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        for ind in order_centroids[i, :2]:
            test = ' %s' % terms[ind]

    c.execute('select * from conversation')
    records = c.fetchall()

    for record in records:
        ask = record[0]
        ask = base64.b64decode(ask)
        ask = str(ask, 'utf-8')
        answer = record[1]
        answer = base64.b64decode(answer)
        answer = str(answer, 'utf-8')

        length_temp_list = len(seq)-1
        for k in range(0, length_temp_list):
            text = seq[k]
            if text in answer:
    first_sum = ''.join([str(item) for item in totality_text])

    sent_tokenizer ='nltk:tokenizers/punkt/english.pickle')
    article_sentences = sent_tokenizer.tokenize(
        first_sum.strip(), realign_boundaries=True)

    article_words = word_tokenize(first_sum)

    wnlemmatizer = nltk.stem.WordNetLemmatizer()
    first_sum = [wnlemmatizer.lemmatize(word) for word in first_sum]
    ' '.join(first_sum)
    # sublinear_tf=True
    word_vectorizer = TfidfVectorizer(
        sublinear_tf=True, min_df=1, stop_words="english")
    tfidf = word_vectorizer.transform(article_sentences)
    similar_vector_values = tfidf * tfidf.T
    arr = similar_vector_values.toarray()
    np.fill_diagonal(arr, np.nan)
    input_idx = article_sentences.index(user_input)
    result_idx = np.nanargmax(arr[input_idx])
    vector_matched = article_sentences[result_idx]
    if vector_matched and not vector_matched.isspace():
        return vector_matched
        jarvis_response = "I am sorry, I could not understand you."
        return jarvis_response

def mainfunct(user_input):

    session = get_tor_session()
    # detect presense of proxy and use env varibles if they exist
    pi = httplib2.proxy_info_from_environment()
    if pi:
        socks.set_default_proxy(socks.SOCKS5, "",SOCKS_PORT)

    # now all calls through httplib2 should use the proxy settings

    rake = Rake()

    kw = rake.extract_keywords_from_text(user_input)

    ranked_phrases = rake.get_ranked_phrases()
    vectorizer = TfidfVectorizer(encoding='latin-1', stop_words='english')

    X = vectorizer.fit_transform(ranked_phrases)

    true_k = 1
    km = KMeans(n_clusters=true_k, init='k-means++',
                max_iter=100, n_init=1, random_state=1)
    y =
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        for ind in order_centroids[i, :10]:
            test = ' %s' % terms[ind]

    length = len(ranked_phrases) - 1
    final_txttxt = []
    for i in range(0, length):
        temp_list = []
        url1 = '{}'.format(ranked_phrases[i])
        url2 = '{}'.format(
        url3 = '{}'.format(ranked_phrases[i])
        url4 = '{}'.format(
        url5 = '{}'.format(ranked_phrases[i])
        url6 = '{}'.format(ranked_phrases[i])
        url7 = '{}'.format(ranked_phrases[i])

    searchfor = user_input
    results =

    # for result in search(searchfor):
    # temp_list.append(result)

    length_temp_list = len(temp_list) - 1
    for k in range(0, length_temp_list):
        # deadline = time.time() + 20.0
        res = requests.get(temp_list[k], headers=random_headers())
        pagetext = res.text
        wiki = BeautifulSoup(pagetext, 'html.parser')
        totality_text = []
        for l in'p'):
            final_txt = ''.join([str(item) for item in totality_text])
            number_of_sentences = sent_tokenize(final_txt)
            if len(number_of_sentences) > 2:
                mysummary = summary(final_txt, 0.65)
                return mysummary
            elif len(number_of_sentences) < 2:

greeting_inputs = ("hey", "good morning", "good evening",
                   "morning", "evening", "hi", "whatsup")
greeting_responses = ["hey", "hey hows you?", "*nods*",
                      "hello, how you doing", "hello", "Welcome, I am good and you"]

def generate_greeting_response(greeting):
    for token in greeting.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses)

def generate_not_understand():
    sentences = []
    reponse = "I am sorry, I could not understand you."
    sentences = paraphrase(reponse)
    sentence_item = random.choice(sentences)
    return sentence_item

def generate_response(user_input, first_sum):
    jarvis_response = ''
    article_sentences = []

    first_sum = mainfunct(user_input)

    sent_tokenizer ='nltk:tokenizers/punkt/english.pickle')

    if first_sum is None:
        jarvis_response = jarvis_response + \
        return jarvis_response
        article_sentences = sent_tokenizer.tokenize(
            first_sum.strip(), realign_boundaries=True)

    article_words = word_tokenize(first_sum)

    wnlemmatizer = nltk.stem.WordNetLemmatizer()
    first_sum = [wnlemmatizer.lemmatize(word) for word in first_sum]
    ' '.join(first_sum)
    # sublinear_tf=True
    word_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(
        1, 3), min_df=0.001, max_df=0.50, max_features=10000, use_idf=True)
    tfidf = word_vectorizer.transform(article_sentences)
    similar_vector_values = tfidf * tfidf.T
    arr = similar_vector_values.toarray()
    np.fill_diagonal(arr, np.nan)
    input_idx = article_sentences.index(user_input)
    result_idx = np.nanargmax(arr[input_idx])
    vector_matched = article_sentences[result_idx]

    if vector_matched and not vector_matched.isspace():
        sentences = []

        jarvis_response = jarvis_response + \
        if jarvis_response is None:
            jarvis_response = jarvis_response + \
            return jarvis_response
            sentences = paraphrase(jarvis_response)
            sentence_item = random.choice(sentences)
            return sentence_item
        jarvis_response = jarvis_response + \
        return jarvis_response

# give our window a spiffy set of colors

layout = [[sg.Text('J.A.R.V.I.S : Just A Rather Very Intelligent System.', size=(60, 1))],
          [sg.Output(size=(127, 30), font=('opensans 11'))],
          [sg.Multiline(size=(85, 5), enter_submits=True, key='query'),
           sg.Button('SEND', button_color=(
               "white", "Black"), bind_return_key=True),
           sg.Button('DB', button_color=(
               "white", "Black"), bind_return_key=True),
           sg.Button('EXIT', button_color=("white", "Black"))]]

window = sg.Window('J.A.R.V.I.S',
                   default_element_size=(30, 2),
                   font=('opensans', ' 13'),
                   default_button_element_size=(8, 2)).Layout(layout)

# ---===--- Loop taking in user input and using it  --- #
while True:
    event, value = window.Read()
    #c.execute("CREATE TABLE conversation (ask TEXT, answer TEXT)")
    if event == 'SEND':
        url = ''
        timeout = 5
            _ = requests.get(url, timeout=timeout)

            query = value['query'].rstrip()
            print('YOU : {}'.format(query))
            user_input = query.lower()

            first_sum = str(mainfunct(user_input))
            if user_input == 'thanks' or user_input == 'thank you very much' or user_input == 'thank you':
                continue_dialogue = False
                print("J.A.R.V.I.S : Most welcome")
                if generate_greeting_response(user_input) != None:
                    print("J.A.R.V.I.S : " +
                    machine_response = generate_response(user_input, first_sum)
                    clear_response = remove_text_inside_brackets(
                    print("J.A.R.V.I.S : " + clear_response)
                    encodedBytes = base64.b64encode(user_input.encode("utf-8"))
                    encodedStr = str(encodedBytes, "utf-8")
                    encodedBot = base64.b64encode(
                    encodedbot = str(encodedBot, "utf-8")
                        "insert into conversation (ask, answer) values (?, ?)", (encodedStr, encodedbot))
        except requests.ConnectionError:
            query = value['query'].rstrip()
            print('YOU : {}'.format(query))
            user_input = query.lower()
            clear_response = databasefunct(user_input)
            print("J.A.R.V.I.S : " + clear_response)
    if event == 'DB':
        c.execute('select * from conversation')
        records = c.fetchall()

        for record in records:
            ask = record[0]
            ask = base64.b64decode(ask)
            ask = str(ask, 'utf-8')
            answer = record[1]
            answer = base64.b64decode(answer)
            answer = str(answer, 'utf-8')
            print("YOU : ", ask)
            print("J.A.R.V.I.S : ", answer)

    elif event in (None, 'EXIT'):            # quit if exit button or X
        # conn.close()

