I ran your code (with added dictionary display) and gor a count of 6772
The only mod that I made was the addition of function of display_dict
ngrams.txt (Size: 140.46 KB / Downloads: 266)
Sorry for so many edits, gave me a bit of trouble attaching
The only mod that I made was the addition of function of display_dict
# getting unique ngrams out of duplicates and counting how many times ngram appears import requests from bs4 import BeautifulSoup import re import string from collections import OrderedDict def cleanInput(input): input = re.sub('\n', " ", input) input = re.sub('\[[0-9]*\]', "", input) input = re.sub(' +', " ", input) input = bytes(input, "UTF-8") input = input.decode("ascii", "ignore") input = input.upper() cleanInput = [] input = input.split(' ') for item in input: item = item.strip(string.punctuation) if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'): cleanInput.append(item) return cleanInput def getNgrams(input, n): input = cleanInput(input) output = dict() for i in range(len(input)-n+1): newNGram = " ".join(input[i:i+n]) if newNGram in output: output[newNGram] += 1 else: output[newNGram] = 1 return output def display_dict(thedict): for key, value in thedict.items(): if isinstance(value, dict): print(f'{key}:') display_dict(value) else: print(f' {key}: {value}') html = requests.get("http://en.wikipedia.org/wiki/Python_(programming_language)") bsObj = BeautifulSoup(html.content, 'html.parser') content = bsObj.find("div", {"id": "mw-content-text"}).get_text() ngrams = getNgrams(content, 2) ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True)) display_dict(ngrams) # print(ngrams) print("2-grams count is: "+str(len(ngrams)))results will add attachment
ngrams.txt (Size: 140.46 KB / Downloads: 266)
Sorry for so many edits, gave me a bit of trouble attaching