# getting unique ngrams out of duplicates and counting how many times ngram appears import requests from bs4 import BeautifulSoup import re import string from collections import OrderedDict def cleanInput(input): input = re.sub('\n', " ", input) input = re.sub('\[[0-9]*\]', "", input) input = re.sub(' +', " ", input) input = bytes(input, "UTF-8") input = input.decode("ascii", "ignore") input = input.upper() cleanInput = [] input = input.split(' ') for item in input: item = item.strip(string.punctuation) if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'): cleanInput.append(item) return cleanInput def getNgrams(input, n): input = cleanInput(input) output = dict() for i in range(len(input)-n+1): newNGram = " ".join(input[i:i+n]) if newNGram in output: output[newNGram] += 1 else: output[newNGram] = 1 return output html = requests.get("http://en.wikipedia.org/wiki/Python_(programming_language)") bsObj = BeautifulSoup(html.content, 'html.parser') content = bsObj.find("div", {"id": "mw-content-text"}).get_text() ngrams = getNgrams(content, 2) ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True)) print(ngrams) print("2-grams count is: "+str(len(ngrams)))I'm working on this example of n-gram extraction in the book of web scraping. What surprised me is that program didn't find any duplicate, for each n-gram that consists of two words there is number '1'. In the book the given output is much different. For example,
Output:('of the', 38)
is sorted first, therefore I'm wondering if I'm doing something wrong. Could you also give me an explanation of this line:
ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True))Particulary this chunk:
key=lambda t: t[1]
Don't understand what is it for...