Oct-26-2020, 05:37 PM
I'm working my way through an excellent webscraping tutorial, and I'm on Lesson 10 here:
https://programminghistorian.org/en/less...requencies
But when I run the recommended code, using functions to delete stop words, I'm still left with a lot of numeric tokens (strings) that are counted in the word frequency list. I'd like to get rid of every numerical character.
Any recommendations as to how to use either the re module or some other function/module that would delete numerical tokens?
Below is my code (I'm working in Thonny 3.2.7 (which uses Python 3.7.7 as the interpreter, and Tk 8.6.8) on Mac Catalina; note that "obo" is a .py file created from recommendations in the tutorial; it's not part of the standard Python library):
# html-to-freq-2.py
import urllib.request, urllib.error, urllib.parse
import obo
url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33'
response = urllib.request.urlopen(url)
html = response.read()
text = obo.stripTags(html).lower()
fullwordlist = obo.stripNonAlphaNum(text)
wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
dictionary = obo.wordListToFreqDict(wordlist)
sorteddict = obo.sortFreqDict(dictionary)
for s in sorteddict: print(str(s))
#Note that the "obo" module is a .py file created with code recommended by the tutorial. That code is:
#For processing obo file Old Baily Online. Webscraping.
#Telling the module which tag to look for to start scraping: startLoc =
#And which tag defines the end point, where you want to stop scraping: endLoc =
def stripTags(pageContents):
pageContents = str(pageContents)
startLoc = pageContents.find("<p>")
endLoc = pageContents.rfind("<br/>")
pageContents = pageContents[startLoc:endLoc]
return pageContents
# Given a list of words, remove any that are in a list of stop words.
def removeStopwords(wordlist, stopwords):
return [w for w in wordlist if w not in stopwords]
#Setting the list of stop words. This, per ProgrammingHistorian, is supposed to go at the beginning of obo.py
stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']
stopwords += ['already', 'also', 'although', 'always', 'am', 'among']
stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
stopwords += ['because', 'become', 'becomes', 'becoming', 'been']
stopwords += ['before', 'beforehand', 'behind', 'being', 'below']
stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']
stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']
stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
stopwords += ['some', 'somehow', 'someone', 'something', 'sometime']
stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby']
stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']
stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
stopwords += ['whatever', 'when', 'whence', 'whenever', 'where']
stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
stopwords += ['within', 'without', 'would', 'yet', 'you', 'your']
stopwords += ['yours', 'yourself', 'yourselves']
stopwords += ['p']
#Stripping out all non-alphanumeric characters, then importing the regex module "re"
#Also using Unicode to make sure all characters from all languages can be scraped by the module (that we're creating)
#And splitting the text
def stripNonAlphaNum(text):
import re
return re.compile(r'\W+', re.UNICODE).split(text)
# Given a list of words, return a dictionary of
# word-frequency pairs.
def wordListToFreqDict(wordlist):
wordfreq = [wordlist.count(p) for p in wordlist]
return dict(list(zip(wordlist,wordfreq)))
# Sort a dictionary of word-frequency pairs in order of descending frequency.
def sortFreqDict(freqdict):
aux = [(freqdict[key], key) for key in freqdict]
aux.sort()
aux.reverse()
return aux
#I want to point out to those who are, like me, learning the rudiments of webscraping, the first lesson in this awesome tutorial is here:
https://programminghistorian.org/en/less...stallation
Thanks for any help you can give me.
https://programminghistorian.org/en/less...requencies
But when I run the recommended code, using functions to delete stop words, I'm still left with a lot of numeric tokens (strings) that are counted in the word frequency list. I'd like to get rid of every numerical character.
Any recommendations as to how to use either the re module or some other function/module that would delete numerical tokens?
Below is my code (I'm working in Thonny 3.2.7 (which uses Python 3.7.7 as the interpreter, and Tk 8.6.8) on Mac Catalina; note that "obo" is a .py file created from recommendations in the tutorial; it's not part of the standard Python library):
# html-to-freq-2.py
import urllib.request, urllib.error, urllib.parse
import obo
url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33'
response = urllib.request.urlopen(url)
html = response.read()
text = obo.stripTags(html).lower()
fullwordlist = obo.stripNonAlphaNum(text)
wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
dictionary = obo.wordListToFreqDict(wordlist)
sorteddict = obo.sortFreqDict(dictionary)
for s in sorteddict: print(str(s))
#Note that the "obo" module is a .py file created with code recommended by the tutorial. That code is:
#For processing obo file Old Baily Online. Webscraping.
#Telling the module which tag to look for to start scraping: startLoc =
#And which tag defines the end point, where you want to stop scraping: endLoc =
def stripTags(pageContents):
pageContents = str(pageContents)
startLoc = pageContents.find("<p>")
endLoc = pageContents.rfind("<br/>")
pageContents = pageContents[startLoc:endLoc]
return pageContents
# Given a list of words, remove any that are in a list of stop words.
def removeStopwords(wordlist, stopwords):
return [w for w in wordlist if w not in stopwords]
#Setting the list of stop words. This, per ProgrammingHistorian, is supposed to go at the beginning of obo.py
stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']
stopwords += ['already', 'also', 'although', 'always', 'am', 'among']
stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
stopwords += ['because', 'become', 'becomes', 'becoming', 'been']
stopwords += ['before', 'beforehand', 'behind', 'being', 'below']
stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']
stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']
stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
stopwords += ['some', 'somehow', 'someone', 'something', 'sometime']
stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby']
stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']
stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
stopwords += ['whatever', 'when', 'whence', 'whenever', 'where']
stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
stopwords += ['within', 'without', 'would', 'yet', 'you', 'your']
stopwords += ['yours', 'yourself', 'yourselves']
stopwords += ['p']
#Stripping out all non-alphanumeric characters, then importing the regex module "re"
#Also using Unicode to make sure all characters from all languages can be scraped by the module (that we're creating)
#And splitting the text
def stripNonAlphaNum(text):
import re
return re.compile(r'\W+', re.UNICODE).split(text)
# Given a list of words, return a dictionary of
# word-frequency pairs.
def wordListToFreqDict(wordlist):
wordfreq = [wordlist.count(p) for p in wordlist]
return dict(list(zip(wordlist,wordfreq)))
# Sort a dictionary of word-frequency pairs in order of descending frequency.
def sortFreqDict(freqdict):
aux = [(freqdict[key], key) for key in freqdict]
aux.sort()
aux.reverse()
return aux
#I want to point out to those who are, like me, learning the rudiments of webscraping, the first lesson in this awesome tutorial is here:
https://programminghistorian.org/en/less...stallation
Thanks for any help you can give me.