For my Project, we are having to create a TextAnalyzer class. This is my first try at creating anything this large, and I have confused myself.
Right now I am stuck on def avg_word_length. No matter what I do, it gives me an error. What I have written now, has a type error, but after multiple attempts of trying to fix it, I have almost given up. Is there an issue in the code above that needs to be fixed that I can't see?
Any help would be greatly appreciated. Before I become bald, and lose my sanity.
Thank you.
Right now I am stuck on def avg_word_length. No matter what I do, it gives me an error. What I have written now, has a type error, but after multiple attempts of trying to fix it, I have almost given up. Is there an issue in the code above that needs to be fixed that I can't see?
Any help would be greatly appreciated. Before I become bald, and lose my sanity.
Thank you.
import requests, re from bs4 import BeautifulSoup from collections import Counter import statistics as stats import string import operator import matplotlib.pyplot as plt plt.rcdefaults() class TextAnalyzer: "A Text Analyzer" def __init__(self, src, src_type='discover'): """Creates a object for analyzing text Keyword arguments: src (str) -- text, path to file, or url src_type (str) -- The type of input (text, path, url, discover)""" if isinstance(src, str) == False or len(src) <= 0: raise exception('Source must be a valid string, filepath or a valid URL') self._src = src self._src_type = src_type self._content = None self._origin_content = None self.get_values def get_values(self): if self._src.endswith('.txt'): self._src_type = 'path' self._content = self._orig_content=self.read_file(self._src) elif self._src.startswith('http'): self._src_type = 'url' r = requests.get(self._src) res = r.content self._orig_content = r.text self._content = res else: self._src_type = 'text' self._orig_content = self._src self._content = self._src def set_content_to_tag(self, tag, tag_id=None): """Changes _content to the text within a specific element of an HTML document. Keyword arguments: tag (str) -- Tag to read tag_id (str) -- ID of tag to read It's possible the HTML does not contain the tag being searched. You should use exception handling to catch any errors.""" soup = BeautifulSoup(self._orig_content, 'html.parser') content = soup.find('{}'.format(tag),{'id':'{}'.format(tag_id)}) if content == None: raise Exception ("Tag or attribute does not exist") self._content = content.getText() print(content) def reset_content(self): """Resets _content to full text that was originally loaded. Useful after a call to set_content_to_tag().""" self._content=self._orig_content def _words(self, casesensitive = False): """Returns words in _content as list. Keyword arguments: casesensitive (bool) -- If False makes all words uppercase.""" words = '' if casesensitive == False: i = 0 self._content = len(words) while i < self._content: words[i] = [word.strip(string.punctuation) for word in words[i]].upper() i += 1 return words def common_words(self, minlen=1, maxlen=100, count=10, casesensitive=False): """Returns a list of 2-element tuples of the structure (word, num), where num is the number of times word shows up in _content. Keyword arguments: minlen (int) - Minimum length of words to include. maxlen (int) - Maximum length of words to include. count (int) - Number of words to include. casesensitive (bool) -- If False makes all words uppercase""" word_list= self._words() min_max_words = [item for item in word_list if len(item) >= minlen and len(item) <= maxlen] com_words= Counter(min_max_words) list_com_words= sorted(com_words.items(), key=operator.itemgetter(1), reverse=True) return list_com_words[:count] def char_distribution(self, casesensitive=False, letters_only=False): """Returns a list of 2-element tuples of the format (char, num), where num is the number of times char shows up in _content. The list should be sorted by num in descending order. Keyword arguments: casesensitive (bool) -- Consider case? letters_only (bool) -- Exclude non-letters?""" str_words = ''.join(self._words(casesensitive)) if letters_only == True: str_words = (re.sub('[_\W\d]+', '', str_words)) char_list = Counter(str_words) chars_list= sorted(char_list.items(), key=operator.itemgetter(1), reverse=True) return chars_list def plot_common_words(self, minlen=1, maxlen=100, count=10, casesensitive=False): """Plots most common words. Keyword arguments: minlen (int) -- Minimum length of words to include. maxlen (int) -- Maximum length of words to include. count (int) -- Number of words to include. casesensitive (bool) -- If False makes all words uppercase.""" word_list= self._words() word_list = [word.strip(string.punctuation) for word in word_list] min_max_words = [item for item in w_list if len(item) >= minlen and len(item) <= maxlen] com_words= Counter(min_max_words) list_com_words= sorted(com_words.items(), key=operator.itemgetter(1), reverse=True) most_com_words= list_com_words[:count] keys1 = [] values1 = [] for item in most_com_words: keys1.append(item[0]) values1.append(item[1]) plt.bar(range(len(keys1)), values1, tick_label=keys1) # plt.savefig('bar.png') plt.title("Common Words") plt.show() def plot_char_distribution(self, casesensitive=False, letters_only=False): """Plots character distribution. Keyword arguments: casesensitive (bool) -- If False makes all words uppercase. letters_only (bool) -- Exclude non-letters?""" char_dist=self.char_distribution(casesensitive, letters_only) keys1 = [] values1 = [] for item in char_dist: keys1.append(item[0]) values1.append(item[1]) plt.bar(range(len(keys1)), values1, tick_label=keys1) plt.title("Character Distribution") plt.show() @property def avg_word_length(self): "Average word length" word= '' for char in '-.,\n': word = word.replace(char, ' ') word = word.lower() word_list = text.split() d = {} for word in word_list: if word not in d: d[word] = 0 d[word] += 1 average_word_length = sum(word)/len(word_list) return average_word_length('%.2f'%length) @property def distinct_word_count(self): "Number of distinct words in content" dis_words = Counter(self._words()) return len(dis_words) @property def positivity(self): tally = 0 words= self._words() neg_word_list= self.read_file('negative.txt', 'rb').split() neg_word_list = [item.decode('UTF-8') for item in neg_word_list] pos_word_list = self.read_file('positive.txt', 'r').split() for item in words: if item in pos_word_list: tally = tally + 1 if item in neg_word_list: tally = tally -1 return (round(tally / self.word_count * 1000)) @property def word_count(self): "Number of words in content including repeats, in all uppercase letters." return len(self._words()) @property def words(self): return self._words() text = '''The outlook wasn't brilliant for the Mudville Nine that day; the score stood four to two, with but one inning more to play. And then when Cooney died at first, and Barrows did the same, a sickly silence fell upon the patrons of the game.''' url = 'https://www.webucator.com/how-to/address-by-bill-clinton-1997.cfm' path = "pride-and-prejudice.txt" #ta = TextAnalyzer(url) ta=TextAnalyzer(path) #ta = TextAnalyzer(text) # print(ta._src) # print(ta._src_type) #print(ta._orig_content) # print(ta._orig_content) #fsta.set_content_to_tag('div','content-main') #ta.set_content_to_tag('div' ,'device-xs visible-xs') #print(ta._content) #print(ta._words()) # ta.reset_content() # print(ta._content) #print("words", ta.words) print("word_count", ta.word_count) print("distinct_word_count", ta.distinct_word_count) print("avg_word_length", ta.avg_word_length) print("positivity", ta.positivity) print(ta.common_words(minlen=5, maxlen=10)) print("plot", ta.plot_common_words(minlen=5, maxlen=10)) print(ta.char_distribution()) print("plot", ta.plot_char_distribution(letters_only=True))
Error:TypeError Traceback (most recent call last)
<ipython-input-69-b3890944cb29> in <module>()
233 print("word_count", ta.word_count)
234 print("distinct_word_count", ta.distinct_word_count)
--> 235 print("avg_word_length", ta.avg_word_length)
236 print("positivity", ta.positivity)
237 print(ta.common_words(minlen=5, maxlen=10))
<ipython-input-69-b3890944cb29> in avg_word_length(self)
176 #words_list = [len(word) for word in words_list]
177
--> 178 average_word_length = sum(word)/len(word_list)
179
180 return average_word_length('%.2f'%length)
TypeError: unsupported operand type(s) for +: 'int' and 'str'