Python Forum
Project: Text Analyzer
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Project: Text Analyzer
#1
Thank you for reading this thread. I need help creating my Text Analyzer. I have been going at it for a few days now. I will present the class I am making along with the instructions given and a help file and of course the current error message. Of course I am not looking for anyone to do the whole thing for me, have to learn after all, so I will take the help step by step. I am working on Jupiter Notebook via Anaconda. I am fairly certain it is something super simple, that I am not putting my finger on it. I know the help file has builtins.object in the parenthesis after the class textanalyzer, but you get an object not found error when it's in. I know one thing that is confusing me is in some places in the help and instructions the type is typed out as "_src_type" and other places it is "src_type".

Instructions & Help
The easiest way to share this is via google drive
Instructions
Help

The Text Analyzer
import requests, re
from bs4 import BeautifulSoup
from collections import Counter
import statistics as stats
import string
#above was created by the school
#create your class here
class TextAnalyzer():   
    def __init__(self, src, _src_type = 'discover'):
        self.src = src
        self._src_type = _src_type
        if src == 'discover':
            if re.match(r"^http", src, re.I):
                _src_type = 'url'
            elif re.search(r"txt&", src, re.I):
                _src_type = 'path'
            else:
                _src_type = 'text'
        elif src == 'url':
            _src_type = 'url'
        elif src == 'path':
            _src_type = 'path'
        else:
            _src_type = 'text'

     
    #def char_distribution(self, casesensitive=False, letters_only=False):

    #def common_words(self, minlen=1, maxlen=100, count=10, casesensitive=False):

    #def plot_char_distribution(self, casesensitive=False, letters_only=False):
        
    #def plot_common_words(self, minlen=1, maxlen=100, count=10, casesensitive=False):
        
    #def reset_content(self):
        
    #def set_content_to_tag(self, tag, tag_id=None):

The Tester
import unittest

url = 'https://www.webucator.com/how-to/address-by-bill-clinton-1997.cfm'
path = 'pride-and-prejudice.txt'
text = '''The outlook wasn't brilliant for the Mudville Nine that day;
the score stood four to two, with but one inning more to play.
And then when Cooney died at first, and Barrows did the same,
a sickly silence fell upon the patrons of the game.'''

class TestTextAnalyzer(unittest.TestCase):
    def test_discover_url(self):
        ta = TextAnalyzer(url)
        self.assertEqual(ta._src_type, 'url')
    def test_discover_path(self):
        ta = TextAnalyzer(path)
        self.assertEqual(ta._src_type, 'path')
    def test_discover_text(self):
        ta = TextAnalyzer(text)
        self.assertEqual(ta._src_type, 'text')
'''
    def test_set_content_to_tag(self):
        ta = TextAnalyzer(url)
        ta.set_content_to_tag('div','content-main')
        self.assertEqual(ta._content[0:25], '\n\nAddress by Bill Clinton')
    def test_reset_content(self):
        ta = TextAnalyzer(url)
        ta.set_content_to_tag('div','content-main')
        ta.reset_content()
        self.assertEqual(ta._content[0], '<')
    def test_common_words(self):
        ta = TextAnalyzer(path, src_type='path')
        common_words = ta.common_words(minlen=5, maxlen=10)
        liz = common_words[0]
        self.assertEqual(liz[0],'ELIZABETH')
    def test_avg_word_length(self):
        ta = TextAnalyzer(text, src_type='text')
        self.assertEqual(ta.avg_word_length, 4.16)
    def test_word_count(self):
        ta = TextAnalyzer(text, src_type='text')
        self.assertEqual(ta.word_count, 45)
    def test_distinct_word_count(self):
        ta = TextAnalyzer(text, src_type='text')
        self.assertEqual(ta.distinct_word_count, 38)
    def test_char_distribution(self):
        ta = TextAnalyzer(text, src_type='text')
        char_dist = ta.char_distribution(letters_only=True)
        self.assertEqual(char_dist[1][1], 20)
    def test_positivity(self):
        ta = TextAnalyzer(text, src_type='text')
        positivity = ta.positivity
        self.assertEqual(positivity, -44)
'''        
suite = unittest.TestLoader().loadTestsFromTestCase(TestTextAnalyzer)
unittest.TextTestRunner().run(suite)

Current Error
Error:
FFF ====================================================================== FAIL: test_discover_path (__main__.TestTextAnalyzer) ---------------------------------------------------------------------- Traceback (most recent call last): File "<ipython-input-6-46dd30f19dcd>", line 16, in test_discover_path self.assertEqual(ta._src_type, 'path') AssertionError: 'discover' != 'path' - discover + path ====================================================================== FAIL: test_discover_text (__main__.TestTextAnalyzer) ---------------------------------------------------------------------- Traceback (most recent call last): File "<ipython-input-6-46dd30f19dcd>", line 19, in test_discover_text self.assertEqual(ta._src_type, 'text') AssertionError: 'discover' != 'text' - discover + text ====================================================================== FAIL: test_discover_url (__main__.TestTextAnalyzer) ---------------------------------------------------------------------- Traceback (most recent call last): File "<ipython-input-6-46dd30f19dcd>", line 13, in test_discover_url self.assertEqual(ta._src_type, 'url') AssertionError: 'discover' != 'url' - discover + url ---------------------------------------------------------------------- Ran 3 tests in 0.002s FAILED (failures=3)
Reply
#2
_src_type is being asigned as a local variable instead a class attribute.
needs to be changed to self._src_type
Reply
#3
Where exactly do I make that change?

Nevermind it just dawned on me what you meant. In the if statements.

now I have the error....
Error:
F.. ====================================================================== FAIL: test_discover_path (__main__.TestTextAnalyzer) ---------------------------------------------------------------------- Traceback (most recent call last): File "<ipython-input-4-46dd30f19dcd>", line 16, in test_discover_path self.assertEqual(ta._src_type, 'path') AssertionError: 'text' != 'path' - text + path ---------------------------------------------------------------------- Ran 3 tests in 0.002s FAILED (failures=1)

got it, I replaced the $ with \Z on line 15. Now I get.

Error:
... ---------------------------------------------------------------------- Ran 3 tests in 0.002s OK
Reply
#4
Alright, well been at the set_content_to_tag for hours now. With soup I have tried search() and find(), but I keep getting the error, well search gave me a different error so I switched back to find_all().

import requests, re
from bs4 import BeautifulSoup
from collections import Counter
import statistics as stats
import string

#create your class here
class TextAnalyzer():   
    def __init__(self, src, _content = 'none', _orig_content = 'none', _src_type = 'discover'):
        self.src = src
        self._src_type = _src_type
        self._content = _content
        self._orig_content = _orig_content
        if self._src_type == 'discover':
            if re.match(r"^http", src, re.I):
                self._src_type = 'url'
            elif re.findall(r"txt\Z", src, re.I):
                self._src_type = 'path'
            else:
                self._src_type = 'text'
        elif src == 'url':
            self._src_type = 'url'
        elif src == 'path':
            self._src_type = 'path'
        else:
            self._src_type = 'text'

    def set_content_to_tag(self, tag, tag_id = None):
        self.tag = tag
        self.tag_id = tag_id
        r = requests.get(url, tag)
        _orig_content = r.text
        soup = BeautifulSoup(_orig_content, 'html.parser')
        _content = soup.find_all(tag, tag_id)
            
Error:
...F ====================================================================== FAIL: test_set_content_to_tag (__main__.TestTextAnalyzer) ---------------------------------------------------------------------- Traceback (most recent call last): File "<ipython-input-27-3d60d3736c30>", line 23, in test_set_content_to_tag self.assertEqual(ta._content[0:25], '\n\nAddress by Bill Clinton') AssertionError: 'none' != '\n\nAddress by Bill Clinton' - none + Address by Bill Clinton ---------------------------------------------------------------------- Ran 4 tests in 0.592s FAILED (failures=1)
Reply


Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020