Python Forum
Project: Text Analyzer
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Project: Text Analyzer
#1
Thank you for reading this thread. I need help creating my Text Analyzer. I have been going at it for a few days now. I will present the class I am making along with the instructions given and a help file and of course the current error message. Of course I am not looking for anyone to do the whole thing for me, have to learn after all, so I will take the help step by step. I am working on Jupiter Notebook via Anaconda. I am fairly certain it is something super simple, that I am not putting my finger on it. I know the help file has builtins.object in the parenthesis after the class textanalyzer, but you get an object not found error when it's in. I know one thing that is confusing me is in some places in the help and instructions the type is typed out as "_src_type" and other places it is "src_type".

Instructions & Help
The easiest way to share this is via google drive
Instructions
Help

The Text Analyzer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import requests, re
from bs4 import BeautifulSoup
from collections import Counter
import statistics as stats
import string
#above was created by the school
#create your class here
class TextAnalyzer():  
    def __init__(self, src, _src_type = 'discover'):
        self.src = src
        self._src_type = _src_type
        if src == 'discover':
            if re.match(r"^http", src, re.I):
                _src_type = 'url'
            elif re.search(r"txt&", src, re.I):
                _src_type = 'path'
            else:
                _src_type = 'text'
        elif src == 'url':
            _src_type = 'url'
        elif src == 'path':
            _src_type = 'path'
        else:
            _src_type = 'text'
 
      
    #def char_distribution(self, casesensitive=False, letters_only=False):
 
    #def common_words(self, minlen=1, maxlen=100, count=10, casesensitive=False):
 
    #def plot_char_distribution(self, casesensitive=False, letters_only=False):
         
    #def plot_common_words(self, minlen=1, maxlen=100, count=10, casesensitive=False):
         
    #def reset_content(self):
         
    #def set_content_to_tag(self, tag, tag_id=None):

The Tester
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import unittest
 
path = 'pride-and-prejudice.txt'
text = '''The outlook wasn't brilliant for the Mudville Nine that day;
the score stood four to two, with but one inning more to play.
And then when Cooney died at first, and Barrows did the same,
a sickly silence fell upon the patrons of the game.'''
 
class TestTextAnalyzer(unittest.TestCase):
    def test_discover_url(self):
        ta = TextAnalyzer(url)
        self.assertEqual(ta._src_type, 'url')
    def test_discover_path(self):
        ta = TextAnalyzer(path)
        self.assertEqual(ta._src_type, 'path')
    def test_discover_text(self):
        ta = TextAnalyzer(text)
        self.assertEqual(ta._src_type, 'text')
'''
    def test_set_content_to_tag(self):
        ta = TextAnalyzer(url)
        ta.set_content_to_tag('div','content-main')
        self.assertEqual(ta._content[0:25], '\n\nAddress by Bill Clinton')
    def test_reset_content(self):
        ta = TextAnalyzer(url)
        ta.set_content_to_tag('div','content-main')
        ta.reset_content()
        self.assertEqual(ta._content[0], '<')
    def test_common_words(self):
        ta = TextAnalyzer(path, src_type='path')
        common_words = ta.common_words(minlen=5, maxlen=10)
        liz = common_words[0]
        self.assertEqual(liz[0],'ELIZABETH')
    def test_avg_word_length(self):
        ta = TextAnalyzer(text, src_type='text')
        self.assertEqual(ta.avg_word_length, 4.16)
    def test_word_count(self):
        ta = TextAnalyzer(text, src_type='text')
        self.assertEqual(ta.word_count, 45)
    def test_distinct_word_count(self):
        ta = TextAnalyzer(text, src_type='text')
        self.assertEqual(ta.distinct_word_count, 38)
    def test_char_distribution(self):
        ta = TextAnalyzer(text, src_type='text')
        char_dist = ta.char_distribution(letters_only=True)
        self.assertEqual(char_dist[1][1], 20)
    def test_positivity(self):
        ta = TextAnalyzer(text, src_type='text')
        positivity = ta.positivity
        self.assertEqual(positivity, -44)
'''       
suite = unittest.TestLoader().loadTestsFromTestCase(TestTextAnalyzer)
unittest.TextTestRunner().run(suite)

Current Error
Error:
FFF ====================================================================== FAIL: test_discover_path (__main__.TestTextAnalyzer) ---------------------------------------------------------------------- Traceback (most recent call last): File "<ipython-input-6-46dd30f19dcd>", line 16, in test_discover_path self.assertEqual(ta._src_type, 'path') AssertionError: 'discover' != 'path' - discover + path ====================================================================== FAIL: test_discover_text (__main__.TestTextAnalyzer) ---------------------------------------------------------------------- Traceback (most recent call last): File "<ipython-input-6-46dd30f19dcd>", line 19, in test_discover_text self.assertEqual(ta._src_type, 'text') AssertionError: 'discover' != 'text' - discover + text ====================================================================== FAIL: test_discover_url (__main__.TestTextAnalyzer) ---------------------------------------------------------------------- Traceback (most recent call last): File "<ipython-input-6-46dd30f19dcd>", line 13, in test_discover_url self.assertEqual(ta._src_type, 'url') AssertionError: 'discover' != 'url' - discover + url ---------------------------------------------------------------------- Ran 3 tests in 0.002s FAILED (failures=3)
Reply
#2
_src_type is being asigned as a local variable instead a class attribute.
needs to be changed to self._src_type
Reply
#3
Where exactly do I make that change?

Nevermind it just dawned on me what you meant. In the if statements.

now I have the error....
Error:
F.. ====================================================================== FAIL: test_discover_path (__main__.TestTextAnalyzer) ---------------------------------------------------------------------- Traceback (most recent call last): File "<ipython-input-4-46dd30f19dcd>", line 16, in test_discover_path self.assertEqual(ta._src_type, 'path') AssertionError: 'text' != 'path' - text + path ---------------------------------------------------------------------- Ran 3 tests in 0.002s FAILED (failures=1)

got it, I replaced the $ with \Z on line 15. Now I get.

Error:
... ---------------------------------------------------------------------- Ran 3 tests in 0.002s OK
Reply
#4
Alright, well been at the set_content_to_tag for hours now. With soup I have tried search() and find(), but I keep getting the error, well search gave me a different error so I switched back to find_all().

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import requests, re
from bs4 import BeautifulSoup
from collections import Counter
import statistics as stats
import string
 
#create your class here
class TextAnalyzer():  
    def __init__(self, src, _content = 'none', _orig_content = 'none', _src_type = 'discover'):
        self.src = src
        self._src_type = _src_type
        self._content = _content
        self._orig_content = _orig_content
        if self._src_type == 'discover':
            if re.match(r"^http", src, re.I):
                self._src_type = 'url'
            elif re.findall(r"txt\Z", src, re.I):
                self._src_type = 'path'
            else:
                self._src_type = 'text'
        elif src == 'url':
            self._src_type = 'url'
        elif src == 'path':
            self._src_type = 'path'
        else:
            self._src_type = 'text'
 
    def set_content_to_tag(self, tag, tag_id = None):
        self.tag = tag
        self.tag_id = tag_id
        r = requests.get(url, tag)
        _orig_content = r.text
        soup = BeautifulSoup(_orig_content, 'html.parser')
        _content = soup.find_all(tag, tag_id)
             
Error:
...F ====================================================================== FAIL: test_set_content_to_tag (__main__.TestTextAnalyzer) ---------------------------------------------------------------------- Traceback (most recent call last): File "<ipython-input-27-3d60d3736c30>", line 23, in test_set_content_to_tag self.assertEqual(ta._content[0:25], '\n\nAddress by Bill Clinton') AssertionError: 'none' != '\n\nAddress by Bill Clinton' - none + Address by Bill Clinton ---------------------------------------------------------------------- Ran 4 tests in 0.592s FAILED (failures=1)
Reply


Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020