Python Forum

hello. I test a code, and it's not working:

from bs4 import BeautifulSoup
import bs4
import requests
import os
import re

def getHTMLText(url):
	try:
		headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}
		
		r = requests.get(url, headers=headers)
		r.raise_for_status()
		r.encoding = r.apparent_encoding
		return r.text
	except:
		return "NO"
	
def main():
        for i in range(100):
                url = 'https://movie.douban.com/subject/30362186/comments?start'
                url = url +str(20*i)
                html = getHTMLText(url)
                soup = BeautifulSoup(html, 'html.parser')
                for new in soup.select('.comments'):
                        for b in new.select('a'):
                                u1=b.text
                        for c in new.select('.short'):
                                u2=c.text
			
print(u1+u2)
		
main()

Error:Traceback (most recent call last):
  File "C:\Folder2\html parser.py", line 32, in <module>
    print(u1+u2)
NameError: name 'u1' is not defined

Most have print() in main() function.
There are serval other problem if i do a test,u1,u2 will have no output.
select('.comments') don't find anything the selectors has to more specific.
Loop for just give same page,just some fixes to get some output.

from bs4 import BeautifulSoup
import bs4
import requests
import os
import re

def getHTMLText(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
        }
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        #r.encoding = r.apparent_encoding
        return r.content # No text let bs4 handle it
    except:
        return "NO"

def main():
    for i in range(2):
        url = "https://movie.douban.com/subject/30362186/comments?start"
        url = url + str(20 * i)
        #print(url)
        html = getHTMLText(url)
        soup = BeautifulSoup(html, "html.parser")
        print(soup.select('#comments > div:nth-child(1) > div.comment'))
        '''for new in soup.select(".comments"):
            for b in new.select("a"):
                u1 = b.text
            for c in new.select(".short"):
                u2 = c.text
            print(u1 + u2)'''

main()

So the url generate,this for me just point to same page

https://movie.douban.com/subject/30362186/comments?start0
https://movie.douban.com/subject/30362186/comments?start20

A common mistake is to make loop or other stuff before has tested that stuff actually work on one page.
Then small step test next page and short loop,not 100 pages🌊

Melcu54

snippsat