Jun-13-2022, 03:35 AM
Good Day Everyone. I am having issues with web scaping as I am not sure why it does not want to scape. I am using xpath and also soup to gather the next URL to check if it works however it does not want to work. What am I doing wrong?
import requests from lxml import etree import html5lib from bs4 import BeautifulSoup from urllib.parse import urljoin import time, re import csv import time start = time.time() print('Starting Program') base ="https://www.studylight.org/lexicons/eng/hebrew/1.html" url = "https://www.studylight.org/lexicons/eng/hebrew/1.html" while True: request = requests.get(urljoin(base,url)) #Get URL server status soup = BeautifulSoup(request.content, 'html5lib') #Pass url content to Soup dom = etree.HTML(str(soup)) #Ini etree url = dom.xpath('/html/body/div[1]/div[3]/div[2]/div[4]/form/div/div[3]/div[2]/a') #Find Next Page URL url2 = urljoin(base,url) urltest2 = soup.find_all("span", class_="greek-hebrew fs-21") #Find next url print('Test First url', url2,' Test number 2 ' , urltest2) # #for line in soup.find_all('a'): # #print(urljoin(base,line.text))#.get('href')) if url2 in 'https://www.studylight.org/lexicons/eng/hebrew/3.html': # Page to Stop break # Break out of loop print('Program Completed')