Jun-21-2023, 11:46 AM
I am scraping some data from a local website, and everything works fine except for the while loop for proxies. I'm not exactly sure where the issue lies. The order of the proxy process and the output are completely wrong and not aligned with my desired outcome. I have been scratching my head for two days without any success. Any advice will be helpful. Thanks!
Here is the current output:
What I would like to achieve is something like that:
Here is part of my script with all the important classes and functions. Please note:I removed unnecessary code because the entire codebase has more than 700 lines.
Here is the current output:
- Proxy error occurred, retrying with a new proxy...
- Error: Failed to parse the HTML content.
- Proxy error occurred, retrying with a new proxy...
- Error: Failed to parse the HTML content.
- Connection failed, retrying...
- Connection failed, retrying...
- Connection failed, retrying...
- Error: Failed to parse the HTML content.
- Proxy error occurred, retrying with a new proxy...
- Error: Failed to parse the HTML content.
- Proxy error occurred, retrying with a new proxy...
- Error: Failed to parse the HTML content.
- Proxy error occurred, retrying with a new proxy...
- Error: Failed to parse the HTML content.
- Connection failed, retrying...
- Connection failed, retrying...
What I would like to achieve is something like that:
- Proxy error occurred, retrying with a new proxy...
- Connection failed, retrying...
- Connection failed, retrying...
- Retrying with a new proxy...
- Connection failed, retrying...
Here is part of my script with all the important classes and functions. Please note:I removed unnecessary code because the entire codebase has more than 700 lines.
import os import re import random import requests from requests.adapters import HTTPAdapter from requests.exceptions import ProxyError from bs4 import BeautifulSoup import time # Barve class Color: HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' class RequestHandler: def __init__(self, headers, proxies): self.headers = headers self.proxies = proxies def make_request(self, url): session = requests.Session() adapter = HTTPAdapter(max_retries=3) session.mount('http://', adapter) session.mount('https://', adapter) retries = 0 while retries < 3: try: response = session.get(url, headers=self.headers, proxies=self.proxies, timeout=10) break except ProxyError: print(Color.FAIL + f"Proxy error occurred, retrying with a new proxy..." + Color.ENDC) self.proxies = self.get_random_proxy() self.proxies = {'http': proxy, 'https': proxy} retries += 1 except requests.exceptions.RequestException: print(Color.FAIL + f"Connection failed, retrying..." + Color.ENDC) retries += 1 if retries == 3: # Retry with a new proxy self.proxies = self.get_random_proxy() self.proxies = {'http': proxy, 'https': proxy} print(Color.WARNING + "Retrying with a new proxy..." + Color.ENDC) retries = 0 else: print(Color.FAIL + f"Failed to connect to the handle: {handle}" + Color.ENDC) return class DataScraper: def __init__(self, user_agents, proxy_list): self.user_agents = user_agents self.proxy_list = proxy_list def read_handles_from_file(self, file_path): with open(file_path, 'r') as file: handles = file.readlines() handles = [handle.strip() for handle in handles] return handles def get_random_user_agent(self): return random.choice(self.user_agents) def get_random_proxy(self): if self.proxy_list: return random.choice(self.proxy_list) else: return None def scrape_data(self, handle): try: headers = {'User-Agent': self.get_random_user_agent()} proxy = self.get_random_proxy() proxies = {'http': proxy, 'https': proxy} if proxy else None url = f"https://www.google.com" request_handler = RequestHandler(headers, proxies) response = request_handler.make_request(url) if response is None: print(Color.FAIL + f"Failed to connect to the handle: {handle}" + Color.ENDC) return except AttributeError as e: print("Error: Failed to parse the HTML content.") if __name__ == "__main__": file_path = 'handles.txt' user_agents_file = "user_agents.txt" with open(user_agents_file, "r") as f: user_agents = [line.replace("\n", "") for line in f] proxies_file = "proxies.txt" with open(proxies_file, "r") as f: proxy_list = [line.replace("\n", "") for line in f] scraper = DataScraper(user_agents, proxy_list) handles = scraper.read_handles_from_file(file_path) for handle in handles: scraper.scrape_data(handle)