Nov-28-2017, 08:47 PM
This code isn't throwing errors, instead it's only giving me limited results based on "for i in range(20)" line. If I have it set as '20' it will only scrape 20 results, if I change it to '10' it will give me 10 etc.
It's ignoring the while loop requirements and I can't figure out why.
The script requires a test.csv file in working dir, here are some URLs: https://pastebin.com/TNYDBvTF
EXPAND CODE:
It's ignoring the while loop requirements and I can't figure out why.
The script requires a test.csv file in working dir, here are some URLs: https://pastebin.com/TNYDBvTF
EXPAND CODE:
import requests from bs4 import BeautifulSoup import re import pandas as pd import numpy as np import threading # COUNTER TO INCREMENT THROUGH NEW_LIST list_counter = 0 # CREATE NEW LIST FROM CSV new_list = df = pd.read_csv('test.csv') # df = dataframe # GET TOTAL LIST ITEMS FROM CSV list_total = len(df.index) - 1 # take away 1 other lists start at zero def worker(): """thread worker function""" global list_counter # set the variables as global global new_list global list_total global df while list_counter <= list_total: scrape = requests.get(df.iloc[list_counter, 0], headers={"user-agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36"}) html = scrape.content soup = BeautifulSoup(html, 'html.parser') comment_search = soup.body.find_all(string=re.compile("Ethereum", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Ethereum Found' comment_search = soup.body.find_all(string=re.compile("Bitcoin", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Bitcoin Found' comment_search = soup.body.find_all(string=re.compile("Ether", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Ether Found' comment_search = soup.body.find_all(string=re.compile("Ripple", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Ripple Found' comment_search = soup.body.find_all(string=re.compile("Qtum", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Qtum' comment_search = soup.body.find_all(string=re.compile("Litecoin", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Litecoin' comment_search = soup.body.find_all(string=re.compile("DigitalCash", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'DigitalCash' comment_search = soup.body.find_all(string=re.compile("Monero", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Monero' comment_search = soup.body.find_all(string=re.compile("Zcash", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Zcash' comment_search = soup.body.find_all(string=re.compile("Bitcash", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Bitcash' comment_search = soup.body.find_all(string=re.compile("Stellar", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Stellar' comment_search = soup.body.find_all(string=re.compile("IOTA", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'IOTA' comment_search = soup.body.find_all(string=re.compile("Neo", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Neo' comment_search = soup.body.find_all(string=re.compile("Power Ledger", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Power Ledger' comment_search = soup.body.find_all(string=re.compile("OmiseGo", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'OmiseGo' comment_search = soup.body.find_all(string=re.compile("Stratis", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Stratis Found' comment_search = soup.body.find_all(string=re.compile("Waves", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Waves Found' comment_search = soup.body.find_all(string=re.compile("Walton", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Walton Found' comment_search = soup.body.find_all(string=re.compile("Hshare", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Hshare Found' comment_search = soup.body.find_all(string=re.compile("Einsteinium", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Einsteinium Found' comment_search = soup.body.find_all(string=re.compile("Stellar Lumens", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Stellar Found' comment_search = soup.body.find_all(string=re.compile("Lisk", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Lisk Found' comment_search = soup.body.find_all(string=re.compile("NEM", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'NEM Found' comment_search = soup.body.find_all(string=re.compile("MonaCoin", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'MonaCoin Found' print(list_counter) list_counter = list_counter + 1 df = df.replace(np.nan, 'NONE', regex=True) # replace all empty cells (nan) with 'none' df.to_csv("test2.csv") return # returns the results outside the function into a variable threads = [] for i in range(20): t = threading.Thread(target=worker) threads.append(t) t.start()