Python Forum

This code isn't throwing errors, instead it's only giving me limited results based on "for i in range(20)" line. If I have it set as '20' it will only scrape 20 results, if I change it to '10' it will give me 10 etc.

It's ignoring the while loop requirements and I can't figure out why.

The script requires a test.csv file in working dir, here are some URLs: https://pastebin.com/TNYDBvTF

EXPAND CODE:

Hide/Show

import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import threading

# COUNTER TO INCREMENT THROUGH NEW_LIST
list_counter = 0

# CREATE NEW LIST FROM CSV
new_list = df = pd.read_csv('test.csv')  # df = dataframe

# GET TOTAL LIST ITEMS FROM CSV
list_total = len(df.index) - 1  # take away 1 other lists start at zero

def worker():
    """thread worker function"""

    global list_counter  # set the variables as global
    global new_list
    global list_total
    global df

    while list_counter <= list_total:
        scrape = requests.get(df.iloc[list_counter, 0], headers={"user-agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36"})
        html = scrape.content
        soup = BeautifulSoup(html, 'html.parser')

        comment_search = soup.body.find_all(string=re.compile("Ethereum", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Ethereum Found'
        comment_search = soup.body.find_all(string=re.compile("Bitcoin", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Bitcoin Found'
        comment_search = soup.body.find_all(string=re.compile("Ether", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Ether Found'
        comment_search = soup.body.find_all(string=re.compile("Ripple", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Ripple Found'
        comment_search = soup.body.find_all(string=re.compile("Qtum", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Qtum'
        comment_search = soup.body.find_all(string=re.compile("Litecoin", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Litecoin'
        comment_search = soup.body.find_all(string=re.compile("DigitalCash", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'DigitalCash'
        comment_search = soup.body.find_all(string=re.compile("Monero", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Monero'
        comment_search = soup.body.find_all(string=re.compile("Zcash", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Zcash'
        comment_search = soup.body.find_all(string=re.compile("Bitcash", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Bitcash'
        comment_search = soup.body.find_all(string=re.compile("Stellar", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Stellar'
        comment_search = soup.body.find_all(string=re.compile("IOTA", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'IOTA'
        comment_search = soup.body.find_all(string=re.compile("Neo", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Neo'
        comment_search = soup.body.find_all(string=re.compile("Power Ledger", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Power Ledger'
        comment_search = soup.body.find_all(string=re.compile("OmiseGo", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'OmiseGo'
        comment_search = soup.body.find_all(string=re.compile("Stratis", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Stratis Found'
        comment_search = soup.body.find_all(string=re.compile("Waves", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Waves Found'
        comment_search = soup.body.find_all(string=re.compile("Walton", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Walton Found'
        comment_search = soup.body.find_all(string=re.compile("Hshare", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Hshare Found'
        comment_search = soup.body.find_all(string=re.compile("Einsteinium", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Einsteinium Found'
        comment_search = soup.body.find_all(string=re.compile("Stellar Lumens", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Stellar Found'
        comment_search = soup.body.find_all(string=re.compile("Lisk", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'Lisk Found'
        comment_search = soup.body.find_all(string=re.compile("NEM", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'NEM Found'
        comment_search = soup.body.find_all(string=re.compile("MonaCoin", re.IGNORECASE))
        if len(comment_search) > 0:
            df.iloc[list_counter, 1] = 'MonaCoin Found'

        print(list_counter)
        list_counter = list_counter + 1

        df = df.replace(np.nan, 'NONE', regex=True)  # replace all empty cells (nan) with 'none'
        df.to_csv("test2.csv")

        return  # returns the results outside the function into a variable

threads = []
for i in range(20):
    t = threading.Thread(target=worker)
    threads.append(t)
    t.start()

Suggest you take a gander at: https://pymotw.com/3/threading/

You'll probably be interested in: greenteapress.com/semaphores/LittleBookOfSemaphores.pdf
also as it shows how to communicate between threads

threads = []
for i in range(5):
    t = threading.Thread(target=worker, args=(i,))
    threads.append(t)
    t.start()

Does this code only generate the number of threads, or does it also automatically assign them to the project as well?

Am I having a problem with variables inside the main loop conflicting with the threads or is it a problem of not assigning threads properly or..? Where specifically am I not setting this up correctly?

digitalmatic7

Larz60+

digitalmatic7