Nov-27-2017, 04:03 AM
Say I want to search for 30 keywords within each set of scraped html data, what would be the best way to go about it? Should I keep repeating the same re.compile and if statement I'm using?
from bs4 import BeautifulSoup import urllib.request import re import pandas as pd # COUNTER TO INCREMENT THROUGH URL_LIST list_counter = 0 # CREATE URL LIST FROM CSV url_list = df = pd.read_csv('example.csv') # df = dataframe # GET URL TOTAL FROM CSV url_total = len(df.index) - 1 # take away 1 other lists start at zero # MAIN LOOP TO CHECK FOR COMMENTS while list_counter <= url_total: scrape = urllib.request.Request(df.iloc[list_counter, 0], headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'}) html = urllib.request.urlopen(scrape) soup = BeautifulSoup(html, 'lxml') comment_search = soup.body.find_all(text=re.compile("keyword1", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Keyword1 Found' comment_search = soup.body.find_all(string=re.compile("keyword2", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Keyword2 Found' comment_search = soup.body.find_all(text=re.compile("keyword3", re.IGNORECASE)) if len(comment_search) > 0: df.iloc[list_counter, 1] = 'Keyword3 Found' print(list_counter) list_counter = list_counter + 1 df.to_csv("example2.csv") df = pd.read_csv('example2.csv') print(df)