Python Forum
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
google webscraper
#8
The thing is that there is no error message shown. The moment i key in the search string, the files are generated but there are no data saved inside. I looked through the class Google function but still unable to figure out the bug inside it.

<python>
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.request import urlopen,urlparse, Request,HTTPError
import urllib
import re
import numpy as np
import csv
from http.client import BadStatusLine
import ssl
import json
#from googlesearch import search

class Google:
@classmethod
def search1(self, search):
url_list = [] #store all the extracted urls in a List
title_list = [] #store all the extracted titles in a List
description_list = [] #store all the extracted Description in a List

for start in range(0,10):
#page = requests.get('https://www.google.com/search?rlz=1C1CHBF_enSG851SG851&ei=Nib2XI6FEcmLvQS1xb-wBQ&q=site%3Alinkedin.com+inurl%3Ain+%7C+inurl%3Apub+%7C+inurl%3Aprofile+-inurl%3Adir+-inurl%3Atitle+-inurl%3Agroups+-inurl%3Acompany+-inurl%3Ajobs+-inurl%3Ajobs2+VP&oq=site%3Alinkedin.com+inurl%3Ain+%7C+inurl%3Apub+%7C+inurl%3Aprofile+-inurl%3Adir+-inurl%3Atitle+-inurl%3Agroups+-inurl%3Acompany+-inurl%3Ajobs+-inurl%3Ajobs2'+search+str(start*10), verify = False)
#page = requests.get('https://www.google.com/search?q='+search+str(start*10), verify = True)
page = requests.get('https://www.google.com/search?q='+search, verify = True)
soup = BeautifulSoup(page.content)
for cite in soup.findAll('cite'): #extract all URLs
url = cite.text
print(url)
if not urlparse.urlparse(url).scheme: #check if url has prefix http:// or not
url = 'http://'+url
print(url)
url_list.append(url.replace('https://','http://'))

for tit in soup.findAll('h3', attrs={'class':'r'}): #extract all Titles
print(tit.text)
title_list.append(tit.text)

for descr in soup.findAll('span', attrs={'class':'st'}): #extraxt all description
print(descr.text)
description_list.append(descr.text)

record_list = [list(item) for item in list(zip(url_list, title_list, description_list))] #join all the lists
df = pd.DataFrame(record_list,columns=['URL','Title', 'Description'])
df.to_csv('result_url_topic_desc.csv', index=False)
with open('result_url_topic_desc.csv') as f:
reader = csv.DictReader(f)
rows = list(reader)
with open('result_url_topic_desc_JSON.json', 'w') as f:
json.dump(rows, f, sort_keys=False, indent=4, separators=(',', ': '))

user_input = input("Enter your search string : ")
Google.search1(user_input) # user search string
#Google.search1('cloud managed services') # user search string, it could be anything the user types

df2 = pd.DataFrame()
df2 = pd.read_csv('result_url_topic_desc.csv')
phn_1 = [] #store all the extracted Phn numbers in a List
mail_1 = [] #store all the extracted E-mail in a List
for row in df2.iterrows(): # Parse through each url in the list.
try:
try:
req1 = Request(row[1]['URL'], headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'})
gcontext = ssl.SSLContext(ssl.PROTOCOL_SSLv23) # Bypass SSL certification verification
f = urlopen(req1, context=gcontext)
url_name = f.geturl() #extract URL name
s = f.read()
phone = re.findall(r"((?:\d{3}|\(\d{3}\))?(?:\s|-|\.)?\d{3}(?:\s|-|\.)\d{4})",s) # Phone regex
emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,3}",s) #Email regex
#emails = re.findall(r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*$",s)

if len(phone) == 0:
print("No phone number found.")
err_msg_phn = "No phone number found."
phn_1.append((url_name, err_msg_phn))

else:
count = 1
for item in phone:
phn_1.append((url_name,item))
count += 1
print(phn_1)

if len(emails) == 0:
print("No email address found.")
err_msg_mail = "No email address found."
mail_1.append((url_name,err_msg_mail))

else:
count = 1
for item in emails:
mail_1.append((url_name,item))
count += 1
print(mail_1)

except BadStatusLine: # Catch if invalid url names exist
print("could not fetch %s" % url_name)

except urllib2.HTTPError as err: # catch HTTP 404 not found error
if err == 404:
print("Received HTTPError on %s" % url_name)


df_p = pd.DataFrame()
df_m = pd.DataFrame()
df_final = pd.DataFrame()

df_p = pd.DataFrame(phn_1,columns=['URL','Phone_No']) # Dataframe for url and Phn number
df_phn = df_p.drop_duplicates(subset=['URL', 'Phone_No'], keep='first') #remove duplicates

df_m = pd.DataFrame(mail_1,columns=['URL','Email']) # Dataframe for url and Email
df_mail = df_m.drop_duplicates(subset=['URL','Email'], keep='first') #remove duplicates

df_final = pd.merge(df_phn,df_mail, on = 'URL', how = 'inner') #Merge two dataframes on the common column
#df_final.groupby(['URL'], as_index=False)
df_final.to_csv('result_contact.csv', index=False)

#convert the csv output to json
with open('result_contact.csv') as f:
reader = csv.DictReader(f)
rows = list(reader)
with open('result_contact_JSON.json', 'w') as f:
json.dump(rows, f, sort_keys=False, indent=4, separators=(',', ': '))
</python>
Reply


Messages In This Thread
google webscraper - by kirito85 - Jun-04-2019, 08:49 AM
RE: google webscraper - by heiner55 - Jun-05-2019, 03:07 PM
RE: google webscraper - by kirito85 - Jun-06-2019, 01:18 AM
RE: google webscraper - by heiner55 - Jun-06-2019, 03:47 AM
RE: google webscraper - by kirito85 - Jun-06-2019, 05:47 AM
RE: google webscraper - by heiner55 - Jun-06-2019, 08:21 AM
RE: google webscraper - by heiner55 - Jun-06-2019, 02:12 PM
RE: google webscraper - by kirito85 - Jun-07-2019, 02:52 AM
RE: google webscraper - by heiner55 - Jun-07-2019, 05:02 AM
RE: google webscraper - by kirito85 - Jun-07-2019, 05:23 AM
RE: google webscraper - by heiner55 - Jun-07-2019, 05:28 AM
RE: google webscraper - by kirito85 - Jun-07-2019, 07:11 AM
RE: google webscraper - by heiner55 - Jun-07-2019, 08:24 AM

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020