Python Forum
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
google webscraper
#1
Hi all,

I have a problem when using the code for webscraping. Appreciate any help please.

I get an error:
Error:
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-10-e93b855c4b15> in <module>() 49 50 user_input = input("Enter your search string : ") ---> 51 Google.search1(user_input) # user search string 52 #Google.search1('cloud managed services') # user search string, it could be anything the user types 53 <ipython-input-10-e93b855c4b15> in search1(self, search) 46 rows = list(reader) 47 with open('result_url_topic_desc_JSON.json', 'w') as f: ---> 48 json.dump(rows, f, sort_keys=False, indent=4, separators=(',', ': '),encoding='utf-8') 49 50 user_input = input("Enter your search string : ") ~\Anaconda3\lib\json\__init__.py in dump(obj, fp, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw) 174 check_circular=check_circular, allow_nan=allow_nan, indent=indent, 175 separators=separators, --> 176 default=default, sort_keys=sort_keys, **kw).iterencode(obj) 177 # could accelerate with writelines in some versions of Python, at 178 # a debuggability cost TypeError: __init__() got an unexpected keyword argument 'encoding'
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.request import urlopen,urlparse, Request,HTTPError
import urllib
import re
import numpy as np
import csv
from http.client import BadStatusLine
import ssl
import json
#from googlesearch import search

class Google:
    @classmethod
    def search1(self, search):
      url_list = []   #store all the extracted urls in a List
      title_list = [] #store all the extracted titles in a List
      description_list = []  #store all the extracted Description in a List

      for start in range(0,10):
        #page = requests.get('http://www.google.com/search?rlz=1C1CHBF_enSG851SG851&ei=Nib2XI6FEcmLvQS1xb-wBQ&q=site%3Alinkedin.com+inurl%3Ain+%7C+inurl%3Apub+%7C+inurl%3Aprofile+-inurl%3Adir+-inurl%3Atitle+-inurl%3Agroups+-inurl%3Acompany+-inurl%3Ajobs+-inurl%3Ajobs2+VP&oq=site%3Alinkedin.com+inurl%3Ain+%7C+inurl%3Apub+%7C+inurl%3Aprofile+-inurl%3Adir+-inurl%3Atitle+-inurl%3Agroups+-inurl%3Acompany+-inurl%3Ajobs+-inurl%3Ajobs2'+search+str(start*10), verify = False)
        page = requests.get('http://www.google.com/search?q='+search+str(start*10), verify = False)
        soup = BeautifulSoup(page.content)
        for cite in soup.findAll('cite'): #extract all URLs
            url = cite.text
            print(url)
            if not urlparse.urlparse(url).scheme: #check if url has prefix http:// or not
                url = 'http://'+url
                print(url)
                url_list.append(url.replace('https://','http://'))

        for tit in soup.findAll('h3', attrs={'class':'r'}): #extract all Titles
            print(tit.text)
            title_list.append(tit.text)
       
        for descr in soup.findAll('span', attrs={'class':'st'}): #extraxt all description
            print(descr.text)
            description_list.append(descr.text)

      record_list = [list(item) for item in list(zip(url_list, title_list, description_list))] #join all the lists
      df = pd.DataFrame(record_list,columns=['URL','Title', 'Description'])
      df.to_csv('result_url_topic_desc.csv', index=False, encoding='utf-8')
      with open('result_url_topic_desc.csv') as f:
           reader = csv.DictReader(f)
           rows = list(reader)
      with open('result_url_topic_desc_JSON.json', 'w') as f:
           json.dump(rows, f, sort_keys=False, indent=4, separators=(',', ': '),encoding='utf-8') 

user_input = input("Enter your search string : ")
Google.search1(user_input) # user search string
#Google.search1('cloud managed services') # user search string, it could be anything the user types

df2=pd.DataFrame()
df2 = pd.read_csv('result_url_topic_desc.csv', encoding='utf-8')
phn_1 = []    #store all the extracted Phn numbers in a List
mail_1 = []    #store all the extracted E-mail in a List
for row in df2.iterrows():  # Parse through each url in the list.
    try:
        try:
           req1 = Request(row[1]['URL'], headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'})
           gcontext = ssl.SSLContext(ssl.PROTOCOL_SSLv23) # Bypass SSL certification verification
           f = urlopen(req1, context=gcontext)
           url_name = f.geturl() #extract URL name 
           s = f.read()
           phone = re.findall(r"((?:\d{3}|\(\d{3}\))?(?:\s|-|\.)?\d{3}(?:\s|-|\.)\d{4})",s)  # Phone regex
           emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,3}",s)  #Email regex
           #emails = re.findall(r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*$",s)

           if len(phone) == 0:
              print("No phone number found.")
              err_msg_phn = "No phone number found."
              phn_1.append((url_name, err_msg_phn))
              
           else:
               count = 1
               for item in phone:
                   phn_1.append((url_name,item))
                   count += 1
               print(phn_1)
        
           if len(emails) == 0:
              print("No email address found.")
              err_msg_mail = "No email address found."
              mail_1.append((url_name,err_msg_mail))

           else:
               count = 1
               for item in emails:
                   mail_1.append((url_name,item))
                   count += 1
               print(mail_1)
               
        except BadStatusLine: # Catch if invalid url names exist
            print("could not fetch %s" % url_name)

    except urllib2.HTTPError as err: # catch HTTP 404 not found error
        if err == 404:
            print("Received HTTPError on %s" % url_name)
            

df_p = pd.DataFrame()
df_m = pd.DataFrame()
df_final = pd.DataFrame()

df_p = pd.DataFrame(phn_1,columns=['URL','Phone_No']) # Dataframe for url and Phn number
df_phn = df_p.drop_duplicates(subset=['URL', 'Phone_No'], keep='first') #remove duplicates

df_m = pd.DataFrame(mail_1,columns=['URL','Email']) # Dataframe for url and Email
df_mail = df_m.drop_duplicates(subset=['URL','Email'], keep='first') #remove duplicates

df_final = pd.merge(df_phn,df_mail, on = 'URL', how = 'inner') #Merge two dataframes on the common column
#df_final.groupby(['URL'], as_index=False)
df_final.to_csv('result_contact.csv', index=False, encoding='utf-8')

#convert the csv output to json
with open('result_contact.csv') as f:
     reader = csv.DictReader(f)
     rows = list(reader)
with open('result_contact_JSON.json', 'w') as f: 
   json.dump(rows, f, sort_keys=False, indent=4, separators=(',', ': '),encoding='utf-8')
Reply
#2
json.dump(...,encoding='utf-8') 
should be (line 47 and line 71):

json.dump(...)
 
Reply
#3
Thank you Heiner55
Reply
#4
You are welcome.
Reply
#5
ok i have a new problem with the codes now as the google function does not scrape any data at all for some reason.

Appreciate any kind help please.
Reply
#6
Then you have to debug your program.
Reply
#7
Or show us the error messages.
Reply
#8
The thing is that there is no error message shown. The moment i key in the search string, the files are generated but there are no data saved inside. I looked through the class Google function but still unable to figure out the bug inside it.

<python>
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.request import urlopen,urlparse, Request,HTTPError
import urllib
import re
import numpy as np
import csv
from http.client import BadStatusLine
import ssl
import json
#from googlesearch import search

class Google:
@classmethod
def search1(self, search):
url_list = [] #store all the extracted urls in a List
title_list = [] #store all the extracted titles in a List
description_list = [] #store all the extracted Description in a List

for start in range(0,10):
#page = requests.get('https://www.google.com/search?rlz=1C1CHBF_enSG851SG851&ei=Nib2XI6FEcmLvQS1xb-wBQ&q=site%3Alinkedin.com+inurl%3Ain+%7C+inurl%3Apub+%7C+inurl%3Aprofile+-inurl%3Adir+-inurl%3Atitle+-inurl%3Agroups+-inurl%3Acompany+-inurl%3Ajobs+-inurl%3Ajobs2+VP&oq=site%3Alinkedin.com+inurl%3Ain+%7C+inurl%3Apub+%7C+inurl%3Aprofile+-inurl%3Adir+-inurl%3Atitle+-inurl%3Agroups+-inurl%3Acompany+-inurl%3Ajobs+-inurl%3Ajobs2'+search+str(start*10), verify = False)
#page = requests.get('https://www.google.com/search?q='+search+str(start*10), verify = True)
page = requests.get('https://www.google.com/search?q='+search, verify = True)
soup = BeautifulSoup(page.content)
for cite in soup.findAll('cite'): #extract all URLs
url = cite.text
print(url)
if not urlparse.urlparse(url).scheme: #check if url has prefix http:// or not
url = 'http://'+url
print(url)
url_list.append(url.replace('https://','http://'))

for tit in soup.findAll('h3', attrs={'class':'r'}): #extract all Titles
print(tit.text)
title_list.append(tit.text)

for descr in soup.findAll('span', attrs={'class':'st'}): #extraxt all description
print(descr.text)
description_list.append(descr.text)

record_list = [list(item) for item in list(zip(url_list, title_list, description_list))] #join all the lists
df = pd.DataFrame(record_list,columns=['URL','Title', 'Description'])
df.to_csv('result_url_topic_desc.csv', index=False)
with open('result_url_topic_desc.csv') as f:
reader = csv.DictReader(f)
rows = list(reader)
with open('result_url_topic_desc_JSON.json', 'w') as f:
json.dump(rows, f, sort_keys=False, indent=4, separators=(',', ': '))

user_input = input("Enter your search string : ")
Google.search1(user_input) # user search string
#Google.search1('cloud managed services') # user search string, it could be anything the user types

df2 = pd.DataFrame()
df2 = pd.read_csv('result_url_topic_desc.csv')
phn_1 = [] #store all the extracted Phn numbers in a List
mail_1 = [] #store all the extracted E-mail in a List
for row in df2.iterrows(): # Parse through each url in the list.
try:
try:
req1 = Request(row[1]['URL'], headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'})
gcontext = ssl.SSLContext(ssl.PROTOCOL_SSLv23) # Bypass SSL certification verification
f = urlopen(req1, context=gcontext)
url_name = f.geturl() #extract URL name
s = f.read()
phone = re.findall(r"((?:\d{3}|\(\d{3}\))?(?:\s|-|\.)?\d{3}(?:\s|-|\.)\d{4})",s) # Phone regex
emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,3}",s) #Email regex
#emails = re.findall(r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*$",s)

if len(phone) == 0:
print("No phone number found.")
err_msg_phn = "No phone number found."
phn_1.append((url_name, err_msg_phn))

else:
count = 1
for item in phone:
phn_1.append((url_name,item))
count += 1
print(phn_1)

if len(emails) == 0:
print("No email address found.")
err_msg_mail = "No email address found."
mail_1.append((url_name,err_msg_mail))

else:
count = 1
for item in emails:
mail_1.append((url_name,item))
count += 1
print(mail_1)

except BadStatusLine: # Catch if invalid url names exist
print("could not fetch %s" % url_name)

except urllib2.HTTPError as err: # catch HTTP 404 not found error
if err == 404:
print("Received HTTPError on %s" % url_name)


df_p = pd.DataFrame()
df_m = pd.DataFrame()
df_final = pd.DataFrame()

df_p = pd.DataFrame(phn_1,columns=['URL','Phone_No']) # Dataframe for url and Phn number
df_phn = df_p.drop_duplicates(subset=['URL', 'Phone_No'], keep='first') #remove duplicates

df_m = pd.DataFrame(mail_1,columns=['URL','Email']) # Dataframe for url and Email
df_mail = df_m.drop_duplicates(subset=['URL','Email'], keep='first') #remove duplicates

df_final = pd.merge(df_phn,df_mail, on = 'URL', how = 'inner') #Merge two dataframes on the common column
#df_final.groupby(['URL'], as_index=False)
df_final.to_csv('result_contact.csv', index=False)

#convert the csv output to json
with open('result_contact.csv') as f:
reader = csv.DictReader(f)
rows = list(reader)
with open('result_contact_JSON.json', 'w') as f:
json.dump(rows, f, sort_keys=False, indent=4, separators=(',', ': '))
</python>
Reply
#9
Sorry, your code is too complex that I can understand it.
With indenting, maybe it would easier.

If you get no error messages, you have to improve your code.
For example, log all important data to a log file.
Then you see what happens in your code.

https://docs.python.org/3/howto/logging.html
Reply
#10
sorry, i think i didnt paste the code correctly under the python tags hence the no indenting. Thanks for your reply, i will try the logging instead.
Reply


Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020