Missing Schema-Python Question

Andwconteh · (This post was last modified: Jun-16-2021, 03:06 PM by Larz60+.)

Good morning all,

My web scraping program gives me a missing schema invalid URL error, and I do not know how to fix it. I am surely appreciate if someone can lend a helping hand.

Error:MissingSchema                             Traceback (most recent call last)
<ipython-input-44-3fcf85db74c0> in <module>
      6         searchdf = pd.DataFrame()
      7         for k in reversed(range(len(pagesToParse))):
----> 8             searchdf = searchdf.append(parsePage(pagesToParse[k]), ignore_index=True)
      9 
     10         location_series = pd.Series([counties[j]]*len(searchdf))

<ipython-input-41-85c64130082e> in parsePage(url)
      1 def parsePage(url):
----> 2     page = requests.get(url)
      3     soup = BeautifulSoup(page.text, "html.parser")
      4 
      5     nameSeries = []

~\Anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs)
     74 
     75     kwargs.setdefault('allow_redirects', True)
---> 76     return request('get', url, params=params, **kwargs)
     77 
     78 

~\Anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs)
     59     # cases, and look like a memory leak in others.
     60     with sessions.Session() as session:
---> 61         return session.request(method=method, url=url, **kwargs)
     62 
     63 

~\Anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    514             hooks=hooks,
    515         )
--> 516         prep = self.prepare_request(req)
    517 
    518         proxies = proxies or {}

~\Anaconda3\lib\site-packages\requests\sessions.py in prepare_request(self, request)
    447 
    448         p = PreparedRequest()
--> 449         p.prepare(
    450             method=request.method.upper(),
    451             url=request.url,

~\Anaconda3\lib\site-packages\requests\models.py in prepare(self, method, url, headers, files, data, params, auth, cookies, hooks, json)
    312 
    313         self.prepare_method(method)
--> 314         self.prepare_url(url, params)
    315         self.prepare_headers(headers)
    316         self.prepare_cookies(cookies)

~\Anaconda3\lib\site-packages\requests\models.py in prepare_url(self, url, params)
    386             error = error.format(to_native_string(url, 'utf8'))
    387 
--> 388             raise MissingSchema(error)
    389 
    390         if not host:

MissingSchema: Invalid URL '/search?search_terms=Private%20Schools%20(K-12)&geo_location_terms=ada%20county%20idaho&page=2': No schema supplied. Perhaps you meant http:///search?search_terms=Private%20Schools%20(K-12)&geo_location_terms=ada%20county%20idaho&page=2?

Larz60+ write Jun-16-2021, 03:06 PM:
Please post all code, output and errors (it it's entirety) between their respective tags. Refer to BBCode help topic on how to post. Use the "Preview Post" button to make sure the code is presented as you expect before hitting the "Post Reply/Thread" button.
Fixed for you this time. Please use bbcode tags on future posts.

Andwconteh · (This post was last modified: Jun-16-2021, 03:05 PM by Larz60+.)

I am also posting my code here:

import requests
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd

counties = pd.read_csv('counties.csv', header=None, names=['county'])
counties = list(counties.county)
search_terms = ['Private+Schools+%28K-12%29','religious+schools','kindergarten','early+childhood+learning+centers']

def findLinks(search_term,county):
    firstUrl = "https://www.yellowpages.com/search?search_terms="+search_term+"&geo_location_terms="+county
    linksToParse = []
    try:
        firstPage = requests.get(firstUrl)
        for link in BeautifulSoup(firstPage.content, "html.parser", parse_only=SoupStrainer('a', href=True)):
            if 'geo_location_terms=' + county + '&page' in link['href']:
                linksToParse.append(link['href'].encode('utf-8'))
        linksToParse = list(set(linksToParse))
        linksToParse = ["https://www.yellowpages.com" + link for link in linksToParse]
        linksToParse.append(firstUrl)
    except:
        pass
    return linksToParse

def parsePage(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")
    
    nameSeries = []
    addressSeries = []
    phoneSeries = []
    infoSeries = []
    websiteSeries = []
    snippetSeries = []

    for result in soup.select(".search-results .result"):
        try:
            name = result.select_one(".business-name").get_text(strip=True, separator=" ")
        except:
            name = 'No name listed'
        try:
            address = result.select_one(".adr").get_text(strip=True, separator=" ")
        except:
            address = 'No address listed'
        try:
            phone = result.select_one(".phones.phone.primary").get_text(strip=True, separator=" ")
        except:
            phone = 'No phone number listed'
        try:
            info = result.select_one(".info-section.info-secondary").get_text(strip=True, separator=" ")
        except:
            info = 'No additional info listed'
        try:
            website = result.select_one(".track-visit-website").attrs['href']
        except:
            website = "No website listed"
        try:
            snippet = result.select_one(".snippet").get_text(strip=True, separator=" ")
        except:
            snippet = "No snippet listed"
        
        nameSeries.append(name)
        addressSeries.append(address)
        phoneSeries.append(phone)
        infoSeries.append(info)
        websiteSeries.append(website)
        snippetSeries.append(snippet)
        
    pagedf = pd.concat([pd.Series(nameSeries),pd.Series(addressSeries),pd.Series(phoneSeries),pd.Series(infoSeries),pd.Series(websiteSeries),pd.Series(snippetSeries)], axis=1)
    return pagedf

#finaldf = pd.DataFrame()
for i in range(len(search_terms)):
    termdf = pd.DataFrame()
    for j in range(len(counties)):
        pagesToParse = findLinks(search_terms[i],counties[j])
        searchdf = pd.DataFrame()
        for k in reversed(xrange(len(pagesToParse))):
            searchdf = searchdf.append(parsePage(pagesToParse[k]), ignore_index=True)
            
        location_series = pd.Series([counties[j]]*len(searchdf))
        term_series = pd.Series([search_terms[i]]*len(searchdf))
        searchdf = pd.concat([searchdf,location_series,term_series], axis=1)
        searchdf.columns = ['Name', 'Address', 'Phone', 'Info', 'Website', 'Snippet', 'Search Location', 'Search Term']
        termdf = termdf.append(searchdf, ignore_index=True)
    
    termdf.to_csv(search_terms[i]+'.csv', index=False, encoding='utf-8')
    #finaldf = finaldf.append(termdf, ignore_index=True)

finaldf.to_csv('output.csv', index=False, encoding='utf-8')

linksList = findLinks(search_terms[0],counties[0])

linksList

Out[46]: ['https://www.yellowpages.com/search?search_terms=Private%20Schools%20(K-12)&geo_location_terms=ada%20county%20idaho&page=2',
 'https://www.yellowpages.com/search?search_terms=Private+Schools+%28K-12%29&geo_location_terms=ada%20county%20idaho']

df = pd.DataFrame()
for i in reversed(xrange(len(linksList))):
    df = df.append(parsePage(linksList[i]), ignore_index=True)
df.columns = ['Name', 'Address', 'Phone', 'Info', 'Website', 'Snippet']

df

Larz60+ write Jun-16-2021, 03:05 PM:
Please post all code, output and errors (it it's entirety) between their respective tags. Refer to BBCode help topic on how to post. Use the "Preview Post" button to make sure the code is presented as you expect before hitting the "Post Reply/Thread" button.
Fixed for you this time. Please use bbcode tags on funure posts.

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	__init__() got multiple values for argument 'schema'	dawid294	4	2,192	Jan-03-2024, 09:42 AM Last Post: buran
	how to catch schema error?	maiya	0	1,850	Jul-16-2021, 08:37 AM Last Post: maiya
	Best way to process large/complex XML/schema ?	MDRI	7	6,265	May-16-2021, 09:31 PM Last Post: snippsat
	Missing Python Library	Novecento99	8	3,723	Jan-21-2020, 09:05 PM Last Post: snippsat
	How can I compare Python XML-Files and add missing values from one to another	kirat	2	2,658	Aug-30-2019, 12:17 PM Last Post: perfringo
	Multithreading socket in Python. I'm missing something	Guybrush	1	3,966	Apr-28-2017, 11:14 PM Last Post: Ofnuts
	python is missing	hsunteik	3	4,740	Dec-19-2016, 05:43 AM Last Post: Larz60+

Missing Schema-Python Question

User Panel Messages

Announcements