Python Forum

Full Version: hi new at python , trying to get urls from website
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
hello,
i'm new at python ,and i'm trying to get urls from website and write it to csv file


this is my code :

from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("https://XXX") # Insert your URL to extract
bsObj = BeautifulSoup(html.read(), "lxml")
bsObj.title.text # check if im right website
for link in bsObj.find_all('a'):
    print(link.get('href'))
	 
	 cookieProcessor = urllib3.request.HTTPCookieProcessor()

from pyquery import PyQuery
import requests
payload = {'inUserName': 'XX', 'inUserPass': 'XX'}
url = 'XXX'
from bs4 import BeautifulSoup
import requests
url = 'XXX'
def links(url):
    html = requests.get(url).content
    bsObj = BeautifulSoup(html, 'lxml')

    finalLinks = set()
    for link in links:
        finalLinks.add(link.attrs['href'])
		  links = bsObj.findAll('a')

    for link in links :
        print (link[0].text, file=open("e:\dvir.csv", "href"))
my question is :
1. im not doing it right - what is the error with my code ?
2. how to export all my links to csv ?

thanks
Your first code snippet looks correct in getting all urls, except i would use the requests module instead. Instead of print them insert them into csv
Here's how I would do it with requests:
import requests
from bs4 import BeautifulSoup


def get_url(url, savefile=None):
    page = None
    response = requests.get(url, allow_redirects=False)
    if response.status_code == 200:
        page = response.content
        if savefile is not None:
            with open(savefile, 'wb') as fp:
                fp.write(page)
    else:
        print('Download problem, status_code: {}'.format(response.status_code))
    return page


def parse_url(url, savefile=None):
    page = None
    page = get_url(url=url, savefile=savefile)
    if page is not None:
        soup = BeautifulSoup(page, 'lxml')
        next_node = soup.select('a')
        for link in next_node:
            print('Link title: {}, url: {}'.format(link.text, link['href']))
    else:
        print('page is empty')

def test_it():
    parse_url(url='https://www.wired.com')


if __name__ == '__main__':
    test_it()
There was a bug having to do with loading from cache in previous code snippet
here's a new one:
import requests
from bs4 import BeautifulSoup


def get_url(url, savefile=None):
    page = None
    response = requests.get(url, allow_redirects=False)
    if response.status_code == 200:
        page = response.content
        if savefile is not None:
            with open(savefile, 'wb') as fp:
                fp.write(page)
    else:
        print('Download problem, status_code: {}'.format(response.status_code))
    return page

def parse_url(url, savefile=None):
    page = None
    if savefile is not None:
        try:
            fp = open(savefile)
            page = fp.read()
        except IOError:
            page = get_url(url=url, savefile=savefile)
    else:
        page = get_url(url=url, savefile=savefile)

    if page is not None:
        soup = BeautifulSoup(page, 'lxml')
        next_node = soup.select('a')
        for link in next_node:
            print('Link title: {}, url: {}'.format(link.text, link['href']))
    else:
        print('page is empty')


def test_it():
    parse_url(url='https://www.wired.com', savefile='Myfile.html')


if __name__ == '__main__':
    test_it()
hi - thnks , but im getting a lot of errors ..

after page = none im getting :

SyntaxError: invalid syntax
>>> page = None
File "<stdin>", line 1
page = None
Larz60+ code work for me,not tested it with other links.

Start simple here a the basic setup.
Then can error handling/testing or not(as many drop in web-scraping).
from bs4 import BeautifulSoup
import requests

url = 'https://www.python.org/'
url_get = requests.get(url)
soup = BeautifulSoup(url_get.content, 'lxml')
for link in soup.select('a'):
    if link.get('href').startswith('http'):
        print(link.get('href'))
Output:
https://docs.python.org https://pypi.python.org/ http://plus.google.com/+Python http://www.facebook.com/pythonlang?fref=ts http://twitter.com/ThePSF http://brochure.getpython.info/ .... ect
So this get links bye using CSS selector or could have used soup.find_all('a')
This filter out so only get link that has http.
I have a tutorial here, part-2
(Feb-24-2018, 04:47 PM)dviry Wrote: [ -> ]>>> page = None
Put it in a file and run it, dont use the python prompt