Python Forum

hello,
i'm new at python ,and i'm trying to get urls from website and write it to csv file

this is my code :

from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("https://XXX") # Insert your URL to extract
bsObj = BeautifulSoup(html.read(), "lxml")
bsObj.title.text # check if im right website
for link in bsObj.find_all('a'):
    print(link.get('href'))
	 
	 cookieProcessor = urllib3.request.HTTPCookieProcessor()

from pyquery import PyQuery
import requests
payload = {'inUserName': 'XX', 'inUserPass': 'XX'}
url = 'XXX'

from bs4 import BeautifulSoup
import requests
url = 'XXX'
def links(url):
    html = requests.get(url).content
    bsObj = BeautifulSoup(html, 'lxml')

    finalLinks = set()
    for link in links:
        finalLinks.add(link.attrs['href'])
		  links = bsObj.findAll('a')

    for link in links :
        print (link[0].text, file=open("e:\dvir.csv", "href"))

my question is :
1. im not doing it right - what is the error with my code ?
2. how to export all my links to csv ?

thanks

Your first code snippet looks correct in getting all urls, except i would use the requests module instead. Instead of print them insert them into csv

Here's how I would do it with requests:

import requests
from bs4 import BeautifulSoup


def get_url(url, savefile=None):
    page = None
    response = requests.get(url, allow_redirects=False)
    if response.status_code == 200:
        page = response.content
        if savefile is not None:
            with open(savefile, 'wb') as fp:
                fp.write(page)
    else:
        print('Download problem, status_code: {}'.format(response.status_code))
    return page


def parse_url(url, savefile=None):
    page = None
    page = get_url(url=url, savefile=savefile)
    if page is not None:
        soup = BeautifulSoup(page, 'lxml')
        next_node = soup.select('a')
        for link in next_node:
            print('Link title: {}, url: {}'.format(link.text, link['href']))
    else:
        print('page is empty')

def test_it():
    parse_url(url='https://www.wired.com')


if __name__ == '__main__':
    test_it()

There was a bug having to do with loading from cache in previous code snippet
here's a new one:

import requests
from bs4 import BeautifulSoup


def get_url(url, savefile=None):
    page = None
    response = requests.get(url, allow_redirects=False)
    if response.status_code == 200:
        page = response.content
        if savefile is not None:
            with open(savefile, 'wb') as fp:
                fp.write(page)
    else:
        print('Download problem, status_code: {}'.format(response.status_code))
    return page

def parse_url(url, savefile=None):
    page = None
    if savefile is not None:
        try:
            fp = open(savefile)
            page = fp.read()
        except IOError:
            page = get_url(url=url, savefile=savefile)
    else:
        page = get_url(url=url, savefile=savefile)

    if page is not None:
        soup = BeautifulSoup(page, 'lxml')
        next_node = soup.select('a')
        for link in next_node:
            print('Link title: {}, url: {}'.format(link.text, link['href']))
    else:
        print('page is empty')


def test_it():
    parse_url(url='https://www.wired.com', savefile='Myfile.html')


if __name__ == '__main__':
    test_it()

hi - thnks , but im getting a lot of errors ..

after page = none im getting :

SyntaxError: invalid syntax
>>> page = None
File "<stdin>", line 1
page = None

Larz60+ code work for me,not tested it with other links.

Start simple here a the basic setup.
Then can error handling/testing or not(as many drop in web-scraping).

from bs4 import BeautifulSoup
import requests

url = 'https://www.python.org/'
url_get = requests.get(url)
soup = BeautifulSoup(url_get.content, 'lxml')
for link in soup.select('a'):
    if link.get('href').startswith('http'):
        print(link.get('href'))

Output:https://docs.python.org
https://pypi.python.org/
http://plus.google.com/+Python
http://www.facebook.com/pythonlang?fref=ts
http://twitter.com/ThePSF
http://brochure.getpython.info/
.... ect

So this get links bye using CSS selector or could have used soup.find_all('a')
This filter out so only get link that has http.
I have a tutorial here, part-2

(Feb-24-2018, 04:47 PM)dviry Wrote: [ -> ]>>> page = None

Put it in a file and run it, dont use the python prompt

dviry

metulburr

Larz60+

Larz60+

dviry

snippsat

metulburr