Python Forum
Scraping based on years BeautifulSoup - Printable Version

+- Python Forum (https://python-forum.io)
+-- Forum: Python Coding (https://python-forum.io/forum-7.html)
+--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html)
+--- Thread: Scraping based on years BeautifulSoup (/thread-33740.html)



Scraping based on years BeautifulSoup - rhat398 - May-22-2021

I am practicing my python skills by scraping different websites and I came across The Hacker News Website I manage to scrape the article's title, links, author names..etc.The problem occurs when i try to advance the scraper feature by filtering between years e.g scrape all the articles in between 2018 and 2019 I try to implement it and it gives me some results but not exact results final output it produces include articles of 2017 as well.Here is my code:

from bs4 import BeautifulSoup
import requests
import csv
import time


results = []

def fetch(url):
response = requests.get(url)
#print(f' | Status code: {response.status_code}')

return response
def parse(response):
#print(f'HTTP GET: {response.url} | Status code: {response.status_code}')

content = BeautifulSoup(response.text, 'lxml')


#Extract Data Fields
labels = content.findAll('div', {'class': 'item-label'})
story_date = [[tag for tag in date][1] for date in labels]
if '2019' or '2018' in story_date:
    story_title = [title.text for title in content.find_all('h2', {'class': 'home-title'})]
    story_link = [story_link['href'] for story_link in content.find_all('a', {'class': 'story-link'})]
    story_author = [[tag for tag in author][2].text.strip('\n')[1:] for author in labels]
    
    for index in range(0, len(story_date)):
        results.append({
            'date': story_date[index],
            'title': story_title[index],
            'link': story_link[index],
            'author': story_author[index]
        })
def export_to_csv(filename):
     with open(filename, 'w', newline='') as csv_file:
     writer = csv.DictWriter(csv_file, fieldnames=results[0].keys())

     writer.writeheader()

     for row in results:
     writer.writerow(row)


if __name__ == '__main__':

baseURL = 'https://thehackernews.com/search/label/'

categories = ['data%20breach', 'Cyber%20Attack', 'Vulnerability', 'Malware']

years = ['2018', '2019', '2020', '2021']

for category in categories:
    for year in years:
        for page in range(0, 5):
            index = page + 19
            url = baseURL + category + f'?updated-max={year}-06-09T13:30:00-07:00&max-results=20&start=' + str(index) + '&by-date=false'
            res = fetch(url)
            html_parsing = parse(res)
            export_to_csv('thn.csv')
            time.sleep(2)
Desire Output: Filter out all the articles under the N number of years.