I am practicing my python skills by scraping different websites and I came across The Hacker News Website I manage to scrape the article's title, links, author names..etc.The problem occurs when i try to advance the scraper feature by filtering between years e.g scrape all the articles in between 2018 and 2019 I try to implement it and it gives me some results but not exact results final output it produces include articles of 2017 as well.Here is my code:
from bs4 import BeautifulSoup import requests import csv import time results = [] def fetch(url): response = requests.get(url) #print(f' | Status code: {response.status_code}') return response def parse(response): #print(f'HTTP GET: {response.url} | Status code: {response.status_code}') content = BeautifulSoup(response.text, 'lxml') #Extract Data Fields labels = content.findAll('div', {'class': 'item-label'}) story_date = [[tag for tag in date][1] for date in labels] if '2019' or '2018' in story_date: story_title = [title.text for title in content.find_all('h2', {'class': 'home-title'})] story_link = [story_link['href'] for story_link in content.find_all('a', {'class': 'story-link'})] story_author = [[tag for tag in author][2].text.strip('\n')[1:] for author in labels] for index in range(0, len(story_date)): results.append({ 'date': story_date[index], 'title': story_title[index], 'link': story_link[index], 'author': story_author[index] }) def export_to_csv(filename): with open(filename, 'w', newline='') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=results[0].keys()) writer.writeheader() for row in results: writer.writerow(row) if __name__ == '__main__': baseURL = 'https://thehackernews.com/search/label/' categories = ['data%20breach', 'Cyber%20Attack', 'Vulnerability', 'Malware'] years = ['2018', '2019', '2020', '2021'] for category in categories: for year in years: for page in range(0, 5): index = page + 19 url = baseURL + category + f'?updated-max={year}-06-09T13:30:00-07:00&max-results=20&start=' + str(index) + '&by-date=false' res = fetch(url) html_parsing = parse(res) export_to_csv('thn.csv') time.sleep(2)Desire Output: Filter out all the articles under the N number of years.