Python Forum
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Webscraping - loop on first page
#1
I need some help with this code. It always returns the same (first) page over and over again instead of going to the next page. I can't figure out why this is the case. I guess the probem would be in the loop.

import csv
import time
from bs4 import BeautifulSoup

def fetch_data(url):
    try:
        response = requests.get(url, verify=False)  # Disable SSL verification
        response.raise_for_status()
        return response.json()
    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")
    except Exception as err:
        print(f"Other error occurred: {err}")
    return None

def process_all_pages(base_url, per_page=25):
    all_data = []
    page = 1
    while True:
        url = f"{base_url}?page={page}&per_page={per_page}"
        print(f"Fetching data from: {url}")  # Debug statement
        data = fetch_data(url)
        
        # Verbose debugging
        if data:
            print(f"Received response for page {page}: {data}")  # Print the raw response
        else:
            print(f"No data received for page {page}. Exiting loop.")
            break

        if 'data' in data and isinstance(data['data'], list):
            num_items = len(data['data'])
            print(f"Page {page}: Received {num_items} items")  # Debug statement
            all_data.extend(data['data'])
            
            # Check if there are more pages to fetch
            if num_items < per_page:
                # If fewer than per_page items are returned, this is the last page
                print(f"Page {page}: Last page reached with {num_items} items.")
                break
        else:
            print(f"Page {page}: No valid data or end of data reached")  # Debug statement
            break

        page += 1
        time.sleep(1)  # Adjust based on the rate limit

    return all_data


# Fetch all data from the API endpoint
json_url = "https://samviewer.digile.be/nl/sam/ampps.json"
all_data = process_all_pages(json_url)

if all_data:
    with open('output.csv', mode='w', newline='', encoding='utf-8') as file:
        csv_writer = csv.writer(file)
        
        # Write the header row
        headers = ['CTI-extended', 'Naam', 'CNK publiek', 'Prijs publiek', 'Prijs af-fabriek', 'Vergunninghouder (verdeler)']
        csv_writer.writerow(headers)
        
        for item in all_data:
            # Extracting values and cleaning HTML tags
            cti_ext = extract_text_from_html(item.get('cti_ext', ''))
            name = extract_text_from_html(item.get('name', ''))
            cnk_p = extract_text_from_html(item.get('cnk_p', ''))
            price_pub = extract_text_from_html(item.get('price_pub', ''))
            price_exf = extract_text_from_html(item.get('price_exf', ''))
            company = extract_text_from_html(item.get('company', ''))

            # Construct row
            row = [
                cti_ext,
                name,
                cnk_p,
                price_pub,
                price_exf,
                company
            ]

            # Write the row to CSV only if at least one field has data
            if any(field for field in row):
                csv_writer.writerow(row)

        print(f"Data successfully written to output.csv. Total rows: {len(all_data)}")
else:
    print("No data retrieved from the API.")
Larz60+ write Jul-23-2024, 08:54 AM:
Please post all code, output and errors (it it's entirety) between their respective tags. Refer to BBCode help topic on how to post. Use the "Preview Post" button to make sure the code is presented as you expect before hitting the "Post Reply/Thread" button.
Tags have been added for you this time. Please use BBCode tags on future posts.
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  Intro to WebScraping d1rjr03 3 5,843 Dec-16-2024, 02:50 AM
Last Post: bobprogrammer
  Webscraping news articles by using selenium cate16 7 6,286 Aug-28-2023, 09:58 AM
Last Post: snippsat
  Webscraping with beautifulsoup cormanstan 3 8,717 Aug-24-2023, 11:57 AM
Last Post: snippsat
  Webscraping returning empty table Buuuwq 0 2,612 Dec-09-2022, 10:41 AM
Last Post: Buuuwq
  WebScraping using Selenium library Korgik 0 1,682 Dec-09-2022, 09:51 AM
Last Post: Korgik
  How to get rid of numerical tokens in output (webscraping issue)? jps2020 0 2,560 Oct-26-2020, 05:37 PM
Last Post: jps2020
  Python Webscraping with a Login Website warriordazza 0 3,429 Jun-07-2020, 07:04 AM
Last Post: warriordazza
  use Xpath in Python :: libxml2 for a page-to-page skip-setting apollo 2 4,837 Mar-19-2020, 06:13 PM
Last Post: apollo
  Help with basic webscraping Captain_Snuggle 2 5,448 Nov-07-2019, 08:07 PM
Last Post: kozaizsvemira
  Can't Resolve Webscraping AttributeError Hass 1 3,111 Jan-15-2019, 09:36 PM
Last Post: nilamo

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020