Web Scraping Project

dyzl3xik · Apr-28-2019, 02:48 AM

Hello,

I am pretty new to Python and for our last semester project, we are to develop a web scraper program. We are to go into the Itunes charts and pull the top 100 for the user selected category. Once the information is pulled, the rankings are to be stored in a txt file. The user is then to input which ranking they want more information on. The program is to pull the data from the txt file and display the information. The program is to continue until the user wants to exit.

I am able to pull the chart information and write the txt file. However, when I enter a ranking that I want more details on, I am having some issues. I can only pull the data for the first 67 entries in the txt file; any number above that seems to crash the kernel. I am stuck and can't seem to figure out what I am doing wrong. I would appreciate any insight as to what I did wrong in the code I have included below.

Thanks.

import sys
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq

#this is a collection of the nine urls needed to access the charts. Since only one category can be accessed at once,
#I assumed the info from the category selected would be the only one written into the text file.

song_url ='https://www.apple.com/itunes/charts/songs'
albums_url ='https://www.apple.com/itunes/charts/albums/'
fapps_url ='https://www.apple.com/itunes/charts/free-apps/'
papps_url = 'https://www.apple.com/itunes/charts/paid-apps/'
tapps_url = 'https://www.apple.com/itunes/charts/top-grossing-apps/'
books_url = 'https://www.apple.com/itunes/charts/paid-books/'
movies_url = 'https://www.apple.com/itunes/charts/movies/'
shows_url = 'https://www.apple.com/itunes/charts/tv-shows/'
videos_url = 'https://www.apple.com/itunes/charts/music-videos/'

#User instructions for program

def user_inst():
    print('This is a program that scrapes data from https://www.apple.com/itunes/charts for a class project.\n')
    print('What category would you like to know more about?\n')
    print(' (1) - Songs')
    print(' (2) - Albums')
    print(' (3) - Free Apps')
    print(' (4) - Paid Apps')
    print(' (5) - Top Grossing Apps')
    print(' (6) - Books')
    print(' (7) - Movies')
    print(' (8) - TV Shows')
    print(' (9) - Music Videos' )
    print(' (10) - Exit Program')

#asks user for category selection and error checks input

def user_select():
    while True:
        try:
            selection = int(input('What category would you like to know more about?\n'))
        except ValueError:
            print('Please enter a valid number')
        if selection <=10 and selection != 0:
            return selection
        else:
            print('Please select a valid number')
            continue

def url_select(selection):
    if selection == 1:
        return song_url
    elif selection == 2:
        return albums_url
    elif selection == 3:
        return fapps_url
    elif selection == 4:
        return papps_url
    elif selection == 5:
        return tapps_url
    elif selection == 6:
        return books_url
    elif selection == 7:
        return movies_url
    elif selection == 8:
        return shows_url
    else:
        return videos_url
        

#Attempts to make a connection to the Itunes site

def make_connection(url):
    try:
        uClient = uReq(url)
        html = uClient.read()
        uClient.close()
        return html
    except:
        print('Could not connect to the site.\n')
        sys.exit(1)

#uses BeautifulSoup library to clean up HTML and make it more manageable.

def make_soup(html):
    soup_bowl = soup(html, 'html.parser')
    match = soup_bowl.find_all('div', class_='section-content')
    content = match[1].ul
    return content

def file_create(pick):
    while pick != 10:
        url = url_select(pick)
        page_html = make_connection(url)
        page_soup = make_soup(page_html)
        filename = 'rankings.txt'
        cat_list=[]
        with open(filename, 'w', encoding = 'utf-8') as f:
            for li in page_soup.findAll('li'):
                for strong in li.findAll('strong'):
                    rank = strong.text
                    rank = rank[:-1]
                for h3 in li.findAll('h3'):
                    title = h3.text
                for h4 in li.findAll('h4'):
                    artist = h4.text
                cat_list.append(rank)
                cat_list.extend(title)
                cat_list.extend(artist)
                f.write(rank + ','+ title + ',' + artist + '\n')
            f.close()
            print('Rankings saved in file named rankings.txt.\n')
            return
    if choice == 10:
        print('Goodbye!')
        quit()


#asks user for rank they would like more information on and error checks the input

def user_rank():
    while True:
        try:
            selection = int(input('What ranking would you like to know more about?\n'))
        except ValueError:
            print('Please enter a valid number')
        if selection <=100 and selection != 0:
            return selection
        else:
            print('Please select a valid number')
            continue

def main():
    user_inst()
    choice = user_select()
    file_create(choice)
    u_rank = user_rank()
    with open('ratings.txt','r') as fhand:
        for line in fhand:
            if str(u_rank) in line:
            #splitlines = line.split(',')
            #print(splitlines)
            #if 
                splitlines = line.split(',')
                #u_rank == int(splitlines[0]):
                u_title = splitlines[1]
                u_artist = splitlines[2]
                print('The information for rank ', u_rank, 'is ', u_title,'-', u_artist)
            #print(splitlines)
                break
    return

main()

Web Scraping Project

User Panel Messages

Announcements