I'm probably jumping the gun here as my code is still pretty crappy, but I'm trying to develop my first working application. I've set up version control locally on my hard drive (no remotes yet) and started commenting my working code the way I've seen code commented in other people's Git repositories. Could you have a look at this and comment on the style as much as on the (admittedly inexpert) code?
obtainer.py
core.py
obtainer.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
from core import * """ Obtains the first page of Google search results for the specified search term in html format and writes it to an html file in the working directory. Imports the core module (pasted below). """ def user_input(): """Obtains the user input as a string representing the search term. Verifies that all characters in the search term string are legal. :returns: The search term string. """ screen = digits + ascii_letters + " " print (screen) while True : search_term = input ( "Input the search term: " ) scan = [char in screen for char search_term] print (scan) if False not in scan: return search_term else : print ( "Invalid character--try again." ) def parse_url(search_term): """Generates a Google search url from the search term. If the search term is more than one word, adds '+' between words. This function is called by get_and_write_page() below. :search_term: The search term input by the user. :returns: The url for the search term. """ search_term_list = search_term.split() if len (search_term_list) > 1 : for index in range ( len (search_term_list) - 1 ): url + = search_term_list[index] + "+" url + = search_term_list[ - 1 ] else : url + = search_term print ( "{} parsed to {}." . format (search_term, url)) return url def get_and_write_page(): """ Calls parse_url() above to obtain url from search term input by the user in user_input() above. Calls the imported check_and_get() function to download the Google search page in html format. Writes downloaded data to disk in binary format as an html file in the working directory. :returns: None """ search_term = user_input() url = parse_url(search_term) raw_html = check_and_get(url) print ( "{} obtained from {}." . format (search_term, url)) if raw_html is not None : filename = "Google search results for " + search_term + ".html" with open (filename, "bw" ) as f: f.write(raw_html) print ( "File {} written." . format (filename)) else : print ( "No data obtained." ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
from requests import get from requests.exceptions import RequestException from contextlib import closing from bs4 import BeautifulSoup from datetime import datetime """ This module obtained, with minor modifications, from the article 'Practical Introduction to Web Scraping in Python', by Colin O'Keefe, It makes an URL request from a webpage and verifies that the page contains HTML/XML data. If it doesn't, it displays an error message and writes an error log to the working directory. If there is HTML/XML in the webpage, the main function of the module extracts the data and returns it. """ def is_good_response(resp): """ Returns True if the response seems to be HTML/XML, False otherwise. Called by check_and_get() below. :resp: Response from with_closing() method of check_and_get() below. :returns: boolean value representing whether there is HTML/XML in the webpage at the URL """ content_type = resp.headers[ "Content-Type" ].lower() return (resp.status_code = = 200 and content_type is not None and content_type.find( 'html' ) > - 1 ) def log_error(e): """In case of error, writes log file to working directory and prints error to console. :e: Error type as returned from is_good_response() above. :returns: None """ logstamp = datetime.today() + "-error.log" with open (logstamp, 'w' ) as f: f.write(e) print (e) def check_and_get(url): """ Attempts to get the contents of a webpage by making an HTTP GET request. if the content-type response is HTML/XML, returns the text content, else returns None. :url: Full website URL :returns: HTML/XML content of webpage or None """ try : with closing(get(url, stream = True )) as resp: if is_good_response(resp): return resp.content else : return None except RequestException as e: log_error( "Error during request to {0} : {1}" . format (url, str (e))) if __name__ = = '__main__' : get_and_write_page( |