Oct-12-2017, 11:41 PM
Ok, since DeaD_EyE introduced me to pathlib last night, I've been playing with it to get familiar with
it's capabilities.
I wanted to get a bunch of files (3220 to be exact) from the US census Tiger files.
So, if you run this code, you'll probably want to stop it after a bit as delays that I've
inserted (so as not to abuse downloads) alone are almost an hour long.
I inserted a stopafter variable, set to 10 to stop after 10 files. Set to None if you actually have
a use for the data, and it will get all.
it's capabilities.
I wanted to get a bunch of files (3220 to be exact) from the US census Tiger files.
So, if you run this code, you'll probably want to stop it after a bit as delays that I've
inserted (so as not to abuse downloads) alone are almost an hour long.
I inserted a stopafter variable, set to 10 to stop after 10 files. Set to None if you actually have
a use for the data, and it will get all.
from pathlib import Path from shutil import unpack_archive import requests from bs4 import BeautifulSoup from time import sleep class TryThis: def __init__(self): self.debug = False self.stop_after = 10 self.data_main_url = 'https://www2.census.gov/geo/tiger/TIGER2017/ADDR/' self.data_url = 'https://www2.census.gov/geo/tiger/TIGER2017/ADDR/tl_2017_01001_addr.zip' self.filelist = None self.homepath = Path('.') self.data_dir = self.homepath / 'data' self.data_dir.mkdir(exist_ok=True, parents=True) self.soup_index_fname = self.data_dir / 'index.html' self.resp = None self.get_main_page() self.get_files() # save to a file so we're not banging on website def get_main_page(self): """ Extract filenames from download page :return: None """ self.filelist = [] self.resp = requests.get(self.data_main_url) soup = BeautifulSoup(self.resp.content, 'lxml') selection = soup.select('a') links = [pt.get_text() for pt in selection] for link in links: link.strip() if link.startswith('tl_'): self.filelist.append(link) print(f'{len(self.filelist)} files to downlload') def get_files(self): """ Get all zip files fron filelist, and extract on the fly :return: None """ for filename in self.filelist: filesdownloaded = 0 self.data_url = f'{self.data_main_url}/{filename}' print(self.data_url) self.resp = requests.get(self.data_url) self.save(filename) self.zip_fname = self.data_dir.joinpath(Path(self.data_url).name) print(f'self.zip_fname: {self.zip_fname}') self.unpack() filesdownloaded += 1 if self.stop_after: if filesdownloaded >= self.stop_after: break sleep(1) def save(self, filename): """ Saves each zip file :param filename: :return: None """ self.zip_fname = self.data_dir / filename self.zip_fname.write_bytes(self.resp.content) def unpack(self): """ Unpack current file :return: None """ unpack_archive(str(self.zip_fname), extract_dir=str(self.data_dir)) for fpath in self.data_dir.glob('*.txt'): print(fpath, fpath.read_text()) if __name__ == '__main__': tt = TryThis()