Python Forum

Ok, since DeaD_EyE introduced me to pathlib last night, I've been playing with it to get familiar with
it's capabilities.

I wanted to get a bunch of files (3220 to be exact) from the US census Tiger files.
So, if you run this code, you'll probably want to stop it after a bit as delays that I've
inserted (so as not to abuse downloads) alone are almost an hour long.
I inserted a stopafter variable, set to 10 to stop after 10 files. Set to None if you actually have
a use for the data, and it will get all.

from pathlib import Path
from shutil import unpack_archive
import requests
from bs4 import BeautifulSoup
from time import sleep


class TryThis:
    def __init__(self):
        self.debug = False
        self.stop_after = 10
        self.data_main_url = 'https://www2.census.gov/geo/tiger/TIGER2017/ADDR/'
        self.data_url = 'https://www2.census.gov/geo/tiger/TIGER2017/ADDR/tl_2017_01001_addr.zip'
        self.filelist = None
        self.homepath = Path('.')
        self.data_dir = self.homepath / 'data'
        self.data_dir.mkdir(exist_ok=True, parents=True)
        self.soup_index_fname = self.data_dir / 'index.html'
        self.resp = None
        self.get_main_page()
        self.get_files()

    # save to a file so we're not banging on website
    def get_main_page(self):
        """
        Extract filenames from download page
        :return: None
        """
        self.filelist = []
        self.resp = requests.get(self.data_main_url)
        soup = BeautifulSoup(self.resp.content, 'lxml')
        selection = soup.select('a')
        links = [pt.get_text() for pt in selection]
        for link in links:
            link.strip()
            if link.startswith('tl_'):
                self.filelist.append(link)
        print(f'{len(self.filelist)} files to downlload')

    def get_files(self):
        """
        Get all zip files fron filelist, and extract on the fly
        :return: None
        """
        for filename in self.filelist:
            filesdownloaded = 0
            self.data_url = f'{self.data_main_url}/{filename}'
            print(self.data_url)
            self.resp = requests.get(self.data_url)
            self.save(filename)
            self.zip_fname = self.data_dir.joinpath(Path(self.data_url).name)
            print(f'self.zip_fname: {self.zip_fname}')
            self.unpack()
            filesdownloaded += 1
            if self.stop_after:
                if filesdownloaded >= self.stop_after:
                    break
            sleep(1)

    def save(self, filename):
        """
        Saves each zip file
        :param filename:
        :return: None
        """
        self.zip_fname = self.data_dir / filename
        self.zip_fname.write_bytes(self.resp.content)

    def unpack(self):
        """
        Unpack current file
        :return: None
        """
        unpack_archive(str(self.zip_fname), extract_dir=str(self.data_dir))
        for fpath in self.data_dir.glob('*.txt'):
            print(fpath, fpath.read_text())


if __name__ == '__main__':
    tt = TryThis()

Please note: There is a download limit of 10 files per session see: https://ask.census.gov/prweb/PRServletCu.../!STANDARD
I found out the hard way (for the second time, shame on me)

Larz60+

Larz60+