Insert results API in database

Insert results API in database - Printable Version

+- Python Forum (https://python-forum.io)
+-- Forum: Python Coding (https://python-forum.io/forum-7.html)
+--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html)
+--- Thread: Insert results API in database (/thread-8920.html)

Insert results API in database - daryl - Mar-13-2018

Hello,

I am new in programmation and in Python.

I am beginning to play with the requests lib which is quiet impressive. My first tests was to retrieve some datas throught some websites :

import requests

r = requests.get("http://somewebsite")
print(r.headers)

Really simple, really smart.

I would like to know if i can insert the results in a databse to store these data and query it. If you have some tutorials or example i'll be very glad.

Regards,

RE: Insert results API in database - Larz60+ - Mar-13-2018

I use a method that saves the request to a file, and checks the created date on the file each time it's
run. If the file is newer than the refresh time, the file will be loaded rather than fetching a new copy,
otherwise it will get a new one.

This may be a bit more that you want, but you can look at the code to get an idea,
there are three modules involved:

CheckInternet.py - checks that there is a network connection available
GetUrl.py - Does the actual fetching of a url
GetPage.py - takes care of caching, calls GetUrl, which does the request

I modified the GetPage.py testit routine on the fly for this post without testing,
so if it doesn't work, let me know.

This code was part of a presentation I gave for a makeit meeting, and has a full jupyter notebook
document with the full code on github here: https://github.com/Larz60p/MakerProject
if you have any interest in how the above code was used in a full package.

prog1 CheckInternet.py:

import socket


class CheckInternet:
    def __init__(self):
        self.internet_available = False

    def check_availability(self):
        self.internet_available = False
        if socket.gethostbyname(socket.gethostname()) != '127.0.0.1':
            self.internet_available = True
        return self.internet_available


def testit():
    ci = CheckInternet()
    print('Please turn internet OFF, then press Enter')
    input()
    ci.check_availability()
    print(f'ci.internet_available: {ci.internet_available}')
    if not ci.internet_available:
        print('    Off test successful')
    else:
        print('    Off test failed')
    print('Please turn internet ON, then press Enter')
    input()
    ci.check_availability()
    print(f'ci.internet_available: {ci.internet_available}')
    if ci.internet_available:
        print('    On test successful')
    else:
        print('    On test failed')


if __name__ == '__main__':
    testit()

prog2 GetUrl.py:

import requests
import CheckInternet
import sys


class GetUrl:
    def __init__(self):
        self.ci = CheckInternet.CheckInternet()
        self.ok_status = 200
        self.r = None

    def fetch_url(self, url):
        self.r = None
        if self.ci.check_availability():
            self.r = requests.get(url, allow_redirects=False)
        return self.r


def testit():
    gu = GetUrl()
    page = gu.fetch_url('https://www.google.com/')
    count = 0
    maxcount = 20
    try:
        if page.status_code == 200:
            ptext = page.text.split('/n')
            for line in ptext:
                print(f'{line}\n')
                count += 1
                if count > maxcount:
                    break
        else:
            print(f'Error retreving file status code: {page.status_code}')
    except AttributeError:
        print('Please enable internet and try again')

if __name__ == '__main__':
    testit()

prog3 GetPage:

import GetUrl
import time
import sys

class GetPage:
    def __init__(self):
        """
        Initalize - Instantiate imported modules, initialize class variables
        """
        self.elapsed_hours = 0
        self.gu = GetUrl.GetUrl()
        self.savefile = None

    def get_page(self, url, savefile=None, refresh_hours_every=48):
        self.url = url
        self.savefile = savefile
        self.refresh_hours_every = refresh_hours_every
        self.page = None
        if self.savefile:
            if self.savefile.exists():
                lstats = savefile.lstat()
                self.elapsed_hours = (time.time() - lstats.st_mtime) / 3600
                if lstats.st_size == 0 or (self.elapsed_hours > self.refresh_hours_every):
                    self.page = self.download_new_file()
                else:
                    with self.savefile.open('r') as f:
                        self.page = f.read()
            else:
                self.page = self.download_new_file()
        else:
            self.page = self.download_new_file()
        return self.page


    def download_new_file(self):
        page = None
        try:
            page = self.gu.fetch_url(self.url)
            if page.status_code == 200:
                with self.savefile.open('wb') as f:
                    f.write(page.content)
            else:
                print(f'Invalid status code: {page.st}')
        except AttributeError:
            print('Please enable internet and try again')
        return page

def testit():
    from pathlib import Path
    homepath = Path('.')
    datapath = self.homepath / 'data'
    datapath.mkdir(exist_ok=True)
    htmlpath = self.datapath / 'html'
    htmlpath.mkdir(exist_ok=True)
    rfc_index_html = self.htmlpath / 'rfc_index.html'
    
    # Test url = rfc index download page, save to data/html/rfc_index.html, refresh always
    gp = GetPage()
    page = gp.get_page(url='https://www.rfc-editor.org/rfc/', savefile=rfc_index_html,
                       refresh_hours_every=0)
    if page:
        if page.status_code == 200:
            print(f'Page contents: {page.text}')
        else:
            print('Page is empty or in')

if __name__ == '__main__':
    testit()

RE: Insert results API in database - thomasp - Mar-16-2018

Try to use sqlalchemy when working with databases, eg:

from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

from sqlalchemy import create_engine
engine = create_engine('sqlite:////tmp/db.sqlite') # 

from sqlalchemy import Column, Text, String
class Header(Base):
    __tablename__ = 'headers'
    url = Column(String(500), unique=True, nullable=False,  primary_key=True)
    data = Column(Text)

Base.metadata.create_all(engine)

from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=engine)
session = Session()

import requests
url = "https://python-forum.io"
r = requests.get(url)

h = Header(url=url, data = str(r.headers))
print(h.data)

session.add(h)
session.commit()

then...

t@tbox:~$ sqlite3 /tmp/db.sqlite "select * from headers"
https://python-forum.io|{'Server': 'Apache', 'Cache-Control': 'no-store, ...