Python Forum - I Need Help Editing My Code

Pages: 1 2

Hi there,

I have the following Python Code, which is run in Jupyter Notebook :-

import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import datetime as dt
class work:
    def __init__(self,link):
        self.link=link
        self.res=requests.get(self.link)
        self.soup=BeautifulSoup(self.res.content, "lxml")
        self.table = self.soup.find_all('table')[0]
        self.l = pd.read_html(str(self.table))

        
    def create(self):
        self.ll=[]
        for i in range(0,6):
            l1=self.l[1][0:1][i]
            l1=list(l1)
            self.ll.extend(l1)
        l2=self.l[1][2:]
        self.date=list(l2[0])
        self.location=list(l2[1])
        self.lancaster=list(l2[2])
        self.spitfire=list(l2[3])
        self.hurricane=list(l2[4])
        self.dakota=list(l2[5])
        
    def month(self):
        mm=self.l[1][1][1]
        
        if mm=='May':
            x=5
        elif mm=='June':
            x=6
        elif mm=='July':
            x=7
        elif mm=='August':
            x=8
        elif mm=='September':
            x=9
        else:
            x=0
        return x
            

        
        
        
    def refine(self):
        self.create()
        arr=np.asarray(self.date)
        temp=arr[0]
        for i in range(0,len(arr)):
            if arr[i]=='nan':
                arr[i]=temp
        
            else:
                temp=arr[i]
        self.y=list(arr)
        return self.y
    def convert(self):
        lx=[]
        x=self.refine()
        y=self.month()
        for i in range(0,len(x)):
            lx.append((dt.datetime(2006, y, int(x[i]))).strftime('%d-%b-%Y'))
        return lx
    
    def post(self):
        date=self.convert()
        dff = pd.DataFrame(list(zip(date,self.location,self.lancaster,self.spitfire,self.hurricane,self.dakota)), 
               columns =self.ll)
        return dff
        
        
        
#a=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/may05.html')
#b=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/june05.html')
#c=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/july05.html')
#d=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/august05.html')
#e=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/september05.html')  

a=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/may06.html')
b=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/june06.html')
c=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/july06.html')
d=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/august06.html')
e=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/september06.html')  

#a=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/may07.html')
#b=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/june07.html')
#c=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/july07.html')
#d=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/august07.html')
#e=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/september07.html')  

#a=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/may08.html')
#b=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/june08.html')
#c=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/july08.html')
#d=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/august08.html')
#e=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/september08.html')  

dff1=a.post()
dff2=b.post()
dff3=c.post()
dff4=d.post()
dff5=e.post()

X = pd.concat([dff1, dff2], axis=0)
Y = pd.concat([X, dff3], axis=0)
Z =  pd.concat([Y, dff4], axis=0)
F =  pd.concat([Z, dff5], axis=0)
F=pd.DataFrame(F)
#display = F[(F['Location'].str.contains('[a-zA-Z]')) & (F['Dakota'].str.contains('D')) & (F['Spitfire'].str.contains('S', na=True)) & (F['Lancaster'] != 'L')]  

#display = F[(F['Location'].str.contains('[a-zA-Z]')) & (F['Date'].str.contains('Jul')) & (F['Dakota'].str.contains('D')) & (F['Spitfire'].str.contains('S', na=True)) & (F['Lancaster'] != 'L')]  

#Use the above Line of Code when filtering DataFrame by Month
                                               
#Months = May Jun Jul Aug Sep                   #('Jun')) For Multiple Months use ('Jun|Jul')) For example
#Months = -05- -06- -07- -08- -09-              

display = F[(F['Location'].str.contains('[a-zA-Z]')) & (F['Date'].str.contains('10$|20$')) & (F['Dakota'].str.contains('D')) & (F['Spitfire'].str.contains('S', na=True)) & (F['Lancaster'] != 'L')]  

#df3['DATE'].str.contains('-6$')) or ('-6$|-8$')) For more than one Day. Use minus sign in front of the number when filtering the DataFrame by Days of Month.

                          #('-6$'))                 #('-6$|-8$')) For example    

pd.options.display.max_rows = 1000   
pd.options.display.max_columns = 1000
display.drop('Lancaster', axis=1, inplace=True)
display=display.dropna(subset=['Spitfire', 'Hurricane'], how='all')
#display=display[['Date','Location','Dakota','Hurricane','Spitfire']]
display=display[['Location','Date','Dakota','Hurricane','Spitfire']]
display=display.fillna('--')
#display.reset_index(drop=True, inplace=True)
display.to_csv(r'C:\Users\Edward\Desktop\BBMF Schedules And Master Forum Thread Texts\BBMF-2006-Code (Dakota With Fighters).csv')

display['Date'] = pd.to_datetime(display['Date'])

display = display.sort_values(by='Date', key=lambda col: 100 * col.dt.day + col.dt.month)

display['Date']= pd.to_datetime(display['Date']).dt.strftime('%d-%b-%Y')

display.reset_index(drop=True, inplace=True)

display

Which I have adapted from another of my Codes, but when I run the Code, only the Columns and no Rows are shown in the DataFrame output.

I think the issue here, is in some of the Lines of Code, 'display' should be 'F' and or vice versa.

Could someone tell me where I need to make those changes, the aim is to get a DataFrame Output, like I aimed to get in my previous str.endswith Thread, which is in this Forum section, posted a few days ago.

Any help would be much appreciated.

Best Regards

Eddie Winch

Can anyone help me ?

Eddie

The following code will help you parse these pages.
This only displays the data, but you can easily get the contents from each of the tr's

It will create a new directory tree immediately below youe python code directory.
It is non destructive, so if you already have directory named deltadata, it will use that,
if not, it will create it.
the new directory tree is shown below:

Output:├── delatdata
    ├── data
    └── html
    └── html
        ├── august06_cache.html
        ├── july06_cache.html
        ├── june06_cache.html
        ├── may06_cache.html
        └── september06_cache.html

This file has a sub directory named html, where downloaded files will be cached.
This will allow you to play wiothout having to download the file each time.
With this, you can also run offline once you download the files the first time.

I added enumeration to help with identifying which tr's hold header verses actual data
for example, you can see that tr[0] contains the header td's

suggest running once without redirect to establish the directory tree
then run again (offline ok now as cache will exist)
python GetDeltaWebInfo.py > .../delatdata/data/trdata.html, replacing ... with your souce code directory

import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import datetime as dt
from pathlib import Path
import os
import sys
from PrettifyPage import PrettifyPage

class GetDeltaWebInfo:
    def __init__(self):
        os.chdir(os.path.abspath(os.path.dirname(__file__)))
        homepath = Path('.')
        self.deltapath = homepath / 'delatdata'        
        self.deltapath.mkdir(exist_ok=True)

        self.htmlpath = self.deltapath / 'html'
        self.htmlpath.mkdir(exist_ok=True)

        self.datapath = self.deltapath / 'data'
        self.datapath.mkdir(exist_ok=True)


    def make_soup(self, URL):
        fname = URL.split('/')[-1].split('.')[0]
        cachefile = self.htmlpath / f"{fname}_cache.html"
        if cachefile.exists():
            with cachefile.open('rb') as fp:
                page = fp.read()
        else:
            response = requests.get(URL)
            if response.status_code == 200:
                page = response.content
                with cachefile.open('wb') as fp:
                    fp.write(page)
            else:
                print(f"Cannot retreive {URU}\nStatus code: {response.status_code}")
                return None
        soup = BeautifulSoup(page, 'lxml')
        return soup


def main():
    pp = PrettifyPage()
    gdwi = GetDeltaWebInfo()
    urls = [
        'http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/may06.html',
        'http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/june06.html',
        'http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/july06.html',
        'http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/august06.html',
        'http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/september06.html'
    ]

    for URL in urls:
        soup = gdwi.make_soup(URL)
        table = soup.select('body > table:nth-child(6) > tbody:nth-child(1) > tr:nth-child(3) > td:nth-child(2) > table:nth-child(4)')       
        print(f"Data for ")
        ttable = soup.find_all('table')
        tdata = ttable[1]
        trs = tdata.find_all('tr')
        for n, tr in enumerate(trs):
            tds = tr.find_all('td')
            for n1, td in enumerate(tds):
                print(f"\n---------------------------- tr{n} td{n1} ----------------------------")
                print(f"{pp.prettify(td,2)}")


if __name__ == '__main__':
    main()

small sample of output

Output:---------------------------- tr0 td0 ----------------------------
<td align="center" valign="top" width="66">
  <h3>
    Date
  </h3>
</td>
 


---------------------------- tr0 td1 ----------------------------
<td align="center" valign="top" width="189">
  <h3>
    Location
  </h3>
</td>
 


---------------------------- tr0 td2 ----------------------------
<td align="center" valign="top" width="92">
  <h3>
    Lancaster
  </h3>
</td>
 


---------------------------- tr0 td3 ----------------------------
<td align="center" valign="top" width="78">
  <h3>
    Spitfire
  </h3>
</td>
 


---------------------------- tr0 td4 ----------------------------
<td align="center" valign="top" width="85">
  <h3>
    Hurricane
  </h3>
</td>
 


---------------------------- tr0 td5 ----------------------------
<td align="center" valign="top" width="76">
  <h3>
    Dakota
  </h3>
</td>
 


---------------------------- tr1 td0 ----------------------------
<td align="CENTER" colspan="6" valign="TOP">
  <p>
  </p>
  <h3>
    May
  </h3>
</td>
 


---------------------------- tr2 td0 ----------------------------
<td align="CENTER" valign="TOP" width="66">
  <p>
    1
  </p>
</td>
 


---------------------------- tr2 td1 ----------------------------
<td align="CENTER" valign="TOP" width="189">
  <p>
    <a href="http://web.archive.org/web/20060811233101/http://www.rotarywakefield.org.uk/page.php?domain_name=rotarywakefield.org.uk&amp;viewpage=gala%20%26%20car%20boot" target="new">
      Wakefield May Gala
    </a>
  </p>
</td>
 


---------------------------- tr2 td2 ----------------------------
<td align="CENTER" valign="TOP" width="92">
  <p>
  </p>
</td>
 


---------------------------- tr2 td3 ----------------------------
<td align="CENTER" valign="TOP" width="78">
  <p>
    S
  </p>
</td>
 


---------------------------- tr2 td4 ----------------------------
<td align="CENTER" valign="TOP" width="85">
  <p>
  </p>
</td>
 


---------------------------- tr2 td5 ----------------------------
<td align="CENTER" valign="TOP" width="76">
  <p>
  </p>
</td>

EDITED 7:08PM DST Fixed output which was incorrect

you will also need:
PrettifyPage.py -- use this name, put in same directory as above program:

from bs4 import BeautifulSoup
import requests
import pathlib


class PrettifyPage:
    def __init__(self):
        pass

    def prettify(self, soup, indent):
        pretty_soup = str()
        previous_indent = 0
        for line in soup.prettify().split("\n"):
            current_indent = str(line).find("<")
            if current_indent == -1 or current_indent > previous_indent + 2:
                current_indent = previous_indent + 1
            previous_indent = current_indent
            pretty_soup += self.write_new_line(line, current_indent, indent)
        return pretty_soup

    def write_new_line(self, line, current_indent, desired_indent):
        new_line = ""
        spaces_to_add = (current_indent * desired_indent) - current_indent
        if spaces_to_add > 0:
            for i in range(spaces_to_add):
                new_line += " "		
        new_line += str(line) + "\n"
        return new_line

Many thanks for taking the time, to write those Codes for me Larz60+, its very much appreciated.

Would you or can anyone else, answer the question I originally asked, about what changes I need to make to my latest Code, to get an DataFrame Row Output ? As I am only getting the Columns showing currently, when I run the Code. There wouldn't be an Output of Rows, if those days were not present, but Rows with days 10 and 20, do show in the full DataFrame Output for 2006. So not being present, isn't the issue here.

Best Regards

Eddie Winch Smile

Hi there,

I have sorted out the issue, I was having with my Code :-

Instead of using the following line of Code,
for filtering the Rows, for the 6th and 13th of a Month ones for example :-

(df3['DATE'].str.contains('-6$|13$'))

Which is from another Code of mine, I need to use the following instead :-

(F['Date'].str.contains('06-|13-'))

And now I get the filtering of Rows I want, from any combination of days.

Eddie Smile

Glad to hear it. I don't use pandas enough, so couldn't help on that end.

Thanks for sharing.

Hi there,

I am doing a search for Displays only now for the Output DataFrame, so in the filtering of Rows, I use the following line of Code :-

display = F[(F['Location'].str.contains('- Display'))

And I also changed a Row, with a Location saying Windermere Air Show to Windermere - Display for that Row,

using the following line of Code :-

display.loc[86,'Location']='Windermere - Display'

However in the Output when I run my Code, all the - Display Rows only show which is correct, but

The Windermere - Display Row shows as :-

Output:
Windermere - Display	NaN	NaN	NaN	NaN

Do I need, to put inplace=True as part of the display.loc line of Code, for the Data in the Row to show ? And if so what should the line read, when that is incorporated ? Or if not what change do I need to make ?

The other lines at the bottom of my Code, with the .loc line are :-

pd.options.display.max_rows = 1000   
pd.options.display.max_columns = 1000
display.drop('Lancaster', axis=1, inplace=True)
display=display.dropna(subset=['Spitfire', 'Hurricane'], how='all')
#display=display[['Date','Location','Dakota','Hurricane','Spitfire']]
display=display[['Location','Date','Dakota','Hurricane','Spitfire']]
display=display.fillna('--')
display.loc[86,'Location']='Windermere - Display'
display.reset_index(drop=True, inplace=True)
display.to_csv(r'C:\Users\Edward\Desktop\BBMF Schedules And Master Forum Thread Texts\BBMF-2006-Code (Dakota With Fighters).csv')
display

I tried moving the position, of that .loc Code line in the full Code, to other positions, but that made no difference, and I still get the Column values as NaN's in my Output. The Index position number '86' is correct, so an incorrect number for that, isn't the issue.

Update :-

If I use the following Line of Code :-

F[(F['Location'].str.contains('- Display|Win'))

I get the correct DataFrame Output, with the Windermere - Display Row, properly showing in the
correct position. But I would like to get the DataFrame Output I want, without including the |Win in that Line of Code, if possible. If someone could direct me, to what change(s) I need to make to achieve that, I would be very grateful.

Any help would be much appreciated

Regards

Eddie Winch

(Aug-26-2020, 01:17 PM)eddywinch82 Wrote: [ -> ]
display.loc[86,'Location']='Windermere - Display'
However in the Output when I run my Code, all the - Display Rows only show which is correct, but

The Windermere - Display Row shows as :-
Output:
Windermere - Display	NaN	NaN	NaN	NaN

I had to comment out the date filter ( & (F['Date'].str.contains('10$|20$'))), because otherwise there was no data in the dataframe. And because there was no data, display.loc[86,'Location']='Windermere - Display' created an item, that only had location data (that's why the other columns were NaN).

So without that filter, and with adding a new filter for & F['Location'].str.contains(' - Display'), I was able to get the following results:

                            Location         Date Dakota Hurricane Spitfire
0         Woodspring Wings - Display  01-Jul-2006      D         H        S
1   Duxford Flying Legends - Display  08-Jul-2006      D         H        S
2      Campbeltown Airshow - Display  23-Jul-2006      D       NaN      NaN
3               RAF Odiham - Display  27-Jul-2006      D       NaN        S
4             East Fortune - Display  29-Jul-2006      D         H        S
5          Whitby Carnival - Display  12-Aug-2006      D       NaN        S
6        Weymouth Carnival - Display  16-Aug-2006      D         H        S
7         Dawlish Carnival - Display  17-Aug-2006      D         H        S
8                Elvington - Display  19-Aug-2006      D         H        S
9                Elvington - Display  20-Aug-2006      D         H        S
10  Islay Airport Open Day - Display  20-Aug-2006      D       NaN      NaN
11               Twinwoods - Display  27-Aug-2006      D       NaN        S
12      Bodelwyddan Castle - Display  28-Aug-2006      D       NaN        S

I also commented out a lot of stuff at the end of the script, to try to simplify the issue. Full code here:

import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import datetime as dt
class work:
    def __init__(self,link):
        self.link=link
        self.res=requests.get(self.link)
        self.soup=BeautifulSoup(self.res.content, "lxml")
        self.table = self.soup.find_all('table')[0]
        self.l = pd.read_html(str(self.table))
 
         
    def create(self):
        self.ll=[]
        for i in range(0,6):
            l1=self.l[1][0:1][i]
            l1=list(l1)
            self.ll.extend(l1)
        l2=self.l[1][2:]
        self.date=list(l2[0])
        self.location=list(l2[1])
        self.lancaster=list(l2[2])
        self.spitfire=list(l2[3])
        self.hurricane=list(l2[4])
        self.dakota=list(l2[5])
         
    def month(self):
        mm=self.l[1][1][1]
         
        if mm=='May':
            x=5
        elif mm=='June':
            x=6
        elif mm=='July':
            x=7
        elif mm=='August':
            x=8
        elif mm=='September':
            x=9
        else:
            x=0
        return x
             
 
         
         
         
    def refine(self):
        self.create()
        arr=np.asarray(self.date)
        temp=arr[0]
        for i in range(0,len(arr)):
            if arr[i]=='nan':
                arr[i]=temp
         
            else:
                temp=arr[i]
        self.y=list(arr)
        return self.y
    def convert(self):
        lx=[]
        x=self.refine()
        y=self.month()
        for i in range(0,len(x)):
            lx.append((dt.datetime(2006, y, int(x[i]))).strftime('%d-%b-%Y'))
        return lx
     
    def post(self):
        date=self.convert()
        dff = pd.DataFrame(list(zip(date,self.location,self.lancaster,self.spitfire,self.hurricane,self.dakota)), 
               columns =self.ll)
        return dff
         
         
         
#a=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/may05.html')
#b=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/june05.html')
#c=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/july05.html')
#d=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/august05.html')
#e=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/september05.html')  
 
a=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/may06.html')
b=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/june06.html')
c=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/july06.html')
d=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/august06.html')
e=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/september06.html')  
 
#a=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/may07.html')
#b=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/june07.html')
#c=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/july07.html')
#d=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/august07.html')
#e=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/september07.html')  
 
#a=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/may08.html')
#b=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/june08.html')
#c=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/july08.html')
#d=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/august08.html')
#e=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/september08.html')  
 
dff1=a.post()
dff2=b.post()
dff3=c.post()
dff4=d.post()
dff5=e.post()
 
X = pd.concat([dff1, dff2], axis=0)
Y = pd.concat([X, dff3], axis=0)
Z =  pd.concat([Y, dff4], axis=0)
F =  pd.concat([Z, dff5], axis=0)
F=pd.DataFrame(F)
#display = F[(F['Location'].str.contains('[a-zA-Z]')) & (F['Dakota'].str.contains('D')) & (F['Spitfire'].str.contains('S', na=True)) & (F['Lancaster'] != 'L')]  
 
#display = F[(F['Location'].str.contains('[a-zA-Z]')) & (F['Date'].str.contains('Jul')) & (F['Dakota'].str.contains('D')) & (F['Spitfire'].str.contains('S', na=True)) & (F['Lancaster'] != 'L')]  
 
#Use the above Line of Code when filtering DataFrame by Month
                                                
#Months = May Jun Jul Aug Sep                   #('Jun')) For Multiple Months use ('Jun|Jul')) For example
#Months = -05- -06- -07- -08- -09-              
 
display = F[
    (F['Location'].str.contains('[a-zA-Z]'))
    & F['Location'].str.contains(' - Display')
   # & (F['Date'].str.contains('10$|20$'))
    & (F['Dakota'].str.contains('D'))
    & (F['Spitfire'].str.contains('S', na=True))
    & (F['Lancaster'] != 'L')
    ]  
 
#df3['DATE'].str.contains('-6$')) or ('-6$|-8$')) For more than one Day. Use minus sign in front of the number when filtering the DataFrame by Days of Month.
 
                          #('-6$'))                 #('-6$|-8$')) For example    
 
#pd.options.display.max_rows = 1000   
#pd.options.display.max_columns = 1000
#display.drop('Lancaster', axis=1, inplace=True)
#display=display.dropna(subset=['Spitfire', 'Hurricane'], how='all')
#display=display[['Date','Location','Dakota','Hurricane','Spitfire']]
display=display[['Location','Date','Dakota','Hurricane','Spitfire']]
#display=display.fillna('--')
#display.loc[86,'Location']='Windermere - Display'
display.reset_index(drop=True, inplace=True)

print(display)

That all said, I don't know anything about pandas, I'm just bashing things together to try to get it to work :/

Hi nilamo,

Sorry I should have said, I have updated my Full Code, since the one you show there, I will post the

Full Code I am using, relevant to the issue I am currently having, when I get home. Sorry I didn't make that clear in my latest post.

Many thanks for taking the time, to sort out the issue I am having.

Best Regards

Eddie Winch

Hi nilamo,

Here is the full Code I am using now :-

import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import datetime as dt
class work:
    def __init__(self,link):
        self.link=link
        self.res=requests.get(self.link)
        self.soup=BeautifulSoup(self.res.content, "lxml")
        self.table = self.soup.find_all('table')[0]
        self.l = pd.read_html(str(self.table))

        
    def create(self):
        self.ll=[]
        for i in range(0,6):
            l1=self.l[1][0:1][i]
            l1=list(l1)
            self.ll.extend(l1)
        l2=self.l[1][2:]
        self.date=list(l2[0])
        self.location=list(l2[1])
        self.lancaster=list(l2[2])
        self.spitfire=list(l2[3])
        self.hurricane=list(l2[4])
        self.dakota=list(l2[5])
        
    def month(self):
        mm=self.l[1][1][1]
        
        if mm=='May':
            x=5
        elif mm=='June':
            x=6
        elif mm=='July':
            x=7
        elif mm=='August':
            x=8
        elif mm=='September':
            x=9
        else:
            x=0
        return x
            

        
        
        
    def refine(self):
        self.create()
        arr=np.asarray(self.date)
        temp=arr[0]
        for i in range(0,len(arr)):
            if arr[i]=='nan':
                arr[i]=temp
        
            else:
                temp=arr[i]
        self.y=list(arr)
        return self.y
    def convert(self):
        lx=[]
        x=self.refine()
        y=self.month()
        for i in range(0,len(x)):
            lx.append((dt.datetime(2006, y, int(x[i]))).strftime('%d-%b-%Y'))
        return lx
    
    def post(self):
        date=self.convert()
        dff = pd.DataFrame(list(zip(date,self.location,self.lancaster,self.spitfire,self.hurricane,self.dakota)), 
               columns =self.ll)
        return dff
        
        
        
#a=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/may05.html')
#b=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/june05.html')
#c=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/july05.html')
#d=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/august05.html')
#e=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/september05.html')  

a=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/may06.html')
b=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/june06.html')
c=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/july06.html')
d=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/august06.html')
e=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/september06.html')  

#a=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/may07.html')
#b=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/june07.html')
#c=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/july07.html')
#d=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/august07.html')
#e=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/september07.html')  

#a=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/may08.html')
#b=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/june08.html')
#c=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/july08.html')
#d=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/august08.html')
#e=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/september08.html')  

dff1=a.post()
dff2=b.post()
dff3=c.post()
dff4=d.post()
dff5=e.post()

X = pd.concat([dff1, dff2], axis=0)
Y = pd.concat([X, dff3], axis=0)
Z =  pd.concat([Y, dff4], axis=0)
F =  pd.concat([Z, dff5], axis=0)
F=pd.DataFrame(F)
display = F[(F['Location'].str.contains('- Display')) & (F['Dakota'].str.contains('D')) & (F['Spitfire'].str.contains('S', na=True)) & (F['Lancaster'] != 'L')]  

#Months = May Jun Jul Aug Sep
#Months = -05- -06- -07- -08- -09-   #('[a-zA-Z]')) or #('- Display')) or  #('- Display|Win'))

#display = F[(F['Location'].str.contains('[a-zA-Z]')) & (F['Date'].str.contains('Jul')) & (F['Dakota'].str.contains('D')) & (F['Spitfire'].str.contains('S', na=True)) & (F['Lancaster'] != 'L')]  

pd.options.display.max_rows = 1000   
pd.options.display.max_columns = 1000
display.drop('Lancaster', axis=1, inplace=True)
display=display.dropna(subset=['Spitfire', 'Hurricane'], how='all')
#display=display[['Date','Location','Dakota','Hurricane','Spitfire']]
display=display[['Location','Date','Dakota','Hurricane','Spitfire']]
display=display.fillna('--')
display.loc[86,'Location']='Windermere - Display'   #'Windermere Air Show'
display.reset_index(drop=True, inplace=True)
display.to_csv(r'C:\Users\Edward\Desktop\BBMF Schedules And Master Forum Thread Texts\BBMF-2006-Code (Dakota With Fighters).csv')
display

If I use the line of Code in my full Code :-

F[(F['Location'].str.contains('- Display'))

I get the Windermere - Display Row, shown at the bottom, of the Output DataFrame,
with all the values NaN.

Output:
Windermere - Display	NaN	NaN	NaN	NaN

If I use this line of Code instead :-

F[(F['Location'].str.contains('- Display|Win'))

I get the correct DataFrame Output, with the Windermere - Display Row displaying in the correct position,
and with all Data showing.

Do I need, to put inplace=True as part of the display.loc line of Code, for the Data in the Row to show ? And if so what should the line read, when that is incorporated ? Or if not what change do I need to make ?

I tried moving the position, of that .loc Code line in the full Code, to other positions, but that made no difference, and I still get the Column values as NaN's in my Output. The Index position number '86' is correct, so an incorrect number for that, isn't the issue.

I would like to get the DataFrame Output I want, without including the |Win in that Line of Code I mentioned before, if possible. Do you know how to do that nilamo ?

Your help is very much appreciated.

Regards

Eddie Winch Smile

Pages: 1 2