Hi snippsat,
Many thanks for your reply to my Thread.
Here is the full Python Code :-
import pandas as pd
import requests
from bs4 import BeautifulSoup
import xlrd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
res1 = requests.get("https://www.air-shows.org.uk/2022/03/airshow-news-battle-of-britain-memorial-flight-display-dates-2022/")
soup1 = BeautifulSoup(res1.content,'lxml')
table1 = soup1.find_all('div',{'class' : 'post-content entry-content'})
lis1,lis2=[],[]
for table in table1:
lis1 = table.find_all('ul')
lis2 = table.find_all('h4')
new_lis1=[]
for l in lis1:
temp=l.find_all('li')
temp_x=[]
for t in temp:
temp_x.append(t.text)
new_lis1.append(temp_x)
new_lis2=[]
for l in lis2:
new_lis2.append(l.text)
all_new_list = []
for lis in new_lis1:
temp_x=[]
for item in lis:
item=item.replace('\xa0'," ")
item=item.replace('\xa0'," ")
item=item.replace('Flypast – ',"")
item=item.replace('Flypasts – ',"")
item=item.replace('Lancaster – ',"")
item=item.replace('17th- ',"17th – ")
item=item.replace('26th- ',"26th – ")
temp_x.append(item.split(" – "))
all_new_list.append(temp_x)
temp_df = pd.DataFrame(columns=['LOCATION','DATE','MONTH','LANCASTER','SPITFIRE','HURRICANE','DAKOTA'])
j=0
date,loc,month=[],[],[]
for i in range(1, len(new_lis2)):
for item in all_new_list[i]:
loc.append(item[1])
date.append(item[0])
month.append(new_lis2[i])
temp_df['LOCATION']=loc
temp_df['DATE'] = date
temp_df['MONTH'] = month
for i,row in temp_df.iterrows():
text = row[0]
if 'Lancaster Only' or 'Duxford' in text:
temp_df.iat[i,3] = 'L'
temp_df.iat[i,4] = '--'
temp_df.iat[i,5] = '--'
temp_df.iat[i,6] = '--'
for i,row in temp_df.iterrows():
text = row[0]
if 'Spitfire Only' in text:
temp_df.iat[i,3] = '--'
temp_df.iat[i,4] = 'S'
temp_df.iat[i,5] = '--'
temp_df.iat[i,6] = '--'
for i,row in temp_df.iterrows():
text = row[0]
if 'Lancaster Only' not in text and 'Spitfire Only' not in text and 'Duxford' not in text:
temp_df.iat[i,3] = 'L'
temp_df.iat[i,4] = 'S'
temp_df.iat[i,5] = 'H'
temp_df.iat[i,6] = '--'
for i,row in temp_df.iterrows():
text = row[1]
text=text.replace('th','')
text=text.replace('st','')
text=text.replace('nd','')
text=text.replace('rd','')
temp_df.iat[i,1] = text
for i,row in temp_df.iterrows():
date = row[1]
month = row[2]
if '-' in date:
if month == 'May' and 'June' not in date:
date_list = date.split("-")
for j in range(int(date_list[0]),int(date_list[1])+1):
dic = {"LOCATION":row[0],"DATE":str(j),"MONTH":row[2],"LANCASTER":row[3],"SPITFIRE":row[4],'HURRICANE':row[5],'DAKOTA':row[6]}
temp_df=temp_df.append(dic, ignore_index = True)
temp_df=temp_df.drop(temp_df[ temp_df['DATE'] == row[1] ].index)
if month == 'May' and 'June' in date:
date=date.replace(" June","")
date_list = date.split("-")
for j in range(int(date_list[0]),32):
dic = {"LOCATION":row[0],"DATE":str(j),"MONTH":row[2],"LANCASTER":row[3],"SPITFIRE":row[4],'HURRICANE':row[5],'DAKOTA':row[6]}
temp_df=temp_df.append(dic, ignore_index = True)
for j in range(1,int(date_list[1])+1):
dic = {"LOCATION":row[0],"DATE":str(j),"MONTH":'June',"LANCASTER":row[3],"SPITFIRE":row[4],'HURRICANE':row[5],'DAKOTA':row[6]}
temp_df=temp_df.append(dic, ignore_index = True)
temp_df=temp_df.drop(temp_df[ temp_df['DATE'] == row[1] ].index)
if month == 'June' and 'July' not in date and 'SAT ONLY' not in row[0]:
date_list = date.split("-")
for j in range(int(date_list[0]),int(date_list[1])+1):
dic = {"LOCATION":row[0],"DATE":str(j),"MONTH":row[2],"LANCASTER":row[3],"SPITFIRE":row[4],'HURRICANE':row[5],'DAKOTA':row[6]}
temp_df=temp_df.append(dic, ignore_index = True)
temp_df=temp_df.drop(temp_df[ temp_df['DATE'] == row[1] ].index)
if month == 'June' and 'July' not in date and 'SAT ONLY' in row[0]:
date_list = date.split("-")
dic = {"LOCATION":row[0],"DATE":str(date_list[0]),"MONTH":row[2],"LANCASTER":'L',"SPITFIRE":'--','HURRICANE':'--','DAKOTA':'--'}
temp_df=temp_df.append(dic, ignore_index = True)
dic = {"LOCATION":row[0],"DATE":str(date_list[1]),"MONTH":row[2],"LANCASTER":'--',"SPITFIRE":'S','HURRICANE':'H','DAKOTA':'--'}
temp_df=temp_df.append(dic, ignore_index = True)
temp_df=temp_df.drop(temp_df[ temp_df['DATE'] == row[1] ].index)
if month == 'June' and 'July' in date:
date=date.replace(" July","")
date_list = date.split("-")
for j in range(int(date_list[0]),31):
dic = {"LOCATION":row[0],"DATE":str(j),"MONTH":row[2],"LANCASTER":row[3],"SPITFIRE":row[4],'HURRICANE':row[5],'DAKOTA':row[6]}
temp_df=temp_df.append(dic, ignore_index = True)
for j in range(1,int(date_list[1])+1):
dic = {"LOCATION":row[0],"DATE":str(j),"MONTH":'July',"LANCASTER":row[3],"SPITFIRE":row[4],'HURRICANE':row[5],'DAKOTA':row[6]}
temp_df=temp_df.append(dic, ignore_index = True)
temp_df=temp_df.drop(temp_df[ temp_df['DATE'] == row[1] ].index)
if month == 'July' and 'August' not in date:
date_list = date.split("-")
for j in range(int(date_list[0]),int(date_list[1])+1):
dic = {"LOCATION":row[0],"DATE":str(j),"MONTH":row[2],"LANCASTER":row[3],"SPITFIRE":row[4],'HURRICANE':row[5],'DAKOTA':row[6]}
temp_df=temp_df.append(dic, ignore_index = True)
temp_df=temp_df.drop(temp_df[ temp_df['DATE'] == row[1] ].index)
if month == 'July' and 'August' in date:
date=date.replace(" August","")
date_list = date.split("-")
for j in range(int(date_list[0]),32):
dic = {"LOCATION":row[0],"DATE":str(j),"MONTH":row[2],"LANCASTER":row[3],"SPITFIRE":row[4],'HURRICANE':row[5],'DAKOTA':row[6]}
temp_df=temp_df.append(dic, ignore_index = True)
for j in range(1,int(date_list[1])+1):
dic = {"LOCATION":row[0],"DATE":str(j),"MONTH":'August',"LANCASTER":row[3],"SPITFIRE":row[4],'HURRICANE':row[5],'DAKOTA':row[6]}
temp_df=temp_df.append(dic, ignore_index = True)
temp_df=temp_df.drop(temp_df[ temp_df['DATE'] == row[1] ].index)
if month == 'August' and 'September' not in date:
date_list = date.split("-")
for j in range(int(date_list[0]),int(date_list[1])+1):
dic = {"LOCATION":row[0],"DATE":str(j),"MONTH":row[2],"LANCASTER":row[3],"SPITFIRE":row[4],'HURRICANE':row[5],'DAKOTA':row[6]}
temp_df=temp_df.append(dic, ignore_index = True)
temp_df=temp_df.drop(temp_df[ temp_df['DATE'] == row[1] ].index)
if month == 'August' and 'September' in date:
date=date.replace(" September","")
date_list = date.split("-")
for j in range(int(date_list[0]),32):
dic = {"LOCATION":row[0],"DATE":str(j),"MONTH":row[2],"LANCASTER":row[3],"SPITFIRE":row[4],'HURRICANE':row[5],'DAKOTA':row[6]}
temp_df=temp_df.append(dic, ignore_index = True)
for j in range(1,int(date_list[1])+1):
dic = {"LOCATION":row[0],"DATE":str(j),"MONTH":'September',"LANCASTER":row[3],"SPITFIRE":row[4],'HURRICANE':row[5],'DAKOTA':row[6]}
temp_df=temp_df.append(dic, ignore_index = True)
temp_df=temp_df.drop(temp_df[ temp_df['DATE'] == row[1] ].index)
if month == 'September' and 'October' not in date:
date_list = date.split("-")
for j in range(int(date_list[0]),int(date_list[1])+1):
dic = {"LOCATION":row[0],"DATE":str(j),"MONTH":row[2],"LANCASTER":row[3],"SPITFIRE":row[4],'HURRICANE':row[5],'DAKOTA':row[6]}
temp_df=temp_df.append(dic, ignore_index = True)
temp_df=temp_df.drop(temp_df[ temp_df['DATE'] == row[1] ].index)
if month == 'September' and 'October' in date:
date=date.replace(" October","")
date_list = date.split("-")
for j in range(int(date_list[0]),31):
dic = {"LOCATION":row[0],"DATE":str(j),"MONTH":row[2],"LANCASTER":row[3],"SPITFIRE":row[4],'HURRICANE':row[5],'DAKOTA':row[6]}
temp_df=temp_df.append(dic, ignore_index = True)
for j in range(1,int(date_list[1])+1):
dic = {"LOCATION":row[0],"DATE":str(j),"MONTH":'October',"LANCASTER":row[3],"SPITFIRE":row[4],'HURRICANE':row[5],'DAKOTA':row[6]}
temp_df=temp_df.append(dic, ignore_index = True)
temp_df=temp_df.drop(temp_df[ temp_df['DATE'] == row[1] ].index)
temp_df.reset_index(drop=True, inplace=True)
for i,row in temp_df.iterrows():
date = row[1]
date=date.replace(' (30 Reserve)',"")
temp_df.iat[i,1]=date
for i,row in temp_df.iterrows():
month = row[2]
day = row[1]
if month == 'June':
date = '2022-'+str(5)+"-"+str(day)
elif month == 'July':
date = '2022-'+str(6)+"-"+str(day)
elif month == 'August':
date = '2022-'+str(7)+"-"+str(day)
elif month == 'September':
date = '2022-'+str(8)+"-"+str(day)
temp_df.iat[i,1] = date
temp_df = temp_df.drop('MONTH',axis=1)
#for i,row in temp_df.iterrows():
#loc = row[0]
#if 'All available BBMF Aircraft' in loc:
#temp_df.iat[i,2] = 'L'
#temp_df.iat[i,3] = 'S'
#temp_df.iat[i,4] = 'H'
#temp_df.iat[i,5] = 'D'
for i,row in temp_df.iterrows():
loc = row[0]
day = row[1]
if '24' in day and 'Headcorn' in loc or '25' in day and 'Headcorn' in loc:
temp_df.iat[i,2] = '--'
temp_df.iat[i,3] = 'S'
temp_df.iat[i,4] = '--'
for i,row in temp_df.iterrows():
loc = row[0]
day = row[1]
if '26' in day and 'Headcorn' in loc:
temp_df.iat[i,2] = 'L'
temp_df.iat[i,3] = 'S'
temp_df.iat[i,4] = '--'
for i,row in temp_df.iterrows():
loc = row[0]
loc = loc.replace("Lancaster","")
loc = loc.replace("Spitfire","")
loc = loc.replace("Hurricane","")
loc = loc.replace("Dakota","")
loc = loc.replace("S x 2","")
loc = loc.replace("S x 4","")
loc = loc.replace("H x 2","")
loc = loc.replace(" (, , , '","")
loc = loc.replace(" (, , , )","")
loc = loc.replace(" ()","")
loc = loc.replace(" (, , )","")
loc = loc.replace(" ()","")
loc = loc.replace("(, , , '","")
loc = loc.replace(" (, , , '","")
loc = loc.replace(" (, )","")
loc = loc.replace(" (SAT ONLY, , )","")
loc = loc.replace(" (, , , ","")
loc = loc.replace(" ()","")
loc = loc.replace("All available BBMF Aircraft","")
temp_df.iat[i,0]=loc
new_df = temp_df
#display = temp_df [a-zA-Z]
#temp_df.head()
#display = new_df[(new_df['LOCATION'].str.contains('[a-zA-Z]')) & (new_df['LANCASTER'].str.contains('L')) & (new_df['SPITFIRE'].str.contains('S', na=True))]
display = new_df[(new_df['LOCATION'].str.contains('[a-zA-Z]'))]
#display = new_df[(new_df['DATE'].str.contains('Sep'))]
#display = df3[(df3['LOCATION'].str.contains('[a-zA-Z]')) & (df3['DATE'].str.contains('-6-|-8-')) & (df3['LANCASTER'].str.contains('L')) & (df3['SPITFIRE'].str.contains('S', na=True)) & (df3['DAKOTA'].str.contains('D', na=True))]
#Use the above Line of Code when filtering DataFrame by Month
#Months = May Jun Jul Aug Sep #('-6-')) For Multiple Months use ('-6-|-8-'))
#Months = -5- -6- -7- -8- -9-
#display = df3[(df3['LOCATION'].str.contains('[a-zA-Z]')) & (df3['DATE'].str.contains('-6$|-8$')) & (df3['LANCASTER'].str.contains('L')) & (df3['SPITFIRE'].str.contains('S', na=True)) & (df3['DAKOTA'].str.contains('D', na=True))]
#df3['DATE'].str.contains('-6$')) or ('-6$|-8$')) For more than one Day. Use minus sign in front of the number when filtering the DataFrame by Days of Month.
#('-6$')) #('-6$|-8$'))
#print(display)
#display['DATE']= pd.to_datetime(display['DATE'],format='%Y-%m-%d')
#display['DATE']= pd.to_datetime(display['DATE']).dt.strftime('%m-%d-%Y')
#display=display.sort_values(by=['DATE'])
##added two lines above to convert date format
#display['DATE']= pd.to_datetime(display['DATE']).dt.strftime('%d-%b-%Y')
display['DATE']= pd.to_datetime(display['DATE'],format='%Y-%m-%d')
#display['DATE']= pd.to_datetime(display['DATE']).dt.strftime('%m-%d-%Y')
#display.drop([303], inplace=True)
#display.rename(index={16: '314', 17: '315'}, inplace=True)
display=display.rename_axis('MyIdx')
display=display.sort_values(by=['DATE','MyIdx'])
#display['DATE']= pd.to_datetime(display['DATE']).dt.strftime('%d-%b-%Y')
#display.drop_duplicates(subset=['LOCATION', 'DATE'], keep='last', inplace=True)
#display.drop('DAKOTA', axis=1, inplace=True)
#display.drop('Date', axis=1, inplace=True)
#display=display.dropna(subset=['SPITFIRE', 'HURRICANE'], how='all')
display=display[['LOCATION','DATE','LANCASTER','HURRICANE','SPITFIRE','DAKOTA']]
#display=display.fillna('--')
#display = display.sort_values(by=['DATE'])
display = display.sort_values(by=['DATE','LOCATION'])
display.reset_index(drop=True, inplace=True)
display['DATE']= pd.to_datetime(display['DATE']).dt.strftime('%d-%b-%Y')
display.to_csv("BBMF DISPLAY 2022.csv")
display
The lines of Code :-
item=item.replace('17th- ',"17th – ")
item=item.replace('26th- ',"26th – ")
sorted the initial problem I mentioned.
But now have a new problem, where the September Rows are not showing in the Output DataFrame, I have tried doing inspect Element, for September, on the Web page of which the Url is at the beginning of the Code. To see if anything is different for September, and i notice l, for Bournemouth Air Festiva whereas for others in the 'strong' part it starts with a single , or nothing. I tried replacing l, with a , or nothing and the September Rows still don't show.
With the following part of the Code :-
j=0
date,loc,month=[],[],[]
for i in range(1, len(new_lis2)):
for item in all_new_list[i]:
loc.append(item[1])
date.append(item[0])
month.append(new_lis2[i])
if I # out those two item.replace lines I posted before, and run the code :-
I get an
Error:
IndexError Traceback (most recent call last)
<ipython-input-40-f67479ed6fd4> in <module>
49 for i in range(1, len(new_lis2)):
50 for item in all_new_list[i]:
---> 51 loc.append(item[1])
52 date.append(item[0])
53 month.append(new_lis2[i])
IndexError: list index out of range
And if I use them, I get a correct DataFrame Output, with rows from June to August, but not September.
And if I change to :-
j=0
date,loc,month=[],[],[]
for i in range(len(new_lis2)):
for item in all_new_list[i]:
loc.append(item[1])
date.append(item[0])
month.append(new_lis2[i])
Which was what I used initially, and run the code, I get the same error, when I use those two lines of item.replace Code.
What makes me think the September Rows are not showing because of an anomoly in the text, is that hashing out those two lines of Code
, produces an list index out of range error. I run the full Code in Jupyter Notebook.
Any help anyone could give me, would be very much appreciated.
Best Regards
Eddie Winch
