Click here to Skip to main content
15,888,527 members
Please Sign up or sign in to vote.
0.00/5 (No votes)
Hi there,

I have the following Python Code, which is run in Jupyter Notebook :-

<pre>import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import datetime as dt
class work:
    def __init__(self,link):
        self.link=link
        self.res=requests.get(self.link)
        self.soup=BeautifulSoup(self.res.content, "lxml")
        self.table = self.soup.find_all('table')[0]
        self.l = pd.read_html(str(self.table))
 
         
    def create(self):
        self.ll=[]
        for i in range(0,6):
            l1=self.l[1][0:1][i]
            l1=list(l1)
            self.ll.extend(l1)
        l2=self.l[1][2:]
        self.date=list(l2[0])
        self.location=list(l2[1])
        self.lancaster=list(l2[2])
        self.spitfire=list(l2[3])
        self.hurricane=list(l2[4])
        self.dakota=list(l2[5])
         
    def month(self):
        mm=self.l[1][1][1]
         
        if mm=='May':
            x=5
        elif mm=='June':
            x=6
        elif mm=='July':
            x=7
        elif mm=='August':
            x=8
        elif mm=='September':
            x=9
        else:
            x=0
        return x
             
 
         
         
         
    def refine(self):
        self.create()
        arr=np.asarray(self.date)
        temp=arr[0]
        for i in range(0,len(arr)):
            if arr[i]=='nan':
                arr[i]=temp
         
            else:
                temp=arr[i]
        self.y=list(arr)
        return self.y
    def convert(self):
        lx=[]
        x=self.refine()
        y=self.month()
        for i in range(0,len(x)):
            lx.append((dt.datetime(2006, y, int(x[i]))).strftime('%d-%b-%Y'))
        return lx
     
    def post(self):
        date=self.convert()
        dff = pd.DataFrame(list(zip(date,self.location,self.lancaster,self.spitfire,self.hurricane,self.dakota)), 
               columns =self.ll)
        return dff
         
         
         
#a=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/may05.html')
#b=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/june05.html')
#c=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/july05.html')
#d=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/august05.html')
#e=work('http://web.archive.org/web/20050726230748/http://www.raf.mod.uk/bbmf/september05.html')  
 
a=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/may06.html')
b=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/june06.html')
c=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/july06.html')
d=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/august06.html')
e=work('http://web.archive.org/web/20060811232523/http://www.deltaweb.co.uk/bbmf/september06.html')  
 
#a=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/may07.html')
#b=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/june07.html')
#c=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/july07.html')
#d=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/august07.html')
#e=work('http://web.archive.org/web/20070701133815/http://www.bbmf.co.uk/september07.html')  
 
#a=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/may08.html')
#b=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/june08.html')
#c=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/july08.html')
#d=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/august08.html')
#e=work('http://web.archive.org/web/20081116021904/http://www.bbmf.co.uk/september08.html')  
 
dff1=a.post()
dff2=b.post()
dff3=c.post()
dff4=d.post()
dff5=e.post()
 
X = pd.concat([dff1, dff2], axis=0)
Y = pd.concat([X, dff3], axis=0)
Z =  pd.concat([Y, dff4], axis=0)
F =  pd.concat([Z, dff5], axis=0)
F=pd.DataFrame(F)
#display = F[(F['Location'].str.contains('[a-zA-Z]')) & (F['Dakota'].str.contains('D')) & (F['Spitfire'].str.contains('S', na=True)) & (F['Lancaster'] != 'L')]  
 
#display = F[(F['Location'].str.contains('[a-zA-Z]')) & (F['Date'].str.contains('Jul')) & (F['Dakota'].str.contains('D')) & (F['Spitfire'].str.contains('S', na=True)) & (F['Lancaster'] != 'L')]  
 
#Use the above Line of Code when filtering DataFrame by Month
                                                
#Months = May Jun Jul Aug Sep                   #('Jun')) For Multiple Months use ('Jun|Jul')) For example
#Months = -05- -06- -07- -08- -09-              
 
display = F[(F['Location'].str.contains('[a-zA-Z]')) & (F['Date'].str.contains('10$|20$')) & (F['Dakota'].str.contains('D')) & (F['Spitfire'].str.contains('S', na=True)) & (F['Lancaster'] != 'L')]  
 
#display['DATE'].str.contains('-6$')) or ('-6$|-8$')) For more than one Day. Use minus sign in front of the number when filtering the DataFrame by Days of the Month, which are below 10.
 
                          #('-6$'))                 #('-6$|-8$')) For example    
 
pd.options.display.max_rows = 1000   
pd.options.display.max_columns = 1000
display.drop('Lancaster', axis=1, inplace=True)
display=display.dropna(subset=['Spitfire', 'Hurricane'], how='all')
#display=display[['Date','Location','Dakota','Hurricane','Spitfire']]
display=display[['Location','Date','Dakota','Hurricane','Spitfire']]
display=display.fillna('--')
#display.reset_index(drop=True, inplace=True)
display.to_csv(r'C:\Users\Edward\Desktop\BBMF Schedules And Master Forum Thread Texts\BBMF-2006-Code (Dakota With Fighters).csv')
 
display['Date'] = pd.to_datetime(display['Date'])
 
display = display.sort_values(by='Date', key=lambda col: 100 * col.dt.day + col.dt.month)
 
display['Date']= pd.to_datetime(display['Date']).dt.strftime('%d-%b-%Y')
 
display.reset_index(drop=True, inplace=True)
 
display


Which I have adapted from another of my Codes, but when I run the Code, only the Columns and no Rows are shown in the DataFrame Output.

Here is the other Code, I adapted the above one from :-

<pre>import pandas as pd
import requests
from bs4 import BeautifulSoup
from contextlib import suppress  

res1 = requests.get("http://web.archive.org/web/20020602133812/http://www.raf.mod.uk/bbmf/calendar.html")     
res2 = requests.get("http://web.archive.org/web/20020803081304/http://www.raf.mod.uk/bbmf/calendar.html")
soup1 = BeautifulSoup(res1.content,'lxml')
soup2 = BeautifulSoup(res2.content,'lxml')
table1 = soup1.find_all('table', align="CENTER")[0]
table2 = soup2.find_all('table', align="CENTER")[0]
df1 = pd.read_html(str(table1))
df2 = pd.read_html(str(table2))


   
df1 = df1[0]
df2 = df2[0]
df = pd.concat([df2, df1], axis=0)
  
  
##################
##################
##################
  
  
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
 
 
#make df[0] to list
list=[]
for i in df[0]:
    list.append(i)
  
#reverse the list to make split to sublist easier
list.reverse()
  
#split list to sublist using condition len(val)> 2 
size = len(list) 
idx_list = [idx + 1 for idx, val in
            enumerate(list) if len(val) > 2] 
res = [list[i: j] for i, j in
        zip([0] + idx_list, idx_list + 
        ([size] if idx_list[-1] != size else []))] 
  
#make monthname to numbers and print
for i in res:
    for j in range(len(i)):
        if i[j].upper()=='JUNE':
            i[j]='6'
        elif i[j].upper() =='MAY':
            i[j]='5'
        elif i[j].upper() == 'APRIL':
            i[j]='4'
        elif i[j].upper() =='JANUARY':
            i[j]='1'
        elif i[j].upper() == 'FEBRUARY':
            i[j]='2'
        elif i[j].upper() =='MARCH':
            i[j]='3'
        elif i[j].upper() == 'JULY':
            i[j]='7'        
        elif i[j].upper() =='AUGUST':
            i[j]='8'
        elif i[j].upper() == 'SEPTEMBER':
            i[j]='9'
        elif i[j].upper() =='OCTOBER':
            i[j]='10'
        elif i[j].upper() == 'NOVEMBER':
            i[j]='11'
        elif i[j].upper() =='DECEMBER':
            i[j]='12'       
  
  
#append string and append to new list
finallist=[]
for i in res:
    for j in range(len(i)):
        if j < len(i) - 1:
            #print(f'2004-{i[-1]}-{i[j]}')
            finallist.append(f'2002-{i[-1]}-{i[j]}')
#print(finallist)
finallist.reverse()
  
  
  
  
#print("\n=== ORIGINAL DF ===\n")
#print(df)
  
  
  
#convert dataframe to list
listtemp1=df.values.tolist()
  
#replace found below values with 0000_removable
removelist=['LOCATION','LANCASTER','SPITFIRE','HURRICANE','DAKOTA','DATE','JUNE','JANUARY','FEBRUARY','MARCH','MAY','JULY','AUGUST','SEPTEMBER','OCTOBER','NOVEMBER','DECEMBER','APRIL']
for i in listtemp1:
    for j in range(len(i)):
        for place in removelist:
            if str(i[j]).upper()==place:
                i[j]='0000_removable'
            else:
                pass
  
                  
#remove sublists with the replaced values we redirected
dellist=['0000_removable', '0000_removable', '0000_removable', '0000_removable', '0000_removable', '0000_removable']
res = [i for i in listtemp1 if i != dellist]
  
#assign back to dataframe DF3
df3=pd.DataFrame()
df3=pd.DataFrame(res, columns=['Date','LOCATION','LANCASTER','SPITFIRE','HURRICANE','DAKOTA'])
#print("\n=== AFTER REMOVE month and column names from DF, assigned to new as DF3 ===\n")
#print(df3)
  
  
#now assign that sorted date list to dataframe DF3
idx = 0
df3.insert(loc=idx, column='DATE', value=finallist)
pd.options.display.max_rows = 500
  
#print("\n=== FINAL DF3 after joining the edited date format column list ===\n")
#print(df3)
  
  
#validation logic if needed compare processed date from new joined "edited_Date_format" column with already existing "Date" column
#df3['ED1']=  pd.to_datetime(df3['EDITED_DATE_FORMAT'],format='%Y-%m-%d').dt.day
#df3['validation of date'] = df3.apply(lambda x: str(x['ED1']) == x['Date'], axis=1)
  
  
#convert df3['EDITED_DATE_FORMAT'] column from object to datetime64 foramt
#df3['EDITED_DATE_FORMAT']= pd.to_datetime(df3['EDITED_DATE_FORMAT'],format='%Y-%m-%d')
  
  
  
##################
##################
##################
  
  
 
 
  
 
#df3 = df3.rename(columns=df.iloc[0])
#df3 = df.iloc[2:]
#df3.head(15)
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

df3['LANCASTER'] = df3['LANCASTER'].replace({'X': 'L'})
df3['HURRICANE'] = df3['HURRICANE'].replace({'X': 'H'})
df3['SPITFIRE'] = df3['SPITFIRE'].replace({'X': 'S'})
df3['SPITFIRE'] = df3['SPITFIRE'].replace({'X x 2': 'SS'})
df3['DAKOTA'] = df3['DAKOTA'].replace({'X': 'D'})


#display = df3[(df3['LOCATION'].str.contains('[a-zA-Z]')) & (df3['DAKOTA'].str.contains('X')) & (df3['SPITFIRE'].str.contains('X', na=True)) & (df3['LANCASTER'] != 'X')]    
#display = df3[(df3['LOCATION'].str.contains('[a-zA-Z]')) & (df3['DAKOTA'].str.contains('D')) & (df3['SPITFIRE'].str.contains('S', na=True)) & (df3['LANCASTER'] != 'L')]    

#display = df3[(df3['LOCATION'].str.contains('[a-zA-Z]')) & (df3['DATE'].str.contains('-6-')) & (df3['DAKOTA'].str.contains('D')) & (df3['SPITFIRE'].str.contains('S', na=True)) & (df3['LANCASTER'] != 'L')]    

#Use the above Line of Code when filtering DataFrame by Month

#Months = May Jun Jul Aug Sep                #('-6-')) For Multiple Months use ('-6-|-8-')) For example
#Months = -5- -6- -7- -8- -9-

display = df3[(df3['LOCATION'].str.contains('[a-zA-Z]')) & (df3['DATE'].str.contains('-6$|13$')) & (df3['DAKOTA'].str.contains('D')) & (df3['SPITFIRE'].str.contains('S', na=True)) & (df3['LANCASTER'] != 'L')]    

#df3['DATE'].str.contains('-6$')) or ('-6$|-8$')) For more than one Day. Use minus sign in front of the number when filtering the DataFrame by Days of Month.

                          #('-6$'))                 #('-6$|-8$')) For example

#print(display) 

#display['DATE']= pd.to_datetime(display['DATE'],format='%Y-%m-%d')
#display['DATE']= pd.to_datetime(display['DATE']).dt.strftime('%m-%d-%Y')
#display=display.sort_values(by=['DATE'])
##added two lines above to convert date format
#display['DATE']= pd.to_datetime(display['DATE']).dt.strftime('%d-%b-%Y')

display['DATE']= pd.to_datetime(display['DATE'],format='%Y-%m-%d')
display['DATE']= pd.to_datetime(display['DATE']).dt.strftime('%m-%d-%Y')

with suppress(Exception):
 display.drop([303], inplace=True) 
#display.rename(index={6: '303', 20: '36'}, inplace=True)
display.rename(index={6: '19'}, inplace=True)
display=display.rename_axis('MyIdx')
display=display.sort_values(by=['DATE','MyIdx'])
display['DATE']= pd.to_datetime(display['DATE']).dt.strftime('%d-%b-%Y')
display.drop_duplicates(subset=['LOCATION', 'DATE'], keep='last', inplace=True)

#display.drop('LANCASTER', axis=1, inplace=True)
#display.drop('Date', axis=1, inplace=True)
display=display.dropna(subset=['SPITFIRE', 'HURRICANE'], how='all')
display=display[['LOCATION','DATE','DAKOTA','HURRICANE','SPITFIRE']]
display=display.fillna('--')
#display.reset_index(drop=True, inplace=True)
display.to_csv(r'C:\Users\Edward\Desktop\BBMF Schedules And Master Forum Thread Texts\BBMF-2002-Code (Dakota With Fighters).csv')

display['DATE'] = pd.to_datetime(display['DATE'])

display = display.sort_values(by='DATE', key=lambda col: 100 * col.dt.day + col.dt.month)

display['DATE']= pd.to_datetime(display['DATE']).dt.strftime('%d-%b-%Y')

display.reset_index(drop=True, inplace=True)

display

#print(display)


I want the earliest Day Rows showing descending, i.e. 10's first with the corresponding earliest Month i.e. 10th June Rows, then the 10th of July Rows, 15th of June then 15th July, if present in the DataFrame etc, that's what the line of Code with the key=lambda in is for.

So for example the Rows Output, if all these month Rows were present, they would be in the following order :-

10-Jun-2006
10-Jul-2006
20-May-2006
20-Jul-2006
20-Aug-2006


The issue I have, is that when I run my Code, no DataFrame Rows Output, only the Column Names with no Rows showing.

I think maybe what is causing the issue I am having, is that in some of the Lines of Code, 'display' should be 'F' and or vice versa, I may be wrong though.

Could someone tell me where I need to make those changes in the first Code. The aim is to get a DataFrame Output, like I aimed to get in my previous Filtering rows in Pandas dataframe considering day and month Thread, which is in this Forum section, posted a few days ago.

Any help would be much appreciated.

Best Regards

Eddie Winch

What I have tried:

The above Code, but when I run it, no DataFrame Rows Output, only the Column Names.
Posted
Updated 23-Aug-20 11:18am
v5
Comments
Richard MacCutchan 23-Aug-20 3:40am    
Please use the Improve question link above and add some actual detail of what the problem is and where it occurs.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900