Mar-22-2021, 10:10 AM
The problem is that when I scraping data from multiple .xlsm file,in ("DataSource": fn) it is taking the whole path of the file location "C:\Users\ShantanuGupta\Desktop\Test Python\202009 - September - Diamond Plod Day & Night MKY025.xlsm". But I wanted only the last portion "202009 - September - Diamond Plod Day & Night MKY025.xlsm" not the whole file location. Also attach picture.I have mentioned in the code with comment where the problem.
Any help????
![[Image: view?usp=sharing]](https://drive.google.com/file/d/1eRpqlHeJagq-_2ss6xNsDQ3lAxCZNY62/view?usp=sharing)
Image
Any help????
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
from pathlib import Path import time import parser import argparse import pandas as pd import numpy as np import warnings warnings.filterwarnings( "ignore" ) parser = argparse.ArgumentParser(description = "Process some integers." ) parser.add_argument( "path" , help = "define the directory to folder/file" ) parser.add_argument( "--verbose" , help = "display processing information" ) start = time.time() def main(path_xlsm, verbose): if ( ".xlsm" in str (path_xlsm).lower()) and path_xlsm.is_file(): xlsm_files = [Path(path_xlsm)] else : xlsm_files = list (Path(path_xlsm).glob( "*.xlsm" )) df = pd.DataFrame() for fn in xlsm_files: all_dfs = pd.read_excel(fn, sheet_name = None , header = None , engine = "openpyxl" ) list_data = all_dfs.keys() all_dfs.pop( "Date" , None ) all_dfs.pop( "Ops Report" , None ) all_dfs.pop( "Fuel Report" , None ) all_dfs.pop( "Bit Report" , None ) all_dfs.pop( "Plod Example" , None ) all_dfs.pop( "Plod Definitions" , None ) all_dfs.pop( "Consumables" , None ) all_dfs.pop( "Tables" , None ) for ws in list_data: # Looping for excel sheet df1 = all_dfs[ws] df2 = pd.DataFrame() if df1.iloc[ 41 , 1 ] = = "Drillers Comments" : row = 42 elif df1.iloc[ 44 , 1 ] = = "Drillers Comments" : row = 45 new_row = { "PlodDate" : df1.iloc[ 4 , 3 ], "PlodShift" : df1.iloc[ 5 , 3 ], "RigNo" : df1.iloc[ 2 , 9 ], "Location" : df1.iloc[ 3 , 3 ], "DrillersComments" : df1.iloc[row, 1 ], "GeologistComments" : df1.iloc[row, 14 ], "MaintenanceComments" : df1.iloc[row, 26 ], "TravelInName" : df1.iloc[ 2 , 36 ], "TravelInHours" : df1.iloc[ 2 , 45 ], "TravelOutName" : df1.iloc[ 3 , 36 ], "TravelOutHours" : df1.iloc[ 3 , 45 ], "DataSource" : fn, #Problem Problem Problem } df2 = df2.append(new_row, ignore_index = True ) df2 = df2[ [ "PlodDate" , "PlodShift" , "RigNo" , "Location" , "DrillersComments" , "GeologistComments" , "MaintenanceComments" , "TravelInName" , "TravelInHours" , "TravelOutName" , "TravelOutHours" , "DataSource" , ] ] cols = [ "Location" , "DrillersComments" , "GeologistComments" , "MaintenanceComments" , "TravelInName" , "TravelInHours" , "TravelOutName" , "TravelOutHours" , ] df2[cols].replace("", np.nan, inplace = True ) df2 = df2.dropna(subset = cols, how = "all" ) df2 = df2.replace( "," , ";" , regex = True ) df2 = df2.replace( "\n" , " " , regex = True ) df2 = df2.replace( "\r" , " " , regex = True ) df2 = df2.replace( "\t" , " " , regex = True ) df = df.append(df2) df.to_csv( "McKayPlod-1.csv" , index = False ) if __name__ = = "__main__" : start = time.time() args = parser.parse_args() path = Path(args.path) verbose = args.verbose main(path, verbose) #Calling Main Function print ( "Processed time:" , time.time() - start) #Total Time |