Mar-22-2021, 10:10 AM
The problem is that when I scraping data from multiple .xlsm file,in ("DataSource": fn) it is taking the whole path of the file location "C:\Users\ShantanuGupta\Desktop\Test Python\202009 - September - Diamond Plod Day & Night MKY025.xlsm". But I wanted only the last portion "202009 - September - Diamond Plod Day & Night MKY025.xlsm" not the whole file location. Also attach picture.I have mentioned in the code with comment where the problem.
Any help????
Any help????
from pathlib import Path import time import parser import argparse import pandas as pd import numpy as np import warnings warnings.filterwarnings("ignore") parser = argparse.ArgumentParser(description="Process some integers.") parser.add_argument("path", help="define the directory to folder/file") parser.add_argument("--verbose", help="display processing information") start = time.time() def main(path_xlsm, verbose): if (".xlsm" in str(path_xlsm).lower()) and path_xlsm.is_file(): xlsm_files = [Path(path_xlsm)] else: xlsm_files = list(Path(path_xlsm).glob("*.xlsm")) df = pd.DataFrame() for fn in xlsm_files: all_dfs = pd.read_excel(fn, sheet_name=None, header=None, engine="openpyxl") list_data = all_dfs.keys() all_dfs.pop("Date", None) all_dfs.pop("Ops Report", None) all_dfs.pop("Fuel Report", None) all_dfs.pop("Bit Report", None) all_dfs.pop("Plod Example", None) all_dfs.pop("Plod Definitions", None) all_dfs.pop("Consumables", None) all_dfs.pop("Tables", None) for ws in list_data: # Looping for excel sheet df1 = all_dfs[ws] df2 = pd.DataFrame() if df1.iloc[41, 1] == "Drillers Comments": row = 42 elif df1.iloc[44, 1] == "Drillers Comments": row = 45 new_row = { "PlodDate": df1.iloc[4, 3], "PlodShift": df1.iloc[5, 3], "RigNo": df1.iloc[2, 9], "Location": df1.iloc[3, 3], "DrillersComments": df1.iloc[row, 1], "GeologistComments": df1.iloc[row, 14], "MaintenanceComments": df1.iloc[row, 26], "TravelInName": df1.iloc[2, 36], "TravelInHours": df1.iloc[2, 45], "TravelOutName": df1.iloc[3, 36], "TravelOutHours": df1.iloc[3, 45], "DataSource": fn, #Problem Problem Problem } df2 = df2.append(new_row, ignore_index=True) df2 = df2[ [ "PlodDate", "PlodShift", "RigNo", "Location", "DrillersComments", "GeologistComments", "MaintenanceComments", "TravelInName", "TravelInHours", "TravelOutName", "TravelOutHours", "DataSource", ] ] cols = [ "Location", "DrillersComments", "GeologistComments", "MaintenanceComments", "TravelInName", "TravelInHours", "TravelOutName", "TravelOutHours", ] df2[cols].replace("", np.nan, inplace=True) df2 = df2.dropna(subset=cols, how="all") df2 = df2.replace(",", ";", regex=True) df2 = df2.replace("\n", " ", regex=True) df2 = df2.replace("\r", " ", regex=True) df2 = df2.replace("\t", " ", regex=True) df = df.append(df2) df.to_csv("McKayPlod-1.csv", index=False) if __name__ == "__main__": start = time.time() args = parser.parse_args() path = Path(args.path) verbose = args.verbose main(path, verbose) #Calling Main Function print("Processed time:", time.time() - start) #Total TimeImage