![]() |
Read TXT file in Pandas and save to Parquet - Printable Version +- Python Forum (https://python-forum.io) +-- Forum: Python Coding (https://python-forum.io/forum-7.html) +--- Forum: General Coding Help (https://python-forum.io/forum-8.html) +--- Thread: Read TXT file in Pandas and save to Parquet (/thread-43214.html) |
Read TXT file in Pandas and save to Parquet - zinho - Sep-14-2024 Hi. I would like import a TXT file, change types(object to [date, str, float etc]), save data to parquet. When a use pandas for this I get error about types. This file is dirt, like skip extra header data and footer data. The columns is separeted by | (pipe), but content extra space, look example below in column-1. Col-1 | Col-2 | Col-3 | Col-4 | Col-4 B |ES|0000192806|01206820002060|BLABLABLA|0000181882| Link for the file, the file has 900k https://drive.google.com/file/d/175JjBY1xQPwoNrDlX_6icFYi1h6LiXlZ/view?usp=drive_link Thanks RE: Read TXT file in Pandas and save to Parquet - Pedroski55 - Sep-15-2024 A useful link for you. Just tell Pandas that the separator is | (default separator is: , ) import pandas as pd path2csv = 'csv/csv_files/pipe_separated.csv' df = pd.read_csv(path2csv, sep="|") df Oh, I just realised, you want to tidy up the column names:# Rename columns for name in df.columns: df.rename(columns={name: name.strip()}, inplace=True) print(df["Col-1"]) RE: Read TXT file in Pandas and save to Parquet - zinho - Sep-15-2024 I solved. I need to convert all columns that has object to spscific types. mport pandas as pd import time # ==============Time execution ================= # record start time start = time.time() # Define columns colunas = [ "Tipo_Lanc","UF","Fis/Jur","CNPJ","RAZAO_SOCIAL","NF","CHAVE_NFE", "DT_Emissão","DT_Fiscal","Data_Lan_amento","PRODUTO","DESCRICAO", "NUM_ITEM","Unid","LISTA","NCM","Monit/Liber","EAN","MVA_Original", "MVA","CFOP","CST","QUANTIDADE","PF_UNIT","PF_TOTAL","VLR_LIQ_UNIT", "VLR_LIQ_ITEM","VL_UNIT_NF","TOTAL_NF","VL_UNIT_LIQ_NF","TOTAL_LIQ_NF", "DESC._TOT","DESCONTO","REPASSE","VC","BC","ICMS","BC_N_Escriturado", "ICMS_N_Escriturado","ALIQ_ICMS","APROPRIA","BC_ICMS_ST","ICMS_ST", "ALIQ_INTERNA","DEB_ICMS","IPI","CAT_ANVISA","TIPO_PRODUTO", "TIPO_DESCONTO","%_DESCONTO","PMC","PMC_FCIA_POP","%_REDUTOR_ANVISA", "CROSS","INDICADOR_ICMSS","BC_ICMS_ST_REC","ICMS_ST_REC","MCANCER","CBASICA","-no-"] # Define columns to convert to numeric numeric_columns = ["MVA_Original","MVA","CFOP","CST","QUANTIDADE","PF_UNIT", "PF_TOTAL","VLR_LIQ_UNIT","VLR_LIQ_ITEM","VL_UNIT_NF", "TOTAL_NF","VL_UNIT_LIQ_NF","TOTAL_LIQ_NF","DESC._TOT", "DESCONTO","REPASSE","VC","BC","ICMS","BC_N_Escriturado", "ICMS_N_Escriturado","ALIQ_ICMS","APROPRIA","BC_ICMS_ST", "ICMS_ST","ALIQ_INTERNA DEB_ICMS","IPI","%_DESCONTO","PMC", "PMC_FCIA_POP","%_REDUTOR_ANVISA","INDICADOR_ICMSS","BC_ICMS_ST_REC","ICMS_ST_REC"] # Define columns to convert to string string_columns = ["Tipo_Lanc", "UF", "Fis/Jur", "CNPJ", "RAZAO_SOCIAL", "NF", "CHAVE_NFE", "PRODUTO", "NUM_ITEM", "Unid", "LISTA", "NCM", "Monit/Liber", "EAN", "CAT_ANVISA", "TIPO_PRODUTO", "TIPO_DESCONTO", "CROSS", "MCANCER","CBASICA"] # Define columns to convert to date date_columns = ["DT_Fiscal", "DT_Emissão","Data_Lan_amento"] def convert_brazilian_numeric(value): """Convert a Brazilian formatted numeric string to a float.""" if pd.isna(value): return pd.NA value = str(value).replace('.', '').replace(',', '.') try: return float(value) except ValueError: return pd.NA def read_csv(file_name, chunksize): for chunk in pd.read_csv(file_name, encoding='ISO-8859-1', sep='|', on_bad_lines='skip', skiprows=9, chunksize=chunksize, header=None): chunk.columns = colunas yield chunk chunk_size = 5000 file_name = "Fat.TXT" #"10Linhas_ms_file.TXT" master_df = pd.concat(read_csv(file_name, chunksize=chunk_size), ignore_index=True) master_df.pop("-no-") master_df = master_df.applymap(lambda x: x.strip() if isinstance(x, str) else x) # Convert specified columns to string for col in string_columns: if col in master_df.columns: master_df[col] = master_df[col].astype(str) # Convert specified columns to numeric for col in numeric_columns: if col in master_df.columns: master_df[col] = master_df[col].apply(convert_brazilian_numeric) master_df[col] = master_df[col].fillna(0).astype(float) # Convert specified columns to date using pd.to_datetime directly for col in date_columns: if col in master_df.columns: master_df[col] = pd.to_datetime(master_df[col], format="%d/%m/%Y", errors='coerce') # Verify the DataFrame print(master_df.head(10)) # Print the first few rows to check the data print(master_df.dtypes) # Check the types of columns # Save to Excel or Partquet #master_df.to_excel("rl_fiscal.xlsx", sheet_name="teste_fiscal", index=False) master_df.to_parquet("fat.parquet") # ==============Time execution ================= # record end time end = time.time() #print("_Total of execution...: ", (end-start) * 10**3, "ms") print("Fim...") |