I solved.
I need to convert all columns that has object to spscific types.
mport pandas as pd
import time
# ==============Time execution =================
# record start time
start = time.time()
# Define columns
colunas = [
"Tipo_Lanc","UF","Fis/Jur","CNPJ","RAZAO_SOCIAL","NF","CHAVE_NFE",
"DT_Emissão","DT_Fiscal","Data_Lan_amento","PRODUTO","DESCRICAO",
"NUM_ITEM","Unid","LISTA","NCM","Monit/Liber","EAN","MVA_Original",
"MVA","CFOP","CST","QUANTIDADE","PF_UNIT","PF_TOTAL","VLR_LIQ_UNIT",
"VLR_LIQ_ITEM","VL_UNIT_NF","TOTAL_NF","VL_UNIT_LIQ_NF","TOTAL_LIQ_NF",
"DESC._TOT","DESCONTO","REPASSE","VC","BC","ICMS","BC_N_Escriturado",
"ICMS_N_Escriturado","ALIQ_ICMS","APROPRIA","BC_ICMS_ST","ICMS_ST",
"ALIQ_INTERNA","DEB_ICMS","IPI","CAT_ANVISA","TIPO_PRODUTO",
"TIPO_DESCONTO","%_DESCONTO","PMC","PMC_FCIA_POP","%_REDUTOR_ANVISA",
"CROSS","INDICADOR_ICMSS","BC_ICMS_ST_REC","ICMS_ST_REC","MCANCER","CBASICA","-no-"]
# Define columns to convert to numeric
numeric_columns = ["MVA_Original","MVA","CFOP","CST","QUANTIDADE","PF_UNIT",
"PF_TOTAL","VLR_LIQ_UNIT","VLR_LIQ_ITEM","VL_UNIT_NF",
"TOTAL_NF","VL_UNIT_LIQ_NF","TOTAL_LIQ_NF","DESC._TOT",
"DESCONTO","REPASSE","VC","BC","ICMS","BC_N_Escriturado",
"ICMS_N_Escriturado","ALIQ_ICMS","APROPRIA","BC_ICMS_ST",
"ICMS_ST","ALIQ_INTERNA DEB_ICMS","IPI","%_DESCONTO","PMC",
"PMC_FCIA_POP","%_REDUTOR_ANVISA","INDICADOR_ICMSS","BC_ICMS_ST_REC","ICMS_ST_REC"]
# Define columns to convert to string
string_columns = ["Tipo_Lanc", "UF", "Fis/Jur", "CNPJ",
"RAZAO_SOCIAL", "NF", "CHAVE_NFE", "PRODUTO",
"NUM_ITEM", "Unid", "LISTA", "NCM", "Monit/Liber",
"EAN", "CAT_ANVISA", "TIPO_PRODUTO", "TIPO_DESCONTO",
"CROSS", "MCANCER","CBASICA"]
# Define columns to convert to date
date_columns = ["DT_Fiscal", "DT_Emissão","Data_Lan_amento"]
def convert_brazilian_numeric(value):
"""Convert a Brazilian formatted numeric string to a float."""
if pd.isna(value):
return pd.NA
value = str(value).replace('.', '').replace(',', '.')
try:
return float(value)
except ValueError:
return pd.NA
def read_csv(file_name, chunksize):
for chunk in pd.read_csv(file_name, encoding='ISO-8859-1', sep='|', on_bad_lines='skip', skiprows=9, chunksize=chunksize, header=None):
chunk.columns = colunas
yield chunk
chunk_size = 5000
file_name = "Fat.TXT" #"10Linhas_ms_file.TXT"
master_df = pd.concat(read_csv(file_name, chunksize=chunk_size), ignore_index=True)
master_df.pop("-no-")
master_df = master_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
# Convert specified columns to string
for col in string_columns:
if col in master_df.columns:
master_df[col] = master_df[col].astype(str)
# Convert specified columns to numeric
for col in numeric_columns:
if col in master_df.columns:
master_df[col] = master_df[col].apply(convert_brazilian_numeric)
master_df[col] = master_df[col].fillna(0).astype(float)
# Convert specified columns to date using pd.to_datetime directly
for col in date_columns:
if col in master_df.columns:
master_df[col] = pd.to_datetime(master_df[col], format="%d/%m/%Y", errors='coerce')
# Verify the DataFrame
print(master_df.head(10)) # Print the first few rows to check the data
print(master_df.dtypes) # Check the types of columns
# Save to Excel or Partquet
#master_df.to_excel("rl_fiscal.xlsx", sheet_name="teste_fiscal", index=False)
master_df.to_parquet("fat.parquet")
# ==============Time execution =================
# record end time
end = time.time()
#print("_Total of execution...: ", (end-start) * 10**3, "ms")
print("Fim...")