The main issue with the provided output CSV file is that the CREDIT and DEBIT columns are not being populated correctly. The values in these columns should reflect the actual credit and debit amounts for each transaction, but the current output shows all zeros.
This indicates that the script is not able to correctly identify and assign the credit and debit values based on the information in the "Particulars" (Transaction Reference) column of the bank statement.
This indicates that the script is not able to correctly identify and assign the credit and debit values based on the information in the "Particulars" (Transaction Reference) column of the bank statement.
import pandas as pd import PyPDF2 import re import os from datetime import datetime print("Script started") def extract_text_from_pdf(pdf_path): print(f"Attempting to read PDF: {pdf_path}") try: with open(pdf_path, 'rb') as file: reader = PyPDF2.PdfReader(file) text = '' for page in reader.pages: text += page.extract_text() print(f"Successfully extracted {len(text)} characters from PDF") return text except Exception as e: print(f"Error reading PDF: {e}") return None def extract_transaction_data(pdf_content): print("Extracting transaction data...") if not pdf_content: print("No PDF content to process") return None pattern = r'(\d{2}-\d{2}-\d{2})\s+(.*?)\s+(\d+\.\d{2}|\-)\s+(\d+\.\d{2}|\-)' matches = re.findall(pattern, pdf_content) print(f"Found {len(matches)} transaction entries") data = [] for match in matches: date, particulars, credit, debit = match credit = 0.0 if credit == '-' else float(credit) debit = 0.0 if debit == '-' else float(debit) # Identify the transaction type and assign credit/debit accordingly if particulars.startswith('UPI/DR/'): credit = 0.0 debit = debit elif particulars.startswith('CR_DR'): credit = credit debit = 0.0 else: credit = credit debit = debit data.append([date, particulars, credit, debit]) df = pd.DataFrame(data, columns=['Date', 'Particulars', 'CREDIT', 'DEBIT']) df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%y') print(f"Created DataFrame with {len(df)} rows") return df def process_bank_statement(input_path): print(f"Processing PDF: {input_path}") pdf_content = extract_text_from_pdf(input_path) if not pdf_content: return print("First 500 characters of PDF content:") print(pdf_content[:500]) df = extract_transaction_data(pdf_content) if df is None or df.empty: print("No transaction data found in the PDF.") return output_folder = os.path.dirname(input_path) output_path = os.path.join(output_folder, 'processed_bank_statement.csv') df.to_csv(output_path, index=False) print(f"Processed data has been saved to: {output_path}") print("\nFirst few rows of the processed data:") print(df.head().to_string(index=False)) def main(): print("Bank Statement Processor") print("========================") default_path = r"C:\Users\csc\Desktop\Ashok SBI\apr23.pdf" if os.path.exists(default_path): print(f"Found default PDF file: {default_path}") input_path = default_path else: print("Default PDF file not found.") input_path = input("Enter the full path to your PDF file: ").strip('"') if not os.path.exists(input_path): print(f"Error: File not found - {input_path}") input("Press Enter to exit...") return process_bank_statement(input_path) print("Processing complete.") input("Press Enter to exit...") if __name__ == "__main__": try: main() except Exception as e: print(f"An error occurred: {e}") import traceback print(traceback.format_exc()) input("Press Enter to exit...") print("Script ended")
buran write Sep-10-2024, 06:41 AM:
New thread merged into old thread
New thread merged into old thread
Attached Files

