Extracting data from bank statement PDFs (Accountant)

a4avinash · (This post was last modified: Sep-10-2024, 06:41 AM by buran.)

The main issue with the provided output CSV file is that the CREDIT and DEBIT columns are not being populated correctly. The values in these columns should reflect the actual credit and debit amounts for each transaction, but the current output shows all zeros.
This indicates that the script is not able to correctly identify and assign the credit and debit values based on the information in the "Particulars" (Transaction Reference) column of the bank statement.

import pandas as pd
import PyPDF2
import re
import os
from datetime import datetime

print("Script started")

def extract_text_from_pdf(pdf_path):
    print(f"Attempting to read PDF: {pdf_path}")
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ''
            for page in reader.pages:
                text += page.extract_text()
        print(f"Successfully extracted {len(text)} characters from PDF")
        return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

def extract_transaction_data(pdf_content):
    print("Extracting transaction data...")
    if not pdf_content:
        print("No PDF content to process")
        return None
    
    pattern = r'(\d{2}-\d{2}-\d{2})\s+(.*?)\s+(\d+\.\d{2}|\-)\s+(\d+\.\d{2}|\-)'
    matches = re.findall(pattern, pdf_content)
    print(f"Found {len(matches)} transaction entries")

    data = []
    for match in matches:
        date, particulars, credit, debit = match
        
        credit = 0.0 if credit == '-' else float(credit)
        debit = 0.0 if debit == '-' else float(debit)
        
        # Identify the transaction type and assign credit/debit accordingly
        if particulars.startswith('UPI/DR/'):
            credit = 0.0
            debit = debit
        elif particulars.startswith('CR_DR'):
            credit = credit
            debit = 0.0
        else:
            credit = credit
            debit = debit
        
        data.append([date, particulars, credit, debit])

    df = pd.DataFrame(data, columns=['Date', 'Particulars', 'CREDIT', 'DEBIT'])
    df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%y')
    print(f"Created DataFrame with {len(df)} rows")
    return df

def process_bank_statement(input_path):
    print(f"Processing PDF: {input_path}")
    
    pdf_content = extract_text_from_pdf(input_path)
    if not pdf_content:
        return
    
    print("First 500 characters of PDF content:")
    print(pdf_content[:500])
    
    df = extract_transaction_data(pdf_content)
    if df is None or df.empty:
        print("No transaction data found in the PDF.")
        return
    
    output_folder = os.path.dirname(input_path)
    output_path = os.path.join(output_folder, 'processed_bank_statement.csv')
    
    df.to_csv(output_path, index=False)
    print(f"Processed data has been saved to: {output_path}")
    print("\nFirst few rows of the processed data:")
    print(df.head().to_string(index=False))

def main():
    print("Bank Statement Processor")
    print("========================")
    
    default_path = r"C:\Users\csc\Desktop\Ashok SBI\apr23.pdf"
    
    if os.path.exists(default_path):
        print(f"Found default PDF file: {default_path}")
        input_path = default_path
    else:
        print("Default PDF file not found.")
        input_path = input("Enter the full path to your PDF file: ").strip('"')
    
    if not os.path.exists(input_path):
        print(f"Error: File not found - {input_path}")
        input("Press Enter to exit...")
        return
    
    process_bank_statement(input_path)
    print("Processing complete.")
    input("Press Enter to exit...")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        print(traceback.format_exc())
        input("Press Enter to exit...")

print("Script ended")

buran write Sep-10-2024, 06:41 AM:
New thread merged into old thread

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Confused by the different ways of extracting data in DataFrame	leea2024	1	765	Aug-17-2024, 01:34 PM Last Post: deanhystad
	Extracting the correct data from a CSV file	S2G	6	2,036	Jun-03-2024, 04:50 PM Last Post: snippsat
	Comparing PDFs	CaseCRS	5	3,876	Apr-01-2023, 05:46 AM Last Post: DPaul
	Extracting Data into Columns using pdfplumber	arvin	17	19,751	Dec-17-2022, 11:59 AM Last Post: arvin
	Extracting Data from tables	DataExtrator	0	1,690	Nov-02-2021, 12:24 PM Last Post: DataExtrator
	extracting data	ajitnayak1987	1	2,180	Jul-29-2021, 06:13 AM Last Post: bowlofred
	Extracting and printing data	ajitnayak1987	0	1,885	Jul-28-2021, 09:30 AM Last Post: ajitnayak1987
	Extracting unique pairs from a data set based on another value	rybina	2	3,124	Feb-12-2021, 08:36 AM Last Post: rybina
	extracting data/strings from Word doc	mikkelibsen	1	2,605	Feb-10-2021, 11:06 AM Last Post: Larz60+
	Extracting data without showing dtype, name etc.	tgottsc1	3	9,227	Jan-10-2021, 02:15 PM Last Post: buran

Extracting data from bank statement PDFs (Accountant)

User Panel Messages

Announcements