(Aug-05-2024, 07:52 AM)Gromila131 Wrote: (Aug-04-2024, 09:15 AM)Pedroski55 Wrote: This is a tricky one! Please let me know if you figure it out!
I thought maybe the table lines confused the text reader, so I got each row from the table, but that did not help!
I thought ftfy might be the answer, but the text comes out the same:
import pymupdf
from pathlib import Path
import ftfy
path2pdf = '/home/pedro/Downloads/a.pdf'
path2text = '/home/pedro/Downloads/a.pdf.txt'
doc = pymupdf.open(path2pdf)
for page in doc:
tabs = page.find_tables()
# there is only 1 table
tab = tabs[0]
data = []
for line in tab.extract(encoding=pymupdf.TEXT_ENCODING_CYRILLIC): # print cell text for each row
print(f'This line has {len(line)} cells')
print(line)
data.append(line)
for d in data:
for i in range(len(d)):
if d[i] == '':
d[i] = 'empty'
elif d[i] == None:
d[i] = 'empty'
row_strings = [''.join(s) for s in data]
text = ''.join(row_strings)
page_text = ftfy.fix_text(text)
I also tried an online OCR but that came back with gibberish too!
Since the pdf displays correctly, the correct information must be in there!
Now I'm converting to the correct format via online services. PDF--->DOCX---->TXT
In this case, the text comes out line by line and is readable.
But I can't make such a converter in Python
I didn't tell you why I'm doing this.
I'm currently creating a Python program that will analyze apartment sales. In my country, developers post a project declaration on the website every month, which indicates the number of apartments sold (it's in PDF format).
I need to take the information from there and put it into an Excel table. Maybe I can take it directly from the PDF file? But now I can only extract it from TXT. Here's my code
import openpyxl #работа эксель
from openpyxl import Workbook, load_workbook
import numpy as np
import pandas as pd
import openpyxl as xl
import re
from openpyxl.styles import Alignment, Font
import sys
def main():
pd_txt = 'obj59216-pd30-000303 (2).txt'
# Название ЖК
with open(pd_txt) as text_file:
for num, line in enumerate(text_file, 1):
if '10.6.1' in line:
name_of_the_building = int(num + 1)
print("Название ЖК:")
f = open(pd_txt)
lines = f.readlines()
NAME_OF_THE_BUILDING = (lines[name_of_the_building])
print(lines[name_of_the_building])
f.close()
# Срок сдачи
with open(pd_txt) as text_file:
for num, line in enumerate(text_file, 1):
if '17.1 (5)' in line:
deadline = int(num+8)
print("Срок сдачи:")
f = open(pd_txt)
lines = f.readlines()
DEADLINE = (lines[deadline])
print(lines[deadline])
f.close()
# Квартир всего
with open(pd_txt) as text_file:
for num, line in enumerate(text_file, 1):
if 'Количество жилых помещений:' in line:
total_apartments = int(num)
print("Квартир всего:")
f = open(pd_txt)
lines = f.readlines()
TOTAL_APARTMENTS = (lines[total_apartments])
print(lines[total_apartments])
f.close()
# Квартир продано
with open(pd_txt) as text_file:
for num, line in enumerate(text_file, 1):
if '19.7.1.1.1.1' in line:
sold_apartments=int(num+1)
print("Квартир продано:")
f = open(pd_txt)
lines = f.readlines()
SOLD_APARTMENTS = (lines[sold_apartments])
print(lines[sold_apartments])
f.close()
# Продано М2
with open(pd_txt) as text_file:
for num, line in enumerate(text_file, 1):
if '19.7.2.1.1.1' in line:
sold_meters=int(num)
print("Продано М2:")
f = open(pd_txt)
lines = f.readlines()
just = lines[sold_meters]
sold_meters_value = just.split(": ")[1]
SOLD_METERS = (sold_meters_value[:-3])
print(sold_meters_value[:-3])
f.close()
# Заработано ₽
with open(pd_txt) as text_file:
for num, line in enumerate(text_file, 1):
if '19.7.3.1.1.1' in line:
money_received=int(num+1)
print("Заработано ₽:")
f = open(pd_txt)
lines = f.readlines()
print(lines [money_received][:-5])
MONEY_RECEIVED = (lines[money_received][:-5] )
f.close()
# Дата загрузки ПД
with open(pd_txt) as text_file:
for num, line in enumerate(text_file, 1):
if 'ПРОЕКТНАЯ ДЕКЛАРАЦИЯ' in line:
project_declaration_date=int(num)
print("Дата загрузки ПД")
f = open(pd_txt)
lines = f.readlines()
PROJECT_DECLARATION_DATE = (lines[project_declaration_date][-8:])
print (lines[project_declaration_date][-8:])
f.close()
# Номер ПД
with open(pd_txt) as text_file:
for num, line in enumerate(text_file, 1):
if 'ПРОЕКТНАЯ ДЕКЛАРАЦИЯ' in line:
project_declaration_num=int(num)
print("Номер ПД")
f = open(pd_txt)
lines = f.readlines()
PROJECT_DECLARATION_NUM = (lines[project_declaration_num][2:11])
print (lines[project_declaration_num][2:11])
f.close()
"""
gp = 7.2024
print(PDD)
print(PDD==gp)
"""
PDN = (str(PROJECT_DECLARATION_NUM))
# Existing Excel file
existing_file = 'excel1.xlsx'
# New data to append
new_data = [[PROJECT_DECLARATION_NUM, PROJECT_DECLARATION_DATE, NAME_OF_THE_BUILDING, DEADLINE, TOTAL_APARTMENTS, SOLD_APARTMENTS,
SOLD_METERS, MONEY_RECEIVED]]
# Load existing workbook
wb = load_workbook(existing_file)
# Select the active sheet
ws = wb[PDN]
# Append new data
for row in new_data:
ws.append(row)
# Save the workbook
wb.save(existing_file)
main()
Maybe someone will find it useful