Hiya!
My German visitors left on Sunday, so I finally had time to look at your Excel properly.
I extracted the data to another Excel file with 3 sheets: deposit, withdraw and order.
I only used openpyxl, I am not very familiar with pandas.
Anyway, this may give you some ideas. Just change the paths for the sourceFile and the targetFile, then you can try myApp() in your IDE.
I use Python's Idle shell. This worked for me, but I would like to try it with a longer sourceFile, see it there are any problems.
I don't like the idea of adding all the comments, so I just overwrote with the comment from the latest row.
def myApp():
import openpyxl
# the 5 imports below can help you format cells in openpyxl for this I only need get_column_letter
# check the documentation online
#from openpyxl.styles import PatternFill
#from openpyxl.styles import Font
from openpyxl.utils import get_column_letter
#from openpyxl.styles import Alignment
#from openpyxl.worksheet.dimensions import ColumnDimension, DimensionHolder
# set the paths
# openpyxl.load_workbook(filename = path2file, data_only=True)
path2file = '/home/pedro/myPython/openpyxl/xlsx_files/Beispieltabelle.xlsx'
savepath = '/home/pedro/myPython/openpyxl/xlsx_files/condensed_Beispieltabelle.xlsx'
dtypes = ['deposit', 'withdraw', 'order']
for d in dtypes:
print('These are the data types we are looking at:', d)
# setColumn_width(afile, asheet, path)
# setColumn_width(targetFile, dtype, savepath)
# a function to set the sizes the cells of your worksheet to the length of the max cell size + 6
# asheet = your current worksheet
# dims is a dictionary
# do this last when everything is done
def setColumn_width(afile, asheet, path):
print(f'Setting the column widths for: {path} and sheet: {asheet} ... ')
# set the active sheet, in case there is more than 1 sheet
afile.active = afile[asheet]
dims = {}
for row in afile.active.rows:
for cell in row:
if cell.value:
# dims.get(cell.column_letter, 0) returns 0 if there is nothing in cell.column_letter
dims[cell.column_letter] = max((dims.get(cell.column_letter, 0), len(str(cell.value))))
# in case of empty cells
else:
dims[cell.column_letter] = 6
# set the width a little wider than the content
for col, value in dims.items():
afile.active.column_dimensions[col].width = value + 6
afile.save(path)
# the function criterion1(rowNum) is True if column R, i.e. column 18 is False
# some problem with openpyxl and the Excel formula =FALSE() workaround by using the string '=FALSE()'
# using: sourceFile = openpyxl.load_workbook(filename = path2file, data_only=True) didn't seem to help
# first row is headers so start at 2
# maxRow + 1 or you won't get the last row
def criterion1(rowNum, type_wanted):
if sourceFile[mysheet].cell(row=rowNum, column=2).value == type_wanted and sourceFile[mysheet].cell(row=rowNum, column=18).value == '=FALSE()':
return True
# get the source XL
sourceFile = openpyxl.load_workbook(path2file)
#sourceFile = openpyxl.load_workbook(filename = path2file, data_only=True)
sourceFilesheets = sourceFile.sheetnames
# show the available sheets
for sheet in sourceFilesheets:
print('These are the sheets:', sheet)
# get the sheet you want to process
mysheet = input('What sheet do you want to process? Copy and paste the sheet name ... ')
# set the active sheet, in case there is more than 1 sheet
sourceFile.active = sourceFile[mysheet]
#ss = sourceFile.active
# get the maximum row and column numbers for later loops
smaxRow = sourceFile[mysheet].max_row
smaxCol = sourceFile[mysheet].max_column
print(f'Sheet {mysheet} has {smaxRow} rows and {smaxCol} columns')
# make a file to take the data you extract
# targetFile = openpyxl.load_workbook(savepath)
targetFile = openpyxl.Workbook()
# make a sheet for each dtype
for dt in dtypes:
targetFile.create_sheet(dt)
# a new workbook starts with a sheet called Sheet, remove it
targetFile.remove(targetFile['Sheet'])
targetFilesheets = targetFile.sheetnames
# just checking that everything is OK
for t in targetFile.sheetnames:
print('The target file has the following sheets:', t)
# make a dictionary to take the column numbers and names
# this may be useful later to remind yourself of what is happening
column_numbers_names = {dtype: [] for dtype in dtypes}
# need a loop here to go through the dtypes and put the relevant headers in the correct targetFile sheets
for dtype in dtypes:
count = 0
for rowNum in range(2, smaxRow + 1):
if sourceFile[mysheet].cell(row=rowNum, column=2).value == dtype:
count +=1
print(f'The type {dtype} is present {count} times in {path2file}')
# get a list of tuples of format (column number, column name) from the sourceFile
# need an if clause here for withdrawals and orders: later, do deposits first
if dtype == 'deposit':
for colNum in [1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17]:
# write the column headers to the target file
data1 = sourceFile[mysheet].cell(row=1, column=colNum).value
# save the column_numbers_names for later
column_numbers_names[dtype].append((colNum, data1))
# put the headers in the target file
if colNum > 5:
targetFile[dtype].cell(row=1, column=colNum - 6).value=data1
else:
targetFile[dtype].cell(row=1, column=colNum).value=data1
elif dtype == 'withdraw':
for colNum in [1, 2, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]:
# write the column headers to the target file
data1 = sourceFile[mysheet].cell(row=1, column=colNum).value
# save the column_numbers_names for later
column_numbers_names[dtype].append((colNum, data1))
# put the headers in the target file
if colNum > 2:
targetFile[dtype].cell(row=1, column=colNum - 3).value=data1
else:
targetFile[dtype].cell(row=1, column=colNum).value=data1
elif dtype == 'order':
for colNum in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17]:
# write the column headers to the target file
data1 = sourceFile[mysheet].cell(row=1, column=colNum).value
# save the column_numbers_names for later
column_numbers_names[dtype].append((colNum, data1))
# put the headers in the target file
if colNum > 11:
targetFile[dtype].cell(row=1, column=colNum - 1).value=data1
else:
targetFile[dtype].cell(row=1, column=colNum).value=data1
# have a look at column_numbers_names any time you need to
# just run showColumn_data()
def showColumn_data(sheet):
print(f'These are the column numbers and headers from the source file copied to the target file sheet: {sheet}')
for tup in column_numbers_names[sheet]:
print(tup)
# remind yourself of the column numbers and names
for dt in dtypes:
showColumn_data(dt)
# save the basic file and have a look
targetFile.save(savepath)
# for each type in dtypesget a list of unique patterns
# boughtCurrency, boughtCurrencyId, or soldCurrency, soldCurrencyId, feeCurrency, feeCurrencyId, classification, location, area as lists
type_dict = {dt: [] for dt in dtypes}
for dt in dtypes:
# column 4 = boughtCurrency, col 5 = boughtCurrencyId, col 12 = classification, col 13 = location, col 14 = area
# sourceFile columns 3, 6 and 9 are the numerical values we are interested in. They should be added later
if dt == 'deposit':
for rowNum in range(2, smaxRow + 1):
# if column 18, isIgnored is not FALSE go to the next row
if not criterion1(rowNum, dt):
continue
mylist = [sourceFile[mysheet].cell(row=rowNum, column=t).value for t in [4, 5, 12, 13, 14]]
# check that this list is not already in deposit_list
if not mylist in type_dict[dt]:
type_dict[dt].append(mylist)
if dt == 'withdraw':
for rowNum in range(2, smaxRow + 1):
# if column 18, isIgnored is not FALSE go to the next row
if not criterion1(rowNum, dt):
continue
mylist = [sourceFile[mysheet].cell(row=rowNum, column=t).value for t in [7, 8, 10, 11, 12, 13, 14]]
# check that this list is not already in deposit_list
if not mylist in type_dict[dt]:
type_dict[dt].append(mylist)
if dt == 'order':
for rowNum in range(2, smaxRow + 1):
# if column 18, isIgnored is not FALSE go to the next row
if not criterion1(rowNum, dt):
continue
mylist = [sourceFile[mysheet].cell(row=rowNum, column=t).value for t in [4, 5, 7, 8, 10, 11, 13, 14]]
# check that this list is not already in deposit_list
if not mylist in type_dict[dt]:
type_dict[dt].append(mylist)
# these are the different permutations of the types deposit, withdraw, order
for key in type_dict.keys():
# get the count for each type from deposit, withdraw and order
count = 0
for rowNum in range(2, smaxRow + 1):
if sourceFile[mysheet].cell(row=rowNum, column=2).value == key:
count +=1
print(f'The type {key} is present {count} times in {path2file}')
print(f'The {key} list of lists is {len(type_dict[key])} long. If {count} > {len(type_dict[key])} some {key} have the same pattern.')
for alist in type_dict[key]:
print(alist)
"""
Using the small sample Excel file, these values were extracted for type = deposit
['Stone', 1, 'mined', 'quarry', 'Kingstown']
['Copper', 2, 'add', 'quarry', 'Kingstown']
['Iron', 3, None, 'quarry', 'Kingstown']
['Iron', 3, 'mined', 'quarry', 'Kingstown']
['Iron', 1, 'mined', 'quarry', 'Kingstown']
['Stone', 1, 'internal', 'quarry', 'Kingstown']
['Stone', 1, 'internal', 'warehouse', 'Kingstown']
['wood', 5, None, 'warehouse', 'Kingstown']
['gold', 4, 'rent_income', 'tenement houses', 'Belize']
['gold', 4, 'rent_income', 'tenement houses', 'Kingstown']
"""
# The above are the different combinations of deposit, boughtCurrency or boughtCurrencyId, classification, location and area,
# of course there may be more patterns when the XL file is much longer
# at the moment, with the example Excel I have, you need 10 amalgamated rows for deposit
# make a new XL to take the results of processing
# put the non-numerical values from deposit_list in the targetFile
# for deposit each list d is 5 long and we start at row 2 so d[rowNum -2]
# for withdraw each list d is 7 long and we start at row 2 so d[rowNum -2]
# for order each list d is 8 long and we start at row 2 so d[rowNum -2]
# columns 1,2, 9,10,11 are all variable but not numerical. The value from the last row will be inserted
# columns 3, 6, 9 are numerical and should be added
def writeRow(alist, rowNr, dt):
if dt == 'deposit':
print(f'Type is deposit. Writing these values to row {rowNr}', alist)
# these are all dt rows anyway each type on a separate sheet
targetFile[dt].cell(row=rowNr, column=2, value=dt)
# set col 3 boughtQuantity to zero to start with
targetFile[dt].cell(row=rowNr, column=3, value=0)
count = 0
for colNum in [4, 5, 6, 7, 8]:
targetFile[dt].cell(row=rowNr, column=colNum, value=alist[count])
count +=1
if dt == 'withdraw':
print(f'Type is withdraw. Writing these values to row {rowNr}', alist)
# these are all dt rows anyway each type on a separate sheet
targetFile[dt].cell(row=rowNr, column=2, value=dt)
# set targetFile col 3, soldQuantity and col 6, feeQuantity to zero to start with
targetFile[dt].cell(row=rowNr, column=3, value=0)
targetFile[dt].cell(row=rowNr, column=6, value=0)
count = 0
for colNum in [4, 5, 7, 8, 9, 10, 11]:
targetFile[dt].cell(row=rowNr, column=colNum, value=alist[count])
count +=1
if dt == 'order':
print(f'Type is order. Writing these values to row {rowNr}', alist)
# these are all dtype rows anyway each type on a separate sheet
targetFile[dt].cell(row=rowNr, column=2, value=dt)
# set targetFile col 3 boughtQuantity, col 6 soldQuantity and col 9 feeQuantity to zero to start with
targetFile[dt].cell(row=rowNr, column=3, value=0)
targetFile[dt].cell(row=rowNr, column=6, value=0)
targetFile[dt].cell(row=rowNr, column=9, value=0)
count = 0
for colNum in [4, 5, 7, 8, 10, 11, 12, 13]:
targetFile[dt].cell(row=rowNr, column=colNum, value=alist[count])
count +=1
for key in type_dict.keys():
if key == 'deposit':
# put the non-numerical values in the rows
for rowNum in range(2, len(type_dict[key]) + 2):
d = type_dict[key][rowNum-2]
writeRow(d, rowNum, key)
if key == 'withdraw':
# put the non-numerical values in the rows
for rowNum in range(2, len(type_dict[key]) + 2):
d = type_dict[key][rowNum-2]
writeRow(d, rowNum, key)
if key == 'order':
# put the non-numerical values in the rows
for rowNum in range(2, len(type_dict[key]) + 2):
d = type_dict[key][rowNum-2]
writeRow(d, rowNum, key)
# save the basic file and go have a look
targetFile.save(savepath)
# a function to find the row number of the corresponding list pattern for deposit
def findRowDeposit(alist, tsheet):
tmaxRow = targetFile[key].max_row
for rowNum in range(2, tmaxRow + 1):
mylistT = [targetFile[tsheet].cell(row=rowNum, column=t).value for t in [4, 5, 6, 7, 8]]
if mylistT == alist:
return rowNum
# a function to find the row number of the corresponding list pattern for withdraw
def findRowWithdraw(alist, tsheet):
tmaxRow = targetFile[key].max_row
for rowNum in range(2, tmaxRow + 1):
mylistT = [targetFile[tsheet].cell(row=rowNum, column=t).value for t in [4, 5, 7, 8, 9, 10, 11]]
if mylistT == alist:
return rowNum
# a function to find the row number of the corresponding list pattern for order
def findRowOrder(alist, tsheet):
tmaxRow = targetFile[key].max_row
for rowNum in range(2, tmaxRow + 1):
mylistT = [targetFile[tsheet].cell(row=rowNum, column=t).value for t in [4, 5, 7, 8, 10, 11, 12, 13]]
if mylistT == alist:
return rowNum
# test the findRow() functions above to make sure they work
# find each row number corresponding to the pattern
for key in type_dict.keys():
# test the above findRow() functions for finding the row number of each pattern
# seems to work for my short source Excel with 105 rows
for t in type_dict[key]:
if key == 'deposit':
print('row number is:', findRowDeposit(t, key))
if key == 'withdraw':
print('row number is:', findRowWithdraw(t, key))
if key == 'order':
print('row number is:', findRowOrder(t, key))
# the test the above seems to work for my short source Excel
# so now we can add the numerical values in the correct rows
# find the rows with deposit in column 2
# mylistS must correspond with 1 of the rows in targetFile[sheet]
# when we have the row number
def writeNumsDeposit(rn, alist, dt):
targetFile[dt].cell(row=rn, column=1).value = data[0] # the timestamp
bQ = targetFile[dt].cell(row=rn, column=3).value # boughtQuantitiy add
targetFile[dt].cell(row=rn, column=3).value = bQ + data[1] # boughtQuantitiy add
targetFile[dt].cell(row=rn, column=9).value = data[2] # txID overwrite
targetFile[dt].cell(row=rn, column=10).value = data[3] # id overwrite
targetFile[dt].cell(row=rn, column=11).value = data[4] # comments overwrite
print(f'data written to row number {rn} of sheet {dt}.')
def writeNumsWithdraw(rn, alist, dt):
targetFile[dt].cell(row=rn, column=1).value = data[0] # the timestamp overwrite
sQ = targetFile[dt].cell(row=rn, column=3).value # soldQuantity add
targetFile[dt].cell(row=rn, column=3).value = sQ + data[1] # soldQuantity add
fQ = targetFile[dt].cell(row=rn, column=6).value # feeQuantity add
targetFile[dt].cell(row=rn, column=6).value = fQ + data[2] # feeQuantity add
targetFile[dt].cell(row=rn, column=12).value = data[3] # txID overwrite
targetFile[dt].cell(row=rn, column=13).value = data[4] # id overwrite
targetFile[dt].cell(row=rn, column=14).value = data[5] # comments overwrite
print(f'data written to row number {rn} of sheet {dt}.')
def writeNumsOrder(rn, alist, dt):
targetFile[dt].cell(row=rn, column=1).value = data[0] # the timestamp overwrite
bQ = targetFile[dt].cell(row=rn, column=3).value # boughtQuantity add
targetFile[dt].cell(row=rn, column=3).value = bQ + data[1] # boughtQuantity add
sQ = targetFile[dt].cell(row=rn, column=6).value # soldQuantity add
targetFile[dt].cell(row=rn, column=6).value = sQ + data[2] # soldQuantity add
fQ = targetFile[dt].cell(row=rn, column=9).value # feeQuantity add
targetFile[dt].cell(row=rn, column=9).value = fQ + data[3] # feeQuantity add
targetFile[dt].cell(row=rn, column=14).value = data[4] # txID overwrite
targetFile[dt].cell(row=rn, column=15).value = data[5] # id overwrite
targetFile[dt].cell(row=rn, column=16).value = data[6] # comments overwrite
print(f'data written to row number {rn} of sheet {dt}.')
# now get each row in sourceFile and look for a pattern matching each list in type_dict
# type_dict has 3 keys: deposit, withdraw and order
for key in type_dict.keys():
if key == 'deposit':
for rowNum in range(2, smaxRow + 1):
if sourceFile[mysheet].cell(row=rowNum, column=2).value == key:
mylistS = [sourceFile[mysheet].cell(row=rowNum, column=t).value for t in [4, 5, 12, 13, 14]]
# find which row of the targetFile corresponds with this sourceFile pattern
rownr = findRowDeposit(mylistS, key)
print(f'targetFile row number is {rownr}')
# col 1 is timestamp, col 3 is boughtQuantity, 15 is txId, 16 is Id, 17 is comments
# for targetFile sheet deposit only column 3 is added, other columns are overwritten
data = [sourceFile[mysheet].cell(row=rowNum, column=t).value for t in [1, 3, 15, 16, 17]]
writeNumsDeposit(rownr, data, key)
if key == 'withdraw':
for rowNum in range(2, smaxRow + 1):
if sourceFile[mysheet].cell(row=rowNum, column=2).value == key:
mylistS = [sourceFile[mysheet].cell(row=rowNum, column=t).value for t in [7, 8, 10, 11, 12, 13, 14]]
# find which row of the targetFile corresponds with this sourceFile pattern
rownr = findRowWithdraw(mylistS, key)
print(f'targetFile row number is {rownr}')
# col 1 is timestamp, col 3 is boughtQuantity, 15 is txId, 16 is Id, 17 is comments
# for targetFile sheet: withdraw column 3 soldQuantity and column 6 feeQuantity are added, other columns are overwritten
data = [sourceFile[mysheet].cell(row=rowNum, column=t).value for t in [1, 6, 9, 15, 16, 17]]
writeNumsWithdraw(rownr, data, key)
if key == 'order':
for rowNum in range(2, smaxRow + 1):
if sourceFile[mysheet].cell(row=rowNum, column=2).value == key:
mylistS = [sourceFile[mysheet].cell(row=rowNum, column=t).value for t in [4, 5, 7, 8, 10, 11, 13, 14]]
# find which row of the targetFile corresponds with this sourceFile pattern
rownr = findRowOrder(mylistS, key)
print(f'targetFile row number is {rownr}')
# col 1 is timestamp, col 3 is boughtQuantity, col 6 is soldQuantity, col 9 is feeQuantity, 15 is txId, 16 is Id, 17 is comments
# for dtype withdraw column 3 soldQuantity and column 6 feeQuantity are added, other columns are overwritten
data = [sourceFile[mysheet].cell(row=rowNum, column=t).value for t in [1, 3, 6, 9, 15, 16, 17]]
writeNumsOrder(rownr, data, key)
for dtype in dtypes:
setColumn_width(targetFile, dtype, savepath)
targetFile.save(savepath)
print('All done!')
Actually, I think you should sort the data as it arrives, not later. But I don't really know what you are doing!
Auf dass es klappt!