If your input is in csv-format, you could use the csv module.
The main point by grouping is, that you have to sort it before.
The functions
sorted
and
itertools.groupby
have key functions, which are called.
A helper function is
operator.itemgetter
to get items by index or name.
from operator import itemgetter
data = [(1,10), (2, 0), (3, -10), (4, -20)]
print(sorted(data, key=itemgetter(1))) # sort by second item
Output:
[(4, -20), (3, -10), (2, 0), (1, 10)]
Now let's do it with data for grouping:
from operator import itemgetter
from itertools import groupby
data = [
(10,20,'foo'),
(10,30,'bar'),
(10,10,'fizz'),
(1,3,'bar'),
(1,3,'foo'),
]
getter = itemgetter(2) # getter for third field
sorted_list = sorted(data, key=getter)
print(sorted_list)
Output:
[(10, 30, 'bar'), (1, 3, 'bar'), (10, 10, 'fizz'), (10, 20, 'foo'), (1, 3, 'foo')]
Next step is the grouping:
from operator import itemgetter
from itertools import groupby
data = [
(10,20,'foo'),
(10,30,'bar'),
(10,10,'fizz'),
(1,3,'bar'),
(1,3,'foo'),
]
getter = itemgetter(2) # getter for third field
sorted_list = sorted(data, key=getter)
grouped = groupby(sorted_list, getter)
for group, elements in grouped:
print('Group ->', group)
for element in elements:
print(element)
Output:
Group -> bar
(10, 30, 'bar')
(1, 3, 'bar')
Group -> fizz
(10, 10, 'fizz')
Group -> foo
(10, 20, 'foo')
(1, 3, 'foo')
If you don't sort, following happens:
from operator import itemgetter
from itertools import groupby
data = [
(10,20,'foo'),
(10,30,'bar'),
(10,10,'fizz'),
(1,3,'bar'),
(1,3,'foo'),
]
getter = itemgetter(2) # getter for third field
grouped = groupby(data, getter)
for group, elements in grouped:
print('Group ->', group)
for element in elements:
print(element)
Output:
Group -> foo
(10, 20, 'foo')
Group -> bar
(10, 30, 'bar')
Group -> fizz
(10, 10, 'fizz')
Group -> bar
(1, 3, 'bar')
Group -> foo
(1, 3, 'foo')
Now the whole implementation with some tricks:
#!/usr/bin/env python3
"""
Group a csv file by a given field.
"""
import sys
from argparse import ArgumentParser
from itertools import groupby
from operator import itemgetter
from pathlib import Path
from csv import reader, writer
def group_csv_reader(file, fields):
with file.open() as fd:
csv = reader(fd)
header = next(csv)
for field in fields:
if field not in header:
raise ValueError(f'Only following fields are valid to group: {header}')
indicies = [header.index(field) for field in fields]
getter = itemgetter(*indicies)
# the itemgetter can get one or more items from a sequence or mapping
yield header
csv = sorted(csv, key=getter)
# remove sorted, to see the effect
# this will consome the whole csv file
# and sort it. After the sorting
# the whole content of the file is in memory
# if the file is smaller than 1GiB,
# it's ok not to be memory efficient
# now the file is closed and
# csv still contains the sorted rows by field
csv = groupby(csv, key=getter)
# groupby itself does not sort
# it just yield the group and rows
for group, rows in csv:
for row in rows:
yield row
def main():
parser = ArgumentParser(description=__doc__)
parser.add_argument('file', type=Path, help='CSV input file to process')
# file is a Path object you can work with
parser.add_argument('fields', nargs='+', help='Fieldname(s) to group by')
parser.add_argument('-o', type=Path, help='Output to a file')
# o is a boolean
# one or more fieldname
args = parser.parse_args()
if args.o:
fout = args.o.open('w')
else:
fout = sys.stdout
# now make a csv_writer, wich uses currently sys.stdout
csv_writer = writer(fout)
# catching the error, if a field was
# given which does not exist
try:
iterator = group_csv_reader(args.file, args.fields)
except ValueError as e:
print(e, file=sys.stderr)
fout.close()
return 1
for row in iterator:
csv_writer.writerow(row)
return 0
if __name__ == '__main__':
sys.exit(main())