import io # fake file import sys import csv from contextlib import ExitStack from itertools import ( chain, compress, repeat, ) def combine_rows(rows, masks): return tuple( chain.from_iterable( compress(row, mask) for row, mask in zip(rows, masks) ) ) def automatic_mask_gen(headers): iterator = iter(headers) selected = [*next(iterator)] masks = [tuple(repeat(1, len(selected[0])))] for header in iterator: mask = [] for topic in header: if topic in selected: mask.append(0) else: mask.append(1) selected.append(topic) masks.append(tuple(mask)) return masks def automatic_sort_gen(flat_cols): return tuple(flat_cols.index(fls) for fls in sorted(flat_cols)) def sort_with_mask(cols, mask): return tuple(cols[idx] for idx in mask) def strip_whitespace(rows): return [[col.strip() for col in cols] for cols in rows] def combine_same_fields(csv_files, masks=None, delimiter=',', strip=True, autosort=True): if masks is not None and len(csv_files) != len(masks): raise ValueError('csv_files must be the same amout as masks.') with ExitStack() as stack: fds = [ stack.enter_context(open(file)) if not isinstance(file, io.StringIO) else file for file in csv_files ] csv_iterators = [csv.reader(fd, delimiter=delimiter) for fd in fds] try: headers = [next(iterator) for iterator in csv_iterators] except StopIteration: raise Exception('Not able to get the header of one of the csv-files', file=sys.stdout) if strip: headers = strip_whitespace(headers) if masks is None: masks = automatic_mask_gen(headers) if autosort: combined_headers = combine_rows(headers, masks) sort_mask = automatic_sort_gen(combined_headers) yield sort_with_mask(combined_headers, sort_mask) else: yield combine_rows(headers, masks) for rows in zip(*csv_iterators): if strip: rows = strip_whitespace(rows) if autosort: yield sort_with_mask(combine_rows(rows, masks), sort_mask) else: yield combine_rows(rows, masks) CSV_1 = io.StringIO(""" DataCol1, DataCol2, DataCol3, P1, P2, P3, P4, P5 data1, data2, data3, 1,2,3,4,5 """.strip()) # fake file 1 CSV_2 = io.StringIO(""" ColName1, ColName2, ColName3, P1, P2, P3, P4, P5 ea1, ea2, ea3, 3,5,6,2,1 db1, db2, db3, 1,2,3,4,5 """.strip()) # fake file 2 # you can have more # mask for 2 csv data files masks = [(1,1,1, 1,1,1,1,1), (1,1,1, 0,0,0,0,0)] # this is the resulting generator object iterator = combine_same_fields([CSV_1, CSV_2], masks=None) # open a output file in write mode, create the csv.writer # object and iterate over the iterator which is the generator. # For each combined row yielded from iterator, # is written by the writer object with the method writerow with open('output.csv', 'w') as fd: writer = csv.writer(fd, delimiter=',') for row in iterator: writer.writerow(row) # done # show stored data # small example with dict csv reader with open('output.csv') as fd: print(fd.read()) fd.seek(0) # back to start of file # example with dict reader reader = csv.DictReader(fd) print('Fields of reader:', reader.fieldnames) for row in reader: print(row)Reading my own code days later
Almost dead, but too lazy to die: https://sourceserver.info
All humans together. We don't need politicians!
All humans together. We don't need politicians!