Python Forum
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
How to match two CSV files
#9
import io # fake file
import sys
import csv
from contextlib import ExitStack
from itertools import (
    chain,
    compress,
    repeat,
)


def combine_rows(rows, masks):
    return tuple(
        chain.from_iterable(
            compress(row, mask)
            for row, mask
            in zip(rows, masks)
        )
    )


def automatic_mask_gen(headers):
    iterator = iter(headers)
    selected = [*next(iterator)]
    masks = [tuple(repeat(1, len(selected[0])))]
    for header in iterator:
        mask = []
        for topic in header:
            if topic in selected:
                mask.append(0)
            else:
                mask.append(1)
                selected.append(topic)
        masks.append(tuple(mask))
    return masks


def automatic_sort_gen(flat_cols):
    return tuple(flat_cols.index(fls) for fls in sorted(flat_cols))


def sort_with_mask(cols, mask):
    return tuple(cols[idx] for idx in mask)


def strip_whitespace(rows):
    return [[col.strip() for col in cols] for cols in rows]


def combine_same_fields(csv_files, masks=None, delimiter=',', strip=True, autosort=True):
    if masks is not None and len(csv_files) != len(masks):
        raise ValueError('csv_files must be the same amout as masks.')
    with ExitStack() as stack:
        fds = [
            stack.enter_context(open(file)) if not isinstance(file, io.StringIO) else file
            for file in csv_files
            ]
        csv_iterators = [csv.reader(fd, delimiter=delimiter) for fd in fds]
        try:
            headers = [next(iterator) for iterator in csv_iterators]
        except StopIteration:
            raise Exception('Not able to get the header of one of the csv-files', file=sys.stdout)
        if strip:
            headers = strip_whitespace(headers)
        if masks is None:
            masks = automatic_mask_gen(headers)
        if autosort:
            combined_headers = combine_rows(headers, masks)
            sort_mask = automatic_sort_gen(combined_headers)
            yield sort_with_mask(combined_headers, sort_mask)
        else:
            yield combine_rows(headers, masks)
        for rows in zip(*csv_iterators):
            if strip:
                rows = strip_whitespace(rows)
            if autosort:
                yield sort_with_mask(combine_rows(rows, masks), sort_mask)
            else:
                yield combine_rows(rows, masks)

CSV_1 = io.StringIO("""
DataCol1, DataCol2, DataCol3, P1, P2, P3, P4, P5
data1, data2, data3, 1,2,3,4,5
""".strip()) # fake file 1


CSV_2 = io.StringIO("""
ColName1, ColName2, ColName3, P1, P2, P3, P4, P5
ea1, ea2, ea3, 3,5,6,2,1
db1, db2, db3, 1,2,3,4,5
""".strip()) # fake file 2

# you can have more

# mask for 2 csv data files
masks = [(1,1,1, 1,1,1,1,1), (1,1,1, 0,0,0,0,0)]
# this is the resulting generator object
iterator = combine_same_fields([CSV_1, CSV_2], masks=None)
# open a output file in write mode, create the csv.writer
# object and iterate over the iterator which is the generator.
# For each combined row yielded from iterator,
# is written by the writer object with the method writerow
with open('output.csv', 'w') as fd:
    writer = csv.writer(fd, delimiter=',')
    for row in iterator:
        writer.writerow(row)

# done


# show stored data
# small example with dict csv reader
with open('output.csv') as fd:
    print(fd.read())
    fd.seek(0) # back to start of file
    # example with dict reader
    reader = csv.DictReader(fd)
    print('Fields of reader:', reader.fieldnames)
    for row in reader:
        print(row)
Reading my own code days later Huh Big Grin Big Grin
Almost dead, but too lazy to die: https://sourceserver.info
All humans together. We don't need politicians!
Reply


Messages In This Thread
How to match two CSV files - by timlamont - Sep-29-2019, 08:25 AM
RE: How to match two CSV files - by Axel_Erfurt - Sep-29-2019, 09:24 AM
RE: How to match two CSV files - by timlamont - Sep-29-2019, 09:53 AM
RE: How to match two CSV files - by perfringo - Sep-29-2019, 10:21 AM
RE: How to match two CSV files - by timlamont - Sep-29-2019, 10:39 AM
RE: How to match two CSV files - by perfringo - Sep-30-2019, 05:02 AM
RE: How to match two CSV files - by buran - Sep-30-2019, 05:46 AM
RE: How to match two CSV files - by DeaD_EyE - Sep-30-2019, 07:16 AM
RE: How to match two CSV files - by DeaD_EyE - Oct-01-2019, 05:11 PM
RE: How to match two CSV files - by timlamont - Oct-01-2019, 05:54 PM

Possibly Related Threads…
Thread Author Replies Views Last Post
  Move Files based on partial Match mohamedsalih12 2 858 Sep-20-2023, 07:38 PM
Last Post: snippsat
  Open and read multiple text files and match words kozaizsvemira 3 6,792 Jul-07-2021, 11:27 AM
Last Post: Larz60+
  Look for match in two files and print out in the first file Batistuta 0 1,608 Mar-03-2020, 02:27 PM
Last Post: Batistuta
  Compare two large CSV files for a match Python_Newbie9 3 5,836 Apr-22-2019, 08:49 PM
Last Post: ichabod801
  Match CSV files for difference Cuz 4 3,570 Dec-18-2018, 02:16 PM
Last Post: Cuz

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020