How to match two CSV files

DeaD_EyE · (This post was last modified: Oct-01-2019, 05:11 PM by DeaD_EyE.)

import io # fake file
import sys
import csv
from contextlib import ExitStack
from itertools import (
    chain,
    compress,
    repeat,
)


def combine_rows(rows, masks):
    return tuple(
        chain.from_iterable(
            compress(row, mask)
            for row, mask
            in zip(rows, masks)
        )
    )


def automatic_mask_gen(headers):
    iterator = iter(headers)
    selected = [*next(iterator)]
    masks = [tuple(repeat(1, len(selected[0])))]
    for header in iterator:
        mask = []
        for topic in header:
            if topic in selected:
                mask.append(0)
            else:
                mask.append(1)
                selected.append(topic)
        masks.append(tuple(mask))
    return masks


def automatic_sort_gen(flat_cols):
    return tuple(flat_cols.index(fls) for fls in sorted(flat_cols))


def sort_with_mask(cols, mask):
    return tuple(cols[idx] for idx in mask)


def strip_whitespace(rows):
    return [[col.strip() for col in cols] for cols in rows]


def combine_same_fields(csv_files, masks=None, delimiter=',', strip=True, autosort=True):
    if masks is not None and len(csv_files) != len(masks):
        raise ValueError('csv_files must be the same amout as masks.')
    with ExitStack() as stack:
        fds = [
            stack.enter_context(open(file)) if not isinstance(file, io.StringIO) else file
            for file in csv_files
            ]
        csv_iterators = [csv.reader(fd, delimiter=delimiter) for fd in fds]
        try:
            headers = [next(iterator) for iterator in csv_iterators]
        except StopIteration:
            raise Exception('Not able to get the header of one of the csv-files', file=sys.stdout)
        if strip:
            headers = strip_whitespace(headers)
        if masks is None:
            masks = automatic_mask_gen(headers)
        if autosort:
            combined_headers = combine_rows(headers, masks)
            sort_mask = automatic_sort_gen(combined_headers)
            yield sort_with_mask(combined_headers, sort_mask)
        else:
            yield combine_rows(headers, masks)
        for rows in zip(*csv_iterators):
            if strip:
                rows = strip_whitespace(rows)
            if autosort:
                yield sort_with_mask(combine_rows(rows, masks), sort_mask)
            else:
                yield combine_rows(rows, masks)

CSV_1 = io.StringIO("""
DataCol1, DataCol2, DataCol3, P1, P2, P3, P4, P5
data1, data2, data3, 1,2,3,4,5
""".strip()) # fake file 1


CSV_2 = io.StringIO("""
ColName1, ColName2, ColName3, P1, P2, P3, P4, P5
ea1, ea2, ea3, 3,5,6,2,1
db1, db2, db3, 1,2,3,4,5
""".strip()) # fake file 2

# you can have more

# mask for 2 csv data files
masks = [(1,1,1, 1,1,1,1,1), (1,1,1, 0,0,0,0,0)]
# this is the resulting generator object
iterator = combine_same_fields([CSV_1, CSV_2], masks=None)
# open a output file in write mode, create the csv.writer
# object and iterate over the iterator which is the generator.
# For each combined row yielded from iterator,
# is written by the writer object with the method writerow
with open('output.csv', 'w') as fd:
    writer = csv.writer(fd, delimiter=',')
    for row in iterator:
        writer.writerow(row)

# done


# show stored data
# small example with dict csv reader
with open('output.csv') as fd:
    print(fd.read())
    fd.seek(0) # back to start of file
    # example with dict reader
    reader = csv.DictReader(fd)
    print('Fields of reader:', reader.fieldnames)
    for row in reader:
        print(row)

Reading my own code days later Huh

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Move Files based on partial Match	mohamedsalih12	2	858	Sep-20-2023, 07:38 PM Last Post: snippsat
	Open and read multiple text files and match words	kozaizsvemira	3	6,792	Jul-07-2021, 11:27 AM Last Post: Larz60+
	Look for match in two files and print out in the first file	Batistuta	0	1,608	Mar-03-2020, 02:27 PM Last Post: Batistuta
	Compare two large CSV files for a match	Python_Newbie9	3	5,836	Apr-22-2019, 08:49 PM Last Post: ichabod801
	Match CSV files for difference	Cuz	4	3,570	Dec-18-2018, 02:16 PM Last Post: Cuz

How to match two CSV files

User Panel Messages

Announcements