Python Forum

Hi

I have two large input files (>10 GBs, Nx4). Task is to sort these files based on column-2 as fast as possible. Right now I am chunking and saving the sorted lines in text files (code below). Though it works, I need better speeds!

Is there any fast way of doing this? I later have to read the sorted files in chunks, how can this be done using Pytables or H5Py module? Or any other suggestions?

filename = ['Input-1.txt', 'Input-2.txt']
savename = ['Sort-1.txt', 'Sort-2.txt']

chunksize = 100_000_00 # chunk's size to read

for findex in range(2):
    nrows = sum(1 for line in open(filename[findex])) # no. of lines in each file

    # storing chunk files in /dump
    this_dir = os.path.dirname(__file__)
    path_1 = ["dump/chunk1_{}.tsv","dump/chunk2_{}.tsv"] # chunks in .tsv
    path_2 = ["dump/chunk1_*.tsv", "dump/chunk2_*.tsv"]
    path_w = os.path.join(this_dir, path_1[findex])
    path_r = os.path.join(this_dir, path_2[findex])  

    fid = 1
    lines = []

    with open(filename[findex], 'r') as f_in:
        # creates chunk file(s)
        f_out = open(path_w.format(fid), 'w')
        
        for line_num, line in enumerate(f_in, 1):
            # keep appending until you reach chunksize (boundary)
            lines.append(line)
            # enter as line_num reaches chunksize
            if line_num % chunksize == 0:
                # updates list with sorted values
                lines = sorted(lines, key=lambda k: float(k.split(',')[1]))
                f_out.writelines(lines)
                f_out.close()
                lines = []
                fid += 1
                # open next chunk
                f_out = open(path_w.format(fid), 'w')

        # last chunk
        if lines:
            lines = sorted(lines, key=lambda k: float(k.split(',')[1]))
            f_out.writelines(lines)
            f_out.close()
            lines = []

    print(f'==> Writing {savename[findex]}')

    from heapq import merge
    chunks = []

    for filename[findex] in glob.glob(path_r):
        chunks += [open(filename[findex], 'r')]

    #print(filename[findex], savename[findex])
    with open(savename[findex], 'w') as f_out:
        f_out.writelines(merge(*chunks, key=lambda k: float(k.split(',')[1])))

Robotguy