Hi,
I am a beginner in python. I've read the Fluent in Python book, and I am trying to dive deeper.
Here is a file based merge sort. I have a file that contains around 10 million numbers.Each number is in their own line. The program will read 1 million numbers at a time, sort them, and store them in temporary files. Finally, a merge process will merge all temporary files into one output file.
I am not very confident about what I coded. Can you guys help make this little program more pythonic? Is there anything I missed that could save more memory or cpu time? Any help is appreciated.
I am a beginner in python. I've read the Fluent in Python book, and I am trying to dive deeper.
Here is a file based merge sort. I have a file that contains around 10 million numbers.Each number is in their own line. The program will read 1 million numbers at a time, sort them, and store them in temporary files. Finally, a merge process will merge all temporary files into one output file.
I am not very confident about what I coded. Can you guys help make this little program more pythonic? Is there anything I missed that could save more memory or cpu time? Any help is appreciated.
from itertools import zip_longest import glob def grouper(iterable, n, fillvalue=None): """Collect data into fixed-length chunks or blocks""" # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" args = [iter(iterable)] * n return zip_longest(*args, fillvalue=fillvalue) class MergeSort: # temp folder TEMP_FOLDER = "temp/" # number of lines to read at a time N = 1000000 def __init__(self, input_file_name, output_file_name): self.input_file_name = input_file_name self.output_file_name = output_file_name def __divide(self): """read numbers to sort, divide into chunks, sort each chunk and write to temp files""" with open(self.input_file_name, "r") as fin: i = 1 # only used for logging purpose for lines in grouper(fin, self.N): print("Sorting batch " + str(i) + "...") data = sorted(lines, key=int) with open(self.TEMP_FOLDER + str(i) + ".text", "w+") as temp: print("Writing batch " + str(i) + "...") temp.writelines(data) i += 1 def __merge(self): """merge sorted temp files into one""" files = [] for f in glob.glob(self.TEMP_FOLDER + "*.text"): print(f) fin = open(f, "r") files.append(fin) with open(self.output_file_name, "w+") as fout: values = [] for f in files: val = int(f.readline()) values.append(val) while files: minval = min(values) index = values.index(minval) print(minval) fout.write(str(minval) + "\n") # read next line data = files[index].readline() if data: del values[index] # values.pop(index) values.insert(index, int(data)) else: files[index].close() # close temp file del files[index] # files.pop(index) del values[index] # values.pop(index) def mergesort(self): self.__divide() self.__merge() if __name__ == '__main__': # file to sort INPUT_FILE_NAME = "numbers_to_sort.text" # sorted file OUTPUT_FILE_NAME = "sorted_numbers.text" MergeSort(INPUT_FILE_NAME, OUTPUT_FILE_NAME).mergesort()