Oct-11-2017, 05:01 PM
A similar script I started writing didn't complete until now, thanks for reminding.
import argparse import os import hashlib def find_duplicates(args): """Driver function to find duplicate files """ all_files = recursive_search(args.path,recurse_flag=args.recursive) same_size = same_size_files(all_files) duplicates ={} for file_list in same_size.values(): if len(file_list)>1: duplicates.update(same_hash_dict(file_list)) action(duplicates,oflag=args.output) def same_size_files(file_list): """:param file_list: :return: duplicates in format {FileSize:FilePath} """ duplicates = {} for path in file_list: size = os.stat(path).st_size if size in duplicates: duplicates[size].append(path) else: duplicates[size] = [path] return duplicates def recursive_search(directory,recurse_flag =False, all_files=[]): """:param directory: Path of Directory to be searched :param recurse_flag: if True the subdirectories are searched too :param all_files: :return: Path string of all files in a directory/subdirectory """ try: for entry in os.scandir(directory): if entry.is_dir(): if recurse_flag: all_files + (recursive_search(entry.path,recurse_flag=recurse_flag)) else: pass elif entry.is_file(): all_files.append(entry.path) except PermissionError as e: print(e) return all_files def same_hash_dict(file_list): """:param file_list: :return: duplicates in format {FileHash:FilePath} """ duplicates = {} for path in file_list: file_hash = hashfile(path) if file_hash in duplicates: duplicates[file_hash].append(path) else: duplicates[file_hash] = [path] return duplicates def hashfile(path, blocksize=1048576): curr_file = open(path, 'rb') hasher = hashlib.md5() buf = curr_file.read(blocksize) while len(buf) > 0: hasher.update(buf) buf = curr_file.read(blocksize) curr_file.close() return hasher.hexdigest() def action(dup_dict,oflag=False): """:param dup_dict: Dictionary of all duplicate file :param oflag: if True writes output to a csv file """ results = dup_dict.values() if len(results) > 0: print('Duplicates Found:') print("files with same content:") print('\n'+'___'*40) for result in results: for path in result: print('\t\t'+ path) print('___'*40) else: print('No duplicate files found.') if oflag: import csv with open('duplicatefiles.csv', 'w', newline='') as csvfile: dupwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) dupwriter.writerow(['FileName','FilePath']) for i,result in enumerate(results): for path in result: dupwriter.writerow([os.path.basename(path),path]) dupwriter.writerow([]) def main(): parser = argparse.ArgumentParser() parser.add_argument('path', help='Path to the directory to be scanned', type=str) parser.add_argument('-o','--output', help='get result in a CSV file',action='store_true') parser.add_argument('-r','--recursive', help='to search path recursively',action='store_true') args = parser.parse_args() import time Start_time = time.time() find_duplicates(args) print("total time: ", time.time()-Start_time) if __name__ == '__main__': main()