Search for duplicated files

hbknjr · Oct-11-2017, 05:01 PM

A similar script I started writing didn't complete until now, thanks for reminding.

import argparse
import os
import hashlib

def find_duplicates(args):
    """Driver function to find duplicate files
    """
    all_files = recursive_search(args.path,recurse_flag=args.recursive)
    same_size = same_size_files(all_files)
    duplicates ={}
    for file_list in same_size.values():
        if len(file_list)>1:
            duplicates.update(same_hash_dict(file_list))

    action(duplicates,oflag=args.output)

def same_size_files(file_list):
    """:param file_list:
       :return: duplicates in format {FileSize:FilePath}
    """
    duplicates = {}
    for path in file_list:
        size = os.stat(path).st_size
        if size in duplicates:
            duplicates[size].append(path)
        else:
            duplicates[size] = [path]
    return duplicates


def recursive_search(directory,recurse_flag =False, all_files=[]):
    """:param directory: Path of Directory to be searched
       :param recurse_flag: if True the subdirectories are searched too
       :param all_files:
       :return: Path string of all files in a directory/subdirectory
    """
    try:
        for entry in os.scandir(directory):
            if entry.is_dir():
                if recurse_flag:
                    all_files + (recursive_search(entry.path,recurse_flag=recurse_flag))
                else:
                    pass
            elif entry.is_file():
                all_files.append(entry.path)
    except PermissionError as e:
        print(e)
    return all_files


def same_hash_dict(file_list):
    """:param file_list:
       :return: duplicates in format {FileHash:FilePath}
    """
    duplicates = {}
    for path in file_list:
        file_hash = hashfile(path)
        if file_hash in duplicates:
            duplicates[file_hash].append(path)
        else:
            duplicates[file_hash] = [path]
    return duplicates


def hashfile(path, blocksize=1048576):
    curr_file = open(path, 'rb')
    hasher = hashlib.md5()
    buf = curr_file.read(blocksize)
    while len(buf) > 0:
        hasher.update(buf)
        buf = curr_file.read(blocksize)
    curr_file.close()
    return hasher.hexdigest()


def action(dup_dict,oflag=False):
    """:param dup_dict: Dictionary of all duplicate file
       :param oflag: if True writes output to a csv file
    """
    results = dup_dict.values()
    if len(results) > 0:
        print('Duplicates Found:')
        print("files with same content:")
        print('\n'+'___'*40)
        for result in results:
            for path in result:
                print('\t\t'+ path)
            print('___'*40)
    else:
        print('No duplicate files found.')
    if oflag:
        import csv
        with open('duplicatefiles.csv', 'w', newline='') as csvfile:
            dupwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            dupwriter.writerow(['FileName','FilePath'])
            for i,result in enumerate(results):
                for path in result:
                    dupwriter.writerow([os.path.basename(path),path])
                dupwriter.writerow([])


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('path', help='Path to the directory to be scanned', type=str)
    parser.add_argument('-o','--output', help='get result in a CSV file',action='store_true')
    parser.add_argument('-r','--recursive', help='to search path recursively',action='store_true')
    args = parser.parse_args()
    import time
    Start_time = time.time()
    find_duplicates(args)
    print("total time: ", time.time()-Start_time)


if __name__ == '__main__':
    main()

Search for duplicated files

User Panel Messages

Announcements