Python Forum
Search for duplicated files
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Search for duplicated files
#8
A similar script I started writing didn't complete until now, thanks for reminding.

import argparse
import os
import hashlib

def find_duplicates(args):
    """Driver function to find duplicate files
    """
    all_files = recursive_search(args.path,recurse_flag=args.recursive)
    same_size = same_size_files(all_files)
    duplicates ={}
    for file_list in same_size.values():
        if len(file_list)>1:
            duplicates.update(same_hash_dict(file_list))

    action(duplicates,oflag=args.output)

def same_size_files(file_list):
    """:param file_list:
       :return: duplicates in format {FileSize:FilePath}
    """
    duplicates = {}
    for path in file_list:
        size = os.stat(path).st_size
        if size in duplicates:
            duplicates[size].append(path)
        else:
            duplicates[size] = [path]
    return duplicates


def recursive_search(directory,recurse_flag =False, all_files=[]):
    """:param directory: Path of Directory to be searched
       :param recurse_flag: if True the subdirectories are searched too
       :param all_files:
       :return: Path string of all files in a directory/subdirectory
    """
    try:
        for entry in os.scandir(directory):
            if entry.is_dir():
                if recurse_flag:
                    all_files + (recursive_search(entry.path,recurse_flag=recurse_flag))
                else:
                    pass
            elif entry.is_file():
                all_files.append(entry.path)
    except PermissionError as e:
        print(e)
    return all_files


def same_hash_dict(file_list):
    """:param file_list:
       :return: duplicates in format {FileHash:FilePath}
    """
    duplicates = {}
    for path in file_list:
        file_hash = hashfile(path)
        if file_hash in duplicates:
            duplicates[file_hash].append(path)
        else:
            duplicates[file_hash] = [path]
    return duplicates


def hashfile(path, blocksize=1048576):
    curr_file = open(path, 'rb')
    hasher = hashlib.md5()
    buf = curr_file.read(blocksize)
    while len(buf) > 0:
        hasher.update(buf)
        buf = curr_file.read(blocksize)
    curr_file.close()
    return hasher.hexdigest()


def action(dup_dict,oflag=False):
    """:param dup_dict: Dictionary of all duplicate file
       :param oflag: if True writes output to a csv file
    """
    results = dup_dict.values()
    if len(results) > 0:
        print('Duplicates Found:')
        print("files with same content:")
        print('\n'+'___'*40)
        for result in results:
            for path in result:
                print('\t\t'+ path)
            print('___'*40)
    else:
        print('No duplicate files found.')
    if oflag:
        import csv
        with open('duplicatefiles.csv', 'w', newline='') as csvfile:
            dupwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            dupwriter.writerow(['FileName','FilePath'])
            for i,result in enumerate(results):
                for path in result:
                    dupwriter.writerow([os.path.basename(path),path])
                dupwriter.writerow([])


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('path', help='Path to the directory to be scanned', type=str)
    parser.add_argument('-o','--output', help='get result in a CSV file',action='store_true')
    parser.add_argument('-r','--recursive', help='to search path recursively',action='store_true')
    args = parser.parse_args()
    import time
    Start_time = time.time()
    find_duplicates(args)
    print("total time: ", time.time()-Start_time)


if __name__ == '__main__':
    main()
Reply


Messages In This Thread
Search for duplicated files - by wavic - Oct-02-2017, 01:28 AM
RE: Search for duplicated files - by wavic - Oct-02-2017, 04:47 PM
RE: Search for duplicated files - by wavic - Oct-04-2017, 07:58 AM
RE: Search for duplicated files - by DeaD_EyE - Oct-04-2017, 08:44 AM
RE: Search for duplicated files - by wavic - Oct-04-2017, 08:59 AM
RE: Search for duplicated files - by DeaD_EyE - Oct-04-2017, 09:01 AM
RE: Search for duplicated files - by wavic - Oct-04-2017, 09:54 AM
RE: Search for duplicated files - by hbknjr - Oct-11-2017, 05:01 PM
RE: Search for duplicated files - by wavic - Oct-12-2017, 03:06 PM
RE: Search for duplicated files - by hbknjr - Oct-12-2017, 03:43 PM
RE: Search for duplicated files - by wavic - Oct-12-2017, 11:54 PM
RE: Search for duplicated files - by hbknjr - Oct-13-2017, 07:22 AM

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020