finddups.py (1235B)
1 #!/usr/bin/python3 2 3 # TODO: Allow giving multiple directories as arguments and check them all 4 # TODO: Discover directories with no unique files 5 # TODO: Make interactive deletion functionality? 6 7 import os 8 from hashlib import sha256 9 10 filefilter = ".jpg" # TODO: Make this an argument, use regex ++ 11 12 hash_to_file_list = {} 13 14 def get_file_hash(fi): 15 with open(fi, "rb") as f: 16 return sha256(f.read()).hexdigest() 17 18 ### Generate hash for all files 19 for rootdir, _, files in os.walk("."): 20 try: 21 files = [f for f in files if filefilter in f] 22 for fi in files: 23 fpath = os.path.join(rootdir, fi) 24 fhash = get_file_hash(fpath) 25 if fhash in hash_to_file_list: 26 hash_to_file_list[fhash] += [os.path.abspath(fpath)] 27 else: 28 hash_to_file_list[fhash] = [os.path.abspath(fpath)] 29 except Exception as e : 30 print(e) 31 32 ### Find all lists with two or more elements (i.e. duplicate files) 33 dls = [copies for copies in hash_to_file_list.values() if len(copies) >= 2] 34 if len(dls) > 0: 35 print("The following files are copies of each other:") 36 for duplist in dls: 37 print(" = ".join(duplist)) 38 else: 39 print("No duplicates were found :)")