snippets

More or less useful code snippets
Log | Files | Refs

finddups.py (1235B)


      1 #!/usr/bin/python3
      2 
      3 # TODO: Allow giving multiple directories as arguments and check them all
      4 # TODO: Discover directories with no unique files
      5 # TODO: Make interactive deletion functionality?
      6 
      7 import os
      8 from hashlib import sha256
      9 
     10 filefilter = ".jpg" # TODO: Make this an argument, use regex ++
     11 
     12 hash_to_file_list = {}
     13 
     14 def get_file_hash(fi):
     15     with open(fi, "rb") as f:
     16         return sha256(f.read()).hexdigest()
     17 
     18 ### Generate hash for all files
     19 for rootdir, _, files in os.walk("."):
     20     try:
     21         files = [f for f in files if filefilter in f]
     22         for fi in files:
     23             fpath = os.path.join(rootdir, fi)
     24             fhash = get_file_hash(fpath)
     25             if fhash in hash_to_file_list:
     26                 hash_to_file_list[fhash] += [os.path.abspath(fpath)]
     27             else:
     28                 hash_to_file_list[fhash] = [os.path.abspath(fpath)]
     29     except Exception as e :
     30         print(e)
     31 
     32 ### Find all lists with two or more elements (i.e. duplicate files)
     33 dls = [copies for copies in hash_to_file_list.values() if len(copies) >= 2]
     34 if len(dls) > 0:
     35     print("The following files are copies of each other:")
     36     for duplist in dls:
     37         print(" = ".join(duplist))
     38 else:
     39     print("No duplicates were found :)")