-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathduplicates.py
61 lines (49 loc) · 2.02 KB
/
duplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import hashlib
from collections import defaultdict
import csv
src_folder = "../../"
def generate_md5(fname, chunk_size=1024):
"""
Function which takes a file name and returns md5 checksum of the file
"""
hash = hashlib.md5()
with open(fname, "rb") as f:
# Read the 1st block of the file
chunk = f.read(chunk_size)
# Keep reading the file until the end and update hash
while chunk:
hash.update(chunk)
chunk = f.read(chunk_size)
# Return the hex checksum
return hash.hexdigest()
if __name__ == "__main__":
"""
Starting block of script
"""
# The dict will have a list as values
md5_dict = defaultdict(list)
file_types_inscope = ["ppt", "pptx", "pdf", "txt", "html",
"mp4", "jpg", "png", "xls", "xlsx", "xml",
"vsd", "py", "json"]
# Walk through all files and folders within directory
for path, dirs, files in os.walk(src_folder):
print("Analyzing {}".format(path))
for each_file in files:
if each_file.split(".")[-1].lower() in file_types_inscope:
# The path variable gets updated for each subfolder
file_path = os.path.join(os.path.abspath(path), each_file)
# If there are more files with same checksum append to list
md5_dict[generate_md5(file_path)].append(file_path)
# Identify keys (checksum) having more than one values (file names)
duplicate_files = (
val for key, val in md5_dict.items() if len(val) > 1)
# Write the list of duplicate files to csv file
with open("duplicates.csv", "w") as log:
# Lineterminator added for windows as it inserts blank rows otherwise
csv_writer = csv.writer(log, quoting=csv.QUOTE_MINIMAL, delimiter=",",
lineterminator="\n")
header = ["File Names"]
csv_writer.writerow(header)
for file_name in duplicate_files:
csv_writer.writerow(file_name)