forked from YichenQiu/deepdesign.space
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dedup_image2.py
executable file
·93 lines (83 loc) · 2.95 KB
/
dedup_image2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
# import the necessary packages
from imutils import paths
import numpy as np
import argparse
import cv2
import os
import csv
csv_columns = ['Hash','Paths']
csv_file = "Duplicates.csv"
# %%
def dhash(image, hashSize=8):
# convert the image to grayscale and resize the grayscale image,
# adding a single column (width) so we can compute the horizontal
# gradient
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
resized = cv2.resize(gray, (hashSize + 1, hashSize))
# compute the (relative) horizontal gradient between adjacent
# column pixels
diff = resized[:, 1:] > resized[:, :-1]
# convert the difference image to a hash and return it
return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])
# %%
ap = argparse.ArgumentParser()
ap.add_argument("-d", "--dataset", required=True,
help="path to input dataset")
ap.add_argument("-r", "--remove", type=int, default=-1,
help="whether or not duplicates should be removed (i.e., dry run)")
args = vars(ap.parse_args())
# %%
# grab the paths to all images in our input dataset directory and
# then initialize our hashes dictionary
print("[INFO] computing image hashes...")
imagePaths = list(paths.list_images(args["dataset"]))
hashes = {}
# loop over our image paths
for imagePath in imagePaths:
# load the input image and compute the hash
image = cv2.imread(imagePath)
h = dhash(image)
# grab all image paths with that hash, add the current image
# path to it, and store the list back in the hashes dictionary
p = hashes.get(h, [])
p.append(imagePath)
hashes[h] = p
# %%
try:
with open(csv_file, 'a+') as csvfile:
for key in hashes.keys():
csvfile.write("%s,%s\n"%(key,hashes[key]))
except IOError:
print("I/O error")
# loop over the image hashes
for (h, hashedPaths) in hashes.items():
# check to see if there is more than one image with the same hash
if len(hashedPaths) > 1:
# check to see if we got duplicates in a single class or in multiple classes
kind = []
killall = False
for p in hashedPaths:
k = '\\'.join(p.split('\\')[0:-1])
if len(kind) == 0:
kind.append(k)
elif k not in kind:
#go for total kill as we have the same image in multiple classes
killall = True
if killall:
flag = 'TRUE'
else:
flag = 'false'
if args["remove"] <=0:
print ("INFO: hash %s killall flag is %s" % (h,flag))
else:
print ("INFO: cleaning hash %s with killall flag set to %s" % (h,flag))
if killall:
for p in hashedPaths:
os.remove(p)
else:
#keep the first occurence of this image with duplicates only in the same class
for p in hashedPaths[1:]:
os.remove(p)