-
Notifications
You must be signed in to change notification settings - Fork 1
/
pull_all.py
270 lines (223 loc) · 12.6 KB
/
pull_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#!/usr/bin/env python3
"""Image Pulling script for UAF-Plankline
Usage:
./pull_all.py -d <project directory>
License:
MIT License
Copyright (c) 2023 Thomas Kelly
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
import os
import glob
import argparse
import csv # to parse all of the csv files
import tarfile # untar the images
import re
import shutil # for copying files
# arg types
def directory(arg):
if os.path.isdir(arg):
return arg
else:
raise argparse.ArgumentTypeError("Not a valid directory path")
def csv_type(arg):
if not os.path.isfile(arg):
raise argparse.ArgumentTypeError("Not a file")
if not arg.endswith(".csv"):
raise argparse.ArgumentTypeError("Not a csv file")
else:
return arg
def float01(arg):
if not float(arg):
raise argparse.ArgumentTypeError("Not a floating point number")
arg = float(arg)
if not (arg < 1.0 and arg > 0.0):
raise argparse.ArgumentTypeError("Not a floating point value between 0 and 1")
else:
return arg
# check to make sure the image doesn't already exist. Some images in different tars are given the same name
def add_unique_postfix(fn):
if not os.path.exists(fn): # if the file name doesn't exist, don't change the name
return fn
path, name = os.path.split(fn)
name, ext = os.path.splitext(name)
make_fn = lambda i: os.path.join(path, '%s(%d)%s' % (name, i, ext))
for i in range(2, 10000): # finds a number that is not used to put as the suffix
uni_fn = make_fn(i)
if not os.path.exists(uni_fn):
return uni_fn
return None
def get_parser(): # get command line options
parser = argparse.ArgumentParser(description="Tool that was created to help expand the data for the taxon that we do not have many images of by utilizing the data from the images that were already classified.")
# you should do many taxon at once since a lot of the time is lost unzipping files so it is only slightly less efficient
parser.add_argument("-t", "--taxon", required=False, type=str, nargs='+', help="The taxons that you are looking to find from the csv files. This should be given as a list of taxon separated by a space.")
selectgroup = parser.add_mutually_exclusive_group(required=True)
selectgroup.add_argument("-p", "--probability_taxon", type=float01, help="The probability that a taxon needs to be over for the program to extract image extract the image, this should be a value between (0,1)")
selectgroup.add_argument("-b", "--best_taxon", action="store_true", help="Extracts the images if the image was classified as a certain taxa, that is one of the taxons that you are trying to find.")
inputgroup = parser.add_mutually_exclusive_group(required=True)
inputgroup.add_argument("-d", "--input_drive", type=directory, help="The directory that you want to be pulling csv files from to look for images")
inputgroup.add_argument("-c", "--input_csv", type=csv_type, help="The csv file that you want to look for taxon images.")
parser.add_argument("-o", "--output", required=True, type=directory, help="The directory that you want to put the taxon folders of images")
parser.add_argument("-r", "--raw_tars", required=True, type=directory, help="The directory of all of the tar images that correspond to the csv data")
parser.add_argument("-m", "--min_images", type=int, default=0, help="The minimum number of images you need to have in a tar.gz in order to unzip it, this defaults to 0")
parser.add_argument("-dc", "--different_columns", action="store_true", default=False, help="Checks all of the csvs and tar.gz files. This should be used if the csv files do not follow the same pattern.")
parser.add_argument("-s", "--strict_subclasses", action="store_true", default=False, help="Doesn't find any substrings, only uses the taxon names that you input exactly")
return parser
def validate_args(parser): # verify the command line options that the user plugged in and print them out
args = parser.parse_args()
tars_dir = args.raw_tars
csvs = []
file_dict = {}
if(args.input_drive): # several csv files
csvs = glob.glob(args.input_drive + "/*.csv")# do something here so that program goes through and gets all of the csv files
if(len(csvs) == 0):
parser.error("-d, --input_drive Directory does not contain any csv files")
print("Input Drive: %s" % (args.input_drive))
print("Number of Input CSVs: %d" % (len(csvs)))
# create dictionary linking between tar files and csvs
file_dict = {}
for tar in os.listdir(tars_dir):
unique_name = os.path.basename(tar).replace(".tar","")
matches = glob.glob(str(args.input_drive) + "/*" + str(unique_name) + "*.csv")
if (len(matches) >= 1):
csv_filename = matches[0]
if (csv_filename not in file_dict):
file_dict[csv_filename] = os.path.join(tars_dir, tar)
else:
print("Error: More than one tar file have been matched with a csv file.")
exit()
else: # a single csv files
print("Input CSV:", args.input_csv)
for tar in os.listdir(tars_dir):
unique_name = os.path.basename(tar).replace(".tar","")
if (unique_name in os.path.basename(args.input_csv)):
if (args.input_csv not in file_dict):
file_dict[args.input_csv] = os.path.join(tars_dir, tar)
else:
print("Error: More than one tar file have been matched with the given csv file.")
exit()
csvs.append(args.input_csv)
for csv_file in csvs:
if (csv_file not in file_dict):
print(file_dict)
parser.error("r, --raw_tars does not have a corresponding tar for the csv file %s" % csv_file)
print("Raw tar Directory:", tars_dir)
if(args.output):
out_dir = args.output
if(len(glob.glob(out_dir + "/*")) > 0):
parser.error("-o, --output Directory can not contain any any files or directories so that we can avoid overwriting existing files.")
print("Output Directory:", out_dir)
min_images = args.min_images
if(min_images < 0):
parser.error("-m, --min_images The minimum numbers of images you extract needs to be a positive integer.")
print("Minimum images needed to unzip:", min_images)
if(args.best_taxon):
print("\n----- Finding chosen taxon where they are the top probability -----")
else:
probability = args.probability_taxon
print("\n----- Finding chosen taxon with probabilities above %s -----" % probability)
return csvs, tars_dir, out_dir, file_dict, min_images, args.best_taxon, args.probability_taxon, args.different_columns, args.strict_subclasses
def find_all_taxon(csv): # NOTE: this is assuming the csv's collumns are all the same.
taxon_exist = next(csv)[1:]
return taxon_exist
def invalid_taxon(parser, taxon_exist, all_taxon): # see if any of the taxon were not substrings of the column names in the csv files.
args = parser.parse_args()
numInvalid = 0 # store the number of invalid taxon
for i, taxon in enumerate(all_taxon):
if(taxon not in taxon_exist):
if(args.different_columns):
numInvalid += 1
print("\"%s\" was ignored since it isn't a valid substring of a taxon name for this csv file" % (taxon))
else:
parser.error("-t, --taxon Taxon \"%s\" is not a valid substring of a taxon name" % (taxon))
if numInvalid == len(all_taxon): # determine if there are any valid taxon that were inputted
return True
return False
# print out all of the taxon this program is searching for
def print_taxon(taxon_locations):
print("Finding the sub_taxon:")
for i, (taxon, sub_taxon) in enumerate(taxon_locations.values()):
print("%d. %s" % (i, sub_taxon))
def find_top(csv, taxon_locations):
matched_images = dict()
for row in csv:
top_col = 1
for col, prob in enumerate(row):
if col < 2: continue # skip the image name row
# TODO maybe add quickselect here to get the top k elements
if float(prob) > float(row[top_col]):
top_col = col
if top_col in taxon_locations: # check if top_col is in the dictionary
image = os.path.basename(row[0])
matched_images[image] = [taxon_locations[top_col]] # add the image to the matched images dict
return matched_images
def find_above_prob(csv, taxon_locations, probability):
matched_images = dict()
for row in csv:
for i in range(len(taxon_locations)):
if float(row[i+1]) > probability: # if one is found this tar.gz now needs to be unzipped
image = os.path.basename(row[0])
if image not in matched_images:
matched_images[image] = [taxon_locations[i]]
else:
matched_images[image].append(taxon_locations[i]) # add taxon to hashmap for the image
return matched_images
def untar(tar, out_dir):
print("Unzipping %s into %s" % (tar, out_dir))
t = tarfile.open(tar, 'r') # set up the file pointer for the tar file
t.extractall(out_dir) # all images into the out_dir
def build_structure(path_array, taxon_locations):
for i in enumerate(path_array): # shouldn't ever be that many, dependant on your minimum detection probability.
dir_structure = "/" + taxon_locations + "/"
os.makedirs(out_dir + dir_structure, exist_ok = True) # TODO: maybe move this to another place so that it is not done in a loop
if(len(path_array) > 1): # then we will need to copy the image to multiple directories
if(i < len(path_array) - 1):
shutil.copy2(img, out_dir + dir_structure)
else: # move the last one instead of copying it
shutil.move(img, out_dir + dir_structure)
else: # move to one directory
image_unique = add_unique_postfix(out_dir + dir_structure + os.path.basename(img)) # make sure there is a unique image name
shutil.move(img, image_unique)
if __name__ == "__main__":
"""Main entry to pull_all.py"""
v_string = "V2023.09.05"
parser = get_parser() # get command line arguments
csvs, tars_dir, out_dir, file_dict, min_images, best, probability, different, strict = validate_args(parser) # error check arguments further
print(v_string)
print(file_dict)
f = open(csvs[0])
_csv = csv.reader(f)
e_taxon = find_all_taxon(_csv) # read the taxon from the first csv file
f.close()
# this is the main processing part of the program
for csv_file in csvs: # loop through all of the csv files
print("\nFinding images from", csv_file)
f = open(csv_file)
_csv = csv.reader(f)
name_line = next(_csv) # remove nameline
tar_file = file_dict[csv_file]
untar(tar_file, out_dir)
for taxon_locations in e_taxon:
matched_images = dict()
matched_images = find_above_prob(_csv, taxon_locations, probability)
print(f"Found {len(matched_images)} images in {csv_file} for {taxon_locations}")
print("Building file structure")
for img in glob.glob(out_dir + f'/{taxon_locations}*.jpg'): # loop through all the images in the untared directory
path_array = matched_images[os.path.basename(img)]
build_structure(path_array, taxon_locations) # move all of the images to the correct sub-directory
f.close()
print("----- Done -----")