-
Notifications
You must be signed in to change notification settings - Fork 5
/
tinyclick_utils.py
60 lines (45 loc) · 1.45 KB
/
tinyclick_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from PIL import Image
import re
def prepare_inputs(image_path, text, processor):
img = Image.open(image_path).convert("RGB")
size = img.size
input_text = ("What to do to execute the command? " + text.strip()).lower()
encoding = processor(
images=img,
text=input_text,
return_tensors="pt",
do_resize=True,
)
encoding["image_size"] = size
return encoding
def postprocess(text: str, image_size: tuple[int]):
"""Function that decodes model's generation into action json.
Args:
text: single generated sample
image_size: corresponding image size
"""
pattern = r"</s><s>(<[^>]+>|[^<\s]+)\s*([^<]*?)(<loc_\d+>.*)"
point_pattern = r"<loc_(\d+)><loc_(\d+)>"
match = re.search(pattern, text)
if not match or (action := match.group(1)) != "click":
return {
"action": None,
"click_point": (0, 0),
}
result = {
"action": action,
}
try:
location = re.findall(point_pattern, text)[0]
if len(location) > 0:
point = [int(loc) for loc in location]
rescaled_point = (
int((point[0] / 1000) * image_size[0]),
int((point[1] / 1000) * image_size[1]),
)
result["click_point"] = rescaled_point
else:
result["click_point"] = (0, 0)
except Exception:
result["click_point"] = (0, 0)
return result