forked from geopavlakos/hamer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
demo_depth.py
339 lines (281 loc) · 14.5 KB
/
demo_depth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
from pathlib import Path
import torch
import argparse
import os
import cv2
import numpy as np
import open3d as o3d
from scipy.optimize import minimize
from mesh_to_sdf.depth_point_cloud import DepthPointCloud
from hamer.configs import CACHE_DIR_HAMER
from hamer.models import HAMER, download_models, load_hamer, DEFAULT_CHECKPOINT
from hamer.utils import recursive_to
from hamer.datasets.vitdet_dataset import ViTDetDataset, DEFAULT_MEAN, DEFAULT_STD
from hamer.utils.renderer import Renderer, cam_crop_to_full
LIGHT_BLUE=(0.65098039, 0.74117647, 0.85882353)
from vitpose_model import ViTPoseModel
import json
from typing import Dict, Optional
import matplotlib.pyplot as plt
def save_point_cloud_as_ply(vertices, filename, colors=None):
"""
Save a point cloud (with optional RGB colors) as a PLY file using Open3D.
Args:
vertices (numpy.ndarray): Nx3 array of 3D points.
filename (str): Name of the output PLY file (without extension).
colors (numpy.ndarray, optional): Nx3 array of RGB colors (values in range [0, 255]).
If None, saves the point cloud without colors.
"""
# Create an Open3D PointCloud object
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(vertices)
# Add colors if provided
if colors is not None:
if colors.shape[0] != vertices.shape[0]:
raise ValueError("The number of color entries must match the number of vertices.")
pcd.colors = o3d.utility.Vector3dVector(colors / 255.0) # Normalize RGB to [0, 1]
# Save the point cloud to a PLY file
o3d.io.write_point_cloud(filename, pcd)
print(f"Point cloud saved to '{filename}' using Open3D.")
def obj_funcion(x, vertices, translation, K1, K2, kd_tree):
# projection 1
V1 = vertices + translation
x1 = K1 @ V1.T
x1[0, :] /= x1[2, :]
x1[1, :] /= x1[2, :]
# projection 2
V2 = vertices + x
x2 = K2 @ V2.T
x2[0, :] /= x2[2, :]
x2[1, :] /= x2[2, :]
# 3D distances
distances, indices = kd_tree.query(V2)
distances = distances.astype(np.float32).reshape(-1)
error_3d = np.mean(distances)
# error
error_2d = np.square(x1[:2] - x2[:2]).mean()
print('error 2d', error_2d, 'error_3d', error_3d)
return error_2d + 100 * error_3d
def main():
parser = argparse.ArgumentParser(description='HaMeR demo code')
parser.add_argument('--checkpoint', type=str, default=DEFAULT_CHECKPOINT, help='Path to pretrained model checkpoint')
parser.add_argument('--img_folder', type=str, default='images', help='Folder with input images')
parser.add_argument('--out_folder', type=str, default='out_demo', help='Output folder to save rendered results')
parser.add_argument('--side_view', dest='side_view', action='store_true', default=False, help='If set, render side view also')
parser.add_argument('--full_frame', dest='full_frame', action='store_true', default=True, help='If set, render all people together also')
parser.add_argument('--save_mesh', dest='save_mesh', action='store_true', default=False, help='If set, save meshes to disk also')
parser.add_argument('--batch_size', type=int, default=1, help='Batch size for inference/fitting')
parser.add_argument('--rescale_factor', type=float, default=2.0, help='Factor for padding the bbox')
parser.add_argument('--body_detector', type=str, default='vitdet', choices=['vitdet', 'regnety'], help='Using regnety improves runtime and reduces memory')
parser.add_argument('--file_type', nargs='+', default=['*.jpg', '*.png'], help='List of file extensions to consider')
args = parser.parse_args()
intrinsic_matrix = np.array([[574.0527954101562, 0.0, 319.5],
[0.0, 574.0527954101562, 239.5],
[0.0, 0.0, 1.0]])
print(intrinsic_matrix)
# Download and load checkpoints
download_models(CACHE_DIR_HAMER)
model, model_cfg = load_hamer(args.checkpoint)
# Setup HaMeR model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)
model.eval()
# Load detector
from hamer.utils.utils_detectron2 import DefaultPredictor_Lazy
if args.body_detector == 'vitdet':
from detectron2.config import LazyConfig
import hamer
cfg_path = Path(hamer.__file__).parent/'configs'/'cascade_mask_rcnn_vitdet_h_75ep.py'
detectron2_cfg = LazyConfig.load(str(cfg_path))
detectron2_cfg.train.init_checkpoint = "https://dl.fbaipublicfiles.com/detectron2/ViTDet/COCO/cascade_mask_rcnn_vitdet_h/f328730692/model_final_f05665.pkl"
for i in range(3):
detectron2_cfg.model.roi_heads.box_predictors[i].test_score_thresh = 0.25
detector = DefaultPredictor_Lazy(detectron2_cfg)
elif args.body_detector == 'regnety':
from detectron2 import model_zoo
from detectron2.config import get_cfg
detectron2_cfg = model_zoo.get_config('new_baselines/mask_rcnn_regnety_4gf_dds_FPN_400ep_LSJ.py', trained=True)
detectron2_cfg.model.roi_heads.box_predictor.test_score_thresh = 0.5
detectron2_cfg.model.roi_heads.box_predictor.test_nms_thresh = 0.4
detector = DefaultPredictor_Lazy(detectron2_cfg)
# keypoint detector
cpm = ViTPoseModel(device)
# Setup the renderer
renderer = Renderer(model_cfg, faces=model.mano.faces)
# Make output directory if it does not exist
os.makedirs(args.out_folder, exist_ok=True)
# Get all demo images ends with .jpg or .png
img_paths = [img for end in args.file_type for img in Path(args.img_folder).glob(end)]
# Iterate over all images in folder
for img_path in img_paths:
print(img_path)
img_cv2 = cv2.imread(str(img_path))
# load depth
depth_path = str(img_path).replace('jpg', 'png')
depth = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH).astype(np.float32)
depth /= 1000.0
print(np.min(depth), np.max(depth))
# Detect humans in image
det_out = detector(img_cv2)
img = img_cv2.copy()[:, :, ::-1]
det_instances = det_out['instances']
valid_idx = (det_instances.pred_classes==0) & (det_instances.scores > 0.5)
pred_bboxes=det_instances.pred_boxes.tensor[valid_idx].cpu().numpy()
pred_scores=det_instances.scores[valid_idx].cpu().numpy()
# Detect human keypoints for each person
vitposes_out = cpm.predict_pose(
img,
[np.concatenate([pred_bboxes, pred_scores[:, None]], axis=1)],
)
bboxes = []
is_right = []
# Use hands based on hand keypoint detections
for vitposes in vitposes_out:
left_hand_keyp = vitposes['keypoints'][-42:-21]
right_hand_keyp = vitposes['keypoints'][-21:]
# Rejecting not confident detections
keyp = left_hand_keyp
valid = keyp[:,2] > 0.5
if sum(valid) > 3:
bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()]
bboxes.append(bbox)
is_right.append(0)
keyp = right_hand_keyp
valid = keyp[:,2] > 0.5
if sum(valid) > 3:
bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()]
bboxes.append(bbox)
is_right.append(1)
if len(bboxes) == 0:
continue
boxes = np.stack(bboxes)
right = np.stack(is_right)
# Run reconstruction on all detected hands
dataset = ViTDetDataset(model_cfg, img_cv2, boxes, right, rescale_factor=args.rescale_factor)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=0)
all_verts = []
all_cam_t = []
all_right = []
for batch in dataloader:
batch = recursive_to(batch, device)
with torch.no_grad():
out = model(batch)
multiplier = (2*batch['right']-1)
pred_cam = out['pred_cam']
pred_cam[:,1] = multiplier*pred_cam[:,1]
box_center = batch["box_center"].float()
box_size = batch["box_size"].float()
img_size = batch["img_size"].float()
multiplier = (2*batch['right']-1)
scaled_focal_length = model_cfg.EXTRA.FOCAL_LENGTH / model_cfg.MODEL.IMAGE_SIZE * img_size.max()
pred_cam_t_full = cam_crop_to_full(pred_cam, box_center, box_size, img_size, scaled_focal_length).detach().cpu().numpy()
# Render the result
batch_size = batch['img'].shape[0]
for n in range(batch_size):
# Get filename from path img_path
img_fn, _ = os.path.splitext(os.path.basename(img_path))
person_id = int(batch['personid'][n])
white_img = (torch.ones_like(batch['img'][n]).cpu() - DEFAULT_MEAN[:,None,None]/255) / (DEFAULT_STD[:,None,None]/255)
input_patch = batch['img'][n].cpu() * (DEFAULT_STD[:,None,None]/255) + (DEFAULT_MEAN[:,None,None]/255)
input_patch = input_patch.permute(1,2,0).numpy()
regression_img = renderer(out['pred_vertices'][n].detach().cpu().numpy(),
out['pred_cam_t'][n].detach().cpu().numpy(),
batch['img'][n],
mesh_base_color=LIGHT_BLUE,
scene_bg_color=(1, 1, 1),
)
if args.side_view:
side_img = renderer(out['pred_vertices'][n].detach().cpu().numpy(),
out['pred_cam_t'][n].detach().cpu().numpy(),
white_img,
mesh_base_color=LIGHT_BLUE,
scene_bg_color=(1, 1, 1),
side_view=True)
final_img = np.concatenate([input_patch, regression_img, side_img], axis=1)
else:
final_img = np.concatenate([input_patch, regression_img], axis=1)
cv2.imwrite(os.path.join(args.out_folder, f'{img_fn}_{person_id}.png'), 255*final_img[:, :, ::-1])
# Add all verts and cams to list
verts = out['pred_vertices'][n].detach().cpu().numpy()
is_right = batch['right'][n].cpu().numpy()
verts[:,0] = (2*is_right-1)*verts[:,0]
cam_t = pred_cam_t_full[n]
all_verts.append(verts)
all_cam_t.append(cam_t)
all_right.append(is_right)
# Save all meshes to disk
if args.save_mesh:
camera_translation = cam_t.copy()
tmesh = renderer.vertices_to_trimesh(verts, camera_translation, LIGHT_BLUE, is_right=is_right)
tmesh.export(os.path.join(args.out_folder, f'{img_fn}_{person_id}.obj'))
# Render front view
if args.full_frame and len(all_verts) > 0:
misc_args = dict(
mesh_base_color=LIGHT_BLUE,
scene_bg_color=(1, 1, 1),
focal_length=scaled_focal_length,
)
print('len all verts', len(all_verts))
cam_view = renderer.render_rgba_multiple(all_verts, cam_t=all_cam_t, render_res=img_size[n], is_right=all_right, **misc_args)
# Overlay image
input_img = img_cv2.astype(np.float32)[:,:,::-1]/255.0
input_img = np.concatenate([input_img, np.ones_like(input_img[:,:,:1])], axis=2) # Add alpha channel
input_img_overlay = input_img[:,:,:3] * (1-cam_view[:,:,3:]) + cam_view[:,:,:3] * cam_view[:,:,3:]
cv2.imwrite(os.path.join(args.out_folder, f'{img_fn}_all.jpg'), 255*input_img_overlay[:, :, ::-1])
# get the hand points
mask = cam_view[:,:,3] > 0
# eroder mask
kernel = np.ones((5, 5), np.uint8)
mask = cv2.erode(mask.astype(np.uint8), kernel)
mask = 1 - mask
# convert depth to point cloud
depth_pc = DepthPointCloud(depth, intrinsic_matrix, camera_pose=np.eye(4), target_mask=mask, threshold=10.0)
print(depth_pc.points.shape)
# solve new translation
K = np.array([[12500, 0, 320], [0, 12500, 240], [0, 0, 1]]).astype(np.float32)
x0 = np.mean(depth_pc.points, axis=0)
print(x0.shape)
res = minimize(obj_funcion, x0, method='nelder-mead',
args=(all_verts[-1], all_cam_t[-1], K, intrinsic_matrix, depth_pc.kd_tree), options={'xatol': 1e-8, 'disp': True})
translation_new = res.x
print(res.x)
fig = plt.figure()
ax = fig.add_subplot(2, 2, 1)
plt.imshow(input_img)
ax = fig.add_subplot(2, 2, 2)
plt.imshow(input_img)
# verify projection 1
vertices = all_verts[-1] + all_cam_t[-1]
print(K, vertices)
print(vertices.shape)
x2d = K @ vertices.T
x2d[0, :] /= x2d[2, :]
x2d[1, :] /= x2d[2, :]
plt.plot(x2d[0, :], x2d[1, :])
plt.title('projection using hamer camera')
ax = fig.add_subplot(2, 2, 3)
plt.imshow(input_img)
# verify projection 2
vertices = all_verts[-1] + translation_new
x2d = intrinsic_matrix @ vertices.T
x2d[0, :] /= x2d[2, :]
x2d[1, :] /= x2d[2, :]
plt.plot(x2d[0, :], x2d[1, :])
plt.title('projection using fetch camera')
ax = fig.add_subplot(2, 2, 4, projection='3d')
ax.scatter(depth_pc.points[:, 0], depth_pc.points[:, 1], depth_pc.points[:, 2], marker='o')
ax.scatter(vertices[:, 0], vertices[:, 1], vertices[:, 2], marker='o', color='r')
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
# save file
depth_pc = DepthPointCloud(depth, intrinsic_matrix, camera_pose=np.eye(4), target_mask=None, threshold=10.0)
colors = np.zeros(depth_pc.points.shape)
colors[:, 1] = 255
save_point_cloud_as_ply(depth_pc.points, filename='depth.ply', colors=colors)
colors = np.zeros(vertices.shape)
colors[:, 0] = 255
save_point_cloud_as_ply(vertices, filename='hand.ply', colors=colors)
plt.show()
if __name__ == '__main__':
main()