-
Notifications
You must be signed in to change notification settings - Fork 1
/
depth_estimation
101 lines (80 loc) · 3.24 KB
/
depth_estimation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Take input video stream and generate Depth Estimation of it
# using Intel's DPT Hybrid Midas Model from Hugging Face
import os
import torch
from transformers import DPTImageProcessor, DPTForDepthEstimation
import cv2
import numpy as np
from PIL import Image
model_name = "Intel/dpt-hybrid-midas"
model_dir = "/models" # Change this to the directory where the volume is mounted
# Create the models directory if it doesn't exist
os.makedirs(model_dir, exist_ok=True)
# Check if the model exists in the model directory
if not os.path.exists(os.path.join(model_dir, model_name)):
# If not, download the model
model = DPTForDepthEstimation.from_pretrained(model_name)
processor = DPTImageProcessor.from_pretrained(model_name)
# Save the model and processor
model.save_pretrained(os.path.join(model_dir, model_name))
processor.save_pretrained(os.path.join(model_dir, model_name))
else:
# If it does, load the model and processor from the directory
model = DPTForDepthEstimation.from_pretrained(os.path.join(model_dir, model_name))
processor = DPTImageProcessor.from_pretrained(os.path.join(model_dir, model_name))
# Get the current working directory
current_directory = os.getcwd()
# Construct the path to the video file
video_path = os.path.join(current_directory, "c_b407_video.mp4")
# Now you can use video_path to access the video file
print("Video path:", video_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(torch.cuda.is_available())
# Open the video file
cap = cv2.VideoCapture(video_path)
# Get video properties
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter("output.mp4", fourcc, fps, (width, height))
current_frame = 0
while cap.isOpened():
# Capture frame-by-frame
ret, frame = cap.read()
if not ret:
break
# Convert frame to PIL Image
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
# Prepare image for the model
inputs = processor(images=image, return_tensors="pt")
with torch.no_grad():
# Get depth prediction
outputs = model(**inputs)
predicted_depth = outputs.predicted_depth
# Interpolate to original size
prediction = torch.nn.functional.interpolate(
predicted_depth.unsqueeze(1),
size=image.size[::-1],
mode="bicubic",
align_corners=False,
)
# Move prediction tensor to CPU for visualization
prediction = prediction.cpu()
# Visualize the prediction
output = prediction.squeeze().numpy()
formatted = (output * 255 / np.max(output)).astype("uint8")
depth = cv2.cvtColor(np.array(Image.fromarray(formatted)), cv2.COLOR_GRAY2BGR)
# Write the depth-enhanced frame to the output video
out.write(depth)
current_frame += 1
if current_frame % 2 == 0: # Print every 10 frames to reduce overhead
print(f'Processed frame {current_frame} of {frame_count}')
break
# Release the video file and writer object, and close windows
cap.release()
out.release()
cv2.destroyAllWindows()