models.py

import cv2
import numpy as np
import torch
from torchvision import models, transforms
from PIL import Image

# Check if MPS is available
device = torch.device("mps" if torch.has_mps else "cpu")
print(f"Using device: {device}")

#masking model
deeplab_model = models.segmentation.deeplabv3_resnet50(weights=None, weights_backbone=None)
state_dict = torch.load('/dtu-compute/sciofsci/other/phd_temp/fashion/deeplabv3_resnet50_coco-cd0a2569.pth')
filtered_state_dict = {k: v for k, v in state_dict.items() if 'aux_classifier' not in k}
deeplab_model.load_state_dict(filtered_state_dict)
deeplab_model = deeplab_model.to(device)
deeplab_model.eval()

#embedding model
mobilenet_model = models.mobilenet_v2(weights=None)
state_dict = torch.load('/dtu-compute/sciofsci/other/phd_temp/fashion/mobilenet_v2-b0353104.pth')
mobilenet_model.load_state_dict(state_dict)
mobilenet_model.classifier = torch.nn.Identity()
mobilenet_model = mobilenet_model.to(device)
mobilenet_model.eval()

# Define the transformation for input images
preprocess = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


# Refine the mask by applying morphological operations
def refine_mask(mask):
    # Convert mask to uint8 format
    mask = (mask * 255).astype(np.uint8)

    # Apply morphological operations to remove noise and smooth the edges
    kernel = np.ones((5, 5), np.uint8)
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=2)
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=2)
    mask = cv2.GaussianBlur(mask, (7, 7), 0)
    return mask

def keep_largest_connected_component(mask):
    num_labels, labels_im = cv2.connectedComponents(mask)
    
    if num_labels > 2:  # More than one person (background counts as one label)
        print(f"Found {num_labels - 1} people in the image. Keeping only the largest connected component.")
        largest_component = 0
        largest_size = 0
        for label in range(1, num_labels):
            size = np.sum(labels_im == label)
            if size > largest_size:
                largest_size = size
                largest_component = label

        # Create a mask with only the largest component
        mask = (labels_im == largest_component).astype(np.uint8)

    return mask

# Merge two bounding boxes into one
def merge_boxes(box1, box2):
    x1, y1, w1, h1 = box1
    x2, y2, w2, h2 = box2
    x = min(x1, x2)
    y = min(y1, y2)
    w = max(x1 + w1, x2 + w2) - x
    h = max(y1 + h1, y2 + h2) - y
    return (x, y, w, h)


def check_for_multiple_people_using_contours(mask,save=False):
    # Find contours in the binary mask
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Count the number of contours (each contour should represent a separate person)
    num_people = len(contours)

     #plot the contours
    if save:
        image_contours = cv2.cvtColor(mask * 255, cv2.COLOR_GRAY2BGR)
        cv2.drawContours(image_contours, contours, -1, (0, 255, 0), 2)
        cv2.imwrite('temp/contours.png', image_contours)


    if num_people > 1:
        print(f"Found {num_people} people in the image.")
        return True, num_people
    else:
        print("Only one person found in the image.")
        return False, num_people


# Remove background and keep only the largest person or group
def remove_background(image,save=False,threshold=0.5):
    image_pil = Image.fromarray(image)
    input_tensor = preprocess(image_pil).unsqueeze(0).to(device)
    
    with torch.no_grad():
        output = deeplab_model(input_tensor)['out']
        scores = torch.nn.functional.softmax(output, dim=1)
        output_predictions = scores.argmax(1) == 15
        output_predictions = output_predictions & (scores[:, 15, :, :] > threshold)

        #output_predictions = scores[ : , 15, : , : ] > 0.001  # Person class
    
    #plot the scores mask
    if save:
        scores = scores.cpu().numpy()
        mask = scores[0][0]
        mask = cv2.resize(mask, (image_pil.width, image_pil.height))
        cv2.imwrite('temp/scores.png', mask*255)
        
    output_predictions = output_predictions.cpu().numpy()
    mask = output_predictions[0]
    mask = cv2.resize(mask.astype(np.uint8), (image_pil.width, image_pil.height))
    if save:
        cv2.imwrite('temp/mask.png', mask*255) 
    
    check_for_multiple_people_using_contours(mask,save)
    
    mask = keep_largest_connected_component(mask)
    if save:
        cv2.imwrite('temp/mask_largest.png', mask*255)
    
    image_rgba = cv2.cvtColor(image, cv2.COLOR_RGB2RGBA)
    r_mask = refine_mask(mask)
    if save:
        cv2.imwrite('temp/mask_refined.png', r_mask)
    image_rgba[:, :, 3] = r_mask  # Apply the refined mask to the alpha channel
    if save:
        cv2.imwrite('temp/masked_image.png', cv2.cvtColor(image_rgba, cv2.COLOR_RGBA2BGRA))
    return image_rgba

def get_embedding(image):
    image = Image.fromarray(image)
    input_tensor = preprocess(image).unsqueeze(0).to(device)
    with torch.no_grad():
        embedding = mobilenet_model(input_tensor)
    return embedding.squeeze().cpu().numpy()


if __name__ == "__main__":
    # Load an image
    #image_path = 'data/images_sample/145.jpg'
    #image_path = 'data/images_sample/124141.jpg'
    #get random inage from the dataset
    image_path = f'data/images_sample/{np.random.choice(range(1, 5000))}.jpg'
    image = cv2.imread(image_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    cv2.imwrite('temp/before_mask.png', cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)) 
    # Remove the background
    image_without_bg = remove_background(image_rgb,save=True,threshold=0.3)

    # Save the result to check the masking
    cv2.imwrite("temp/after_mask.png", cv2.cvtColor(image_without_bg, cv2.COLOR_RGBA2BGRA))

    # Convert back to RGB for embedding extraction (ignore alpha channel)
    image_rgb_no_alpha = cv2.cvtColor(image_without_bg, cv2.COLOR_BGRA2BGR)

    # Generate embeddings
    embedding = get_embedding(image_rgb_no_alpha)

    print("Embedding shape:", embedding.shape)