Merge pull request #46 from MichiganCOG/dev

Dev
MichiganCOG · Oct 25, 2019 · 74776f2 · 74776f2
2 parents 326c42c + a76fe1e
commit 74776f2
Show file tree

Hide file tree

Showing 34 changed files with 2,636 additions and 159 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@ runs/*
 models/HGC3D
 *.json
 pbs/*
+*.pt
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Video Platform for Recognition and Detection in Pytorch
+# [Video Platform for Recognition and Detection in Pytorch](https://arxiv.org/abs/1910.02793)
 
 A platform for quick and easy development of deep learning networks for recognition and detection in videos. Includes popular models like C3D and SSD.
 
@@ -9,13 +9,39 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki)
 ### Recognition
 | Model Architecture | Dataset | ViP Accuracy (%) | 
 |:--------------------:|:------------------:|:---------------------:|
+| I3D | HMDB51 (Split 1) | 72.75 |
 | C3D | HMDB51 (Split 1) | 50.14 ± 0.777 |
 | C3D | UCF101 (Split 1) | 80.40 ± 0.399 |
 
 ### Object Detection
 | Model Architecture | Dataset | ViP Accuracy (%) | 
 |:--------------------:|:------------------:|:---------------------:|
 | SSD300 | VOC2007 | 76.58 |
+
+### Video Object Grounding
+| Model Architecture | Dataset | ViP Accuracy (%) | 
+|:--------------------:|:------------------:|:---------------------:|
+| DVSA (+fw, obj) | YC2-BB (Validation) | 30.09 |
+
+**fw**: framewise weighting, **obj**: object interaction
+
+
+## Citation
+
+Please cite ViP when releasing any work that used this platform: https://arxiv.org/abs/1910.02793
+
+```
+@article{ganesh2019vip,
+ title={ViP: Video Platform for PyTorch},
+ author={Ganesh, Madan Ravi and Hofesmann, Eric and Louis, Nathan and Corso, Jason},
+ journal={arXiv preprint arXiv:1910.02793},
+ year={2019}
+}
+
+```
+
+
+
 ## Table of Contents
 
 * [Datasets](#configured-datasets)
@@ -38,12 +64,16 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki)
 |[ImageNetVID](http://bvisionweb1.cs.unc.edu/ilsvrc2015/download-videos-3j16.php) | Video Object Detection |
 |[MSCOCO 2014](http://cocodataset.org/#download) | Object Detection, Keypoints|
 |[VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/) | Object Detection, Classification|
+|[YC2-BB](http://youcook2.eecs.umich.edu/download)| Video Object Grounding|
+|[DHF1K](https://github.com/wenguanwang/DHF1K) | Video Saliency Prediction|
 
 ## Models
 | Model | Task(s) |
 |:------------------------------------------------:|:--------------------:|
 |[C3D](https://github.com/jfzhang95/pytorch-video-recognition/blob/master/network/C3D_model.py) | Activity Recognition |
+|[I3D](https://github.com/piergiaj/pytorch-i3d) | Activity Recognition |
 |[SSD300](https://github.com/amdegroot/ssd.pytorch) | Object Detection |
+|[DVSA (+fw, obj)](https://github.com/MichiganCOG/Video-Grounding-from-Text)| Video Object Grounding|
 
 ## Requirements
 

diff --git a/config_default_example.yaml b/config_default_example.yaml
@@ -1,25 +1,24 @@
 # Preprocessing
 clip_length: 16 # Number of frames within a clip 
 clip_offset: 0 # Frame offset between beginning of video and clip (1st clip only) 
-clip_stride: 0 # Frame offset between successive frames
+clip_stride: 1 # Frame offset between successive clips, must be >= 1 
 crop_shape: [112,112] # (Height, Width) of frame 
 crop_type: Random # Type of cropping operation (Random, Central and None) 
 final_shape: [112,112] # (Height, Width) of input to be given to CNN
 num_clips: -1 # Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips) 
 random_offset: 0 # Boolean switch to generate a clip length sized clip from a video 
 resize_shape: [128,171] # (Height, Width) to resize original data 
-sample_duration: 16 # Temporal size of video to be provided as input to the model 
-sample_size: 112 # Height of frame to be provided as input to the model
 subtract_mean: '' # Subtract mean (R,G,B) from all frames during preprocessing
 
 # Experiment Setup 
 acc_metric: Accuracy # Accuracy metric 
-batch_size: 3 # Numbers of videos in a mini-batch 
+batch_size: 15 # Numbers of videos in a mini-batch 
 dataset: HMDB51 # Name of dataset 
 debug: 0 # If True, do not plot, save, or create data files 
 epoch: 30 # Total number of epochs 
 exp: exp # Experiment name
 gamma: 0.1 # Multiplier with which to change learning rate
+grad_max_norm: 0 # Norm for gradient clipping
 json_path: /z/dat/HMDB51/ # Path to the json file for the given dataset
 labels: 51 # Number of total classes in the dataset
 load_type: train # Environment selection, to include only training/training and validation/testing dataset
@@ -37,3 +36,4 @@ rerun: 1 # Number of trials to repeat an experim
 save_dir: './results' # Path to results directory
 seed: 999 # Seed for reproducibility 
 weight_decay: 0.0005 # Weight decay
+resume: 0 # Flag to resume training or switch to alternate objective after loading
diff --git a/datasets/DHF1K.py b/datasets/DHF1K.py
@@ -0,0 +1,113 @@
+import torch
+try:
+ from .abstract_datasets import DetectionDataset 
+except:
+ from abstract_datasets import DetectionDataset 
+import cv2
+import os
+import numpy as np
+import json
+try:
+ import datasets.preprocessing_transforms as pt
+except:
+ import preprocessing_transforms as pt
+
+class DHF1K(DetectionDataset):
+ def __init__(self, *args, **kwargs):
+ super(DHF1K, self).__init__(*args, **kwargs)
+
+ # Get model object in case preprocessing other than default is used
+ self.model_object = kwargs['model_obj']
+ self.load_type = kwargs['load_type']
+
+ print(self.load_type)
+ if self.load_type=='train':
+ self.transforms = kwargs['model_obj'].train_transforms
+
+ else:
+ self.transforms = kwargs['model_obj'].test_transforms
+
+
+
+
+ def __getitem__(self, idx):
+ vid_info = self.samples[idx]
+
+
+ base_path = vid_info['base_path']
+ vid_size = vid_info['frame_size']
+
+ input_data = []
+ map_data = []
+ bin_data = []
+
+ for frame_ind in range(len(vid_info['frames'])):
+ frame = vid_info['frames'][frame_ind]
+ frame_path = frame['img_path']
+ map_path = frame['map_path']
+ bin_path = frame['bin_path']
+
+ # Load frame, convert to RGB from BGR and normalize from 0 to 1
+ input_data.append(cv2.imread(os.path.join(base_path, frame_path))[...,::-1]/255.)
+
+ # Load frame, Normalize from 0 to 1
+ # All frame channels have repeated values
+ map_data.append(cv2.imread(map_path)/255.)
+ bin_data.append(cv2.imread(bin_path)/255.)
+
+
+
+ vid_data = self.transforms(input_data)
+
+ # Annotations must be resized in the loss/metric
+ map_data = torch.Tensor(map_data)
+ bin_data = torch.Tensor(bin_data)
+
+ # Permute the PIL dimensions (Frame, Height, Width, Chan) to pytorch (Chan, frame, height, width) 
+ vid_data = vid_data.permute(3, 0, 1, 2)
+ map_data = map_data.permute(3, 0, 1, 2)
+ bin_data = bin_data.permute(3, 0, 1, 2)
+ # All channels are repeated so remove the unnecessary channels
+ map_data = map_data[0].unsqueeze(0)
+ bin_data = bin_data[0].unsqueeze(0)
+
+
+ ret_dict = dict() 
+ ret_dict['data'] = vid_data 
+
+ annot_dict = dict()
+ annot_dict['map'] = map_data
+ annot_dict['bin'] = bin_data
+ annot_dict['input_shape'] = vid_data.size()
+ annot_dict['name'] = base_path
+ ret_dict['annots'] = annot_dict
+
+ return ret_dict
+
+
+if __name__=='__main__':
+
+ class tts():
+ def __call__(self, x):
+ return pt.ToTensorClip()(x)
+ class debug_model():
+ def __init__(self):
+ self.train_transforms = tts()
+
+
+ json_path = '/path/to/DHF1K' #### Change this when testing ####
+
+
+ dataset = DHF1K(model_obj=debug_model(), json_path=json_path, load_type='train', clip_length=16, clip_offset=0, clip_stride=1, num_clips=0, random_offset=0, resize_shape=0, crop_shape=0, crop_type='Center', final_shape=0, batch_size=1)
+ train_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=1, shuffle=False)
+
+
+ import matplotlib.pyplot as plt
+ for x in enumerate(train_loader):
+ dat = x[1]['data'][0,:,0].permute(1,2,0).numpy()
+ bin = x[1]['annots']['bin'][0,:,0].permute(1,2,0).numpy().repeat(3,axis=2)
+ map = x[1]['annots']['map'][0,:,0].permute(1,2,0).numpy().repeat(3, axis=2)
+ img = np.concatenate([dat,bin,map], axis=0)
+ plt.imshow(img)
+ plt.show()
+ import pdb; pdb.set_trace()
diff --git a/datasets/HMDB51.py b/datasets/HMDB51.py
@@ -40,8 +40,9 @@ def __getitem__(self, idx):
  base_path = vid_info['base_path']
 
  input_data = []
- vid_data = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
- labels = np.zeros((self.clip_length))-1
+ vid_length = len(vid_info['frames'])
+ vid_data = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+ labels = np.zeros((vid_length))-1
  input_data = []
 
  for frame_ind in range(len(vid_info['frames'])):

diff --git a/datasets/ImageNetVID.py b/datasets/ImageNetVID.py
@@ -42,10 +42,12 @@ def __getitem__(self, idx):
  vid_size = vid_info['frame_size']
 
  input_data = []
- vid_data = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
- bbox_data = np.zeros((self.clip_length, self.max_objects, 4))-1
- labels = np.zeros((self.clip_length, self.max_objects))-1
- occlusions = np.zeros((self.clip_length, self.max_objects))-1
+
+ vid_length = len(vid_info['frames'])
+ vid_data = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+ bbox_data = np.zeros((vid_length, self.max_objects, 4))-1
+ labels = np.zeros((vid_length, self.max_objects))-1
+ occlusions = np.zeros((vid_length, self.max_objects))-1
 
 
 

diff --git a/datasets/KTH.py b/datasets/KTH.py
@@ -0,0 +1,79 @@
+import torch
+from .abstract_datasets import RecognitionDataset 
+from PIL import Image
+import cv2
+import os
+import numpy as np
+from torchvision import transforms
+
+class KTH(RecognitionDataset):
+ def __init__(self, *args, **kwargs):
+ """
+ Initialize KTH class 
+ Args:
+ load_type (String): Select training or testing set 
+ resize_shape (Int): [Int, Int] Array indicating desired height and width to resize input
+ crop_shape (Int): [Int, Int] Array indicating desired height and width to crop input
+ final_shape (Int): [Int, Int] Array indicating desired height and width of input to deep network
+ preprocess (String): Keyword to select different preprocessing types 
+
+ Return:
+ None
+ """
+ super(KTH, self).__init__(*args, **kwargs)
+
+ self.load_type = kwargs['load_type']
+ self.resize_shape = kwargs['resize_shape']
+ self.crop_shape = kwargs['crop_shape']
+ self.final_shape = kwargs['final_shape']
+ self.preprocess = kwargs['preprocess']
+
+ if self.load_type=='train':
+ self.transforms = kwargs['model_obj'].train_transforms
+
+ else:
+ self.transforms = kwargs['model_obj'].test_transforms
+
+
+ def __getitem__(self, idx):
+ vid_info = self.samples[idx]
+ base_path = vid_info['base_path']
+
+ input_data = []
+
+ vid_length = len(vid_info['frames'])
+ vid_data = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+ labels = np.zeros((vid_length))-1
+ input_data = []
+
+ for frame_ind in range(len(vid_info['frames'])):
+ frame_path = os.path.join(base_path, vid_info['frames'][frame_ind]['img_path'])
+
+ for frame_labels in vid_info['frames'][frame_ind]['actions']:
+ labels[frame_ind] = frame_labels['action_class']
+
+ # Load frame image data and preprocess image accordingly
+ input_data.append(cv2.imread(frame_path)[...,::-1]/1.)
+
+
+ # Preprocess data
+ vid_data = self.transforms(input_data)
+ labels = torch.from_numpy(labels).float()
+
+ # Permute the PIL dimensions (Frame, Height, Width, Chan) to pytorch (Chan, frame, height, width) 
+ vid_data = vid_data.permute(3, 0, 1, 2)
+
+ ret_dict = dict() 
+ ret_dict['data'] = vid_data 
+
+ annot_dict = dict()
+ annot_dict['labels'] = labels
+
+ ret_dict['annots'] = annot_dict
+
+ return ret_dict
+
+
+#dataset = HMDB51(json_path='/z/dat/HMDB51', dataset_type='train', clip_length=100, num_clips=0)
+#dat = dataset.__getitem__(0)
+#import pdb; pdb.set_trace()
diff --git a/datasets/MSCOCO.py b/datasets/MSCOCO.py
@@ -1,6 +1,6 @@
 import torch
 from .abstract_datasets import DetectionDataset 
-from PIL import Image
+import cv2
 import os
 import numpy as np
 import datasets.preprocessing_transforms as pt
@@ -34,10 +34,11 @@ def __getitem__(self, idx):
  vid_size = vid_info['frame_size']
 
  input_data = []
- vid_data = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
- bbox_data = np.zeros((self.clip_length, self.max_objects, 4))-1
- labels = np.zeros((self.clip_length, self.max_objects))-1
- iscrowds = np.zeros((self.clip_length, self.max_objects))-1
+ vid_length = len(vid_info['frames'])
+ vid_data = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+ bbox_data = np.zeros((vid_length, self.max_objects, 4))-1
+ labels = np.zeros((vid_length, self.max_objects))-1
+ iscrowds = np.zeros((vid_length, self.max_objects))-1
 
 
 
@@ -62,7 +63,7 @@ def __getitem__(self, idx):
  iscrowds[frame_ind, trackid] = iscrowd
 
 
- input_data.append(Image.open(os.path.join(base_path, frame_path)))
+ input_data.append(cv2.imread(os.path.join(base_path, frame_path))[...,::-1])
 
  vid_data, bbox_data = self.transforms(input_data, bbox_data)