From 510833a221f891d638d21f4606fb88deae3a98dd Mon Sep 17 00:00:00 2001 From: Favyen Bastani Date: Mon, 6 Dec 2021 22:24:25 -0500 Subject: [PATCH] Initial commit --- README.md | 112 ++++++++ geom.py | 335 +++++++++++++++++++++++ infer.py | 349 ++++++++++++++++++++++++ model.py | 489 ++++++++++++++++++++++++++++++++++ scripts/filter_short.py | 29 ++ scripts/filter_small.py | 37 +++ scripts/interpolate.py | 46 ++++ scripts/json2mot.py | 45 ++++ scripts/mot2json.py | 49 ++++ scripts/pathtrack.py | 39 +++ scripts/preprocess-info.py | 158 +++++++++++ scripts/preprocess-matches.go | 212 +++++++++++++++ scripts/symlink.py | 21 ++ scripts/ytw-extract.py | 16 ++ scripts/ytw-maskrcnn.py | 84 ++++++ train.py | 307 +++++++++++++++++++++ 16 files changed, 2328 insertions(+) create mode 100644 README.md create mode 100644 geom.py create mode 100644 infer.py create mode 100644 model.py create mode 100644 scripts/filter_short.py create mode 100644 scripts/filter_small.py create mode 100644 scripts/interpolate.py create mode 100644 scripts/json2mot.py create mode 100644 scripts/mot2json.py create mode 100644 scripts/pathtrack.py create mode 100644 scripts/preprocess-info.py create mode 100644 scripts/preprocess-matches.go create mode 100644 scripts/symlink.py create mode 100644 scripts/ytw-extract.py create mode 100644 scripts/ytw-maskrcnn.py create mode 100644 train.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..3d3bbb5 --- /dev/null +++ b/README.md @@ -0,0 +1,112 @@ +Self-Supervised Multi-Object Tracking with Cross-Input Consistency +------------------------------------------------------------------ + +UNS20 is the code for "Self-Supervised Multi-Object Tracking with Cross-Input Consistency" (NeurIPS 2021). +UNS20 is an approach for training a robust multi-object tracking model using +only an object detector and a large corpus of unlabeled video. + + +Installation +------------ + +Requires Tensorflow 1.15: + + pip install 'tensorflow<2.0' scikit-image + +Download MOT17 dataset: + + mkdir /home/ubuntu/data/ + wget https://motchallenge.net/data/MOT17.zip + unzip MOT17.zip + mv MOT17 /home/ubuntu/data/mot17/ + +Download UNS20 model: + + wget https://favyen.com/files/uns20-model.zip + mv model/ /home/ubuntu/model/ + + +Inference +--------- + +For SDP detections: + + cd /path/to/uns20/ + python scripts/mot2json.py /home/ubuntu/data/ test + python infer.py /home/ubuntu/model/model /home/ubuntu/data/ + +DPM and FRCNN detections have lower accuracy than SDP detections. Recent +methods universally perform regression and classification pre-processing steps. +Classification prunes incorrect input detections, while regression improves the +bounding box coordinates. These steps don't really make sense, since they use a +better detector to improve lower-quality detections. However, the steps are +needed to achieve performance comparable with other methods, since all methods +now use the same steps. + +To apply UNS20 on DPM and FRCNN detections, it should be executed after the +regression and classification pre-processing steps from https://github.com/phil-bergmann/tracking_wo_bnw. + +For the most informative comparison, we highly recommend comparing performance +only on the SDP detections, which have the highest accuracy. While evaluating +on lower-quality detections sounds like it could be useful, one would really be +evaluating the pre-processing steps more than the method itself. + + +Evaluation +---------- + +Convert from JSON to the TXT format: + + mkdir /home/ubuntu/outputs/ + python scripts/json2mot.py /home/ubuntu/data/ train /home/ubuntu/outputs/ + +Compare: + + pip install motmetrics + python -m motmetrics.apps.eval_motchallenge /home/ubuntu/data/mot17/train/ /home/ubuntu/outputs/ + + +Training +-------- + +First, obtain PathTrack and YT-Walking datasets: + + wget https://data.vision.ee.ethz.ch/daid/MOT/pathtrack_release_v1.0.zip + wget https://favyen.com/files/yt-walking.zip + mkdir /home/ubuntu/data/yt-walking/ + unzip yt-walking.zip -d /home/ubuntu/data/yt-walking/ + mkdir /home/ubuntu/data/pathtrack/ + unzip pathtrack_release_v1.0.zip + mv pathtrack_release /home/ubuntu/data/pathtrack/ + +Extract video frames from YT-Walking mp4 files: + + python scripts/ytw-extract.py /home/ubuntu/data/ + +Convert MOT17 object detections to uniform JSON format: + + python scripts/mot2json.py /home/ubuntu/data/ train + python scripts/mot2json.py /home/ubuntu/data/ test + +Convert PathTrack object detections to uniform JSON format: + + python scripts/pathtrack.py /home/ubuntu/data/ + +Normalize MOT17 and PathTrack datasets: + + python scripts/symlink.py mot17 /home/ubuntu/data/ + python scripts/symlink.py pathtrack /home/ubuntu/data/ + +Pre-process each of the three datasets using `scripts/preprocess-info.py` and `scripts/preprocess-matches.go`. + + python scripts/preprocess-info.py mot17 /home/ubuntu/data/ 8 + python scripts/preprocess-info.py pathtrack /home/ubuntu/data/ 8 + python scripts/preprocess-info.py yt-walking /home/ubuntu/data/ 8 + go run scripts/preprocess-matches.go mot17 /home/ubuntu/data/ + go run scripts/preprocess-matches.go pathtrack /home/ubuntu/data/ + go run scripts/preprocess-matches.go yt-walking /home/ubuntu/data/ + +Train the model: + + mkdir /home/ubuntu/model/ + python train.py /home/ubuntu/data/ /home/ubuntu/model/model diff --git a/geom.py b/geom.py new file mode 100644 index 0000000..30e563c --- /dev/null +++ b/geom.py @@ -0,0 +1,335 @@ +import math +import numpy + +class Point(object): + def __init__(self, x, y): + self.x = int(x) + self.y = int(y) + + def distance(self, other): + dx = self.x - other.x + dy = self.y - other.y + return math.sqrt(dx * dx + dy * dy) + + def sub(self, other): + return Point(self.x - other.x, self.y - other.y) + + def add(self, other): + return Point(self.x + other.x, self.y + other.y) + + def scale(self, f): + return Point(self.x * f, self.y * f) + + def magnitude(self): + return math.sqrt(self.x * self.x + self.y * self.y) + + def angle_to(self, other): + if self.magnitude() == 0 or other.magnitude() == 0: + return 0 + s = (self.x * other.x + self.y * other.y) / self.magnitude() / other.magnitude() + if abs(s) > 1: s = s / abs(s) + angle = math.acos(s) + if angle > math.pi: + return 2 * math.pi - angle + else: + return angle + + def signed_angle(self, other): + return math.atan2(other.y, other.x) - math.atan2(self.y, self.x) + + def bounds(self): + return Rectangle(self, self) + + def dot(self, point): + return self.x * point.x + self.y * point.y + + def rotate(self, center, angle): + dx = self.x - center.x + dy = self.y - center.y + rx = math.cos(angle)*dx - math.sin(angle)*dy + ry = math.sin(angle)*dx + math.cos(angle)*dy + return Point(center.x + int(rx), center.y + int(ry)) + + def __repr__(self): + return 'Point({}, {})'.format(self.x, self.y) + + def __eq__(self, other): + return self.x == other.x and self.y == other.y + + def __ne__(self, other): + return not self.__eq__(other) + + def __hash__(self): + return hash((self.x, self.y)) + +class FPoint(object): + def __init__(self, x, y): + self.x = float(x) + self.y = float(y) + + def distance(self, other): + dx = self.x - other.x + dy = self.y - other.y + return math.sqrt(dx * dx + dy * dy) + + def sub(self, other): + return FPoint(self.x - other.x, self.y - other.y) + + def add(self, other): + return FPoint(self.x + other.x, self.y + other.y) + + def scale(self, f): + return FPoint(self.x * f, self.y * f) + + def scale_to_length(self, l): + return self.scale(l / self.magnitude()) + + def magnitude(self): + return math.sqrt(self.x * self.x + self.y * self.y) + + def angle_to(self, other): + if self.magnitude() == 0 or other.magnitude() == 0: + return 0 + s = (self.x * other.x + self.y * other.y) / self.magnitude() / other.magnitude() + if abs(s) > 1: s = s / abs(s) + angle = math.acos(s) + if angle > math.pi: + return 2 * math.pi - angle + else: + return angle + + def signed_angle(self, other): + return math.atan2(other.y, other.x) - math.atan2(self.y, self.x) + + def bounds(self): + return Rectangle(self, self) + + def dot(self, point): + return self.x * point.x + self.y * point.y + + def __repr__(self): + return 'FPoint({}, {})'.format(self.x, self.y) + + def to_point(self): + return Point(self.x, self.y) + + def __eq__(self, other): + return self.x == other.x and self.y == other.y + + def __ne__(self, other): + return not self.__eq__(other) + + def __hash__(self): + return hash((self.x, self.y)) + +class Segment(object): + def __init__(self, start, end): + self.start = start + self.end = end + + def length(self): + return self.start.distance(self.end) + + def project_factor(self, point, line=False): + l = self.length() + if l == 0: + return 0 + t = point.sub(self.start).dot(self.end.sub(self.start)) / l + if not line: + t = max(0, min(l, t)) + return t + + def project(self, point, line=False): + t = self.project_factor(point, line=line) + return self.point_at_factor(t) + + def point_at_factor(self, t): + l = self.length() + if l == 0: + return self.start + return self.start.add(self.end.sub(self.start).scale(t / l)) + + def distance(self, point, line=False): + p = self.project(point, line=line) + return p.distance(point) + + def intersection(self, other): + d1 = self.vector() + d2 = other.vector() + d12 = other.start.sub(self.start) + + den = d1.y * d2.x - d1.x * d2.y + u1 = d1.x * d12.y - d1.y * d12.x + u2 = d2.x * d12.y - d2.y * d12.x + + if den == 0: + # collinear + if u1 == 0 and u2 == 0: + return self.start + else: + return None + + if float(u1) / den < 0 or float(u1) / den > 1 or float(u2) / den < 0 or float(u2) / den > 1: + return None + + return self.point_at_factor(float(u2) / den * self.length()) + + def vector(self): + return self.end.sub(self.start) + + def bounds(self): + return self.start.bounds().extend(self.end) + + def extend(self, amount): + v = self.vector() + v = v.scale(amount / v.magnitude()) + return Segment( + self.start.sub(v), + self.end.add(v) + ) + + def __repr__(self): + return 'Segment({}, {})'.format(self.start, self.end) + +class Rectangle(object): + def __init__(self, start, end): + self.start = start + self.end = end + + def lengths(self): + return Point(self.end.x - self.start.x, self.end.y - self.start.y) + + def clip(self, point): + npoint = Point(point.x, point.y) + if npoint.x < self.start.x: + npoint.x = self.start.x + elif npoint.x >= self.end.x: + npoint.x = self.end.x - 1 + if npoint.y < self.start.y: + npoint.y = self.start.y + elif npoint.y >= self.end.y: + npoint.y = self.end.y - 1 + return npoint + + def clip_rect(self, r): + return Rectangle(self.clip(r.start), self.clip(r.end)) + + def add_tol(self, tol): + return Rectangle( + self.start.sub(Point(tol, tol)), + self.end.add(Point(tol, tol)) + ) + + def contains(self, point): + return point.x >= self.start.x and point.x < self.end.x and point.y >= self.start.y and point.y < self.end.y + + def extend(self, point): + return Rectangle( + Point(min(self.start.x, point.x), min(self.start.y, point.y)), + Point(max(self.end.x, point.x), max(self.end.y, point.y)) + ) + + def extend_rect(self, rect): + return Rectangle( + Point(min(self.start.x, rect.start.x), min(self.start.y, rect.start.y)), + Point(max(self.end.x, rect.end.x), max(self.end.y, rect.end.y)) + ) + + def intersects(self, other): + return self.end.y >= other.start.y and other.end.y >= self.start.y and self.end.x >= other.start.x and other.end.x >= self.start.x + + def scale(self, f): + return Rectangle(self.start.scale(f), self.end.scale(f)) + + def intersection(self, other): + intersection = Rectangle( + Point(max(self.start.x, other.start.x), max(self.start.y, other.start.y)), + Point(min(self.end.x, other.end.x), min(self.end.y, other.end.y)) + ) + if intersection.end.x <= intersection.start.x: + intersection.end.x = intersection.start.x + if intersection.end.y <= intersection.start.y: + intersection.end.y = intersection.start.y + return intersection + + def area(self): + return (self.end.x - self.start.x) * (self.end.y - self.start.y) + + def iou(self, other): + intersect_area = self.intersection(other).area() + if intersect_area == 0: + return 0 + return float(intersect_area) / (self.area() + other.area() - intersect_area) + + def __repr__(self): + return 'Rectangle({}, {})'.format(self.start, self.end) + +def draw_line(start, end, lengths): + # followX indicates whether to move along x or y coordinates + followX = abs(end.y - start.y) <= abs(end.x - start.x) + if followX: + x0 = start.x + x1 = end.x + y0 = start.y + y1 = end.y + else: + x0 = start.y + x1 = end.y + y0 = start.x + y1 = end.x + + delta = Point(abs(x1 - x0), abs(y1 - y0)) + current_error = 0 + + if x0 < x1: + xstep = 1 + else: + xstep = -1 + + if y0 < y1: + ystep = 1 + else: + ystep = -1 + + points = [] + def add_point(p): + if p.x >= 0 and p.x < lengths.x and p.y >= 0 and p.y < lengths.y: + points.append(p) + + x = x0 + y = y0 + + while x != x1 + xstep: + if followX: + add_point(Point(x, y)) + else: + add_point(Point(y, x)) + + x += xstep + current_error += delta.y + if current_error >= delta.x: + y += ystep + current_error -= delta.x + + return points + +def draw_lines(segments, im=None, shape=None): + from eyediagram._brescount import bres_segments_count + if not shape: + if not im: + raise Exception('shape or im must be provided') + shape = im.shape + tmpim = numpy.zeros((shape[0], shape[1]), dtype='int32') + + sticks = numpy.zeros((len(segments), 4), dtype='int32') + for i, segment in enumerate(segments): + sticks[i] = [segment.start.x, segment.start.y, segment.end.x, segment.end.y] + bres_segments_count(sticks, tmpim) + tmpim = tmpim > 0 + if im: + return numpy.logical_or(im, tmpim) + else: + return tmpim + +def vector_from_angle(angle, length): + return Point(math.cos(angle) * length, math.sin(angle) * length) diff --git a/infer.py b/infer.py new file mode 100644 index 0000000..f68d097 --- /dev/null +++ b/infer.py @@ -0,0 +1,349 @@ +import geom +import model + +import json +import numpy +import math +import os +import skimage.io, skimage.transform +import sys +import tensorflow as tf +import time + +MODEL_PATH = sys.argv[1] +data_path = sys.argv[2] + +model.BATCH_SIZE = 1 +model.SEQ_LEN = 2 + +SKIP = 2 +MAX_AGE = 10 +MODE = 'imsp' + +LABELS = ['MOT17-{}-SDP'.format(x) for x in ['01', '03', '06', '07', '08', '12', '14']] +DETECTION_PATH = data_path + '/mot17/test/{}/det/det-filter60.json' +FRAME_PATH = data_path + '/mot17/test/{}/img1/' +OUT_PATH = data_path + '/mot17/test/{}/det/uns20.json' + +ORIG_WIDTH = 1920 +ORIG_HEIGHT = 1080 +DETECTION_SCALE = 1 +FRAME_SCALE = 1 +CROP_SIZE = 64 +HIDDEN_SIZE = 4*64 + +print('initializing model') +m = model.Model(options={'mode': MODE}) +config = tf.ConfigProto() +config.gpu_options.allow_growth = True +session = tf.Session(config=config) + +m.saver.restore(session, MODEL_PATH) + +def get_frame_fname(frame_idx): + s = str(frame_idx) + while len(s) < 6: + s = '0' + s + return s + '.jpg' + +for label in LABELS: + detection_path = DETECTION_PATH.format(label) + print('loading detections from {}'.format(detection_path)) + with open(detection_path, 'r') as f: + raw_detections = json.load(f) + + # auto-detect im width/height + for frame_idx, dlist in enumerate(raw_detections): + if not dlist or len(dlist) == 0: + continue + im = skimage.io.imread('{}/{}'.format(FRAME_PATH.format(label), get_frame_fname(frame_idx))) + im_bounds = geom.Rectangle(geom.Point(0, 0), geom.Point(im.shape[1]*FRAME_SCALE, im.shape[0]*FRAME_SCALE)) + break + + detections = [None for _ in range(len(raw_detections))] + for frame_idx, dlist in enumerate(raw_detections): + if not dlist or frame_idx % SKIP != 0: + continue + detections[frame_idx] = [] + for i, d in enumerate(dlist): + rect = geom.Rectangle( + geom.Point(d['left']//DETECTION_SCALE, d['top']//DETECTION_SCALE), + geom.Point(d['right']//DETECTION_SCALE, d['bottom']//DETECTION_SCALE) + ) + rect = im_bounds.clip_rect(rect) + if rect.lengths().x < 4 or rect.lengths().y < 4: + continue + nd = { + 'left': rect.start.x, + 'top': rect.start.y, + 'right': rect.end.x, + 'bottom': rect.end.y, + 'frame_idx': d['frame_idx'], + } + detections[frame_idx].append(nd) + + def zip_frame_info(detections, frame_idx): + im = skimage.io.imread('{}/{}'.format(FRAME_PATH.format(label), get_frame_fname(frame_idx))) + im_bounds = geom.Rectangle( + geom.Point(0, 0), + geom.Point(im.shape[0], im.shape[1]) + ) + info = [] + for detection in detections: + rect = geom.Rectangle( + geom.Point(detection['top']//FRAME_SCALE, detection['left']//FRAME_SCALE), + geom.Point(detection['bottom']//FRAME_SCALE, detection['right']//FRAME_SCALE) + ) + crop = im[rect.start.x:rect.end.x, rect.start.y:rect.end.y, :] + resize_factor = min([float(CROP_SIZE) / crop.shape[0], float(CROP_SIZE) / crop.shape[1]]) + crop = (skimage.transform.resize(crop, [int(crop.shape[0] * resize_factor), int(crop.shape[1] * resize_factor)])*255).astype('uint8') + fix_crop = numpy.zeros((CROP_SIZE, CROP_SIZE, 3), dtype='uint8') + fix_crop[0:crop.shape[0], 0:crop.shape[1], :] = crop + detection['width'] = float(detection['right']-detection['left'])/ORIG_WIDTH + detection['height'] = float(detection['bottom']-detection['top'])/ORIG_HEIGHT + info.append((detection, fix_crop)) + return info + + def get_loc(detection): + cx = (detection['left'] + detection['right']) / 2 + cy = (detection['top'] + detection['bottom']) / 2 + cx = float(cx) / ORIG_WIDTH + cy = float(cy) / ORIG_HEIGHT + return cx, cy + + def get_stuff(infos): + def per_info(info): + images = [] + boxes = [] + for i, (detection, crop) in enumerate(info): + images.append(crop) + cx, cy = get_loc(detection) + boxes.append([cx, cy, detection['width'], detection['height']]) + detections = [get_loc(detection) for detection, _ in info] + return images, boxes, detections, len(info) + + all_images = [] + all_boxes = [] + all_detections = [] + all_counts = [] + for info in infos: + images, boxes, detections, count = per_info(info) + all_images.extend(images) + all_boxes.extend(boxes) + all_detections.append(detections) + all_counts.append(count) + + return all_images, all_boxes, all_detections, all_counts + + def softmax(X, theta = 1.0, axis = None): + y = numpy.atleast_2d(X) + if axis is None: + axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1) + y = y * float(theta) + y = y - numpy.expand_dims(numpy.max(y, axis = axis), axis) + y = numpy.exp(y) + ax_sum = numpy.expand_dims(numpy.sum(y, axis = axis), axis) + p = y / ax_sum + if len(X.shape) == 1: p = p.flatten() + return p + + # list of objects (id, detection_idx in latest frame, prev_hidden, time since last match) + # note: detection_idx should be len(info)+1 for the terminal vertex + active_objects = None + track_counter = 0 + for frame_idx in range(0, len(detections)-SKIP, SKIP): + if not detections[frame_idx] or not detections[frame_idx+SKIP]: + active_objects = None + continue + + print(frame_idx, len(detections)) + info1 = zip_frame_info(detections[frame_idx], frame_idx) + info2 = zip_frame_info(detections[frame_idx+SKIP], frame_idx+SKIP) + + if len(info1) == 0 or len(info2) == 0: + active_objects = None + continue + + images1, boxes1, _, counts1 = get_stuff([info1]) + images2, boxes2, _, counts2 = get_stuff([info2]) + + if active_objects is None: + active_objects = [] + for left_idx in range(len(info1)): + active_objects.append(( + track_counter, + left_idx, + numpy.zeros((HIDDEN_SIZE,), dtype='float32'), + 0, + [images1[left_idx]], + )) + detections[frame_idx][left_idx]['track_id'] = track_counter + track_counter += 1 + + ''' + outputs_raw, out_mat, cur_hidden, out_logits, mat_finesp, mat_longim = session.run([m.out_mat_reweight, m.out_mat, m.out_hidden, m.out_logits_finesp, m.out_mat_finesp, m.out_mat_longim], feed_dict=feed_dict) + + # take maximum in outputs_raw along the active indices + outputs = numpy.zeros((len(active_objects), len(info2)+1), dtype='float32') + for i, obj in enumerate(active_objects): + cur_finesp = mat_finesp[active_indices[i], :].max(axis=0) + cur_longim = mat_longim[active_indices[i], :].max(axis=0) + outputs[i, :] = cur_finesp + cur_longim + #outputs[i, 0:len(info2)] = outputs_raw[active_indices[i], 0:len(info2)].max(axis=0) + #outputs[i, len(info2)] = outputs_raw[active_indices[i], len(info2)].min() + ''' + + if MODE == 'imsp' or MODE == 'finesp' or MODE == 'longim': + # flatten the active objects since each object may have multiple images + flat_images = [] + flat_boxes = [] + flat_hidden = [] + active_indices = {} + for i, obj in enumerate(active_objects): + active_indices[i] = [] + for j in [1, 2, 4, 8, 16]: + #for j in range(1, len(obj[4])+1, len(obj[4])//5+1): + if len(obj[4]) < j: + continue + # use image from stored history, but use current box + active_indices[i].append(len(flat_images)) + flat_images.append(obj[4][-j]) + if obj[1] < len(info1): + flat_boxes.append(boxes1[obj[1]]) + else: + flat_boxes.append(numpy.zeros((4,), dtype='float32')) + flat_hidden.append(obj[2]) + + feed_dict = { + m.raw_images: flat_images + images2, + m.input_boxes: flat_boxes + boxes2, + m.n_image: [[len(flat_images), len(images2), 0]], + m.is_training: False, + m.infer_sel: range(len(flat_images)), + m.infer_hidden: flat_hidden, + } + + longim_logits, finesp_logits, pre_cur_hidden = session.run([m.out_logits_longim, m.out_logits_finesp, m.out_hidden], feed_dict=feed_dict) + longim_out_logits = numpy.zeros((len(active_objects), len(info2)+1), dtype='float32') + finesp_out_logits = numpy.zeros((len(active_objects), len(info2)+1), dtype='float32') + cur_hidden = numpy.zeros((len(active_objects), len(info2)+1, HIDDEN_SIZE), dtype='float32') + for i, obj in enumerate(active_objects): + longim_out_logits[i, 0:len(info2)] = longim_logits[active_indices[i], 0:len(info2)].mean(axis=0) + longim_out_logits[i, len(info2)] = longim_logits[active_indices[i], len(info2)].min() + finesp_out_logits[i, 0:len(info2)] = finesp_logits[active_indices[i], 0:len(info2)].mean(axis=0) + finesp_out_logits[i, len(info2)] = finesp_logits[active_indices[i], len(info2)].min() + cur_hidden[i, :, :] = pre_cur_hidden[active_indices[i][0], :, :] + #longim_mat = softmax(longim_out_logits, axis=1) + #finesp_mat = softmax(finesp_out_logits, axis=1) + longim_mat = numpy.minimum(softmax(longim_out_logits, axis=0), softmax(longim_out_logits, axis=1)) + finesp_mat = numpy.minimum(softmax(finesp_out_logits, axis=0), softmax(finesp_out_logits, axis=1)) + outputs = numpy.minimum(longim_mat, finesp_mat) + #outputs = numpy.minimum(longim_out_logits, finesp_out_logits) + #outputs = (longim_out_logits+finesp_out_logits)/2 + if MODE == 'finesp': + outputs = finesp_mat + elif MODE == 'longim': + outputs = longim_mat + else: + feed_dict = { + m.raw_images: images1 + images2, + m.input_boxes: boxes1 + boxes2, + m.is_training: False, + m.infer_sel: [obj[1] for obj in active_objects], + m.infer_hidden: [obj[2] for obj in active_objects], + } + if MODE == 'occl': + feed_dict[m.a_counts] = [len(images1), len(images2)] + else: + feed_dict[m.n_image] = [[len(images1), len(images2), 0]] + outputs, out_mat, out_logits, cur_hidden = session.run([m.out_mat_reweight, m.out_mat, m.out_logits, m.out_hidden], feed_dict=feed_dict) + outputs = out_mat + + # vote on best next frame: idx1->(output,idx2) + votes = {} + for i in range(len(active_objects)): + for j in range(len(info2)+1): + output = outputs[i, j] + #if j == len(info2) and out_logits[active_indices[i][0], :].argmax() == len(info2): + #if j == len(info2) and longim_out_logits[:, outputs[i, :].argmax()].argmax() != i: + #if j == len(info2) and longim_out_logits[i, :].max() < 1: + if MODE == 'imsp' and j != len(info2) and (longim_out_logits[i, j] < 0 or finesp_out_logits[i, j] < 0): + output = -100.0 + elif MODE == 'finesp' and j != len(info2) and finesp_out_logits[i, j] < 0: + output = -100.0 + #if j == len(info2): + # output = -2 + if i not in votes or output > votes[i][0]: + if j < len(info2): + votes[i] = (output, j) + else: + votes[i] = (output, None) + # group by receiver and vote on max idx2->idx1 to eliminate duplicates + votes2 = {} + for idx1, t in votes.items(): + output, idx2 = t + if idx2 is not None and (idx2 not in votes2 or output > votes2[idx2][0]): + votes2[idx2] = (output, idx1) + forward_matches = {idx1: idx2 for (idx2, (_, idx1)) in votes2.items()} + + def get_hidden(idx1, idx2): + if model.__name__ == 'occl3b_model': + return cur_hidden[idx1, :] + else: + return cur_hidden[idx1, idx2, :] + + new_objects = [] + used_idx2s = set() + for idx1, obj in enumerate(active_objects): + if idx1 in forward_matches: + idx2 = forward_matches[idx1] + new_objects.append(( + obj[0], + idx2, + get_hidden(idx1, idx2), + #numpy.zeros((64,), dtype='float32'), + 0, + obj[4] + [images2[idx2]], + )) + used_idx2s.add(idx2) + detections[frame_idx+SKIP][idx2]['track_id'] = obj[0] + elif obj[3] < MAX_AGE: + idx2 = votes[idx1][1] + if idx2 is None or True: + idx2 = len(info2) + new_objects.append(( + obj[0], + idx2, + get_hidden(idx1, idx2), + #numpy.zeros((64,), dtype='float32'), + obj[3]+1, + obj[4], + )) + + for idx2 in range(len(info2)): + if idx2 in used_idx2s: + continue + new_objects.append(( + track_counter, + idx2, + numpy.zeros((HIDDEN_SIZE,), dtype='float32'), + 0, + [images2[idx2]], + )) + detections[frame_idx+SKIP][idx2]['track_id'] = track_counter + track_counter += 1 + + active_objects = new_objects + + ndetections = [None for _ in detections] + for frame_idx, dlist in enumerate(detections): + if not dlist: + continue + dlist = [d for d in dlist if 'track_id' in d] + if not dlist: + continue + ndetections[frame_idx] = dlist + detections = ndetections + + with open(OUT_PATH.format(label), 'w') as f: + json.dump(detections, f) diff --git a/model.py b/model.py new file mode 100644 index 0000000..c6c480d --- /dev/null +++ b/model.py @@ -0,0 +1,489 @@ +import numpy +import tensorflow as tf +import os +import os.path +import random +import math +import time + +BATCH_SIZE = 1 +SEQ_LEN = 17 +KERNEL_SIZE = 3 + +class Model: + def _conv_layer(self, name, input_var, stride, in_channels, out_channels, options = {}): + activation = options.get('activation', 'relu') + dropout = options.get('dropout', None) + padding = options.get('padding', 'SAME') + batchnorm = options.get('batchnorm', False) + transpose = options.get('transpose', False) + + with tf.variable_scope(name) as scope: + if not transpose: + filter_shape = [KERNEL_SIZE, KERNEL_SIZE, in_channels, out_channels] + else: + filter_shape = [KERNEL_SIZE, KERNEL_SIZE, out_channels, in_channels] + kernel = tf.get_variable( + 'weights', + shape=filter_shape, + initializer=tf.truncated_normal_initializer(stddev=math.sqrt(2.0 / KERNEL_SIZE / KERNEL_SIZE / in_channels)), + dtype=tf.float32 + ) + biases = tf.get_variable( + 'biases', + shape=[out_channels], + initializer=tf.constant_initializer(0.0), + dtype=tf.float32 + ) + if not transpose: + output = tf.nn.bias_add( + tf.nn.conv2d( + input_var, + kernel, + [1, stride, stride, 1], + padding=padding + ), + biases + ) + else: + batch = tf.shape(input_var)[0] + side = tf.shape(input_var)[1] + output = tf.nn.bias_add( + tf.nn.conv2d_transpose( + input_var, + kernel, + [batch, side * stride, side * stride, out_channels], + [1, stride, stride, 1], + padding=padding + ), + biases + ) + if batchnorm: + output = tf.contrib.layers.batch_norm(output, center=True, scale=True, is_training=self.is_training, decay=0.99) + if dropout is not None: + output = tf.nn.dropout(output, keep_prob=1-dropout) + + if activation == 'relu': + return tf.nn.relu(output, name=scope.name) + elif activation == 'sigmoid': + return tf.nn.sigmoid(output, name=scope.name) + elif activation == 'none': + return output + else: + raise Exception('invalid activation {} specified'.format(activation)) + + def _fc_layer(self, name, input_var, input_size, output_size, options = {}): + activation = options.get('activation', 'relu') + dropout = options.get('dropout', None) + batchnorm = options.get('batchnorm', False) + + with tf.variable_scope(name) as scope: + weights = tf.get_variable( + 'weights', + shape=[input_size, output_size], + initializer=tf.truncated_normal_initializer(stddev=math.sqrt(2.0 / input_size)), + dtype=tf.float32 + ) + biases = tf.get_variable( + 'biases', + shape=[output_size], + initializer=tf.constant_initializer(0.0), + dtype=tf.float32 + ) + output = tf.matmul(input_var, weights) + biases + if batchnorm: + output = tf.contrib.layers.batch_norm(output, center=True, scale=True, is_training=self.is_training, decay=0.99) + if dropout is not None: + output = tf.nn.dropout(output, keep_prob=1-dropout) + + if activation == 'relu': + return tf.nn.relu(output, name=scope.name) + elif activation == 'sigmoid': + return tf.nn.sigmoid(output, name=scope.name) + elif activation == 'none': + return output + else: + raise Exception('invalid activation {} specified'.format(activation)) + + def __init__(self, options={}): + tf.reset_default_graph() + self.options = options + + self.is_training = tf.placeholder(tf.bool) + self.raw_images = tf.placeholder(tf.uint8, [None, 64, 64, 3]) + self.input_images = tf.cast(self.raw_images, tf.float32)/255.0 + self.input_boxes = tf.placeholder(tf.float32, [None, 4]) + self.n_image = tf.placeholder(tf.int32, [BATCH_SIZE, SEQ_LEN+1]) + self.input_masks = tf.placeholder(tf.float32, [None]) + self.match_length = tf.placeholder(tf.int32) + self.learning_rate = tf.placeholder(tf.float32) + + # for inference + self.infer_sel = tf.placeholder(tf.int32, [None]) + self.infer_hidden = tf.placeholder(tf.float32, [None, 256]) + + # extract masks + self.masks = [] + s = 0 + for batch in range(BATCH_SIZE): + n_first = self.n_image[batch, 0] + n_last = self.n_image[batch, self.match_length] + cur_count = n_first*(n_last+1) + cur_mask = tf.reshape(self.input_masks[s:s+cur_count], [n_first, n_last+1]) + self.masks.append(cur_mask) + s += cur_count + + if SEQ_LEN < 4: + stuffs = [] + for i in range(4): + with tf.variable_scope('ensemble' + str(i)): + stuff = self.make_part(options, infer_hidden=self.infer_hidden[:, 64*i:64*(i+1)]) + stuffs.append(stuff) + + if options.get('infer_op', 'mean') == 'min': + self.out_mat_finesp = tf.reduce_min([stuff[0] for stuff in stuffs], axis=0) + self.out_logits_finesp = tf.reduce_min([stuff[1] for stuff in stuffs], axis=0) + self.out_mat_longim = tf.reduce_min([stuff[2] for stuff in stuffs], axis=0) + self.out_logits_longim = tf.reduce_min([stuff[3] for stuff in stuffs], axis=0) + self.out_mat = tf.reduce_min([stuff[4] for stuff in stuffs], axis=0) + self.out_mat_reweight = tf.reduce_min([stuff[5] for stuff in stuffs], axis=0) + else: + self.out_mat_finesp = tf.reduce_mean([stuff[0] for stuff in stuffs], axis=0) + self.out_logits_finesp = tf.reduce_mean([stuff[1] for stuff in stuffs], axis=0) + self.out_mat_longim = tf.reduce_mean([stuff[2] for stuff in stuffs], axis=0) + self.out_logits_longim = tf.reduce_mean([stuff[3] for stuff in stuffs], axis=0) + self.out_mat = tf.reduce_mean([stuff[4] for stuff in stuffs], axis=0) + self.out_mat_reweight = tf.reduce_mean([stuff[5] for stuff in stuffs], axis=0) + + self.out_hidden = tf.concat([stuff[6] for stuff in stuffs], axis=2) + else: + longim_losses = [] + finesp_losses = [] + for i in range(4): + with tf.variable_scope('ensemble' + str(i)): + longim_loss, finesp_loss = self.make_part(options) + longim_losses.append(longim_loss) + finesp_losses.append(finesp_loss) + + self.longim_loss = tf.reduce_mean(longim_losses) + self.finesp_loss = tf.reduce_mean(finesp_losses) + + with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): + self.longim_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.longim_loss) + self.finesp_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.finesp_loss) + + self.init_op = tf.initialize_all_variables() + self.saver = tf.train.Saver(max_to_keep=None) + + def make_part(self, options, infer_hidden=None): + c_image = options.get('c_image', 64) + c_spatial = 4 + c_features = c_image+c_spatial + c_rnn = 64 + + # CNN for long-term image + layer1 = self._conv_layer('layer1', self.input_images, 2, 3, 64) # -> 32x32x64 + layer2 = self._conv_layer('layer2', layer1, 2, 64, c_image) # -> 16x16x64 + layer3 = self._conv_layer('layer3', layer2, 2, c_image, c_image) # -> 8x8x64 + layer4 = self._conv_layer('layer4', layer3, 2, c_image, c_image) # -> 4x4x64 + layer5 = self._conv_layer('layer5', layer4, 2, c_image, c_image) # -> 2x2x64 + layer6 = self._conv_layer('layer6', layer5, 2, c_image, c_image, {'activation': 'none'})[:, 0, 0, :] + + features = [[] for _ in range(BATCH_SIZE)] + s = 0 + for batch in range(BATCH_SIZE): + for i in range(SEQ_LEN+1): + cur_count = self.n_image[batch, i] + cur_features = tf.concat([ + self.input_boxes[s:s+cur_count, :], + layer6[s:s+cur_count, :], + ], axis=1) + cur_features = tf.concat([ + cur_features, + tf.zeros([1, c_features], dtype=tf.float32), + ], axis=0) + features[batch].append(cur_features) + s += cur_count + + # MATCHER + # context is longim or finesp + def matcher(pairs, context): + with tf.variable_scope('matcher' + context, reuse=tf.AUTO_REUSE): + im_pairs = tf.concat([pairs[:, 0:c_rnn], pairs[:, c_rnn+4:c_rnn+c_features], pairs[:, c_rnn+c_features+4:]], axis=1) + if options.get('spatial_rel', False): + sp1 = pairs[:, c_rnn:c_rnn+4] + sp2 = pairs[:, c_rnn+c_features:c_rnn+c_features+4] + spatial_pairs = tf.concat([ + pairs[:, 0:c_rnn], + sp1[:, 0:2] - sp2[:, 0:2], + sp1[:, 2:4], + sp2[:, 0:2] - sp1[:, 0:2], + sp2[:, 2:4], + ], axis=1) + else: + spatial_pairs = tf.concat([pairs[:, 0:c_rnn+4], pairs[:, c_rnn+c_features:c_rnn+c_features+4]], axis=1) + + if context == 'longim': + matcher1 = self._fc_layer('matcher1', im_pairs, c_rnn+2*c_image, 256) + matcher2 = self._fc_layer('matcher2', matcher1, 256, 65, {'activation': 'none'}) + return matcher2 + elif context == 'finesp': + matcher1 = self._fc_layer('matcher1', spatial_pairs, c_rnn+2*c_spatial, 256) + matcher2 = self._fc_layer('matcher2', matcher1, 256, 128) + matcher3 = self._fc_layer('matcher3', matcher2, 128, 128) + matcher4 = self._fc_layer('matcher4', matcher3, 128, 1, {'activation': 'none'}) + + matcher5 = self._fc_layer('matcher5', spatial_pairs, c_rnn+2*c_spatial, 256) + matcher6 = self._fc_layer('matcher6', matcher1, 256, 128) + matcher7 = self._fc_layer('matcher7', matcher2, 128, 128) + matcher8 = self._fc_layer('matcher8', matcher3, 128, c_rnn, {'activation': 'none'}) + + return tf.concat([matcher4, matcher8], axis=1) + elif context == 'combined': + matcher1 = self._fc_layer('matcher1', pairs, c_rnn+2*c_features, 256) + matcher2 = self._fc_layer('matcher2', matcher1, 256, 128) + matcher3 = self._fc_layer('matcher3', matcher2, 128, 128) + matcher4 = self._fc_layer('matcher4', matcher3, 128, 1, {'activation': 'none'}) + + matcher5 = self._fc_layer('matcher5', pairs, c_rnn+2*c_features, 256) + matcher6 = self._fc_layer('matcher6', matcher1, 256, 128) + matcher7 = self._fc_layer('matcher7', matcher2, 128, 128) + matcher8 = self._fc_layer('matcher8', matcher3, 128, c_rnn, {'activation': 'none'}) + + return tf.concat([matcher4, matcher8], axis=1) + + # logit replacing matching some detection with the zero (null/fake) detection + no_match_logit = tf.get_variable('no_match_logit', shape=[1], initializer=tf.constant_initializer(0.0), dtype=tf.float32) + + def get_mat_hidden(n_prev, n_next, rnn_features, prev_features, next_features, context, incl_logits=False, do_neg=True): + do_neg = do_neg and options.get('do_neg', True) + if do_neg: + # include min(n_next, n_neg) negative examples where we borrow the next spatial features + # now we also include n_prev previous images as negatives + #fake_next1 = tf.minimum(n_prev, n_next) + #fake_next2 = tf.maximum(0, n_prev - n_next) + fake_next1 = n_prev + fake_next2 = tf.minimum(n_next, self.n_image[0, SEQ_LEN]) + fake_next = fake_next1 + fake_next2 + n_next += fake_next + #neg_features1 = tf.concat([ + # next_features[0:fake_next1, 0:c_spatial], + # prev_features[0:fake_next1, c_spatial:], + #], axis=1) + #neg_features2 = tf.concat([ + # prev_features[0:fake_next2, 0:c_spatial], + # prev_features[fake_next1:n_prev, c_spatial:], + #], axis=1) + neg_features1 = prev_features + neg_features2 = tf.concat([ + next_features[0:fake_next2, 0:c_spatial], + features[0][SEQ_LEN][0:fake_next2, c_spatial:], + ], axis=1) + next_features = tf.concat([neg_features1, neg_features2, next_features], axis=0) + + cur_pairs = tf.concat([ + tf.tile( + tf.reshape(rnn_features, [n_prev, 1, c_rnn]), + [1, n_next+1, 1] + ), + tf.tile( + tf.reshape(prev_features, [n_prev, 1, c_features]), + [1, n_next+1, 1] + ), + tf.tile( + tf.reshape(next_features, [1, n_next+1, c_features]), + [n_prev, 1, 1] + ), + ], axis=2) + + cur_pairs = tf.reshape(cur_pairs, [n_prev*(n_next+1), c_rnn+2*c_features]) + cur_outputs = matcher(cur_pairs, context=context) + cur_outputs = tf.reshape(cur_outputs, [n_prev, n_next+1, 1+c_rnn]) + + cur_logits = cur_outputs[:, :, 0] + if options.get('no_match_logit', True): + cur_logits = tf.concat([ + cur_logits[:, :-1], + tf.tile(tf.reshape(no_match_logit, [1, 1]), [n_prev, 1]), + ], axis=1) + + if do_neg: + # need to eliminate logits that are connecting the same features + # these are in the first n_prev x n_prev of the matrix + elim_mat = tf.eye(num_rows=n_prev, num_columns=n_next+1) + cur_logits = (cur_logits*(1-elim_mat)) - 50*elim_mat + + if options.get('linearnorm', False): + # multiply rows and columns by a factor so that they add up to at most 1 + # we do rows first, then columns + cur_mat = tf.nn.sigmoid(cur_logits) + cur_mat = tf.concat([ + cur_mat[:, :-1], + tf.maximum(1-tf.reduce_sum(cur_mat[:, :-1], axis=1, keepdims=True), tf.maximum(0.01, no_match_logit)), + ], axis=1) + row_factors = 1.0/tf.maximum(1.0, tf.reduce_sum(cur_mat, axis=1, keepdims=True)) + cur_mat *= tf.tile(row_factors, [1, n_next+1]) + col_factors = 1.0/tf.maximum(1.0, tf.reduce_sum(cur_mat, axis=0, keepdims=True)) + cur_mat *= tf.tile(col_factors, [n_prev, 1]) + else: + cur_mat = tf.math.minimum( + tf.nn.softmax(cur_logits, axis=0), + tf.nn.softmax(cur_logits, axis=1) + ) + cur_hidden = cur_outputs[:, :, 1:] + + if do_neg: + cur_logits = cur_logits[:, fake_next:] + cur_mat = cur_mat[:, fake_next:] + cur_hidden = cur_hidden[:, fake_next:, :] + + if incl_logits: + return cur_mat, cur_hidden, cur_logits + else: + return cur_mat, cur_hidden + + def index_list(l, idx, out_shape): + flatlist = [] + sums = [0] + for t in l: + flat = tf.reshape(t, [-1]) + flatlist.append(flat) + sums.append(sums[-1] + tf.shape(flat)[0]) + flatlist = tf.concat(flatlist, axis=0) + sums = tf.stack(sums, axis=0) + output = flatlist[sums[idx]:sums[idx+1]] + return tf.reshape(output, out_shape) + + def terminal_reweight(mat): + mat_term = mat[:, -1] + factor = tf.minimum(1.0/(tf.reduce_sum(mat_term)+1e-2), tf.cast(tf.shape(mat)[0], tf.float32)) + mat_term = mat_term * factor + row_maxes = 1 - tf.reduce_sum(mat[:, :-1], axis=1) + row_maxes = tf.maximum(row_maxes, 0) + mat_term = tf.minimum(mat_term, row_maxes) + return tf.concat([mat[:, :-1], tf.reshape(mat_term, [-1, 1])], axis=1) + + def get_recur_sel(mat): + if options.get('simple_sel', False): + return tf.argmax(mat, axis=1, output_type=tf.int32) + def f(mat): + # take argmax along rows (over columns) + # but only use it if it is higher value than other rows in same column + row_argmax = numpy.argmax(mat, axis=1) + col_argmax = numpy.argmax(mat, axis=0) + out = row_argmax + for i in range(out.shape[0]): + if col_argmax[out[i]] != i: + out[i] = mat.shape[1]-1 + return out.astype('int32') + + sel = tf.py_func(f, [mat], tf.int32, stateful=False) + return sel + + def compute_loss(mat1, mat2, batch, apply_mask=True): + if apply_mask: + mask = self.masks[batch] + else: + mask = tf.ones(tf.shape(mat1), dtype=tf.float32) + + epsilon = options.get('epsilon', 1e-8) + if options.get('no_terminal', False): + loss = -tf.reduce_mean(tf.log(tf.reduce_sum(mat1[:, :-1] * mat2[:, :-1] * mask[:, :-1], axis=1) + epsilon)) + elif options.get('terminal_reweight', True): + loss = -tf.reduce_mean(tf.log(tf.reduce_sum(terminal_reweight(mat1) * terminal_reweight(mat2) * mask, axis=1) + epsilon)) + else: + loss = -tf.reduce_mean(tf.log(tf.reduce_sum(mat1 * mat2 * mask, axis=1) + epsilon)) + + return loss + + + if SEQ_LEN < 4: + # inference + n_prev = tf.shape(self.infer_sel)[0] + n_next = self.n_image[0, 1] + rnn_features = infer_hidden + prev_features = tf.gather(features[0][0], self.infer_sel, axis=0) + next_features = features[0][1] + + out_mat_finesp, out_hidden, out_logits_finesp = get_mat_hidden(n_prev, n_next, rnn_features, prev_features, next_features, 'finesp', incl_logits=True, do_neg=False) + out_mat_longim, _, out_logits_longim = get_mat_hidden(n_prev, n_next, tf.zeros(tf.shape(rnn_features), dtype=tf.float32), prev_features, next_features, 'longim', incl_logits=True, do_neg=False) + out_mat = tf.minimum(out_mat_finesp, out_mat_longim) + + if options.get('terminal_reweight', True): + out_mat_reweight = terminal_reweight(out_mat) + + return out_mat_finesp, out_logits_finesp, out_mat_longim, out_logits_longim, out_mat, out_mat_reweight, out_hidden + + + finesp_indices = [] + for i in range(SEQ_LEN-1): + finesp_indices.append((i, i+1)) + + # LONGIM + extra_mats = [[] for _ in range(BATCH_SIZE)] + extra_mats_finesp = [[] for _ in range(BATCH_SIZE)] + for batch in range(BATCH_SIZE): + n_prev = self.n_image[batch, 0] + n_next = self.n_image[batch, self.match_length] + rnn_features = tf.zeros((n_prev, 1, c_rnn), dtype=tf.float32) + prev_features = features[batch][0][:-1, :] + # next_features = features[batch][match_length] + next_features = index_list(features[batch], self.match_length, [n_next+1, c_features]) + cur_mat, _ = get_mat_hidden(n_prev, n_next, rnn_features, prev_features, next_features, 'longim') + extra_mats[batch].append(cur_mat) + + for i in range(SEQ_LEN-1): + # for extra_mats_finesp we always have SEQ_LEN inputs + n_prev = self.n_image[batch, 0] + n_next = self.n_image[batch, i+1] + rnn_features = tf.zeros((n_prev, 1, c_rnn), dtype=tf.float32) + prev_features = features[batch][0][:-1, :] + next_features = features[batch][i+1] + cur_mat, _ = get_mat_hidden(n_prev, n_next, rnn_features, prev_features, next_features, 'longim', do_neg=False) + extra_mats_finesp[batch].append(cur_mat) + + # FINESP (note: this can't be executed with variable matchlen, at least for now) + finesp_mats = [[] for _ in range(BATCH_SIZE)] + finesp_hiddens = [[] for _ in range(BATCH_SIZE)] + for batch in range(BATCH_SIZE): + for prev_idx, next_idx in finesp_indices: + n_next = self.n_image[batch, next_idx] + if prev_idx == 0: + n_prev = self.n_image[batch, prev_idx] + prev_features = features[batch][prev_idx][:-1, :] + rnn_features = tf.zeros((n_prev, 1, c_rnn), dtype=tf.float32) + else: + n_prev = self.n_image[batch, 0] + if options.get('follow_longim', False): + sel = get_recur_sel(extra_mats_finesp[batch][prev_idx-1]) + else: + sel = get_recur_sel(finesp_mats[batch][-1]) + rnn_sel = tf.stack([ + tf.range(n_prev, dtype=tf.int32), + sel, + ], axis=1) + prev_features = tf.gather(features[batch][prev_idx], sel, axis=0) + rnn_features = tf.gather_nd(finesp_hiddens[batch][-1], rnn_sel) + + cur_mat, cur_hidden = get_mat_hidden(n_prev, n_next, rnn_features, prev_features, features[batch][next_idx], 'finesp', do_neg=False) + finesp_mats[batch].append(cur_mat) + finesp_hiddens[batch].append(cur_hidden) + + # longim loss + longim_losses = [] + for batch in range(BATCH_SIZE): + mat = extra_mats[batch][0] + loss = compute_loss(mat, mat, batch) + longim_losses.append(loss) + longim_loss = tf.reduce_mean(longim_losses) + + # finespatial loss + finesp_losses = [] + for batch in range(BATCH_SIZE): + for i, finesp_mat in enumerate(finesp_mats[batch]): + extra_mat = tf.stop_gradient(extra_mats_finesp[batch][i]) + loss = compute_loss(finesp_mat, extra_mat, batch, apply_mask=(i==SEQ_LEN-2)) + finesp_losses.append(loss) + finesp_loss = tf.reduce_mean(finesp_losses) + + return longim_loss, finesp_loss diff --git a/scripts/filter_short.py b/scripts/filter_short.py new file mode 100644 index 0000000..8b04281 --- /dev/null +++ b/scripts/filter_short.py @@ -0,0 +1,29 @@ +import json +import sys + +in_fname = sys.argv[1] +out_fname = sys.argv[2] + +with open(in_fname, 'r') as f: + detections = json.load(f) + +# get tracks +track_map = {} +for dlist in detections: + if dlist is None: + continue + for detection in dlist: + track_id = detection['track_id'] + if track_id not in track_map: + track_map[track_id] = [] + track_map[track_id].append(detection) + +ndetections = [[] for _ in detections] +for track in track_map.values(): + if len(track) <= 3: + continue + for detection in track: + ndetections[detection['frame_idx']].append(detection) + +with open(out_fname, 'w') as f: + json.dump(ndetections, f) diff --git a/scripts/filter_small.py b/scripts/filter_small.py new file mode 100644 index 0000000..f491427 --- /dev/null +++ b/scripts/filter_small.py @@ -0,0 +1,37 @@ +import json +import os +import skimage.io +import sys + +in_fname = sys.argv[1] +frame_path = sys.argv[2] +out_fname = sys.argv[3] + +with open(in_fname, 'r') as f: + detections = json.load(f) + +frame_fname = [fname for fname in os.listdir(frame_path) if fname.endswith('.jpg')][0] +im = skimage.io.imread(frame_path + frame_fname) + +ndetections = [[] for _ in detections] +for frame_idx, dlist in enumerate(detections): + if dlist is None: + continue + for detection in dlist: + if detection['left'] < 0: + detection['left'] = 0 + if detection['right'] >= im.shape[1]: + detection['right'] = im.shape[1]-1 + if detection['top'] < 0: + detection['top'] = 0 + if detection['bottom'] >= im.shape[0]: + detection['bottom'] = im.shape[0]-1 + + if detection['right'] - detection['left'] <= 4: + continue + elif detection['bottom'] - detection['top'] <= 4: + continue + ndetections[frame_idx].append(detection) + +with open(out_fname, 'w') as f: + json.dump(ndetections, f) diff --git a/scripts/interpolate.py b/scripts/interpolate.py new file mode 100644 index 0000000..9e3769c --- /dev/null +++ b/scripts/interpolate.py @@ -0,0 +1,46 @@ +import json +import sys + +in_fname = sys.argv[1] +out_fname = sys.argv[2] + +with open(in_fname, 'r') as f: + detections = json.load(f) + +# get tracks +track_map = {} +for dlist in detections: + if dlist is None: + continue + for detection in dlist: + track_id = detection['track_id'] + if track_id not in track_map: + track_map[track_id] = [] + track_map[track_id].append(detection) + +# interpolate tracks +ndetections = [[] for _ in detections] +for track in track_map.values(): + ntrack = [] + for detection in track: + if len(ntrack) > 0: + prev = ntrack[-1] + next = detection + jump = next['frame_idx'] - prev['frame_idx'] + for i in range(1, jump): + prev_weight = float(jump-i) / float(jump) + next_weight = float(i) / float(jump) + interp = { + 'track_id': prev['track_id'], + 'frame_idx': prev['frame_idx']+i, + } + for k in ['left', 'top', 'right', 'bottom']: + interp[k] = int(prev[k]*prev_weight + next[k]*next_weight) + ntrack.append(interp) + ntrack.append(detection) + + for detection in ntrack: + ndetections[detection['frame_idx']].append(detection) + +with open(out_fname, 'w') as f: + json.dump(ndetections, f) diff --git a/scripts/json2mot.py b/scripts/json2mot.py new file mode 100644 index 0000000..2230a76 --- /dev/null +++ b/scripts/json2mot.py @@ -0,0 +1,45 @@ +import json +import os, os.path +import sys +import subprocess + +data_path = sys.argv[1] +split = sys.argv[2] +out_path = sys.argv[3] + +MODE = 'uns20' + +labels = [fname for fname in os.listdir(data_path + '/mot17/{}/'.format(split))] +labels = [label for label in labels if 'SDP' in label] +labels.sort() + +for label in labels: + subprocess.call([ + 'python', 'scripts/filter_short.py', + data_path + '/mot17/{}/{}/det/{}.json'.format(split, label, MODE), + data_path + '/mot17/{}/{}/det/{}-noshort.json'.format(split, label, MODE), + ]) + +for label in labels: + subprocess.call([ + 'python', 'scripts/interpolate.py', + data_path + '/mot17/{}/{}/det/{}-noshort.json'.format(split, label, MODE), + data_path + '/mot17/{}/{}/det/{}-interp.json'.format(split, label, MODE), + ]) + +for label in labels: + with open(data_path + '/mot17/{}/{}/det/{}-interp.json'.format(split, label, MODE), 'r') as f: + detections = json.load(f) + + lines = [] + for frame_idx, dlist in enumerate(detections): + if dlist is None: + continue + for d in dlist: + w = d['right'] - d['left'] + h = d['bottom'] - d['top'] + line = "{},{},{},{},{},{},-1,-1,-1,-1".format(d['frame_idx'], d['track_id']+1, d['left'], d['top'], w, h) + lines.append(line) + lines.append("") + with open(os.path.join(out_path, '{}.txt'.format(label)), 'w') as f: + f.write("\n".join(lines)) diff --git a/scripts/mot2json.py b/scripts/mot2json.py new file mode 100644 index 0000000..2903eff --- /dev/null +++ b/scripts/mot2json.py @@ -0,0 +1,49 @@ +import json +import subprocess +import sys + +data_path = sys.argv[1] +split = sys.argv[2] + +if split == 'train': + LABELS = ['02', '04', '05', '09', '10', '11', '13'] +elif split == 'test': + LABELS = ['01', '03', '06', '07', '08', '12', '14'] + +for label in LABELS: + detections = [] + with open(data_path + '/mot17/{}/MOT17-{}-SDP/det/det.txt'.format(split, label), 'r') as f: + lines = f.readlines() + for line in lines: + parts = line.strip().split(',') + if len(parts) < 7: + continue + frame_idx = int(parts[0]) + track_id = int(parts[1]) + left = int(float(parts[2])) + top = int(float(parts[3])) + right = left + int(float(parts[4])) + bottom = top + int(float(parts[5])) + score = float(parts[6]) + if score < 0.6: + continue + while frame_idx >= len(detections): + detections.append([]) + detections[frame_idx].append({ + 'frame_idx': frame_idx, + 'track_id': track_id, + 'left': left, + 'top': top, + 'right': right, + 'bottom': bottom, + }) + + fname = data_path + '/mot17/{}/MOT17-{}-SDP/det/det-filter60.json'.format(split, label) + with open(fname, 'w') as f: + json.dump(detections, f) + subprocess.call([ + 'python', 'scripts/filter_small.py', + fname, + data_path + '/mot17/{}/MOT17-{}-SDP/img1/'.format(split, label), + fname, + ]) diff --git a/scripts/pathtrack.py b/scripts/pathtrack.py new file mode 100644 index 0000000..07d091a --- /dev/null +++ b/scripts/pathtrack.py @@ -0,0 +1,39 @@ +import json +import os +import subprocess +import sys + +data_path = sys.argv[1] + +os.makedirs(data_path + 'pathtrack/json/', exist_ok=True) + +labels = os.listdir(data_path + '/pathtrack/train/') +for i, label in enumerate(labels): + print(label, i, len(labels)) + + detections = [] + with open(data_path + '/pathtrack/train/{}/det/det_rcnn.txt'.format(label), 'r') as f: + lines = [line.strip() for line in f.readlines() if line.strip()] + for line in lines: + parts = line.split(',') + frame_idx = int(float(parts[0])) + while len(detections) <= frame_idx: + detections.append([]) + left = int(float(parts[2])) + top = int(float(parts[3])) + right = left+int(float(parts[4])) + bottom = top+int(float(parts[5])) + score = float(parts[6]) + if score < 0.5: + continue + detections[frame_idx].append({ + 'left': left, + 'top': top, + 'right': right, + 'bottom': bottom, + 'frame_idx': frame_idx, + 'track_id': -1, + }) + + with open(data_path + '/pathtrack/json/{}.json'.format(label), 'w') as f: + json.dump(detections, f) diff --git a/scripts/preprocess-info.py b/scripts/preprocess-info.py new file mode 100644 index 0000000..78bcb71 --- /dev/null +++ b/scripts/preprocess-info.py @@ -0,0 +1,158 @@ +import json +import math +import multiprocessing +import numpy +import os, os.path +import pickle +import skimage.io +import skimage.transform +import sys + +sys.path.append('.') +import geom + +dataset = sys.argv[1] +data_path = sys.argv[2] +nthreads = int(sys.argv[3]) + +ORIG_WIDTH = 1920 +ORIG_HEIGHT = 1080 +SKIP = 1 +FRAME_SCALE = 1 +CROP_SIZE = 64 + +if dataset == 'pathtrack': + LABELS = [label for label in os.listdir(data_path + '/pathtrack/frames/')] + FRAME_PATH = data_path + '/pathtrack/frames/{}/' + DETECTION_PATH = data_path + '/pathtrack/json/{}.json' + PICKLE_PATH = data_path + '/pathtrack/pickle-info/{}.pkl' +elif dataset == 'yt-walking': + LABELS = [label for label in os.listdir(data_path + '/yt-walking/frames/')] + FRAME_PATH = data_path + '/yt-walking/frames/{}/' + DETECTION_PATH = data_path + '/yt-walking/json/{}.json' + PICKLE_PATH = data_path + '/yt-walking/pickle-info/{}.pkl' +elif dataset == 'mot17': + LABELS = [label for label in os.listdir(data_path + '/mot17/frames/')] + FRAME_PATH = data_path + '/mot17/frames/{}/' + DETECTION_PATH = data_path + '/mot17/json/{}.json' + PICKLE_PATH = data_path + '/mot17/pickle-info/{}.pkl' + +os.makedirs(os.path.dirname(PICKLE_PATH), exist_ok=True) + +def get_frame_fname(frame_idx): + s = str(frame_idx) + while len(s) < 6: + s = '0' + s + return s + '.jpg' + +def to_rect(detection): + return geom.Rectangle( + geom.Point(detection['left'], detection['top']), + geom.Point(detection['right'], detection['bottom']), + ) + +MAX_MATCH_AGE = 5 +def get_potential_matches(detections, first_frame, last_frame): + # from detection_idx in frame #first_idx to iterable of matching tuples (frame, det_idx) + cur_matches = {} + for idx in range(len(detections[first_frame])): + cur_matches[idx] = [(first_frame, idx)] + + for right_frame in range(first_frame+1, last_frame+1): + # list the detections we need to match + check_set = set() + for l in cur_matches.values(): + check_set.update(l) + + connections = {} + for left_frame, left_idx in check_set: + connections[(left_frame, left_idx)] = [] + + for right_idx in range(len(detections[right_frame])): + rect1 = to_rect(detections[left_frame][left_idx]) + rect2 = to_rect(detections[right_frame][right_idx]) + intersect_area = rect1.intersection(rect2).area() + if intersect_area < 0: + intersect_area = 0 + union_area = rect1.area() + rect2.area() - intersect_area + iou_score = float(intersect_area) / float(union_area) + if iou_score > 0.1: + connections[(left_frame, left_idx)].append((right_frame, right_idx)) + + for idx in cur_matches: + new_matches = set() + for left_frame, left_idx in cur_matches[idx]: + new_matches.update(connections[(left_frame, left_idx)]) + if right_frame - left_frame < MAX_MATCH_AGE: + new_matches.add((left_frame, left_idx)) + cur_matches[idx] = new_matches + + final_matches = {} + for idx, matches in cur_matches.items(): + final_matches[idx] = [right_idx for right_frame, right_idx in matches if right_frame == last_frame] + return final_matches + +def zip_frame_info(detections, label, frame_idx): + if not detections: + return [] + frame_path = FRAME_PATH.format(label) + im = skimage.io.imread('{}/{}'.format(frame_path, get_frame_fname(frame_idx))) + im_bounds = geom.Rectangle( + geom.Point(0, 0), + geom.Point(im.shape[0], im.shape[1]) + ) + info = [] + for idx, detection in enumerate(detections): + rect = geom.Rectangle( + geom.Point(detection['top']/FRAME_SCALE, detection['left']/FRAME_SCALE), + geom.Point(detection['bottom']/FRAME_SCALE, detection['right']/FRAME_SCALE) + ) + if rect.lengths().x < 4 or rect.lengths().y < 4: + continue + crop = im[rect.start.x:rect.end.x, rect.start.y:rect.end.y, :] + resize_factor = min([float(CROP_SIZE) / crop.shape[0], float(CROP_SIZE) / crop.shape[1]]) + resize_shape = [int(crop.shape[0] * resize_factor), int(crop.shape[1] * resize_factor)] + if resize_shape[0] == 0 or resize_shape[1] == 0: + continue + crop = (skimage.transform.resize(crop, resize_shape)*255).astype('uint8') + fix_crop = numpy.zeros((CROP_SIZE, CROP_SIZE, 3), dtype='uint8') + fix_crop[0:crop.shape[0], 0:crop.shape[1], :] = crop + detection['width'] = float(detection['right']-detection['left'])/ORIG_WIDTH + detection['height'] = float(detection['bottom']-detection['top'])/ORIG_HEIGHT + info.append((detection, fix_crop, idx)) + return info + +def process(label): + pickle_path = PICKLE_PATH.format(label) + if os.path.exists(pickle_path): + return + print('reading from {}'.format(label)) + with open(DETECTION_PATH.format(label), 'r') as f: + detections = json.load(f) + + if not detections: + return + + frame_infos = {} + #matches = {} + for frame_idx in range(0, len(detections), SKIP): + if frame_idx % 30000 > 10000: + continue + print(label, frame_idx) + if not detections[frame_idx]: + continue + frame_infos[frame_idx] = zip_frame_info(detections[frame_idx], label, frame_idx) + #for match_len in [10, 15, 25, 35, 45, 55, 65]: + # frame_range = range(frame_idx, frame_idx+match_len+1) + # if not all([detections[i] is not None and len(detections[i]) > 0 for i in frame_range]): + # continue + # for i in frame_range: + # frame_infos[i] = zip_frame_info(detections[i], label, i) + # matches[(frame_idx, frame_idx+match_len)] = get_potential_matches(detections, frame_idx, frame_idx+match_len) + + with open(pickle_path, 'wb') as f: + pickle.dump(frame_infos, f) + +p = multiprocessing.Pool(nthreads) +p.map(process, LABELS) +p.close() diff --git a/scripts/preprocess-matches.go b/scripts/preprocess-matches.go new file mode 100644 index 0000000..556af67 --- /dev/null +++ b/scripts/preprocess-matches.go @@ -0,0 +1,212 @@ +package main + +import ( + "github.com/mitroadmaps/gomapinfer/common" + "encoding/json" + "fmt" + "io/ioutil" + "os" + "strings" +) + +type Detection struct { + Left int `json:"left"` + Top int `json:"top"` + Right int `json:"right"` + Bottom int `json:"bottom"` +} + +func (d Detection) Rect() common.Rectangle { + return common.Rectangle{ + common.Point{float64(d.Left), float64(d.Top)}, + common.Point{float64(d.Right), float64(d.Bottom)}, + } +} + +const Skip int = 1 +const MaxMatchAge int = 10 +const Padding float64 = 10 +var MatchLengths = []int{2, 4, 8, 16, 32, 64} +var MaxMatchLength = MatchLengths[len(MatchLengths) - 1] + +// Returns map idx (in frameIdx) -> (frame, det_idx) +func matchFrom(detections [][]Detection, frameIdx int) map[int]map[[2]int]bool { + // from detection_idx in frameIdx to list of matching tuples (frame, det_idx) + curMatches := make(map[int]map[[2]int]bool) + finalMatches := make(map[int]map[[2]int]bool) + for idx := range detections[frameIdx] { + curMatches[idx] = make(map[[2]int]bool) + finalMatches[idx] = make(map[[2]int]bool) + curMatches[idx][[2]int{frameIdx, idx}] = true + finalMatches[idx][[2]int{frameIdx, idx}] = true + } + + lastFrame := frameIdx + MaxMatchLength + for rightFrame := frameIdx + 1; rightFrame <= lastFrame; rightFrame++ { + // find the detections we need to match + checkSet := make(map[[2]int]bool) + for _, matches := range curMatches { + for t := range matches { + if rightFrame - t[0] > MaxMatchAge { + continue + } + checkSet[t] = true + } + } + + connections := make(map[[2]int][][2]int) + for left := range checkSet { + leftFrame, leftIdx := left[0], left[1] + for rightIdx := 0; rightIdx < len(detections[rightFrame]); rightIdx++ { + leftRect := detections[leftFrame][leftIdx].Rect().AddTol(Padding) + rightRect := detections[rightFrame][rightIdx].Rect().AddTol(Padding) + intersectArea := leftRect.Intersection(rightRect).Area() + if intersectArea < 0 { + intersectArea = 0 + } + unionArea := leftRect.Area() + rightRect.Area() - intersectArea + iouScore := intersectArea / unionArea + if iouScore < 0.1 { + continue + } + connections[left] = append(connections[left], [2]int{rightFrame, rightIdx}) + } + } + + for idx, matches := range curMatches { + for t := range matches { + if rightFrame - t[0] >= MaxMatchAge { + delete(matches, t) + } + } + for left := range matches { + for _, right := range connections[left] { + matches[right] = true + finalMatches[idx][right] = true + } + } + } + } + + return finalMatches +} + +func process(jsonPath string, matchPath string, label string) { + bytes, err := ioutil.ReadFile(fmt.Sprintf(jsonPath, label)) + if err != nil { + panic(err) + } + var detections [][]Detection + if err := json.Unmarshal(bytes, &detections); err != nil { + panic(err) + } + + // match length -> frameIdx -> det_idx in frameIdx -> list of det_idx in (frameIdx+match length) + matches := make(map[int]map[int]map[int][]int) + mlSet := make(map[int]bool) + for _, matchLength := range MatchLengths { + mlSet[matchLength] = true + matches[matchLength] = make(map[int]map[int][]int) + } + + n := 18 + ch := make(chan int) + donech := make(chan map[int]map[int]map[int][]int) + for i := 0; i < n; i++ { + go func() { + threadMatches := make(map[int]map[int]map[int][]int) + for _, matchLength := range MatchLengths { + threadMatches[matchLength] = make(map[int]map[int][]int) + } + for baseFrame := range ch { + ok := true + for frameIdx := baseFrame; frameIdx <= baseFrame + MaxMatchLength; frameIdx++ { + if len(detections[frameIdx]) == 0 { + ok = false + } + } + if !ok { + continue + } + frameMatches := matchFrom(detections, baseFrame) + for curIdx := range frameMatches { + for right := range frameMatches[curIdx] { + matchLength := right[0] - baseFrame + rightIdx := right[1] + if !mlSet[matchLength] { + continue + } + if threadMatches[matchLength][baseFrame] == nil { + threadMatches[matchLength][baseFrame] = make(map[int][]int) + } + threadMatches[matchLength][baseFrame][curIdx] = append(threadMatches[matchLength][baseFrame][curIdx], rightIdx) + } + } + } + donech <- threadMatches + }() + } + for baseFrame := 0; baseFrame < len(detections) - MaxMatchLength; baseFrame += Skip { + fmt.Printf("%d/%d\n", baseFrame, len(detections)) + ch <- baseFrame + } + close(ch) + for i := 0; i < n; i++ { + threadMatches := <- donech + for matchLength := range threadMatches { + for baseFrame := range threadMatches[matchLength] { + for curIdx := range threadMatches[matchLength][baseFrame] { + if matches[matchLength][baseFrame] == nil { + matches[matchLength][baseFrame] = make(map[int][]int) + } + matches[matchLength][baseFrame][curIdx] = threadMatches[matchLength][baseFrame][curIdx] + } + } + } + } + + bytes, err = json.Marshal(matches) + if err != nil { + panic(err) + } + if err := ioutil.WriteFile(fmt.Sprintf(matchPath, label), bytes, 0644); err != nil { + panic(err) + } +} + +func main() { + dataset := os.Args[1] + dataPath := os.Args[2] + + var framePath, jsonPath, matchPath string + if dataset == "pathtrack" { + framePath = dataPath + "/pathtrack/frames/" + jsonPath = dataPath + "/pathtrack/json/%s.json" + matchPath = dataPath + "/pathtrack/pickle-info/%s.matches.json" + } else if dataset == "yt-walking" { + framePath = dataPath + "/yt-walking/frames/" + jsonPath = dataPath + "/yt-walking/json/%s.json" + matchPath = dataPath + "/yt-walking/pickle-info/%s.matches.json" + } else if dataset == "mot17" { + framePath = dataPath + "/mot17/frames/" + jsonPath = dataPath + "/mot17/json/%s.json" + matchPath = dataPath + "/mot17/pickle-info/%s.matches.json" + } + + var labels []string + files, err := ioutil.ReadDir(framePath) + if err != nil { + panic(err) + } + for _, fi := range files { + if strings.Contains(framePath, "beach") && !strings.HasPrefix(fi.Name(), "2019-") { + continue + } else if fi.Name() == "json" || fi.Name() == "pickle-info" { + continue + } + labels = append(labels, fi.Name()) + } + for _, label := range labels { + process(jsonPath, matchPath, label) + } +} diff --git a/scripts/symlink.py b/scripts/symlink.py new file mode 100644 index 0000000..4d74eda --- /dev/null +++ b/scripts/symlink.py @@ -0,0 +1,21 @@ +import os +import subprocess +import sys + +dataset = sys.argv[1] +data_path = sys.argv[2] + +if dataset == 'mot17': + os.makedirs(data_path + '/mot17/json', exist_ok=True) + os.makedirs(data_path + '/mot17/frames', exist_ok=True) + for split in ['train', 'test']: + labels = os.listdir(data_path + 'mot17/{}'.format(split)) + labels = [label for label in labels if 'SDP' in label] + for label in labels: + subprocess.call(['ln', '-s', data_path + '/mot17/{}/{}/det/det-filter60.json'.format(split, label), data_path + '/mot17/json/{}.json'.format(label)]) + subprocess.call(['ln', '-s', data_path + '/mot17/{}/{}/img1'.format(split, label), data_path + '/mot17/frames/{}'.format(label)]) +elif dataset == 'pathtrack': + os.makedirs(data_path + '/pathtrack/frames', exist_ok=True) + labels = os.listdir(data_path + 'pathtrack/train') + for label in labels: + subprocess.call(['ln', '-s', data_path + '/pathtrack/train/{}/img1'.format(label), data_path + '/pathtrack/frames/{}'.format(label)]) diff --git a/scripts/ytw-extract.py b/scripts/ytw-extract.py new file mode 100644 index 0000000..45bafe0 --- /dev/null +++ b/scripts/ytw-extract.py @@ -0,0 +1,16 @@ +import os +import subprocess +import sys + +data_path = sys.argv[1] + +processes = [] +for fname in os.listdir(data_path+'/yt-walking/'): + if not fname.endswith('.mp4'): + continue + label = fname.split('.')[0] + os.makedirs(data_path+'/yt-walking/frames/' + label, exist_ok=True) + p = subprocess.Popen(['ffmpeg', '-threads', '3', '-i', data_path+'/yt-walking/'+fname, '-vf', 'fps=10,scale=960:540', '-q:v', '1', data_path+'/yt-walking/frames/' + label + '/%06d.jpg']) + processes.append(p) +for p in processes: + p.wait() diff --git a/scripts/ytw-maskrcnn.py b/scripts/ytw-maskrcnn.py new file mode 100644 index 0000000..be77f56 --- /dev/null +++ b/scripts/ytw-maskrcnn.py @@ -0,0 +1,84 @@ +import os +import sys +import random +import math +import numpy as np +import skimage.io +import matplotlib +import matplotlib.pyplot as plt + +ROOT_DIR = os.path.abspath("../") +sys.path.append(ROOT_DIR) # To find local version of the library +from mrcnn import utils +import mrcnn.model as modellib +from mrcnn import visualize +sys.path.append(os.path.join(ROOT_DIR, "samples/coco/")) # To find local version +import coco + +MODEL_DIR = os.path.join(ROOT_DIR, "logs") +COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.h5") + +class InferenceConfig(coco.CocoConfig): + GPU_COUNT = 1 + IMAGES_PER_GPU = 1 + +config = InferenceConfig() +model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config) +model.load_weights(COCO_MODEL_PATH, by_name=True) +class_names = [ + 'BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', + 'bus', 'train', 'truck', 'boat', 'traffic light', + 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', + 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', + 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', + 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', + 'kite', 'baseball bat', 'baseball glove', 'skateboard', + 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', + 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', + 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', + 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', + 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', + 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', + 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', + 'teddy bear', 'hair drier', 'toothbrush', +] + +data_path = sys.argv[1] + +import json +import os +import subprocess +FRAME_PATH = data_path + '/yt-walking/frames/' +JSON_PATH = data_path + '/yt-walking/json/' +BATCH_SIZE = 1 +labels = os.listdir(FRAME_PATH) +labels.sort() +for label in labels[0:1]: + print('processing', label) + im_path = FRAME_PATH + label + '/' + fnames = os.listdir(im_path) + detections = [] + for i in range(0, len(fnames), BATCH_SIZE): + print(label, i, len(fnames)) + batch = fnames[i:i+BATCH_SIZE] + ims = [skimage.io.imread(im_path + fname) for fname in batch] + results = model.detect(ims) + for j in range(len(batch)): + frame_idx = int(batch[j].split('.')[0]) + while len(detections) <= frame_idx: + detections.append([]) + for roi, class_id in zip(results[j]['rois'], results[j]['class_ids']): + if int(class_id) != 1: + continue + detections[frame_idx].append({ + 'frame_idx': frame_idx, + 'left': int(roi[1]), + 'top': int(roi[0]), + 'right': int(roi[3]), + 'bottom': int(roi[2]), + 'track_id': -1, + }) + + json_fname = JSON_PATH + label + '.json' + with open(json_fname, 'w') as f: + json.dump(detections, f) diff --git a/train.py b/train.py new file mode 100644 index 0000000..109fe03 --- /dev/null +++ b/train.py @@ -0,0 +1,307 @@ +import geom +import model + +import json +import math +import numpy +import os +import pickle +import random +import skimage.io, skimage.transform +import sys +import tensorflow as tf +import time + +data_path = sys.argv[1] +model_path = sys.argv[2] + +ORIG_WIDTH = 1920 +ORIG_HEIGHT = 1080 +FRAME_SCALE = 1 +CROP_SIZE = 64 +MATCH_LENGTHS = [4, 16] +ADD_NEGATIVES = True +MODE = 'imsp-longim' + +SKIPS = [1] +DATASETS = [ + ( + os.listdir(data_path + '/yt-walking/frames/'), + data_path + '/yt-walking/frames/{}/', + data_path + '/yt-walking/pickle-info/{}.pkl', + data_path + '/yt-walking/pickle-info/{}.matches.json', + 2.0, + [1, 2], + ), + ( + os.listdir(data_path + '/pathtrack/frames/'), + data_path + '/pathtrack/frames/{}/', + data_path + '/pathtrack/pickle-info/{}.pkl', + data_path + '/pathtrack/pickle-info/{}.matches.json', + 1.5, + [2, 4], + ), + ( + os.listdir(data_path + '/mot17/frames/'), + data_path + '/mot17/frames/{}/', + data_path + '/mot17/pickle-info/{}.pkl', + data_path + '/mot17/pickle-info/{}.matches.json', + 1.0, + [2, 4], + ), +] +val_fn = lambda example: hash(example[4]) % 20 == 0 and 'MOT' not in example[4] + +def get_frame_fname(frame_idx): + s = str(frame_idx) + while len(s) < 6: + s = '0' + s + return s + '.jpg' + +def get_loc(detection): + cx = (detection['left'] + detection['right']) / 2 + cy = (detection['top'] + detection['bottom']) / 2 + cx = float(cx) / ORIG_WIDTH + cy = float(cy) / ORIG_HEIGHT + return cx, cy + +def to_rect(detection): + return geom.Rectangle( + geom.Point(detection['left'], detection['top']), + geom.Point(detection['right'], detection['bottom']), + ) + +def get_stuff(infos, matches): + def per_info(info): + images = [] + boxes = numpy.zeros((len(info), 4), dtype='float32') + for i, (detection, crop, _) in enumerate(info): + images.append(crop) + cx, cy = get_loc(detection) + boxes[i, :] = [cx, cy, detection['width'], detection['height']] + detections = [get_loc(detection) for detection, _, _ in info] + return images, boxes, detections, len(info) + + all_images = [] + all_boxes = [] + all_detections = [] + all_counts = [] + for i, info in enumerate(infos): + images, boxes, detections, count = per_info(info) + all_images.append(images) + all_boxes.append(boxes) + all_detections.append(detections) + all_counts.append(count) + + all_masks = [] + for i, match_len in enumerate(MATCH_LENGTHS): + last_idx = match_len + mask = numpy.zeros((len(infos[0]), len(infos[last_idx])+1), dtype='float32') + mask[:, len(infos[last_idx])] = 1 + first_map = {} + for j, (_, _, orig_idx) in enumerate(infos[0]): + first_map[orig_idx] = j + last_map = {} + for j, (_, _, orig_idx) in enumerate(infos[last_idx]): + last_map[orig_idx] = j + for left_idx in matches[i]: + if left_idx not in first_map: + continue + for right_idx in matches[i][left_idx]: + if right_idx not in last_map: + continue + mask[first_map[left_idx], last_map[right_idx]] = 1 + all_masks.append(mask.flatten()) + + return all_images, all_boxes, all_detections, all_counts, all_masks + +print('loading infos and matches') +all_frame_data = {} +for labels, frame_tmpl, pickle_tmpl, match_tmpl, detection_scale, _ in DATASETS: + for label in labels: + pickle_path = pickle_tmpl.format(label) + match_path = match_tmpl.format(label) + print('... {} (pickle)'.format(label)) + with open(pickle_path, 'rb') as f: + frame_infos = pickle.load(f, encoding='latin1') + for info_list in frame_infos.values(): + for info in info_list: + info[0]['left'] *= detection_scale + info[0]['top'] *= detection_scale + info[0]['right'] *= detection_scale + info[0]['bottom'] *= detection_scale + info[0]['width'] *= detection_scale + info[0]['height'] *= detection_scale + print('... {} (matches)'.format(label)) + with open(match_path, 'r') as f: + raw_matches = json.load(f, encoding='latin1') + frame_matches = {} + for match_len in raw_matches: + frame_matches[int(match_len)] = {} + for frame_idx in raw_matches[match_len]: + frame_matches[int(match_len)][int(frame_idx)] = {} + for left_idx in raw_matches[match_len][frame_idx]: + frame_matches[int(match_len)][int(frame_idx)][int(left_idx)] = raw_matches[match_len][frame_idx][left_idx] + all_frame_data[label] = (frame_infos, frame_matches) + +print('preparing random info generator') +labels_and_weights = [(label, len(all_frame_data[label][0])) for label in all_frame_data.keys()] +def get_random_info(exclude_label): + labels = [label for label in all_frame_data.keys() if label != exclude_label] + weights = [len(all_frame_data[label][0]) for label in labels] + weight_sum = sum(weights) + weights = [float(x)/float(weight_sum) for x in weights] + while True: + label = numpy.random.choice(labels, p=weights) + frame_infos = all_frame_data[label][0] + frame_idx = random.choice(list(frame_infos.keys())) + if len(frame_infos[frame_idx]) > 4: + return frame_infos[frame_idx] + +# each example is tuple (images, boxes, n_image, label, frame_idx, skip) +print('extracting examples') +all_examples = [] +for labels, frame_tmpl, _, _, _, skips in DATASETS: + for label in labels: + frame_path = frame_tmpl.format(label) + frame_infos, frame_matches = all_frame_data[label] + + for i, frame_idx in enumerate(frame_infos.keys()): + print('...', label, i, len(frame_infos)) + + skip = random.choice(skips) + match_lengths = [skip*match_len for match_len in MATCH_LENGTHS] + + infos = [frame_infos.get(frame_idx+l*skip, None) for l in range(model.SEQ_LEN)] + if any([(info is None or len(info) == 0) for info in infos]): + continue + elif any([frame_idx not in frame_matches[match_len] for match_len in match_lengths]): + continue + + if ADD_NEGATIVES: + neg_info = get_random_info(label) + else: + neg_info = [] + + matches = [frame_matches[match_len][frame_idx] for match_len in match_lengths] + images, boxes, detections, counts, mask = get_stuff(infos + [neg_info], matches) + all_examples.append(( + images, boxes, counts, mask, + label, frame_idx, detections, frame_path, skip, + )) + +random.shuffle(all_examples) +val_examples = [example for example in all_examples if val_fn(example)] +if len(val_examples) > 1024: + val_examples = random.sample(val_examples, 1024) +train_examples = [example for example in all_examples if not val_fn(example) and min(example[2][:-1]) >= 6] + +best_loss = None + +def train(learning_rate, num_epochs): + global best_loss + + print('training mode={} at lr={} for {} epochs'.format(MODE, learning_rate, num_epochs)) + for epoch in range(num_epochs): + start_time = time.time() + train_losses = [] + for _ in range(2048//model.BATCH_SIZE): + if MODE == 'imsp-finesp': + match_len = max(MATCH_LENGTHS) + else: + match_len = random.choice(MATCH_LENGTHS) + + batch = [] + for example in random.sample(train_examples, model.BATCH_SIZE): + imlists = example[0][0:match_len+1] + [example[0][model.SEQ_LEN]] + boxlists = example[1][0:match_len+1] + [example[1][model.SEQ_LEN]] + counts = example[2][0:match_len+1] + [example[2][model.SEQ_LEN]] + mask = example[3][MATCH_LENGTHS.index(match_len)] + batch.append((imlists, boxlists, counts, mask)) + + imlists = [imlist for example in batch for imlist in example[0]] + boxlists = [boxlist for example in batch for boxlist in example[1]] + counts = [[] for _ in range(len(batch))] + for i, example in enumerate(batch): + counts[i] = example[2][0:match_len+1] + while len(counts[i]) < model.SEQ_LEN: + counts[i].append(0) + counts[i].append(example[2][-1]) + + images = [im for imlist in imlists for im in imlist] + boxes = [box for boxlist in boxlists for box in boxlist] + + masks = numpy.concatenate([example[3] for example in batch], axis=0) + feed_dict = { + m.raw_images: images, + m.input_boxes: boxes, + m.n_image: counts, + m.input_masks: masks, + m.match_length: match_len, + m.is_training: True, + m.learning_rate: learning_rate, + } + if MODE == 'imsp-longim': + _, loss = session.run([m.longim_optimizer, m.longim_loss], feed_dict=feed_dict) + elif MODE == 'imsp-finesp': + _, loss = session.run([m.finesp_optimizer, m.finesp_loss], feed_dict=feed_dict) + train_losses.append(loss) + train_loss = numpy.mean(train_losses) + train_time = time.time() + + val_losses = [] + for i in range(0, len(val_examples), model.BATCH_SIZE): + batch = val_examples[i:i+model.BATCH_SIZE] + images = [im for example in batch for imlist in example[0] for im in imlist] + boxes = [box for example in batch for boxlist in example[1] for box in boxlist] + counts = [example[2] for example in batch] + masks = numpy.concatenate([example[3][-1] for example in batch], axis=0) + feed_dict = { + m.raw_images: images, + m.input_boxes: boxes, + m.n_image: counts, + m.input_masks: masks, + m.match_length: model.SEQ_LEN-1, + m.is_training: False, + } + if MODE == 'imsp-longim': + loss = session.run(m.longim_loss, feed_dict=feed_dict) + elif MODE == 'imsp-finesp': + loss = session.run(m.finesp_loss, feed_dict=feed_dict) + val_losses.append(loss) + + val_loss = numpy.mean(val_losses) + val_time = time.time() + + print('iteration {}: train_time={}, val_time={}, train_loss={}, val_loss={}/{}'.format(epoch, int(train_time - start_time), int(val_time - train_time), train_loss, val_loss, best_loss)) + + if best_loss is None or val_loss < best_loss: + best_loss = val_loss + m.saver.save(session, model_path) + + +print('initializing model: longim') +m = model.Model(options={'mode': MODE}) +config = tf.ConfigProto() +config.gpu_options.allow_growth = True +session = tf.Session(config=config) +session.run(m.init_op) + +train(1e-3, 200) +train(1e-4, 200) +train(1e-5, 200) + +print('initializing model: finesp') +MODE = 'imsp-finesp' +session.close() +m = model.Model(options={'mode': MODE}) +config = tf.ConfigProto() +config.gpu_options.allow_growth = True +session = tf.Session(config=config) +m.saver.restore(session, model_path) + +train(1e-3, 200) +train(1e-4, 200) +train(1e-5, 200) + +print('done')