From 510833a221f891d638d21f4606fb88deae3a98dd Mon Sep 17 00:00:00 2001
From: Favyen Bastani <fbastani@perennate.com>
Date: Mon, 6 Dec 2021 22:24:25 -0500
Subject: [PATCH] Initial commit

---
 README.md                     | 112 ++++++++
 geom.py                       | 335 +++++++++++++++++++++++
 infer.py                      | 349 ++++++++++++++++++++++++
 model.py                      | 489 ++++++++++++++++++++++++++++++++++
 scripts/filter_short.py       |  29 ++
 scripts/filter_small.py       |  37 +++
 scripts/interpolate.py        |  46 ++++
 scripts/json2mot.py           |  45 ++++
 scripts/mot2json.py           |  49 ++++
 scripts/pathtrack.py          |  39 +++
 scripts/preprocess-info.py    | 158 +++++++++++
 scripts/preprocess-matches.go | 212 +++++++++++++++
 scripts/symlink.py            |  21 ++
 scripts/ytw-extract.py        |  16 ++
 scripts/ytw-maskrcnn.py       |  84 ++++++
 train.py                      | 307 +++++++++++++++++++++
 16 files changed, 2328 insertions(+)
 create mode 100644 README.md
 create mode 100644 geom.py
 create mode 100644 infer.py
 create mode 100644 model.py
 create mode 100644 scripts/filter_short.py
 create mode 100644 scripts/filter_small.py
 create mode 100644 scripts/interpolate.py
 create mode 100644 scripts/json2mot.py
 create mode 100644 scripts/mot2json.py
 create mode 100644 scripts/pathtrack.py
 create mode 100644 scripts/preprocess-info.py
 create mode 100644 scripts/preprocess-matches.go
 create mode 100644 scripts/symlink.py
 create mode 100644 scripts/ytw-extract.py
 create mode 100644 scripts/ytw-maskrcnn.py
 create mode 100644 train.py

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3d3bbb5
--- /dev/null
+++ b/README.md
@@ -0,0 +1,112 @@
+Self-Supervised Multi-Object Tracking with Cross-Input Consistency
+------------------------------------------------------------------
+
+UNS20 is the code for "Self-Supervised Multi-Object Tracking with Cross-Input Consistency" (NeurIPS 2021).
+UNS20 is an approach for training a robust multi-object tracking model using
+only an object detector and a large corpus of unlabeled video.
+
+
+Installation
+------------
+
+Requires Tensorflow 1.15:
+
+	pip install 'tensorflow<2.0' scikit-image
+
+Download MOT17 dataset:
+
+	mkdir /home/ubuntu/data/
+	wget https://motchallenge.net/data/MOT17.zip
+	unzip MOT17.zip
+	mv MOT17 /home/ubuntu/data/mot17/
+
+Download UNS20 model:
+
+	wget https://favyen.com/files/uns20-model.zip
+	mv model/ /home/ubuntu/model/
+
+
+Inference
+---------
+
+For SDP detections:
+
+	cd /path/to/uns20/
+	python scripts/mot2json.py /home/ubuntu/data/ test
+	python infer.py /home/ubuntu/model/model /home/ubuntu/data/
+
+DPM and FRCNN detections have lower accuracy than SDP detections. Recent
+methods universally perform regression and classification pre-processing steps.
+Classification prunes incorrect input detections, while regression improves the
+bounding box coordinates. These steps don't really make sense, since they use a
+better detector to improve lower-quality detections. However, the steps are
+needed to achieve performance comparable with other methods, since all methods
+now use the same steps.
+
+To apply UNS20 on DPM and FRCNN detections, it should be executed after the
+regression and classification pre-processing steps from https://github.com/phil-bergmann/tracking_wo_bnw.
+
+For the most informative comparison, we highly recommend comparing performance
+only on the SDP detections, which have the highest accuracy. While evaluating
+on lower-quality detections sounds like it could be useful, one would really be
+evaluating the pre-processing steps more than the method itself.
+
+
+Evaluation
+----------
+
+Convert from JSON to the TXT format:
+
+	mkdir /home/ubuntu/outputs/
+	python scripts/json2mot.py /home/ubuntu/data/ train /home/ubuntu/outputs/
+
+Compare:
+
+	pip install motmetrics
+	python -m motmetrics.apps.eval_motchallenge /home/ubuntu/data/mot17/train/ /home/ubuntu/outputs/
+
+
+Training
+--------
+
+First, obtain PathTrack and YT-Walking datasets:
+
+	wget https://data.vision.ee.ethz.ch/daid/MOT/pathtrack_release_v1.0.zip
+	wget https://favyen.com/files/yt-walking.zip
+	mkdir /home/ubuntu/data/yt-walking/
+	unzip yt-walking.zip -d /home/ubuntu/data/yt-walking/
+	mkdir /home/ubuntu/data/pathtrack/
+	unzip pathtrack_release_v1.0.zip
+	mv pathtrack_release /home/ubuntu/data/pathtrack/
+
+Extract video frames from YT-Walking mp4 files:
+
+	python scripts/ytw-extract.py /home/ubuntu/data/
+
+Convert MOT17 object detections to uniform JSON format:
+
+	python scripts/mot2json.py /home/ubuntu/data/ train
+	python scripts/mot2json.py /home/ubuntu/data/ test
+
+Convert PathTrack object detections to uniform JSON format:
+
+	python scripts/pathtrack.py /home/ubuntu/data/
+
+Normalize MOT17 and PathTrack datasets:
+
+	python scripts/symlink.py mot17 /home/ubuntu/data/
+	python scripts/symlink.py pathtrack /home/ubuntu/data/
+
+Pre-process each of the three datasets using `scripts/preprocess-info.py` and `scripts/preprocess-matches.go`.
+
+	python scripts/preprocess-info.py mot17 /home/ubuntu/data/ 8
+	python scripts/preprocess-info.py pathtrack /home/ubuntu/data/ 8
+	python scripts/preprocess-info.py yt-walking /home/ubuntu/data/ 8
+	go run scripts/preprocess-matches.go mot17 /home/ubuntu/data/
+	go run scripts/preprocess-matches.go pathtrack /home/ubuntu/data/
+	go run scripts/preprocess-matches.go yt-walking /home/ubuntu/data/
+
+Train the model:
+
+	mkdir /home/ubuntu/model/
+	python train.py /home/ubuntu/data/ /home/ubuntu/model/model
diff --git a/geom.py b/geom.py
new file mode 100644
index 0000000..30e563c
--- /dev/null
+++ b/geom.py
@@ -0,0 +1,335 @@
+import math
+import numpy
+
+class Point(object):
+	def __init__(self, x, y):
+		self.x = int(x)
+		self.y = int(y)
+
+	def distance(self, other):
+		dx = self.x - other.x
+		dy = self.y - other.y
+		return math.sqrt(dx * dx + dy * dy)
+
+	def sub(self, other):
+		return Point(self.x - other.x, self.y - other.y)
+
+	def add(self, other):
+		return Point(self.x + other.x, self.y + other.y)
+
+	def scale(self, f):
+		return Point(self.x * f, self.y * f)
+
+	def magnitude(self):
+		return math.sqrt(self.x * self.x + self.y * self.y)
+
+	def angle_to(self, other):
+		if self.magnitude() == 0 or other.magnitude() == 0:
+			return 0
+		s = (self.x * other.x + self.y * other.y) / self.magnitude() / other.magnitude()
+		if abs(s) > 1: s = s / abs(s)
+		angle = math.acos(s)
+		if angle > math.pi:
+			return 2 * math.pi - angle
+		else:
+			return angle
+
+	def signed_angle(self, other):
+		return math.atan2(other.y, other.x) - math.atan2(self.y, self.x)
+
+	def bounds(self):
+		return Rectangle(self, self)
+
+	def dot(self, point):
+		return self.x * point.x + self.y * point.y
+
+	def rotate(self, center, angle):
+		dx = self.x - center.x
+		dy = self.y - center.y
+		rx = math.cos(angle)*dx - math.sin(angle)*dy
+		ry = math.sin(angle)*dx + math.cos(angle)*dy
+		return Point(center.x + int(rx), center.y + int(ry))
+
+	def __repr__(self):
+		return 'Point({}, {})'.format(self.x, self.y)
+
+	def __eq__(self, other):
+		return self.x == other.x and self.y == other.y
+
+	def __ne__(self, other):
+		return not self.__eq__(other)
+
+	def __hash__(self):
+		return hash((self.x, self.y))
+
+class FPoint(object):
+	def __init__(self, x, y):
+		self.x = float(x)
+		self.y = float(y)
+
+	def distance(self, other):
+		dx = self.x - other.x
+		dy = self.y - other.y
+		return math.sqrt(dx * dx + dy * dy)
+
+	def sub(self, other):
+		return FPoint(self.x - other.x, self.y - other.y)
+
+	def add(self, other):
+		return FPoint(self.x + other.x, self.y + other.y)
+
+	def scale(self, f):
+		return FPoint(self.x * f, self.y * f)
+
+	def scale_to_length(self, l):
+		return self.scale(l / self.magnitude())
+
+	def magnitude(self):
+		return math.sqrt(self.x * self.x + self.y * self.y)
+
+	def angle_to(self, other):
+		if self.magnitude() == 0 or other.magnitude() == 0:
+			return 0
+		s = (self.x * other.x + self.y * other.y) / self.magnitude() / other.magnitude()
+		if abs(s) > 1: s = s / abs(s)
+		angle = math.acos(s)
+		if angle > math.pi:
+			return 2 * math.pi - angle
+		else:
+			return angle
+
+	def signed_angle(self, other):
+		return math.atan2(other.y, other.x) - math.atan2(self.y, self.x)
+
+	def bounds(self):
+		return Rectangle(self, self)
+
+	def dot(self, point):
+		return self.x * point.x + self.y * point.y
+
+	def __repr__(self):
+		return 'FPoint({}, {})'.format(self.x, self.y)
+
+	def to_point(self):
+		return Point(self.x, self.y)
+
+	def __eq__(self, other):
+		return self.x == other.x and self.y == other.y
+
+	def __ne__(self, other):
+		return not self.__eq__(other)
+
+	def __hash__(self):
+		return hash((self.x, self.y))
+
+class Segment(object):
+	def __init__(self, start, end):
+		self.start = start
+		self.end = end
+
+	def length(self):
+		return self.start.distance(self.end)
+
+	def project_factor(self, point, line=False):
+		l = self.length()
+		if l == 0:
+			return 0
+		t = point.sub(self.start).dot(self.end.sub(self.start)) / l
+		if not line:
+			t = max(0, min(l, t))
+		return t
+
+	def project(self, point, line=False):
+		t = self.project_factor(point, line=line)
+		return self.point_at_factor(t)
+
+	def point_at_factor(self, t):
+		l = self.length()
+		if l == 0:
+			return self.start
+		return self.start.add(self.end.sub(self.start).scale(t / l))
+
+	def distance(self, point, line=False):
+		p = self.project(point, line=line)
+		return p.distance(point)
+
+	def intersection(self, other):
+		d1 = self.vector()
+		d2 = other.vector()
+		d12 = other.start.sub(self.start)
+
+		den = d1.y * d2.x - d1.x * d2.y
+		u1 = d1.x * d12.y - d1.y * d12.x
+		u2 = d2.x * d12.y - d2.y * d12.x
+
+		if den == 0:
+			# collinear
+			if u1 == 0 and u2 == 0:
+				return self.start
+			else:
+				return None
+
+		if float(u1) / den < 0 or float(u1) / den > 1 or float(u2) / den < 0 or float(u2) / den > 1:
+			return None
+
+		return self.point_at_factor(float(u2) / den * self.length())
+
+	def vector(self):
+		return self.end.sub(self.start)
+
+	def bounds(self):
+		return self.start.bounds().extend(self.end)
+
+	def extend(self, amount):
+		v = self.vector()
+		v = v.scale(amount / v.magnitude())
+		return Segment(
+			self.start.sub(v),
+			self.end.add(v)
+		)
+
+	def __repr__(self):
+		return 'Segment({}, {})'.format(self.start, self.end)
+
+class Rectangle(object):
+	def __init__(self, start, end):
+		self.start = start
+		self.end = end
+
+	def lengths(self):
+		return Point(self.end.x - self.start.x, self.end.y - self.start.y)
+
+	def clip(self, point):
+		npoint = Point(point.x, point.y)
+		if npoint.x < self.start.x:
+			npoint.x = self.start.x
+		elif npoint.x >= self.end.x:
+			npoint.x = self.end.x - 1
+		if npoint.y < self.start.y:
+			npoint.y = self.start.y
+		elif npoint.y >= self.end.y:
+			npoint.y = self.end.y - 1
+		return npoint
+
+	def clip_rect(self, r):
+		return Rectangle(self.clip(r.start), self.clip(r.end))
+
+	def add_tol(self, tol):
+		return Rectangle(
+			self.start.sub(Point(tol, tol)),
+			self.end.add(Point(tol, tol))
+		)
+
+	def contains(self, point):
+		return point.x >= self.start.x and point.x < self.end.x and point.y >= self.start.y and point.y < self.end.y
+
+	def extend(self, point):
+		return Rectangle(
+			Point(min(self.start.x, point.x), min(self.start.y, point.y)),
+			Point(max(self.end.x, point.x), max(self.end.y, point.y))
+		)
+
+	def extend_rect(self, rect):
+		return Rectangle(
+			Point(min(self.start.x, rect.start.x), min(self.start.y, rect.start.y)),
+			Point(max(self.end.x, rect.end.x), max(self.end.y, rect.end.y))
+		)
+
+	def intersects(self, other):
+		return self.end.y >= other.start.y and other.end.y >= self.start.y and self.end.x >= other.start.x and other.end.x >= self.start.x
+
+	def scale(self, f):
+		return Rectangle(self.start.scale(f), self.end.scale(f))
+
+	def intersection(self, other):
+		intersection = Rectangle(
+			Point(max(self.start.x, other.start.x), max(self.start.y, other.start.y)),
+			Point(min(self.end.x, other.end.x), min(self.end.y, other.end.y))
+		)
+		if intersection.end.x <= intersection.start.x:
+			intersection.end.x = intersection.start.x
+		if intersection.end.y <= intersection.start.y:
+			intersection.end.y = intersection.start.y
+		return intersection
+
+	def area(self):
+		return (self.end.x - self.start.x) * (self.end.y - self.start.y)
+
+	def iou(self, other):
+		intersect_area = self.intersection(other).area()
+		if intersect_area == 0:
+			return 0
+		return float(intersect_area) / (self.area() + other.area() - intersect_area)
+
+	def __repr__(self):
+		return 'Rectangle({}, {})'.format(self.start, self.end)
+
+def draw_line(start, end, lengths):
+	# followX indicates whether to move along x or y coordinates
+	followX = abs(end.y - start.y) <= abs(end.x - start.x)
+	if followX:
+		x0 = start.x
+		x1 = end.x
+		y0 = start.y
+		y1 = end.y
+	else:
+		x0 = start.y
+		x1 = end.y
+		y0 = start.x
+		y1 = end.x
+
+	delta = Point(abs(x1 - x0), abs(y1 - y0))
+	current_error = 0
+
+	if x0 < x1:
+		xstep = 1
+	else:
+		xstep = -1
+
+	if y0 < y1:
+		ystep = 1
+	else:
+		ystep = -1
+
+	points = []
+	def add_point(p):
+		if p.x >= 0 and p.x < lengths.x and p.y >= 0 and p.y < lengths.y:
+			points.append(p)
+
+	x = x0
+	y = y0
+
+	while x != x1 + xstep:
+		if followX:
+			add_point(Point(x, y))
+		else:
+			add_point(Point(y, x))
+
+		x += xstep
+		current_error += delta.y
+		if current_error >= delta.x:
+			y += ystep
+			current_error -= delta.x
+
+	return points
+
+def draw_lines(segments, im=None, shape=None):
+	from eyediagram._brescount import bres_segments_count
+	if not shape:
+		if not im:
+			raise Exception('shape or im must be provided')
+		shape = im.shape
+	tmpim = numpy.zeros((shape[0], shape[1]), dtype='int32')
+
+	sticks = numpy.zeros((len(segments), 4), dtype='int32')
+	for i, segment in enumerate(segments):
+		sticks[i] = [segment.start.x, segment.start.y, segment.end.x, segment.end.y]
+	bres_segments_count(sticks, tmpim)
+	tmpim = tmpim > 0
+	if im:
+		return numpy.logical_or(im, tmpim)
+	else:
+		return tmpim
+
+def vector_from_angle(angle, length):
+	return Point(math.cos(angle) * length, math.sin(angle) * length)
diff --git a/infer.py b/infer.py
new file mode 100644
index 0000000..f68d097
--- /dev/null
+++ b/infer.py
@@ -0,0 +1,349 @@
+import geom
+import model
+
+import json
+import numpy
+import math
+import os
+import skimage.io, skimage.transform
+import sys
+import tensorflow as tf
+import time
+
+MODEL_PATH = sys.argv[1]
+data_path = sys.argv[2]
+
+model.BATCH_SIZE = 1
+model.SEQ_LEN = 2
+
+SKIP = 2
+MAX_AGE = 10
+MODE = 'imsp'
+
+LABELS = ['MOT17-{}-SDP'.format(x) for x in ['01', '03', '06', '07', '08', '12', '14']]
+DETECTION_PATH = data_path + '/mot17/test/{}/det/det-filter60.json'
+FRAME_PATH = data_path + '/mot17/test/{}/img1/'
+OUT_PATH = data_path + '/mot17/test/{}/det/uns20.json'
+
+ORIG_WIDTH = 1920
+ORIG_HEIGHT = 1080
+DETECTION_SCALE = 1
+FRAME_SCALE = 1
+CROP_SIZE = 64
+HIDDEN_SIZE = 4*64
+
+print('initializing model')
+m = model.Model(options={'mode': MODE})
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+session = tf.Session(config=config)
+
+m.saver.restore(session, MODEL_PATH)
+
+def get_frame_fname(frame_idx):
+	s = str(frame_idx)
+	while len(s) < 6:
+		s = '0' + s
+	return s + '.jpg'
+
+for label in LABELS:
+	detection_path = DETECTION_PATH.format(label)
+	print('loading detections from {}'.format(detection_path))
+	with open(detection_path, 'r') as f:
+		raw_detections = json.load(f)
+
+	# auto-detect im width/height
+	for frame_idx, dlist in enumerate(raw_detections):
+		if not dlist or len(dlist) == 0:
+			continue
+		im = skimage.io.imread('{}/{}'.format(FRAME_PATH.format(label), get_frame_fname(frame_idx)))
+		im_bounds = geom.Rectangle(geom.Point(0, 0), geom.Point(im.shape[1]*FRAME_SCALE, im.shape[0]*FRAME_SCALE))
+		break
+
+	detections = [None for _ in range(len(raw_detections))]
+	for frame_idx, dlist in enumerate(raw_detections):
+		if not dlist or frame_idx % SKIP != 0:
+			continue
+		detections[frame_idx] = []
+		for i, d in enumerate(dlist):
+			rect = geom.Rectangle(
+				geom.Point(d['left']//DETECTION_SCALE, d['top']//DETECTION_SCALE),
+				geom.Point(d['right']//DETECTION_SCALE, d['bottom']//DETECTION_SCALE)
+			)
+			rect = im_bounds.clip_rect(rect)
+			if rect.lengths().x < 4 or rect.lengths().y < 4:
+				continue
+			nd = {
+				'left': rect.start.x,
+				'top': rect.start.y,
+				'right': rect.end.x,
+				'bottom': rect.end.y,
+				'frame_idx': d['frame_idx'],
+			}
+			detections[frame_idx].append(nd)
+
+	def zip_frame_info(detections, frame_idx):
+		im = skimage.io.imread('{}/{}'.format(FRAME_PATH.format(label), get_frame_fname(frame_idx)))
+		im_bounds = geom.Rectangle(
+			geom.Point(0, 0),
+			geom.Point(im.shape[0], im.shape[1])
+		)
+		info = []
+		for detection in detections:
+			rect = geom.Rectangle(
+				geom.Point(detection['top']//FRAME_SCALE, detection['left']//FRAME_SCALE),
+				geom.Point(detection['bottom']//FRAME_SCALE, detection['right']//FRAME_SCALE)
+			)
+			crop = im[rect.start.x:rect.end.x, rect.start.y:rect.end.y, :]
+			resize_factor = min([float(CROP_SIZE) / crop.shape[0], float(CROP_SIZE) / crop.shape[1]])
+			crop = (skimage.transform.resize(crop, [int(crop.shape[0] * resize_factor), int(crop.shape[1] * resize_factor)])*255).astype('uint8')
+			fix_crop = numpy.zeros((CROP_SIZE, CROP_SIZE, 3), dtype='uint8')
+			fix_crop[0:crop.shape[0], 0:crop.shape[1], :] = crop
+			detection['width'] = float(detection['right']-detection['left'])/ORIG_WIDTH
+			detection['height'] = float(detection['bottom']-detection['top'])/ORIG_HEIGHT
+			info.append((detection, fix_crop))
+		return info
+
+	def get_loc(detection):
+		cx = (detection['left'] + detection['right']) / 2
+		cy = (detection['top'] + detection['bottom']) / 2
+		cx = float(cx) / ORIG_WIDTH
+		cy = float(cy) / ORIG_HEIGHT
+		return cx, cy
+
+	def get_stuff(infos):
+		def per_info(info):
+			images = []
+			boxes = []
+			for i, (detection, crop) in enumerate(info):
+				images.append(crop)
+				cx, cy = get_loc(detection)
+				boxes.append([cx, cy, detection['width'], detection['height']])
+			detections = [get_loc(detection) for detection, _ in info]
+			return images, boxes, detections, len(info)
+
+		all_images = []
+		all_boxes = []
+		all_detections = []
+		all_counts = []
+		for info in infos:
+			images, boxes, detections, count = per_info(info)
+			all_images.extend(images)
+			all_boxes.extend(boxes)
+			all_detections.append(detections)
+			all_counts.append(count)
+
+		return all_images, all_boxes, all_detections, all_counts
+
+	def softmax(X, theta = 1.0, axis = None):
+		y = numpy.atleast_2d(X)
+		if axis is None:
+		    axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)
+		y = y * float(theta)
+		y = y - numpy.expand_dims(numpy.max(y, axis = axis), axis)
+		y = numpy.exp(y)
+		ax_sum = numpy.expand_dims(numpy.sum(y, axis = axis), axis)
+		p = y / ax_sum
+		if len(X.shape) == 1: p = p.flatten()
+		return p
+
+	# list of objects (id, detection_idx in latest frame, prev_hidden, time since last match)
+	# note: detection_idx should be len(info)+1 for the terminal vertex
+	active_objects = None
+	track_counter = 0
+	for frame_idx in range(0, len(detections)-SKIP, SKIP):
+		if not detections[frame_idx] or not detections[frame_idx+SKIP]:
+			active_objects = None
+			continue
+
+		print(frame_idx, len(detections))
+		info1 = zip_frame_info(detections[frame_idx], frame_idx)
+		info2 = zip_frame_info(detections[frame_idx+SKIP], frame_idx+SKIP)
+
+		if len(info1) == 0 or len(info2) == 0:
+			active_objects = None
+			continue
+
+		images1, boxes1, _, counts1 = get_stuff([info1])
+		images2, boxes2, _, counts2 = get_stuff([info2])
+
+		if active_objects is None:
+			active_objects = []
+			for left_idx in range(len(info1)):
+				active_objects.append((
+					track_counter,
+					left_idx,
+					numpy.zeros((HIDDEN_SIZE,), dtype='float32'),
+					0,
+					[images1[left_idx]],
+				))
+				detections[frame_idx][left_idx]['track_id'] = track_counter
+				track_counter += 1
+
+		'''
+		outputs_raw, out_mat, cur_hidden, out_logits, mat_finesp, mat_longim = session.run([m.out_mat_reweight, m.out_mat, m.out_hidden, m.out_logits_finesp, m.out_mat_finesp, m.out_mat_longim], feed_dict=feed_dict)
+
+		# take maximum in outputs_raw along the active indices
+		outputs = numpy.zeros((len(active_objects), len(info2)+1), dtype='float32')
+		for i, obj in enumerate(active_objects):
+			cur_finesp = mat_finesp[active_indices[i], :].max(axis=0)
+			cur_longim = mat_longim[active_indices[i], :].max(axis=0)
+			outputs[i, :] = cur_finesp + cur_longim
+			#outputs[i, 0:len(info2)] = outputs_raw[active_indices[i], 0:len(info2)].max(axis=0)
+			#outputs[i, len(info2)] = outputs_raw[active_indices[i], len(info2)].min()
+		'''
+
+		if MODE == 'imsp' or MODE == 'finesp' or MODE == 'longim':
+			# flatten the active objects since each object may have multiple images
+			flat_images = []
+			flat_boxes = []
+			flat_hidden = []
+			active_indices = {}
+			for i, obj in enumerate(active_objects):
+				active_indices[i] = []
+				for j in [1, 2, 4, 8, 16]:
+				#for j in range(1, len(obj[4])+1, len(obj[4])//5+1):
+					if len(obj[4]) < j:
+						continue
+					# use image from stored history, but use current box
+					active_indices[i].append(len(flat_images))
+					flat_images.append(obj[4][-j])
+					if obj[1] < len(info1):
+						flat_boxes.append(boxes1[obj[1]])
+					else:
+						flat_boxes.append(numpy.zeros((4,), dtype='float32'))
+					flat_hidden.append(obj[2])
+
+			feed_dict = {
+				m.raw_images: flat_images + images2,
+				m.input_boxes: flat_boxes + boxes2,
+				m.n_image: [[len(flat_images), len(images2), 0]],
+				m.is_training: False,
+				m.infer_sel: range(len(flat_images)),
+				m.infer_hidden: flat_hidden,
+			}
+
+			longim_logits, finesp_logits, pre_cur_hidden = session.run([m.out_logits_longim, m.out_logits_finesp, m.out_hidden], feed_dict=feed_dict)
+			longim_out_logits = numpy.zeros((len(active_objects), len(info2)+1), dtype='float32')
+			finesp_out_logits = numpy.zeros((len(active_objects), len(info2)+1), dtype='float32')
+			cur_hidden = numpy.zeros((len(active_objects), len(info2)+1, HIDDEN_SIZE), dtype='float32')
+			for i, obj in enumerate(active_objects):
+				longim_out_logits[i, 0:len(info2)] = longim_logits[active_indices[i], 0:len(info2)].mean(axis=0)
+				longim_out_logits[i, len(info2)] = longim_logits[active_indices[i], len(info2)].min()
+				finesp_out_logits[i, 0:len(info2)] = finesp_logits[active_indices[i], 0:len(info2)].mean(axis=0)
+				finesp_out_logits[i, len(info2)] = finesp_logits[active_indices[i], len(info2)].min()
+				cur_hidden[i, :, :] = pre_cur_hidden[active_indices[i][0], :, :]
+			#longim_mat = softmax(longim_out_logits, axis=1)
+			#finesp_mat = softmax(finesp_out_logits, axis=1)
+			longim_mat = numpy.minimum(softmax(longim_out_logits, axis=0), softmax(longim_out_logits, axis=1))
+			finesp_mat = numpy.minimum(softmax(finesp_out_logits, axis=0), softmax(finesp_out_logits, axis=1))
+			outputs = numpy.minimum(longim_mat, finesp_mat)
+			#outputs = numpy.minimum(longim_out_logits, finesp_out_logits)
+			#outputs = (longim_out_logits+finesp_out_logits)/2
+			if MODE == 'finesp':
+				outputs = finesp_mat
+			elif MODE == 'longim':
+				outputs = longim_mat
+		else:
+			feed_dict = {
+				m.raw_images: images1 + images2,
+				m.input_boxes: boxes1 + boxes2,
+				m.is_training: False,
+				m.infer_sel: [obj[1] for obj in active_objects],
+				m.infer_hidden: [obj[2] for obj in active_objects],
+			}
+			if MODE == 'occl':
+				feed_dict[m.a_counts] = [len(images1), len(images2)]
+			else:
+				feed_dict[m.n_image] = [[len(images1), len(images2), 0]]
+			outputs, out_mat, out_logits, cur_hidden = session.run([m.out_mat_reweight, m.out_mat, m.out_logits, m.out_hidden], feed_dict=feed_dict)
+			outputs = out_mat
+
+		# vote on best next frame: idx1->(output,idx2)
+		votes = {}
+		for i in range(len(active_objects)):
+			for j in range(len(info2)+1):
+				output = outputs[i, j]
+				#if j == len(info2) and out_logits[active_indices[i][0], :].argmax() == len(info2):
+				#if j == len(info2) and longim_out_logits[:, outputs[i, :].argmax()].argmax() != i:
+				#if j == len(info2) and longim_out_logits[i, :].max() < 1:
+				if MODE == 'imsp' and j != len(info2) and (longim_out_logits[i, j] < 0 or finesp_out_logits[i, j] < 0):
+					output = -100.0
+				elif MODE == 'finesp' and j != len(info2) and finesp_out_logits[i, j] < 0:
+					output = -100.0
+				#if j == len(info2):
+				#	output = -2
+				if i not in votes or output > votes[i][0]:
+					if j < len(info2):
+						votes[i] = (output, j)
+					else:
+						votes[i] = (output, None)
+		# group by receiver and vote on max idx2->idx1 to eliminate duplicates
+		votes2 = {}
+		for idx1, t in votes.items():
+			output, idx2 = t
+			if idx2 is not None and (idx2 not in votes2 or output > votes2[idx2][0]):
+				votes2[idx2] = (output, idx1)
+		forward_matches = {idx1: idx2 for (idx2, (_, idx1)) in votes2.items()}
+
+		def get_hidden(idx1, idx2):
+			if model.__name__ == 'occl3b_model':
+				return cur_hidden[idx1, :]
+			else:
+				return cur_hidden[idx1, idx2, :]
+
+		new_objects = []
+		used_idx2s = set()
+		for idx1, obj in enumerate(active_objects):
+			if idx1 in forward_matches:
+				idx2 = forward_matches[idx1]
+				new_objects.append((
+					obj[0],
+					idx2,
+					get_hidden(idx1, idx2),
+					#numpy.zeros((64,), dtype='float32'),
+					0,
+					obj[4] + [images2[idx2]],
+				))
+				used_idx2s.add(idx2)
+				detections[frame_idx+SKIP][idx2]['track_id'] = obj[0]
+			elif obj[3] < MAX_AGE:
+				idx2 = votes[idx1][1]
+				if idx2 is None or True:
+					idx2 = len(info2)
+				new_objects.append((
+					obj[0],
+					idx2,
+					get_hidden(idx1, idx2),
+					#numpy.zeros((64,), dtype='float32'),
+					obj[3]+1,
+					obj[4],
+				))
+
+		for idx2 in range(len(info2)):
+			if idx2 in used_idx2s:
+				continue
+			new_objects.append((
+				track_counter,
+				idx2,
+				numpy.zeros((HIDDEN_SIZE,), dtype='float32'),
+				0,
+				[images2[idx2]],
+			))
+			detections[frame_idx+SKIP][idx2]['track_id'] = track_counter
+			track_counter += 1
+
+		active_objects = new_objects
+
+	ndetections = [None for _ in detections]
+	for frame_idx, dlist in enumerate(detections):
+		if not dlist:
+			continue
+		dlist = [d for d in dlist if 'track_id' in d]
+		if not dlist:
+			continue
+		ndetections[frame_idx] = dlist
+	detections = ndetections
+
+	with open(OUT_PATH.format(label), 'w') as f:
+		json.dump(detections, f)
diff --git a/model.py b/model.py
new file mode 100644
index 0000000..c6c480d
--- /dev/null
+++ b/model.py
@@ -0,0 +1,489 @@
+import numpy
+import tensorflow as tf
+import os
+import os.path
+import random
+import math
+import time
+
+BATCH_SIZE = 1
+SEQ_LEN = 17
+KERNEL_SIZE = 3
+
+class Model:
+	def _conv_layer(self, name, input_var, stride, in_channels, out_channels, options = {}):
+		activation = options.get('activation', 'relu')
+		dropout = options.get('dropout', None)
+		padding = options.get('padding', 'SAME')
+		batchnorm = options.get('batchnorm', False)
+		transpose = options.get('transpose', False)
+
+		with tf.variable_scope(name) as scope:
+			if not transpose:
+				filter_shape = [KERNEL_SIZE, KERNEL_SIZE, in_channels, out_channels]
+			else:
+				filter_shape = [KERNEL_SIZE, KERNEL_SIZE, out_channels, in_channels]
+			kernel = tf.get_variable(
+				'weights',
+				shape=filter_shape,
+				initializer=tf.truncated_normal_initializer(stddev=math.sqrt(2.0 / KERNEL_SIZE / KERNEL_SIZE / in_channels)),
+				dtype=tf.float32
+			)
+			biases = tf.get_variable(
+				'biases',
+				shape=[out_channels],
+				initializer=tf.constant_initializer(0.0),
+				dtype=tf.float32
+			)
+			if not transpose:
+				output = tf.nn.bias_add(
+					tf.nn.conv2d(
+						input_var,
+						kernel,
+						[1, stride, stride, 1],
+						padding=padding
+					),
+					biases
+				)
+			else:
+				batch = tf.shape(input_var)[0]
+				side = tf.shape(input_var)[1]
+				output = tf.nn.bias_add(
+					tf.nn.conv2d_transpose(
+						input_var,
+						kernel,
+						[batch, side * stride, side * stride, out_channels],
+						[1, stride, stride, 1],
+						padding=padding
+					),
+					biases
+				)
+			if batchnorm:
+				output = tf.contrib.layers.batch_norm(output, center=True, scale=True, is_training=self.is_training, decay=0.99)
+			if dropout is not None:
+				output = tf.nn.dropout(output, keep_prob=1-dropout)
+
+			if activation == 'relu':
+				return tf.nn.relu(output, name=scope.name)
+			elif activation == 'sigmoid':
+				return tf.nn.sigmoid(output, name=scope.name)
+			elif activation == 'none':
+				return output
+			else:
+				raise Exception('invalid activation {} specified'.format(activation))
+
+	def _fc_layer(self, name, input_var, input_size, output_size, options = {}):
+		activation = options.get('activation', 'relu')
+		dropout = options.get('dropout', None)
+		batchnorm = options.get('batchnorm', False)
+
+		with tf.variable_scope(name) as scope:
+			weights = tf.get_variable(
+				'weights',
+				shape=[input_size, output_size],
+				initializer=tf.truncated_normal_initializer(stddev=math.sqrt(2.0 / input_size)),
+				dtype=tf.float32
+			)
+			biases = tf.get_variable(
+				'biases',
+				shape=[output_size],
+				initializer=tf.constant_initializer(0.0),
+				dtype=tf.float32
+			)
+			output = tf.matmul(input_var, weights) + biases
+			if batchnorm:
+				output = tf.contrib.layers.batch_norm(output, center=True, scale=True, is_training=self.is_training, decay=0.99)
+			if dropout is not None:
+				output = tf.nn.dropout(output, keep_prob=1-dropout)
+
+			if activation == 'relu':
+				return tf.nn.relu(output, name=scope.name)
+			elif activation == 'sigmoid':
+				return tf.nn.sigmoid(output, name=scope.name)
+			elif activation == 'none':
+				return output
+			else:
+				raise Exception('invalid activation {} specified'.format(activation))
+
+	def __init__(self, options={}):
+		tf.reset_default_graph()
+		self.options = options
+
+		self.is_training = tf.placeholder(tf.bool)
+		self.raw_images = tf.placeholder(tf.uint8, [None, 64, 64, 3])
+		self.input_images = tf.cast(self.raw_images, tf.float32)/255.0
+		self.input_boxes = tf.placeholder(tf.float32, [None, 4])
+		self.n_image = tf.placeholder(tf.int32, [BATCH_SIZE, SEQ_LEN+1])
+		self.input_masks = tf.placeholder(tf.float32, [None])
+		self.match_length = tf.placeholder(tf.int32)
+		self.learning_rate = tf.placeholder(tf.float32)
+
+		# for inference
+		self.infer_sel = tf.placeholder(tf.int32, [None])
+		self.infer_hidden = tf.placeholder(tf.float32, [None, 256])
+
+		# extract masks
+		self.masks = []
+		s = 0
+		for batch in range(BATCH_SIZE):
+			n_first = self.n_image[batch, 0]
+			n_last = self.n_image[batch, self.match_length]
+			cur_count = n_first*(n_last+1)
+			cur_mask = tf.reshape(self.input_masks[s:s+cur_count], [n_first, n_last+1])
+			self.masks.append(cur_mask)
+			s += cur_count
+
+		if SEQ_LEN < 4:
+			stuffs = []
+			for i in range(4):
+				with tf.variable_scope('ensemble' + str(i)):
+					stuff = self.make_part(options, infer_hidden=self.infer_hidden[:, 64*i:64*(i+1)])
+					stuffs.append(stuff)
+
+			if options.get('infer_op', 'mean') == 'min':
+				self.out_mat_finesp = tf.reduce_min([stuff[0] for stuff in stuffs], axis=0)
+				self.out_logits_finesp = tf.reduce_min([stuff[1] for stuff in stuffs], axis=0)
+				self.out_mat_longim = tf.reduce_min([stuff[2] for stuff in stuffs], axis=0)
+				self.out_logits_longim = tf.reduce_min([stuff[3] for stuff in stuffs], axis=0)
+				self.out_mat = tf.reduce_min([stuff[4] for stuff in stuffs], axis=0)
+				self.out_mat_reweight = tf.reduce_min([stuff[5] for stuff in stuffs], axis=0)
+			else:
+				self.out_mat_finesp = tf.reduce_mean([stuff[0] for stuff in stuffs], axis=0)
+				self.out_logits_finesp = tf.reduce_mean([stuff[1] for stuff in stuffs], axis=0)
+				self.out_mat_longim = tf.reduce_mean([stuff[2] for stuff in stuffs], axis=0)
+				self.out_logits_longim = tf.reduce_mean([stuff[3] for stuff in stuffs], axis=0)
+				self.out_mat = tf.reduce_mean([stuff[4] for stuff in stuffs], axis=0)
+				self.out_mat_reweight = tf.reduce_mean([stuff[5] for stuff in stuffs], axis=0)
+
+			self.out_hidden = tf.concat([stuff[6] for stuff in stuffs], axis=2)
+		else:
+			longim_losses = []
+			finesp_losses = []
+			for i in range(4):
+				with tf.variable_scope('ensemble' + str(i)):
+					longim_loss, finesp_loss = self.make_part(options)
+					longim_losses.append(longim_loss)
+					finesp_losses.append(finesp_loss)
+
+			self.longim_loss = tf.reduce_mean(longim_losses)
+			self.finesp_loss = tf.reduce_mean(finesp_losses)
+
+			with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
+				self.longim_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.longim_loss)
+				self.finesp_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.finesp_loss)
+
+		self.init_op = tf.initialize_all_variables()
+		self.saver = tf.train.Saver(max_to_keep=None)
+
+	def make_part(self, options, infer_hidden=None):
+		c_image = options.get('c_image', 64)
+		c_spatial = 4
+		c_features = c_image+c_spatial
+		c_rnn = 64
+
+		# CNN for long-term image
+		layer1 = self._conv_layer('layer1', self.input_images, 2, 3, 64) # -> 32x32x64
+		layer2 = self._conv_layer('layer2', layer1, 2, 64, c_image) # -> 16x16x64
+		layer3 = self._conv_layer('layer3', layer2, 2, c_image, c_image) # -> 8x8x64
+		layer4 = self._conv_layer('layer4', layer3, 2, c_image, c_image) # -> 4x4x64
+		layer5 = self._conv_layer('layer5', layer4, 2, c_image, c_image) # -> 2x2x64
+		layer6 = self._conv_layer('layer6', layer5, 2, c_image, c_image, {'activation': 'none'})[:, 0, 0, :]
+
+		features = [[] for _ in range(BATCH_SIZE)]
+		s = 0
+		for batch in range(BATCH_SIZE):
+			for i in range(SEQ_LEN+1):
+				cur_count = self.n_image[batch, i]
+				cur_features = tf.concat([
+					self.input_boxes[s:s+cur_count, :],
+					layer6[s:s+cur_count, :],
+				], axis=1)
+				cur_features = tf.concat([
+					cur_features,
+					tf.zeros([1, c_features], dtype=tf.float32),
+				], axis=0)
+				features[batch].append(cur_features)
+				s += cur_count
+
+		# MATCHER
+		# context is longim or finesp
+		def matcher(pairs, context):
+			with tf.variable_scope('matcher' + context, reuse=tf.AUTO_REUSE):
+				im_pairs = tf.concat([pairs[:, 0:c_rnn], pairs[:, c_rnn+4:c_rnn+c_features], pairs[:, c_rnn+c_features+4:]], axis=1)
+				if options.get('spatial_rel', False):
+					sp1 = pairs[:, c_rnn:c_rnn+4]
+					sp2 = pairs[:, c_rnn+c_features:c_rnn+c_features+4]
+					spatial_pairs = tf.concat([
+						pairs[:, 0:c_rnn],
+						sp1[:, 0:2] - sp2[:, 0:2],
+						sp1[:, 2:4],
+						sp2[:, 0:2] - sp1[:, 0:2],
+						sp2[:, 2:4],
+					], axis=1)
+				else:
+					spatial_pairs = tf.concat([pairs[:, 0:c_rnn+4], pairs[:, c_rnn+c_features:c_rnn+c_features+4]], axis=1)
+
+				if context == 'longim':
+					matcher1 = self._fc_layer('matcher1', im_pairs, c_rnn+2*c_image, 256)
+					matcher2 = self._fc_layer('matcher2', matcher1, 256, 65, {'activation': 'none'})
+					return matcher2
+				elif context == 'finesp':
+					matcher1 = self._fc_layer('matcher1', spatial_pairs, c_rnn+2*c_spatial, 256)
+					matcher2 = self._fc_layer('matcher2', matcher1, 256, 128)
+					matcher3 = self._fc_layer('matcher3', matcher2, 128, 128)
+					matcher4 = self._fc_layer('matcher4', matcher3, 128, 1, {'activation': 'none'})
+
+					matcher5 = self._fc_layer('matcher5', spatial_pairs, c_rnn+2*c_spatial, 256)
+					matcher6 = self._fc_layer('matcher6', matcher1, 256, 128)
+					matcher7 = self._fc_layer('matcher7', matcher2, 128, 128)
+					matcher8 = self._fc_layer('matcher8', matcher3, 128, c_rnn, {'activation': 'none'})
+
+					return tf.concat([matcher4, matcher8], axis=1)
+				elif context == 'combined':
+					matcher1 = self._fc_layer('matcher1', pairs, c_rnn+2*c_features, 256)
+					matcher2 = self._fc_layer('matcher2', matcher1, 256, 128)
+					matcher3 = self._fc_layer('matcher3', matcher2, 128, 128)
+					matcher4 = self._fc_layer('matcher4', matcher3, 128, 1, {'activation': 'none'})
+
+					matcher5 = self._fc_layer('matcher5', pairs, c_rnn+2*c_features, 256)
+					matcher6 = self._fc_layer('matcher6', matcher1, 256, 128)
+					matcher7 = self._fc_layer('matcher7', matcher2, 128, 128)
+					matcher8 = self._fc_layer('matcher8', matcher3, 128, c_rnn, {'activation': 'none'})
+
+					return tf.concat([matcher4, matcher8], axis=1)
+
+		# logit replacing matching some detection with the zero (null/fake) detection
+		no_match_logit = tf.get_variable('no_match_logit', shape=[1], initializer=tf.constant_initializer(0.0), dtype=tf.float32)
+
+		def get_mat_hidden(n_prev, n_next, rnn_features, prev_features, next_features, context, incl_logits=False, do_neg=True):
+			do_neg = do_neg and options.get('do_neg', True)
+			if do_neg:
+				# include min(n_next, n_neg) negative examples where we borrow the next spatial features
+				# now we also include n_prev previous images as negatives
+				#fake_next1 = tf.minimum(n_prev, n_next)
+				#fake_next2 = tf.maximum(0, n_prev - n_next)
+				fake_next1 = n_prev
+				fake_next2 = tf.minimum(n_next, self.n_image[0, SEQ_LEN])
+				fake_next = fake_next1 + fake_next2
+				n_next += fake_next
+				#neg_features1 = tf.concat([
+				#	next_features[0:fake_next1, 0:c_spatial],
+				#	prev_features[0:fake_next1, c_spatial:],
+				#], axis=1)
+				#neg_features2 = tf.concat([
+				#	prev_features[0:fake_next2, 0:c_spatial],
+				#	prev_features[fake_next1:n_prev, c_spatial:],
+				#], axis=1)
+				neg_features1 = prev_features
+				neg_features2 = tf.concat([
+					next_features[0:fake_next2, 0:c_spatial],
+					features[0][SEQ_LEN][0:fake_next2, c_spatial:],
+				], axis=1)
+				next_features = tf.concat([neg_features1, neg_features2, next_features], axis=0)
+
+			cur_pairs = tf.concat([
+				tf.tile(
+					tf.reshape(rnn_features, [n_prev, 1, c_rnn]),
+					[1, n_next+1, 1]
+				),
+				tf.tile(
+					tf.reshape(prev_features, [n_prev, 1, c_features]),
+					[1, n_next+1, 1]
+				),
+				tf.tile(
+					tf.reshape(next_features, [1, n_next+1, c_features]),
+					[n_prev, 1, 1]
+				),
+			], axis=2)
+
+			cur_pairs = tf.reshape(cur_pairs, [n_prev*(n_next+1), c_rnn+2*c_features])
+			cur_outputs = matcher(cur_pairs, context=context)
+			cur_outputs = tf.reshape(cur_outputs, [n_prev, n_next+1, 1+c_rnn])
+
+			cur_logits = cur_outputs[:, :, 0]
+			if options.get('no_match_logit', True):
+				cur_logits = tf.concat([
+					cur_logits[:, :-1],
+					tf.tile(tf.reshape(no_match_logit, [1, 1]), [n_prev, 1]),
+				], axis=1)
+
+			if do_neg:
+				# need to eliminate logits that are connecting the same features
+				# these are in the first n_prev x n_prev of the matrix
+				elim_mat = tf.eye(num_rows=n_prev, num_columns=n_next+1)
+				cur_logits = (cur_logits*(1-elim_mat)) - 50*elim_mat
+
+			if options.get('linearnorm', False):
+				# multiply rows and columns by a factor so that they add up to at most 1
+				# we do rows first, then columns
+				cur_mat = tf.nn.sigmoid(cur_logits)
+				cur_mat = tf.concat([
+					cur_mat[:, :-1],
+					tf.maximum(1-tf.reduce_sum(cur_mat[:, :-1], axis=1, keepdims=True), tf.maximum(0.01, no_match_logit)),
+				], axis=1)
+				row_factors = 1.0/tf.maximum(1.0, tf.reduce_sum(cur_mat, axis=1, keepdims=True))
+				cur_mat *= tf.tile(row_factors, [1, n_next+1])
+				col_factors = 1.0/tf.maximum(1.0, tf.reduce_sum(cur_mat, axis=0, keepdims=True))
+				cur_mat *= tf.tile(col_factors, [n_prev, 1])
+			else:
+				cur_mat = tf.math.minimum(
+					tf.nn.softmax(cur_logits, axis=0),
+					tf.nn.softmax(cur_logits, axis=1)
+				)
+			cur_hidden = cur_outputs[:, :, 1:]
+
+			if do_neg:
+				cur_logits = cur_logits[:, fake_next:]
+				cur_mat = cur_mat[:, fake_next:]
+				cur_hidden = cur_hidden[:, fake_next:, :]
+
+			if incl_logits:
+				return cur_mat, cur_hidden, cur_logits
+			else:
+				return cur_mat, cur_hidden
+
+		def index_list(l, idx, out_shape):
+			flatlist = []
+			sums = [0]
+			for t in l:
+				flat = tf.reshape(t, [-1])
+				flatlist.append(flat)
+				sums.append(sums[-1] + tf.shape(flat)[0])
+			flatlist = tf.concat(flatlist, axis=0)
+			sums = tf.stack(sums, axis=0)
+			output = flatlist[sums[idx]:sums[idx+1]]
+			return tf.reshape(output, out_shape)
+
+		def terminal_reweight(mat):
+			mat_term = mat[:, -1]
+			factor = tf.minimum(1.0/(tf.reduce_sum(mat_term)+1e-2), tf.cast(tf.shape(mat)[0], tf.float32))
+			mat_term = mat_term * factor
+			row_maxes = 1 - tf.reduce_sum(mat[:, :-1], axis=1)
+			row_maxes = tf.maximum(row_maxes, 0)
+			mat_term = tf.minimum(mat_term, row_maxes)
+			return tf.concat([mat[:, :-1], tf.reshape(mat_term, [-1, 1])], axis=1)
+
+		def get_recur_sel(mat):
+			if options.get('simple_sel', False):
+				return tf.argmax(mat, axis=1, output_type=tf.int32)
+			def f(mat):
+				# take argmax along rows (over columns)
+				# but only use it if it is higher value than other rows in same column
+				row_argmax = numpy.argmax(mat, axis=1)
+				col_argmax = numpy.argmax(mat, axis=0)
+				out = row_argmax
+				for i in range(out.shape[0]):
+					if col_argmax[out[i]] != i:
+						out[i] = mat.shape[1]-1
+				return out.astype('int32')
+
+			sel = tf.py_func(f, [mat], tf.int32, stateful=False)
+			return sel
+
+		def compute_loss(mat1, mat2, batch, apply_mask=True):
+			if apply_mask:
+				mask = self.masks[batch]
+			else:
+				mask = tf.ones(tf.shape(mat1), dtype=tf.float32)
+
+			epsilon = options.get('epsilon', 1e-8)
+			if options.get('no_terminal', False):
+				loss = -tf.reduce_mean(tf.log(tf.reduce_sum(mat1[:, :-1] * mat2[:, :-1] * mask[:, :-1], axis=1) + epsilon))
+			elif options.get('terminal_reweight', True):
+				loss = -tf.reduce_mean(tf.log(tf.reduce_sum(terminal_reweight(mat1) * terminal_reweight(mat2) * mask, axis=1) + epsilon))
+			else:
+				loss = -tf.reduce_mean(tf.log(tf.reduce_sum(mat1 * mat2 * mask, axis=1) + epsilon))
+
+			return loss
+
+
+		if SEQ_LEN < 4:
+			# inference
+			n_prev = tf.shape(self.infer_sel)[0]
+			n_next = self.n_image[0, 1]
+			rnn_features = infer_hidden
+			prev_features = tf.gather(features[0][0], self.infer_sel, axis=0)
+			next_features = features[0][1]
+
+			out_mat_finesp, out_hidden, out_logits_finesp = get_mat_hidden(n_prev, n_next, rnn_features, prev_features, next_features, 'finesp', incl_logits=True, do_neg=False)
+			out_mat_longim, _, out_logits_longim = get_mat_hidden(n_prev, n_next, tf.zeros(tf.shape(rnn_features), dtype=tf.float32), prev_features, next_features, 'longim', incl_logits=True, do_neg=False)
+			out_mat = tf.minimum(out_mat_finesp, out_mat_longim)
+
+			if options.get('terminal_reweight', True):
+				out_mat_reweight = terminal_reweight(out_mat)
+
+			return out_mat_finesp, out_logits_finesp, out_mat_longim, out_logits_longim, out_mat, out_mat_reweight, out_hidden
+
+
+		finesp_indices = []
+		for i in range(SEQ_LEN-1):
+			finesp_indices.append((i, i+1))
+
+		# LONGIM
+		extra_mats = [[] for _ in range(BATCH_SIZE)]
+		extra_mats_finesp = [[] for _ in range(BATCH_SIZE)]
+		for batch in range(BATCH_SIZE):
+			n_prev = self.n_image[batch, 0]
+			n_next = self.n_image[batch, self.match_length]
+			rnn_features = tf.zeros((n_prev, 1, c_rnn), dtype=tf.float32)
+			prev_features = features[batch][0][:-1, :]
+			# next_features = features[batch][match_length]
+			next_features = index_list(features[batch], self.match_length, [n_next+1, c_features])
+			cur_mat, _ = get_mat_hidden(n_prev, n_next, rnn_features, prev_features, next_features, 'longim')
+			extra_mats[batch].append(cur_mat)
+
+			for i in range(SEQ_LEN-1):
+				# for extra_mats_finesp we always have SEQ_LEN inputs
+				n_prev = self.n_image[batch, 0]
+				n_next = self.n_image[batch, i+1]
+				rnn_features = tf.zeros((n_prev, 1, c_rnn), dtype=tf.float32)
+				prev_features = features[batch][0][:-1, :]
+				next_features = features[batch][i+1]
+				cur_mat, _ = get_mat_hidden(n_prev, n_next, rnn_features, prev_features, next_features, 'longim', do_neg=False)
+				extra_mats_finesp[batch].append(cur_mat)
+
+		# FINESP (note: this can't be executed with variable matchlen, at least for now)
+		finesp_mats = [[] for _ in range(BATCH_SIZE)]
+		finesp_hiddens = [[] for _ in range(BATCH_SIZE)]
+		for batch in range(BATCH_SIZE):
+			for prev_idx, next_idx in finesp_indices:
+				n_next = self.n_image[batch, next_idx]
+				if prev_idx == 0:
+					n_prev = self.n_image[batch, prev_idx]
+					prev_features = features[batch][prev_idx][:-1, :]
+					rnn_features = tf.zeros((n_prev, 1, c_rnn), dtype=tf.float32)
+				else:
+					n_prev = self.n_image[batch, 0]
+					if options.get('follow_longim', False):
+						sel = get_recur_sel(extra_mats_finesp[batch][prev_idx-1])
+					else:
+						sel = get_recur_sel(finesp_mats[batch][-1])
+					rnn_sel = tf.stack([
+						tf.range(n_prev, dtype=tf.int32),
+						sel,
+					], axis=1)
+					prev_features = tf.gather(features[batch][prev_idx], sel, axis=0)
+					rnn_features = tf.gather_nd(finesp_hiddens[batch][-1], rnn_sel)
+
+				cur_mat, cur_hidden = get_mat_hidden(n_prev, n_next, rnn_features, prev_features, features[batch][next_idx], 'finesp', do_neg=False)
+				finesp_mats[batch].append(cur_mat)
+				finesp_hiddens[batch].append(cur_hidden)
+
+		# longim loss
+		longim_losses = []
+		for batch in range(BATCH_SIZE):
+			mat = extra_mats[batch][0]
+			loss = compute_loss(mat, mat, batch)
+			longim_losses.append(loss)
+		longim_loss = tf.reduce_mean(longim_losses)
+
+		# finespatial loss
+		finesp_losses = []
+		for batch in range(BATCH_SIZE):
+			for i, finesp_mat in enumerate(finesp_mats[batch]):
+				extra_mat = tf.stop_gradient(extra_mats_finesp[batch][i])
+				loss = compute_loss(finesp_mat, extra_mat, batch, apply_mask=(i==SEQ_LEN-2))
+				finesp_losses.append(loss)
+		finesp_loss = tf.reduce_mean(finesp_losses)
+
+		return longim_loss, finesp_loss
diff --git a/scripts/filter_short.py b/scripts/filter_short.py
new file mode 100644
index 0000000..8b04281
--- /dev/null
+++ b/scripts/filter_short.py
@@ -0,0 +1,29 @@
+import json
+import sys
+
+in_fname = sys.argv[1]
+out_fname = sys.argv[2]
+
+with open(in_fname, 'r') as f:
+	detections = json.load(f)
+
+# get tracks
+track_map = {}
+for dlist in detections:
+	if dlist is None:
+		continue
+	for detection in dlist:
+		track_id = detection['track_id']
+		if track_id not in track_map:
+			track_map[track_id] = []
+		track_map[track_id].append(detection)
+
+ndetections = [[] for _ in detections]
+for track in track_map.values():
+	if len(track) <= 3:
+		continue
+	for detection in track:
+		ndetections[detection['frame_idx']].append(detection)
+
+with open(out_fname, 'w') as f:
+	json.dump(ndetections, f)
diff --git a/scripts/filter_small.py b/scripts/filter_small.py
new file mode 100644
index 0000000..f491427
--- /dev/null
+++ b/scripts/filter_small.py
@@ -0,0 +1,37 @@
+import json
+import os
+import skimage.io
+import sys
+
+in_fname = sys.argv[1]
+frame_path = sys.argv[2]
+out_fname = sys.argv[3]
+
+with open(in_fname, 'r') as f:
+	detections = json.load(f)
+
+frame_fname = [fname for fname in os.listdir(frame_path) if fname.endswith('.jpg')][0]
+im = skimage.io.imread(frame_path + frame_fname)
+
+ndetections = [[] for _ in detections]
+for frame_idx, dlist in enumerate(detections):
+	if dlist is None:
+		continue
+	for detection in dlist:
+		if detection['left'] < 0:
+			detection['left'] = 0
+		if detection['right'] >= im.shape[1]:
+			detection['right'] = im.shape[1]-1
+		if detection['top'] < 0:
+			detection['top'] = 0
+		if detection['bottom'] >= im.shape[0]:
+			detection['bottom'] = im.shape[0]-1
+
+		if detection['right'] - detection['left'] <= 4:
+			continue
+		elif detection['bottom'] - detection['top'] <= 4:
+			continue
+		ndetections[frame_idx].append(detection)
+
+with open(out_fname, 'w') as f:
+	json.dump(ndetections, f)
diff --git a/scripts/interpolate.py b/scripts/interpolate.py
new file mode 100644
index 0000000..9e3769c
--- /dev/null
+++ b/scripts/interpolate.py
@@ -0,0 +1,46 @@
+import json
+import sys
+
+in_fname = sys.argv[1]
+out_fname = sys.argv[2]
+
+with open(in_fname, 'r') as f:
+	detections = json.load(f)
+
+# get tracks
+track_map = {}
+for dlist in detections:
+	if dlist is None:
+		continue
+	for detection in dlist:
+		track_id = detection['track_id']
+		if track_id not in track_map:
+			track_map[track_id] = []
+		track_map[track_id].append(detection)
+
+# interpolate tracks
+ndetections = [[] for _ in detections]
+for track in track_map.values():
+	ntrack = []
+	for detection in track:
+		if len(ntrack) > 0:
+			prev = ntrack[-1]
+			next = detection
+			jump = next['frame_idx'] - prev['frame_idx']
+			for i in range(1, jump):
+				prev_weight = float(jump-i) / float(jump)
+				next_weight = float(i) / float(jump)
+				interp = {
+					'track_id': prev['track_id'],
+					'frame_idx': prev['frame_idx']+i,
+				}
+				for k in ['left', 'top', 'right', 'bottom']:
+					interp[k] = int(prev[k]*prev_weight + next[k]*next_weight)
+				ntrack.append(interp)
+		ntrack.append(detection)
+
+	for detection in ntrack:
+		ndetections[detection['frame_idx']].append(detection)
+
+with open(out_fname, 'w') as f:
+	json.dump(ndetections, f)
diff --git a/scripts/json2mot.py b/scripts/json2mot.py
new file mode 100644
index 0000000..2230a76
--- /dev/null
+++ b/scripts/json2mot.py
@@ -0,0 +1,45 @@
+import json
+import os, os.path
+import sys
+import subprocess
+
+data_path = sys.argv[1]
+split = sys.argv[2]
+out_path = sys.argv[3]
+
+MODE = 'uns20'
+
+labels = [fname for fname in os.listdir(data_path + '/mot17/{}/'.format(split))]
+labels = [label for label in labels if 'SDP' in label]
+labels.sort()
+
+for label in labels:
+	subprocess.call([
+		'python', 'scripts/filter_short.py',
+		data_path + '/mot17/{}/{}/det/{}.json'.format(split, label, MODE),
+		data_path + '/mot17/{}/{}/det/{}-noshort.json'.format(split, label, MODE),
+	])
+
+for label in labels:
+	subprocess.call([
+		'python', 'scripts/interpolate.py',
+		data_path + '/mot17/{}/{}/det/{}-noshort.json'.format(split, label, MODE),
+		data_path + '/mot17/{}/{}/det/{}-interp.json'.format(split, label, MODE),
+	])
+
+for label in labels:
+	with open(data_path + '/mot17/{}/{}/det/{}-interp.json'.format(split, label, MODE), 'r') as f:
+		detections = json.load(f)
+
+	lines = []
+	for frame_idx, dlist in enumerate(detections):
+		if dlist is None:
+			continue
+		for d in dlist:
+			w = d['right'] - d['left']
+			h = d['bottom'] - d['top']
+			line = "{},{},{},{},{},{},-1,-1,-1,-1".format(d['frame_idx'], d['track_id']+1, d['left'], d['top'], w, h)
+			lines.append(line)
+	lines.append("")
+	with open(os.path.join(out_path, '{}.txt'.format(label)), 'w') as f:
+		f.write("\n".join(lines))
diff --git a/scripts/mot2json.py b/scripts/mot2json.py
new file mode 100644
index 0000000..2903eff
--- /dev/null
+++ b/scripts/mot2json.py
@@ -0,0 +1,49 @@
+import json
+import subprocess
+import sys
+
+data_path = sys.argv[1]
+split = sys.argv[2]
+
+if split == 'train':
+	LABELS = ['02', '04', '05', '09', '10', '11', '13']
+elif split == 'test':
+	LABELS = ['01', '03', '06', '07', '08', '12', '14']
+
+for label in LABELS:
+	detections = []
+	with open(data_path + '/mot17/{}/MOT17-{}-SDP/det/det.txt'.format(split, label), 'r') as f:
+		lines = f.readlines()
+	for line in lines:
+		parts = line.strip().split(',')
+		if len(parts) < 7:
+			continue
+		frame_idx = int(parts[0])
+		track_id = int(parts[1])
+		left = int(float(parts[2]))
+		top = int(float(parts[3]))
+		right = left + int(float(parts[4]))
+		bottom = top + int(float(parts[5]))
+		score = float(parts[6])
+		if score < 0.6:
+			continue
+		while frame_idx >= len(detections):
+			detections.append([])
+		detections[frame_idx].append({
+			'frame_idx': frame_idx,
+			'track_id': track_id,
+			'left': left,
+			'top': top,
+			'right': right,
+			'bottom': bottom,
+		})
+
+	fname = data_path + '/mot17/{}/MOT17-{}-SDP/det/det-filter60.json'.format(split, label)
+	with open(fname, 'w') as f:
+		json.dump(detections, f)
+	subprocess.call([
+		'python', 'scripts/filter_small.py',
+		fname,
+		data_path + '/mot17/{}/MOT17-{}-SDP/img1/'.format(split, label),
+		fname,
+	])
diff --git a/scripts/pathtrack.py b/scripts/pathtrack.py
new file mode 100644
index 0000000..07d091a
--- /dev/null
+++ b/scripts/pathtrack.py
@@ -0,0 +1,39 @@
+import json
+import os
+import subprocess
+import sys
+
+data_path = sys.argv[1]
+
+os.makedirs(data_path + 'pathtrack/json/', exist_ok=True)
+
+labels = os.listdir(data_path + '/pathtrack/train/')
+for i, label in enumerate(labels):
+	print(label, i, len(labels))
+
+	detections = []
+	with open(data_path + '/pathtrack/train/{}/det/det_rcnn.txt'.format(label), 'r') as f:
+		lines = [line.strip() for line in f.readlines() if line.strip()]
+	for line in lines:
+		parts = line.split(',')
+		frame_idx = int(float(parts[0]))
+		while len(detections) <= frame_idx:
+			detections.append([])
+		left = int(float(parts[2]))
+		top = int(float(parts[3]))
+		right = left+int(float(parts[4]))
+		bottom = top+int(float(parts[5]))
+		score = float(parts[6])
+		if score < 0.5:
+			continue
+		detections[frame_idx].append({
+			'left': left,
+			'top': top,
+			'right': right,
+			'bottom': bottom,
+			'frame_idx': frame_idx,
+			'track_id': -1,
+		})
+
+	with open(data_path + '/pathtrack/json/{}.json'.format(label), 'w') as f:
+		json.dump(detections, f)
diff --git a/scripts/preprocess-info.py b/scripts/preprocess-info.py
new file mode 100644
index 0000000..78bcb71
--- /dev/null
+++ b/scripts/preprocess-info.py
@@ -0,0 +1,158 @@
+import json
+import math
+import multiprocessing
+import numpy
+import os, os.path
+import pickle
+import skimage.io
+import skimage.transform
+import sys
+
+sys.path.append('.')
+import geom
+
+dataset = sys.argv[1]
+data_path = sys.argv[2]
+nthreads = int(sys.argv[3])
+
+ORIG_WIDTH = 1920
+ORIG_HEIGHT = 1080
+SKIP = 1
+FRAME_SCALE = 1
+CROP_SIZE = 64
+
+if dataset == 'pathtrack':
+	LABELS = [label for label in os.listdir(data_path + '/pathtrack/frames/')]
+	FRAME_PATH = data_path + '/pathtrack/frames/{}/'
+	DETECTION_PATH = data_path + '/pathtrack/json/{}.json'
+	PICKLE_PATH = data_path + '/pathtrack/pickle-info/{}.pkl'
+elif dataset == 'yt-walking':
+	LABELS = [label for label in os.listdir(data_path + '/yt-walking/frames/')]
+	FRAME_PATH = data_path + '/yt-walking/frames/{}/'
+	DETECTION_PATH = data_path + '/yt-walking/json/{}.json'
+	PICKLE_PATH = data_path + '/yt-walking/pickle-info/{}.pkl'
+elif dataset == 'mot17':
+	LABELS = [label for label in os.listdir(data_path + '/mot17/frames/')]
+	FRAME_PATH = data_path + '/mot17/frames/{}/'
+	DETECTION_PATH = data_path + '/mot17/json/{}.json'
+	PICKLE_PATH = data_path + '/mot17/pickle-info/{}.pkl'
+
+os.makedirs(os.path.dirname(PICKLE_PATH), exist_ok=True)
+
+def get_frame_fname(frame_idx):
+	s = str(frame_idx)
+	while len(s) < 6:
+		s = '0' + s
+	return s + '.jpg'
+
+def to_rect(detection):
+	return geom.Rectangle(
+		geom.Point(detection['left'], detection['top']),
+		geom.Point(detection['right'], detection['bottom']),
+	)
+
+MAX_MATCH_AGE = 5
+def get_potential_matches(detections, first_frame, last_frame):
+	# from detection_idx in frame #first_idx to iterable of matching tuples (frame, det_idx)
+	cur_matches = {}
+	for idx in range(len(detections[first_frame])):
+		cur_matches[idx] = [(first_frame, idx)]
+
+	for right_frame in range(first_frame+1, last_frame+1):
+		# list the detections we need to match
+		check_set = set()
+		for l in cur_matches.values():
+			check_set.update(l)
+
+		connections = {}
+		for left_frame, left_idx in check_set:
+			connections[(left_frame, left_idx)] = []
+
+			for right_idx in range(len(detections[right_frame])):
+				rect1 = to_rect(detections[left_frame][left_idx])
+				rect2 = to_rect(detections[right_frame][right_idx])
+				intersect_area = rect1.intersection(rect2).area()
+				if intersect_area < 0:
+					intersect_area = 0
+				union_area = rect1.area() + rect2.area() - intersect_area
+				iou_score = float(intersect_area) / float(union_area)
+				if iou_score > 0.1:
+					connections[(left_frame, left_idx)].append((right_frame, right_idx))
+
+		for idx in cur_matches:
+			new_matches = set()
+			for left_frame, left_idx in cur_matches[idx]:
+				new_matches.update(connections[(left_frame, left_idx)])
+				if right_frame - left_frame < MAX_MATCH_AGE:
+					new_matches.add((left_frame, left_idx))
+			cur_matches[idx] = new_matches
+
+	final_matches = {}
+	for idx, matches in cur_matches.items():
+		final_matches[idx] = [right_idx for right_frame, right_idx in matches if right_frame == last_frame]
+	return final_matches
+
+def zip_frame_info(detections, label, frame_idx):
+	if not detections:
+		return []
+	frame_path = FRAME_PATH.format(label)
+	im = skimage.io.imread('{}/{}'.format(frame_path, get_frame_fname(frame_idx)))
+	im_bounds = geom.Rectangle(
+		geom.Point(0, 0),
+		geom.Point(im.shape[0], im.shape[1])
+	)
+	info = []
+	for idx, detection in enumerate(detections):
+		rect = geom.Rectangle(
+			geom.Point(detection['top']/FRAME_SCALE, detection['left']/FRAME_SCALE),
+			geom.Point(detection['bottom']/FRAME_SCALE, detection['right']/FRAME_SCALE)
+		)
+		if rect.lengths().x < 4 or rect.lengths().y < 4:
+			continue
+		crop = im[rect.start.x:rect.end.x, rect.start.y:rect.end.y, :]
+		resize_factor = min([float(CROP_SIZE) / crop.shape[0], float(CROP_SIZE) / crop.shape[1]])
+		resize_shape = [int(crop.shape[0] * resize_factor), int(crop.shape[1] * resize_factor)]
+		if resize_shape[0] == 0 or resize_shape[1] == 0:
+			continue
+		crop = (skimage.transform.resize(crop, resize_shape)*255).astype('uint8')
+		fix_crop = numpy.zeros((CROP_SIZE, CROP_SIZE, 3), dtype='uint8')
+		fix_crop[0:crop.shape[0], 0:crop.shape[1], :] = crop
+		detection['width'] = float(detection['right']-detection['left'])/ORIG_WIDTH
+		detection['height'] = float(detection['bottom']-detection['top'])/ORIG_HEIGHT
+		info.append((detection, fix_crop, idx))
+	return info
+
+def process(label):
+	pickle_path = PICKLE_PATH.format(label)
+	if os.path.exists(pickle_path):
+		return
+	print('reading from {}'.format(label))
+	with open(DETECTION_PATH.format(label), 'r') as f:
+		detections = json.load(f)
+
+	if not detections:
+		return
+
+	frame_infos = {}
+	#matches = {}
+	for frame_idx in range(0, len(detections), SKIP):
+		if frame_idx % 30000 > 10000:
+			continue
+		print(label, frame_idx)
+		if not detections[frame_idx]:
+			continue
+		frame_infos[frame_idx] = zip_frame_info(detections[frame_idx], label, frame_idx)
+		#for match_len in [10, 15, 25, 35, 45, 55, 65]:
+		#	frame_range = range(frame_idx, frame_idx+match_len+1)
+		#	if not all([detections[i] is not None and len(detections[i]) > 0 for i in frame_range]):
+		#		continue
+		#	for i in frame_range:
+		#		frame_infos[i] = zip_frame_info(detections[i], label, i)
+		#	matches[(frame_idx, frame_idx+match_len)] = get_potential_matches(detections, frame_idx, frame_idx+match_len)
+
+	with open(pickle_path, 'wb') as f:
+		pickle.dump(frame_infos, f)
+
+p = multiprocessing.Pool(nthreads)
+p.map(process, LABELS)
+p.close()
diff --git a/scripts/preprocess-matches.go b/scripts/preprocess-matches.go
new file mode 100644
index 0000000..556af67
--- /dev/null
+++ b/scripts/preprocess-matches.go
@@ -0,0 +1,212 @@
+package main
+
+import (
+	"github.com/mitroadmaps/gomapinfer/common"
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"strings"
+)
+
+type Detection struct {
+	Left int `json:"left"`
+	Top int `json:"top"`
+	Right int `json:"right"`
+	Bottom int `json:"bottom"`
+}
+
+func (d Detection) Rect() common.Rectangle {
+	return common.Rectangle{
+		common.Point{float64(d.Left), float64(d.Top)},
+		common.Point{float64(d.Right), float64(d.Bottom)},
+	}
+}
+
+const Skip int = 1
+const MaxMatchAge int = 10
+const Padding float64 = 10
+var MatchLengths = []int{2, 4, 8, 16, 32, 64}
+var MaxMatchLength = MatchLengths[len(MatchLengths) - 1]
+
+// Returns map idx (in frameIdx) -> (frame, det_idx)
+func matchFrom(detections [][]Detection, frameIdx int) map[int]map[[2]int]bool {
+	// from detection_idx in frameIdx to list of matching tuples (frame, det_idx)
+	curMatches := make(map[int]map[[2]int]bool)
+	finalMatches := make(map[int]map[[2]int]bool)
+	for idx := range detections[frameIdx] {
+		curMatches[idx] = make(map[[2]int]bool)
+		finalMatches[idx] = make(map[[2]int]bool)
+		curMatches[idx][[2]int{frameIdx, idx}] = true
+		finalMatches[idx][[2]int{frameIdx, idx}] = true
+	}
+
+	lastFrame := frameIdx + MaxMatchLength
+	for rightFrame := frameIdx + 1; rightFrame <= lastFrame; rightFrame++ {
+		// find the detections we need to match
+		checkSet := make(map[[2]int]bool)
+		for _, matches := range curMatches {
+			for t := range matches {
+				if rightFrame - t[0] > MaxMatchAge {
+					continue
+				}
+				checkSet[t] = true
+			}
+		}
+
+		connections := make(map[[2]int][][2]int)
+		for left := range checkSet {
+			leftFrame, leftIdx := left[0], left[1]
+			for rightIdx := 0; rightIdx < len(detections[rightFrame]); rightIdx++ {
+				leftRect := detections[leftFrame][leftIdx].Rect().AddTol(Padding)
+				rightRect := detections[rightFrame][rightIdx].Rect().AddTol(Padding)
+				intersectArea := leftRect.Intersection(rightRect).Area()
+				if intersectArea < 0 {
+					intersectArea = 0
+				}
+				unionArea := leftRect.Area() + rightRect.Area() - intersectArea
+				iouScore := intersectArea / unionArea
+				if iouScore < 0.1 {
+					continue
+				}
+				connections[left] = append(connections[left], [2]int{rightFrame, rightIdx})
+			}
+		}
+
+		for idx, matches := range curMatches {
+			for t := range matches {
+				if rightFrame - t[0] >= MaxMatchAge {
+					delete(matches, t)
+				}
+			}
+			for left := range matches {
+				for _, right := range connections[left] {
+					matches[right] = true
+					finalMatches[idx][right] = true
+				}
+			}
+		}
+	}
+
+	return finalMatches
+}
+
+func process(jsonPath string, matchPath string, label string) {
+	bytes, err := ioutil.ReadFile(fmt.Sprintf(jsonPath, label))
+	if err != nil {
+		panic(err)
+	}
+	var detections [][]Detection
+	if err := json.Unmarshal(bytes, &detections); err != nil {
+		panic(err)
+	}
+
+	// match length -> frameIdx -> det_idx in frameIdx -> list of det_idx in (frameIdx+match length)
+	matches := make(map[int]map[int]map[int][]int)
+	mlSet := make(map[int]bool)
+	for _, matchLength := range MatchLengths {
+		mlSet[matchLength] = true
+		matches[matchLength] = make(map[int]map[int][]int)
+	}
+
+	n := 18
+	ch := make(chan int)
+	donech := make(chan map[int]map[int]map[int][]int)
+	for i := 0; i < n; i++ {
+		go func() {
+			threadMatches := make(map[int]map[int]map[int][]int)
+			for _, matchLength := range MatchLengths {
+				threadMatches[matchLength] = make(map[int]map[int][]int)
+			}
+			for baseFrame := range ch {
+				ok := true
+				for frameIdx := baseFrame; frameIdx <= baseFrame + MaxMatchLength; frameIdx++ {
+					if len(detections[frameIdx]) == 0 {
+						ok = false
+					}
+				}
+				if !ok {
+					continue
+				}
+				frameMatches := matchFrom(detections, baseFrame)
+				for curIdx := range frameMatches {
+					for right := range frameMatches[curIdx] {
+						matchLength := right[0] - baseFrame
+						rightIdx := right[1]
+						if !mlSet[matchLength] {
+							continue
+						}
+						if threadMatches[matchLength][baseFrame] == nil {
+							threadMatches[matchLength][baseFrame] = make(map[int][]int)
+						}
+						threadMatches[matchLength][baseFrame][curIdx] = append(threadMatches[matchLength][baseFrame][curIdx], rightIdx)
+					}
+				}
+			}
+			donech <- threadMatches
+		}()
+	}
+	for baseFrame := 0; baseFrame < len(detections) - MaxMatchLength; baseFrame += Skip {
+		fmt.Printf("%d/%d\n", baseFrame, len(detections))
+		ch <- baseFrame
+	}
+	close(ch)
+	for i := 0; i < n; i++ {
+		threadMatches := <- donech
+		for matchLength := range threadMatches {
+			for baseFrame := range threadMatches[matchLength] {
+				for curIdx := range threadMatches[matchLength][baseFrame] {
+					if matches[matchLength][baseFrame] == nil {
+						matches[matchLength][baseFrame] = make(map[int][]int)
+					}
+					matches[matchLength][baseFrame][curIdx] = threadMatches[matchLength][baseFrame][curIdx]
+				}
+			}
+		}
+	}
+
+	bytes, err = json.Marshal(matches)
+	if err != nil {
+		panic(err)
+	}
+	if err := ioutil.WriteFile(fmt.Sprintf(matchPath, label), bytes, 0644); err != nil {
+		panic(err)
+	}
+}
+
+func main() {
+	dataset := os.Args[1]
+	dataPath := os.Args[2]
+
+	var framePath, jsonPath, matchPath string
+	if dataset == "pathtrack" {
+		framePath = dataPath + "/pathtrack/frames/"
+		jsonPath = dataPath + "/pathtrack/json/%s.json"
+		matchPath = dataPath + "/pathtrack/pickle-info/%s.matches.json"
+	} else if dataset == "yt-walking" {
+		framePath = dataPath + "/yt-walking/frames/"
+		jsonPath = dataPath + "/yt-walking/json/%s.json"
+		matchPath = dataPath + "/yt-walking/pickle-info/%s.matches.json"
+	} else if dataset == "mot17" {
+		framePath = dataPath + "/mot17/frames/"
+		jsonPath = dataPath + "/mot17/json/%s.json"
+		matchPath = dataPath + "/mot17/pickle-info/%s.matches.json"
+	}
+
+	var labels []string
+	files, err := ioutil.ReadDir(framePath)
+	if err != nil {
+		panic(err)
+	}
+	for _, fi := range files {
+		if strings.Contains(framePath, "beach") && !strings.HasPrefix(fi.Name(), "2019-") {
+			continue
+		} else if fi.Name() == "json" || fi.Name() == "pickle-info" {
+			continue
+		}
+		labels = append(labels, fi.Name())
+	}
+	for _, label := range labels {
+		process(jsonPath, matchPath, label)
+	}
+}
diff --git a/scripts/symlink.py b/scripts/symlink.py
new file mode 100644
index 0000000..4d74eda
--- /dev/null
+++ b/scripts/symlink.py
@@ -0,0 +1,21 @@
+import os
+import subprocess
+import sys
+
+dataset = sys.argv[1]
+data_path = sys.argv[2]
+
+if dataset == 'mot17':
+    os.makedirs(data_path + '/mot17/json', exist_ok=True)
+    os.makedirs(data_path + '/mot17/frames', exist_ok=True)
+    for split in ['train', 'test']:
+        labels = os.listdir(data_path + 'mot17/{}'.format(split))
+        labels = [label for label in labels if 'SDP' in label]
+        for label in labels:
+            subprocess.call(['ln', '-s', data_path + '/mot17/{}/{}/det/det-filter60.json'.format(split, label), data_path + '/mot17/json/{}.json'.format(label)])
+            subprocess.call(['ln', '-s', data_path + '/mot17/{}/{}/img1'.format(split, label), data_path + '/mot17/frames/{}'.format(label)])
+elif dataset == 'pathtrack':
+    os.makedirs(data_path + '/pathtrack/frames', exist_ok=True)
+    labels = os.listdir(data_path + 'pathtrack/train')
+    for label in labels:
+        subprocess.call(['ln', '-s', data_path + '/pathtrack/train/{}/img1'.format(label), data_path + '/pathtrack/frames/{}'.format(label)])
diff --git a/scripts/ytw-extract.py b/scripts/ytw-extract.py
new file mode 100644
index 0000000..45bafe0
--- /dev/null
+++ b/scripts/ytw-extract.py
@@ -0,0 +1,16 @@
+import os
+import subprocess
+import sys
+
+data_path = sys.argv[1]
+
+processes = []
+for fname in os.listdir(data_path+'/yt-walking/'):
+	if not fname.endswith('.mp4'):
+		continue
+	label = fname.split('.')[0]
+	os.makedirs(data_path+'/yt-walking/frames/' + label, exist_ok=True)
+	p = subprocess.Popen(['ffmpeg', '-threads', '3', '-i', data_path+'/yt-walking/'+fname, '-vf', 'fps=10,scale=960:540', '-q:v', '1', data_path+'/yt-walking/frames/' + label + '/%06d.jpg'])
+	processes.append(p)
+for p in processes:
+	p.wait()
diff --git a/scripts/ytw-maskrcnn.py b/scripts/ytw-maskrcnn.py
new file mode 100644
index 0000000..be77f56
--- /dev/null
+++ b/scripts/ytw-maskrcnn.py
@@ -0,0 +1,84 @@
+import os
+import sys
+import random
+import math
+import numpy as np
+import skimage.io
+import matplotlib
+import matplotlib.pyplot as plt
+
+ROOT_DIR = os.path.abspath("../")
+sys.path.append(ROOT_DIR)  # To find local version of the library
+from mrcnn import utils
+import mrcnn.model as modellib
+from mrcnn import visualize
+sys.path.append(os.path.join(ROOT_DIR, "samples/coco/"))  # To find local version
+import coco
+
+MODEL_DIR = os.path.join(ROOT_DIR, "logs")
+COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.h5")
+
+class InferenceConfig(coco.CocoConfig):
+	GPU_COUNT = 1
+	IMAGES_PER_GPU = 1
+
+config = InferenceConfig()
+model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config)
+model.load_weights(COCO_MODEL_PATH, by_name=True)
+class_names = [
+	'BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
+	'bus', 'train', 'truck', 'boat', 'traffic light',
+	'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird',
+	'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear',
+	'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
+	'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
+	'kite', 'baseball bat', 'baseball glove', 'skateboard',
+	'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
+	'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
+	'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
+	'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
+	'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+	'keyboard', 'cell phone', 'microwave', 'oven', 'toaster',
+	'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
+	'teddy bear', 'hair drier', 'toothbrush',
+]
+
+data_path = sys.argv[1]
+
+import json
+import os
+import subprocess
+FRAME_PATH = data_path + '/yt-walking/frames/'
+JSON_PATH = data_path + '/yt-walking/json/'
+BATCH_SIZE = 1
+labels = os.listdir(FRAME_PATH)
+labels.sort()
+for label in labels[0:1]:
+	print('processing', label)
+	im_path = FRAME_PATH + label + '/'
+	fnames = os.listdir(im_path)
+	detections = []
+	for i in range(0, len(fnames), BATCH_SIZE):
+		print(label, i, len(fnames))
+		batch = fnames[i:i+BATCH_SIZE]
+		ims = [skimage.io.imread(im_path + fname) for fname in batch]
+		results = model.detect(ims)
+		for j in range(len(batch)):
+			frame_idx = int(batch[j].split('.')[0])
+			while len(detections) <= frame_idx:
+				detections.append([])
+			for roi, class_id in zip(results[j]['rois'], results[j]['class_ids']):
+				if int(class_id) != 1:
+					continue
+				detections[frame_idx].append({
+					'frame_idx': frame_idx,
+					'left': int(roi[1]),
+					'top': int(roi[0]),
+					'right': int(roi[3]),
+					'bottom': int(roi[2]),
+					'track_id': -1,
+				})
+
+	json_fname = JSON_PATH + label + '.json'
+	with open(json_fname, 'w') as f:
+		json.dump(detections, f)
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..109fe03
--- /dev/null
+++ b/train.py
@@ -0,0 +1,307 @@
+import geom
+import model
+
+import json
+import math
+import numpy
+import os
+import pickle
+import random
+import skimage.io, skimage.transform
+import sys
+import tensorflow as tf
+import time
+
+data_path = sys.argv[1]
+model_path = sys.argv[2]
+
+ORIG_WIDTH = 1920
+ORIG_HEIGHT = 1080
+FRAME_SCALE = 1
+CROP_SIZE = 64
+MATCH_LENGTHS = [4, 16]
+ADD_NEGATIVES = True
+MODE = 'imsp-longim'
+
+SKIPS = [1]
+DATASETS = [
+	(
+		os.listdir(data_path + '/yt-walking/frames/'),
+		data_path + '/yt-walking/frames/{}/',
+		data_path + '/yt-walking/pickle-info/{}.pkl',
+		data_path + '/yt-walking/pickle-info/{}.matches.json',
+		2.0,
+		[1, 2],
+	),
+	(
+		os.listdir(data_path + '/pathtrack/frames/'),
+		data_path + '/pathtrack/frames/{}/',
+		data_path + '/pathtrack/pickle-info/{}.pkl',
+		data_path + '/pathtrack/pickle-info/{}.matches.json',
+		1.5,
+		[2, 4],
+	),
+	(
+		os.listdir(data_path + '/mot17/frames/'),
+		data_path + '/mot17/frames/{}/',
+		data_path + '/mot17/pickle-info/{}.pkl',
+		data_path + '/mot17/pickle-info/{}.matches.json',
+		1.0,
+		[2, 4],
+	),
+]
+val_fn = lambda example: hash(example[4]) % 20 == 0 and 'MOT' not in example[4]
+
+def get_frame_fname(frame_idx):
+	s = str(frame_idx)
+	while len(s) < 6:
+		s = '0' + s
+	return s + '.jpg'
+
+def get_loc(detection):
+	cx = (detection['left'] + detection['right']) / 2
+	cy = (detection['top'] + detection['bottom']) / 2
+	cx = float(cx) / ORIG_WIDTH
+	cy = float(cy) / ORIG_HEIGHT
+	return cx, cy
+
+def to_rect(detection):
+	return geom.Rectangle(
+		geom.Point(detection['left'], detection['top']),
+		geom.Point(detection['right'], detection['bottom']),
+	)
+
+def get_stuff(infos, matches):
+	def per_info(info):
+		images = []
+		boxes = numpy.zeros((len(info), 4), dtype='float32')
+		for i, (detection, crop, _) in enumerate(info):
+			images.append(crop)
+			cx, cy = get_loc(detection)
+			boxes[i, :] = [cx, cy, detection['width'], detection['height']]
+		detections = [get_loc(detection) for detection, _, _ in info]
+		return images, boxes, detections, len(info)
+
+	all_images = []
+	all_boxes = []
+	all_detections = []
+	all_counts = []
+	for i, info in enumerate(infos):
+		images, boxes, detections, count = per_info(info)
+		all_images.append(images)
+		all_boxes.append(boxes)
+		all_detections.append(detections)
+		all_counts.append(count)
+
+	all_masks = []
+	for i, match_len in enumerate(MATCH_LENGTHS):
+		last_idx = match_len
+		mask = numpy.zeros((len(infos[0]), len(infos[last_idx])+1), dtype='float32')
+		mask[:, len(infos[last_idx])] = 1
+		first_map = {}
+		for j, (_, _, orig_idx) in enumerate(infos[0]):
+			first_map[orig_idx] = j
+		last_map = {}
+		for j, (_, _, orig_idx) in enumerate(infos[last_idx]):
+			last_map[orig_idx] = j
+		for left_idx in matches[i]:
+			if left_idx not in first_map:
+				continue
+			for right_idx in matches[i][left_idx]:
+				if right_idx not in last_map:
+					continue
+				mask[first_map[left_idx], last_map[right_idx]] = 1
+		all_masks.append(mask.flatten())
+
+	return all_images, all_boxes, all_detections, all_counts, all_masks
+
+print('loading infos and matches')
+all_frame_data = {}
+for labels, frame_tmpl, pickle_tmpl, match_tmpl, detection_scale, _ in DATASETS:
+	for label in labels:
+		pickle_path = pickle_tmpl.format(label)
+		match_path = match_tmpl.format(label)
+		print('... {} (pickle)'.format(label))
+		with open(pickle_path, 'rb') as f:
+			frame_infos = pickle.load(f, encoding='latin1')
+		for info_list in frame_infos.values():
+			for info in info_list:
+				info[0]['left'] *= detection_scale
+				info[0]['top'] *= detection_scale
+				info[0]['right'] *= detection_scale
+				info[0]['bottom'] *= detection_scale
+				info[0]['width'] *= detection_scale
+				info[0]['height'] *= detection_scale
+		print('... {} (matches)'.format(label))
+		with open(match_path, 'r') as f:
+			raw_matches = json.load(f, encoding='latin1')
+			frame_matches = {}
+			for match_len in raw_matches:
+				frame_matches[int(match_len)] = {}
+				for frame_idx in raw_matches[match_len]:
+					frame_matches[int(match_len)][int(frame_idx)] = {}
+					for left_idx in raw_matches[match_len][frame_idx]:
+						frame_matches[int(match_len)][int(frame_idx)][int(left_idx)] = raw_matches[match_len][frame_idx][left_idx]
+		all_frame_data[label] = (frame_infos, frame_matches)
+
+print('preparing random info generator')
+labels_and_weights = [(label, len(all_frame_data[label][0])) for label in all_frame_data.keys()]
+def get_random_info(exclude_label):
+	labels = [label for label in all_frame_data.keys() if label != exclude_label]
+	weights = [len(all_frame_data[label][0]) for label in labels]
+	weight_sum = sum(weights)
+	weights = [float(x)/float(weight_sum) for x in weights]
+	while True:
+		label = numpy.random.choice(labels, p=weights)
+		frame_infos = all_frame_data[label][0]
+		frame_idx = random.choice(list(frame_infos.keys()))
+		if len(frame_infos[frame_idx]) > 4:
+			return frame_infos[frame_idx]
+
+# each example is tuple (images, boxes, n_image, label, frame_idx, skip)
+print('extracting examples')
+all_examples = []
+for labels, frame_tmpl, _, _, _, skips in DATASETS:
+	for label in labels:
+		frame_path = frame_tmpl.format(label)
+		frame_infos, frame_matches = all_frame_data[label]
+
+		for i, frame_idx in enumerate(frame_infos.keys()):
+			print('...', label, i, len(frame_infos))
+
+			skip = random.choice(skips)
+			match_lengths = [skip*match_len for match_len in MATCH_LENGTHS]
+
+			infos = [frame_infos.get(frame_idx+l*skip, None) for l in range(model.SEQ_LEN)]
+			if any([(info is None or len(info) == 0) for info in infos]):
+				continue
+			elif any([frame_idx not in frame_matches[match_len] for match_len in match_lengths]):
+				continue
+
+			if ADD_NEGATIVES:
+				neg_info = get_random_info(label)
+			else:
+				neg_info = []
+
+			matches = [frame_matches[match_len][frame_idx] for match_len in match_lengths]
+			images, boxes, detections, counts, mask = get_stuff(infos + [neg_info], matches)
+			all_examples.append((
+				images, boxes, counts, mask,
+				label, frame_idx, detections, frame_path, skip,
+			))
+
+random.shuffle(all_examples)
+val_examples = [example for example in all_examples if val_fn(example)]
+if len(val_examples) > 1024:
+	val_examples = random.sample(val_examples, 1024)
+train_examples = [example for example in all_examples if not val_fn(example) and min(example[2][:-1]) >= 6]
+
+best_loss = None
+
+def train(learning_rate, num_epochs):
+	global best_loss
+
+	print('training mode={} at lr={} for {} epochs'.format(MODE, learning_rate, num_epochs))
+	for epoch in range(num_epochs):
+		start_time = time.time()
+		train_losses = []
+		for _ in range(2048//model.BATCH_SIZE):
+			if MODE == 'imsp-finesp':
+				match_len = max(MATCH_LENGTHS)
+			else:
+				match_len = random.choice(MATCH_LENGTHS)
+
+			batch = []
+			for example in random.sample(train_examples, model.BATCH_SIZE):
+				imlists = example[0][0:match_len+1] + [example[0][model.SEQ_LEN]]
+				boxlists = example[1][0:match_len+1] + [example[1][model.SEQ_LEN]]
+				counts = example[2][0:match_len+1] + [example[2][model.SEQ_LEN]]
+				mask = example[3][MATCH_LENGTHS.index(match_len)]
+				batch.append((imlists, boxlists, counts, mask))
+
+			imlists = [imlist for example in batch for imlist in example[0]]
+			boxlists = [boxlist for example in batch for boxlist in example[1]]
+			counts = [[] for _ in range(len(batch))]
+			for i, example in enumerate(batch):
+				counts[i] = example[2][0:match_len+1]
+				while len(counts[i]) < model.SEQ_LEN:
+					counts[i].append(0)
+				counts[i].append(example[2][-1])
+
+			images = [im for imlist in imlists for im in imlist]
+			boxes = [box for boxlist in boxlists for box in boxlist]
+
+			masks = numpy.concatenate([example[3] for example in batch], axis=0)
+			feed_dict = {
+				m.raw_images: images,
+				m.input_boxes: boxes,
+				m.n_image: counts,
+				m.input_masks: masks,
+				m.match_length: match_len,
+				m.is_training: True,
+				m.learning_rate: learning_rate,
+			}
+			if MODE == 'imsp-longim':
+				_, loss = session.run([m.longim_optimizer, m.longim_loss], feed_dict=feed_dict)
+			elif MODE == 'imsp-finesp':
+				_, loss = session.run([m.finesp_optimizer, m.finesp_loss], feed_dict=feed_dict)
+			train_losses.append(loss)
+		train_loss = numpy.mean(train_losses)
+		train_time = time.time()
+
+		val_losses = []
+		for i in range(0, len(val_examples), model.BATCH_SIZE):
+			batch = val_examples[i:i+model.BATCH_SIZE]
+			images = [im for example in batch for imlist in example[0] for im in imlist]
+			boxes = [box for example in batch for boxlist in example[1] for box in boxlist]
+			counts = [example[2] for example in batch]
+			masks = numpy.concatenate([example[3][-1] for example in batch], axis=0)
+			feed_dict = {
+				m.raw_images: images,
+				m.input_boxes: boxes,
+				m.n_image: counts,
+				m.input_masks: masks,
+				m.match_length: model.SEQ_LEN-1,
+				m.is_training: False,
+			}
+			if MODE == 'imsp-longim':
+				loss = session.run(m.longim_loss, feed_dict=feed_dict)
+			elif MODE == 'imsp-finesp':
+				loss = session.run(m.finesp_loss, feed_dict=feed_dict)
+			val_losses.append(loss)
+
+		val_loss = numpy.mean(val_losses)
+		val_time = time.time()
+
+		print('iteration {}: train_time={}, val_time={}, train_loss={}, val_loss={}/{}'.format(epoch, int(train_time - start_time), int(val_time - train_time), train_loss, val_loss, best_loss))
+
+		if best_loss is None or val_loss < best_loss:
+			best_loss = val_loss
+			m.saver.save(session, model_path)
+
+
+print('initializing model: longim')
+m = model.Model(options={'mode': MODE})
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+session = tf.Session(config=config)
+session.run(m.init_op)
+
+train(1e-3, 200)
+train(1e-4, 200)
+train(1e-5, 200)
+
+print('initializing model: finesp')
+MODE = 'imsp-finesp'
+session.close()
+m = model.Model(options={'mode': MODE})
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+session = tf.Session(config=config)
+m.saver.restore(session, model_path)
+
+train(1e-3, 200)
+train(1e-4, 200)
+train(1e-5, 200)
+
+print('done')