diff --git a/datasetinsights/constants.py b/datasetinsights/constants.py index 32c23604..000109eb 100644 --- a/datasetinsights/constants.py +++ b/datasetinsights/constants.py @@ -37,7 +37,6 @@ # ... DEFAULT_DATA_ROOT = "/data" SYNTHETIC_SUBFOLDER = "synthetic" -DEFAULT_PUBLIC_DATASET = "synthetic" # Default Unity Project ID where USim jobs was executed DEFAULT_PROJECT_ID = "474ba200-4dcc-4976-818e-0efd28efed30" diff --git a/datasetinsights/datasets/coco.py b/datasetinsights/datasets/coco.py index 8dde5b41..76eab7e7 100644 --- a/datasetinsights/datasets/coco.py +++ b/datasetinsights/datasets/coco.py @@ -1,5 +1,9 @@ +import fcntl +import glob +import json import logging import os +import shutil import zipfile from pathlib import Path from typing import List, Tuple @@ -13,6 +17,7 @@ from datasetinsights.io.gcs import GCSClient from .base import Dataset +from .exceptions import DatasetNotFoundError ANNOTATION_FILE_TEMPLATE = "{}_{}2017.json" COCO_GCS_PATH = "data/coco" @@ -58,9 +63,9 @@ def convert_coco2canonical(coco_annotation): convert from a tuple of image and coco style dictionary describing the bboxes to a tuple of image, List of BBox2D Args: - coco_annotation (tuple): image and coco style dictionary + coco_annotation (tuple): image and coco style dictionary. - Returns: a tuple of image, List of BBox2D + Returns: a tuple of image, List of BBox2D. """ image, targets = coco_annotation @@ -74,34 +79,59 @@ def convert_coco2canonical(coco_annotation): class CocoDetection(Dataset): - """ - http://cocodataset.org/#detection-2019 + """COCO dataset for 2D object detection. + + Before the class instantiation, it would assume that the COCO dataset is + downloaded. + + See COCO dataset `documentation `_ + for more details. + + Attributes: + root (str): root path of the data. + transforms: callable transformation that applies to a pair of + capture, annotation. Capture is the information captured by the + sensor, in this case an image, and annotations, which in this + dataset are 2d bounding box coordinates and labels. + split (str): indicate split type of the dataset (train|val). + label_mappings (dict): a dict of {label_id: label_name} mapping. + coco (torchvision.datasets.CocoDetection): COCO dataset. """ def __init__( self, *, - data_root=const.DEFAULT_DATA_ROOT, + data_path=const.DEFAULT_DATA_ROOT, split="train", transforms=None, remove_examples_without_boxes=True, **kwargs, ): + """ + Args: + data_path (str): Directory of the dataset. + split (str): indicate split type of the dataset (train|val). + transforms: callable transformation that applies to a pair of + capture, annotation. + remove_examples_without_boxes (bool): whether to remove examples + without boxes. Defaults to True. + """ # todo add test split self.split = split - self.root = os.path.join(data_root, COCO_LOCAL_PATH) - self.download() + self.root = data_path + self._preprocess_dataset(data_path=self.root, split=self.split) self.coco = self._get_coco(root=self.root, image_set=split) if remove_examples_without_boxes: self.coco = _coco_remove_images_without_annotations( dataset=self.coco ) self.transforms = transforms + self.label_mappings = self._get_label_mappings() def __getitem__(self, idx) -> Tuple[Image, List[BBox2D]]: """ Args: - idx: + idx (int): index of the data. Returns: Image with list of bounding boxes found inside the image @@ -143,6 +173,91 @@ def _get_local_annotations_zip(self): def _get_local_images_zip(self): return os.path.join(self.root, f"{self.split}2017.zip") + def _get_label_mappings(self): + """get label mappings. + + Returns: + dict: A dict containing {label_id: label_name} mappings. + """ + ann_file_name = ( + Path(self.root) / "annotations" / f"instances_{self.split}2017.json" + ) + label_mappings = {} + with open(ann_file_name, "r") as ann_file: + anns = json.load(ann_file) + for cat in anns["categories"]: + label_mappings[cat["id"]] = cat["name"] + return label_mappings + + @staticmethod + def _preprocess_dataset(data_path, split): + """ Preprocess dataset inside data_path and un-archive if necessary. + + Args: + data_path (str): Path where dataset is stored. + split (str): indicate split type of the dataset (train|val). + + Return: + Tuple: (unarchived img path, unarchived annotation path) + """ + + archive_img_file = Path(data_path) / f"{split}2017.zip" + archive_ann_file = Path(data_path) / "annotations_trainval2017.zip" + if archive_img_file.exists() and archive_ann_file.exists(): + unarchived_img_path = CocoDetection._unarchive_data( + data_path, archive_img_file + ) + unarchived_ann_path = CocoDetection._unarchive_data( + data_path, archive_ann_file + ) + return (unarchived_img_path, unarchived_ann_path) + elif CocoDetection._is_dataset_files_present(data_path): + # This is for dataset generated by unity simulation. + return data_path + else: + raise DatasetNotFoundError( + f"Expecting a file {archive_img_file} and {archive_ann_file}" + "under {data_path}" + ) + + def _unarchive_data(self, data_path, archive_file): + """unarchive downloaded data. + Args: + data_path (str): Path where dataset is stored. + archive_file (str): archived file name. + + Returns: + str: unarchived path. + """ + file_descriptor = os.open(archive_file, os.O_RDONLY) + try: + fcntl.flock(file_descriptor, fcntl.LOCK_EX) + unarchived_path = Path(data_path) + if not CocoDetection._is_dataset_files_present(unarchived_path): + shutil.unpack_archive( + filename=archive_file, extract_dir=unarchived_path, + ) + logger.info(f"Unpack {archive_file} to {unarchived_path}") + finally: + os.close(file_descriptor) + return unarchived_path + + @staticmethod + def _is_dataset_files_present(data_path): + """check whether dataset files exist. + + Args: + data_path (str): Path where dataset is stored. + + Returns: + bool: whether dataset files exist. + """ + return ( + os.path.isdir(data_path) + and any(glob.glob(f"{data_path}/*.json")) + and any(glob.glob(f"{data_path}/*.jpg")) + ) + def download(self, cloud_path=COCO_GCS_PATH): path = Path(self.root) path.mkdir(parents=True, exist_ok=True) diff --git a/tests/datasets/test_coco.py b/tests/datasets/test_coco.py new file mode 100644 index 00000000..dd6b86c0 --- /dev/null +++ b/tests/datasets/test_coco.py @@ -0,0 +1,46 @@ +import os +import tempfile +from pathlib import Path +from unittest.mock import patch + +from pytest import raises + +from datasetinsights.datasets.coco import CocoDetection +from datasetinsights.datasets.exceptions import DatasetNotFoundError + + +def test__is_dataset_files_present(): + with tempfile.TemporaryDirectory() as tmp: + with open(os.path.join(tmp, "coco.json"), "x"): + with open(os.path.join(tmp, "coco.jpg"), "x"): + assert CocoDetection._is_dataset_files_present(tmp) + + with tempfile.TemporaryDirectory() as tmp: + assert not CocoDetection._is_dataset_files_present(tmp) + + +@patch("datasetinsights.datasets.CocoDetection._unarchive_data") +def test__preprocess_dataset(mock_unarchive): + tmp_dir = tempfile.TemporaryDirectory() + tmp_name = tmp_dir.name + split = "train" + + # test no dataset found + with raises(DatasetNotFoundError): + CocoDetection._preprocess_dataset(tmp_name, split) + + # test dataset already exists + with open(os.path.join(tmp_name, "coco.json"), "x"): + with open(os.path.join(tmp_name, "coco.jpg"), "x"): + return_value = CocoDetection._preprocess_dataset(tmp_name, split) + assert return_value == tmp_name + + # test whether it can unarchive data + archive_img_file = Path(tmp_name) / f"{split}2017.zip" + archive_ann_file = Path(tmp_name) / "annotations_trainval2017.zip" + with open(archive_img_file, "x"): + with open(archive_ann_file, "x"): + CocoDetection._preprocess_dataset(tmp_name, split) + assert mock_unarchive.call_count == 2 + + tmp_dir.cleanup()