From 854044de86037489dbc3435ee0aaafeaebef733b Mon Sep 17 00:00:00 2001 From: Ravindu Madawalagama <80534358+Ravindu987@users.noreply.github.com> Date: Fri, 12 May 2023 18:17:09 +0530 Subject: [PATCH] Add kolektor dataset (#983) * Add dataset create function * Add KolektorDataset class * Add Kolektor class * Add comments for dataset creation * Add licensing * Fix error message * Add train split ratio as a parameter * Add dataset link * Add docstrings * Fix warn in imageio * Add Prepare dataset method with custom download function * Code clean * Update function to check mask * Update download util with filename in DownloadInfo * Use updated download_and_extract in Kolektor * Use opencv for reading images * Updated the changelog --------- Co-authored-by: Samet Akcay --- src/anomalib/data/kolektor.py | 285 ++++++++++++++++++++++++++++ src/anomalib/data/utils/download.py | 7 +- 2 files changed, 291 insertions(+), 1 deletion(-) create mode 100644 src/anomalib/data/kolektor.py diff --git a/src/anomalib/data/kolektor.py b/src/anomalib/data/kolektor.py new file mode 100644 index 0000000000..92350c76f5 --- /dev/null +++ b/src/anomalib/data/kolektor.py @@ -0,0 +1,285 @@ +"""Kolektor Surface-Defect Dataset (CC BY-NC-SA 4.0). + +Description: + This script contains PyTorch Dataset, Dataloader and PyTorch + Lightning DataModule for the Kolektor Surface-Defect dataset. + The dataset can be found at https://www.vicos.si/resources/kolektorsdd/ + +License: + Kolektor Surface-Defect dataset is released under the Creative Commons + Attribution-NonCommercial-ShareAlike 4.0 International License + (CC BY-NC-SA 4.0)(https://creativecommons.org/licenses/by-nc-sa/4.0/). + +Reference: + - Tabernik, Domen, Samo Šela, Jure Skvarč, and Danijel Skočaj. + "Segmentation-based deep-learning approach for surface-defect detection." + Journal of Intelligent Manufacturing 31, no. 3 (2020): 759-776. +""" + + +from __future__ import annotations + +import logging +from pathlib import Path + +import albumentations as A +import numpy as np +from cv2 import imread +from pandas import DataFrame +from sklearn.model_selection import train_test_split + +from anomalib.data.base import AnomalibDataModule, AnomalibDataset +from anomalib.data.task_type import TaskType +from anomalib.data.utils import ( + DownloadInfo, + InputNormalizationMethod, + Split, + TestSplitMode, + ValSplitMode, + download_and_extract, + get_transforms, +) + +logger = logging.getLogger(__name__) + +DOWNLOAD_INFO = DownloadInfo( + name="kolektor", + url="https://go.vicos.si/kolektorsdd", + hash="2b094030343c1cd59df02203ac6c57a0", + filename="KolektorSDD.zip", +) + + +# Check if a mask shows defects +def is_mask_anomalous(path): + img_arr = imread(path) + if np.all(img_arr == 0): + return 0 + return 1 + + +def make_kolektor_dataset( + root: str | Path, + train_split_ratio: float = 0.8, + split: str | Split | None = None, +) -> DataFrame: + """Create Kolektor samples by parsing the Koelktor data file structure. + + The files are expected to follow the structure: + image files: + path/to/dataset/item/image_filename.jpg + path/to/dataset/kos01/Part0.jpg + mask files: + path/to/dataset/item/mask_filename.bmp + path/to/dataset/kos01/Part0_label.bmp + + This function creates a dataframe to store the parsed information based on the following format: + |---|--------------------|--------|-------|---------|---------------------|--------------------|-------------| + | | path | item | split | label | image_path | mask_path | label_index | + |---|--------------------|--------|-------|---------|---------------------|--------------------|-------------| + | 0 | KolektorSDD | kos01 | test | Bad | /path/to/image_file | /path/to/mask_file | 1 | + |---|--------------------|--------|-------|---------|---------------------|--------------------|-------------| + + Args: + root (Path): Path to dataset + train_split_ratio (float, optional): Ratio to split good images into train/test + Defaults to 0.8 for train. + split (str | Split | None, optional): Dataset split (Either train or test). Defaults to None. + + Examples: + The following example shows how to get training samples from Kolektor Dataset: + + >>> root = Path('./KolektorSDD/') + + >>> samples = make_kolektor_dataset(root, train_split_ratio=0.8) + >>> print(samples.head()) + path item split label image_path mask_path label_index + 0 KolektorSDD kos01 train Good KolektorSDD/kos01/Part0.jpg KolektorSDD/kos01/Part0_label.bmp 0 + 1 KolektorSDD kos01 train Good KolektorSDD/kos01/Part1.jpg KolektorSDD/kos01/Part1_label.bmp 0 + 2 KolektorSDD kos01 train Good KolektorSDD/kos01/Part2.jpg KolektorSDD/kos01/Part2_label.bmp 0 + 3 KolektorSDD kos01 test Good KolektorSDD/kos01/Part3.jpg KolektorSDD/kos01/Part3_label.bmp 0 + 4 KolektorSDD kos01 train Good KolektorSDD/kos01/Part4.jpg KolektorSDD/kos01/Part4_label.bmp 0 + + Returns: + DataFrame: an output dataframe containing the samples of the dataset. + """ + + root = Path(root) + + # Get list of images and masks + samples_list = [(str(root),) + f.parts[-2:] for f in root.glob(r"**/*") if f.suffix == ".jpg"] + masks_list = [(str(root),) + f.parts[-2:] for f in root.glob(r"**/*") if f.suffix == ".bmp"] + + if not samples_list: + raise RuntimeError(f"Found 0 images in {root}") + + # Create dataframes + samples = DataFrame(samples_list, columns=["path", "item", "image_path"]) + masks = DataFrame(masks_list, columns=["path", "item", "image_path"]) + + # Modify image_path column by converting to absolute path + samples["image_path"] = samples.path + "/" + samples.item + "/" + samples.image_path + masks["image_path"] = masks.path + "/" + masks.item + "/" + masks.image_path + + # Sort samples by image path + samples = samples.sort_values(by="image_path", ignore_index=True) + masks = masks.sort_values(by="image_path", ignore_index=True) + + # Add mask paths for sample images + samples["mask_path"] = masks.image_path.values + + # Use is_good func to configure the label_index + samples["label_index"] = samples["mask_path"].apply(is_mask_anomalous) + samples.label_index = samples.label_index.astype(int) + + # Use label indexes to label data + samples.loc[(samples.label_index == 0), "label"] = "Good" + samples.loc[(samples.label_index == 1), "label"] = "Bad" + + # Add all 'Bad' samples to test set + samples.loc[(samples.label == "Bad"), "split"] = "test" + + # Divide 'good' images to train/test on 0.8/0.2 ratio + train_samples, test_samples = train_test_split( + samples[samples.label == "Good"], train_size=train_split_ratio, random_state=42 + ) + samples.loc[train_samples.index, "split"] = "train" + samples.loc[test_samples.index, "split"] = "test" + + # Reorder columns + samples = samples[["path", "item", "split", "label", "image_path", "mask_path", "label_index"]] + + # assert that the right mask files are associated with the right test images + assert ( + samples.loc[samples.label_index == 1] + .apply(lambda x: Path(x.image_path).stem in Path(x.mask_path).stem, axis=1) + .all() + ), "Mismatch between anomalous images and ground truth masks. Make sure the mask files \ + follow the same naming convention as the anomalous images in the dataset (e.g. image: 'Part0.jpg', \ + mask: 'Part0_label.bmp')." + + # Get the dataframe for the required split + if split: + samples = samples[samples.split == split].reset_index(drop=True) + + return samples + + +class KolektorDataset(AnomalibDataset): + """Kolektor dataset class. + + Args: + task (TaskType): Task type, ``classification``, ``detection`` or ``segmentation`` + transform (A.Compose): Albumentations Compose object describing the transforms that are applied to the inputs. + root (Path | str): Path to the root of the dataset + split (str | Split | None): Split of the dataset, usually Split.TRAIN or Split.TEST + """ + + def __init__( + self, + task: TaskType, + transform: A.Compose, + root: Path | str, + split: str | Split | None = None, + ) -> None: + super().__init__(task=task, transform=transform) + + self.root = root + self.split = split + + def _setup(self) -> None: + self.samples = make_kolektor_dataset(self.root, train_split_ratio=0.8, split=self.split) + + +class Kolektor(AnomalibDataModule): + """Kolektor Datamodule. + + Args: + root (Path | str): Path to the root of the dataset + image_size (int | tuple[int, int] | None, optional): Size of the input image. + Defaults to None. + center_crop (int | tuple[int, int] | None, optional): When provided, the images will be center-cropped + to the provided dimensions. + normalize (bool): When True, the images will be normalized to the ImageNet statistics. + train_batch_size (int, optional): Training batch size. Defaults to 32. + eval_batch_size (int, optional): Test batch size. Defaults to 32. + num_workers (int, optional): Number of workers. Defaults to 8. + task TaskType): Task type, 'classification', 'detection' or 'segmentation' + transform_config_train (str | A.Compose | None, optional): Config for pre-processing + during training. + Defaults to None. + transform_config_val (str | A.Compose | None, optional): Config for pre-processing + during validation. + Defaults to None. + test_split_mode (TestSplitMode): Setting that determines how the testing subset is obtained. + test_split_ratio (float): Fraction of images from the train set that will be reserved for testing. + Defaults to 0.2 + val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained. + val_split_ratio (float): Fraction of train or test images that will be reserved for validation. + Defaults to 0.5 + seed (int | None, optional): Seed which may be set to a fixed value for reproducibility. + """ + + def __init__( + self, + root: Path | str, + image_size: int | tuple[int, int] | None = None, + center_crop: int | tuple[int, int] | None = None, + normalization: str | InputNormalizationMethod = InputNormalizationMethod.IMAGENET, + train_batch_size: int = 32, + eval_batch_size: int = 32, + num_workers: int = 8, + task: TaskType = TaskType.SEGMENTATION, + transform_config_train: str | A.Compose | None = None, + transform_config_eval: str | A.Compose | None = None, + test_split_mode: TestSplitMode = TestSplitMode.FROM_DIR, + test_split_ratio: float = 0.2, + val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, + val_split_ratio: float = 0.5, + seed: int | None = None, + ) -> None: + super().__init__( + train_batch_size=train_batch_size, + eval_batch_size=eval_batch_size, + num_workers=num_workers, + test_split_mode=test_split_mode, + test_split_ratio=test_split_ratio, + val_split_mode=val_split_mode, + val_split_ratio=val_split_ratio, + seed=seed, + ) + + self.root = Path(root) + + transform_train = get_transforms( + config=transform_config_train, + image_size=image_size, + center_crop=center_crop, + normalization=InputNormalizationMethod(normalization), + ) + transform_eval = get_transforms( + config=transform_config_eval, + image_size=image_size, + center_crop=center_crop, + normalization=InputNormalizationMethod(normalization), + ) + + self.train_data = KolektorDataset( + task=task, + transform=transform_train, + split=Split.TRAIN, + root=root, + ) + self.test_data = KolektorDataset( + task=task, + transform=transform_eval, + split=Split.TEST, + root=root, + ) + + def prepare_data(self) -> None: + """Download the dataset if not available.""" + if (self.root).is_dir(): + logger.info("Found the dataset.") + else: + download_and_extract(self.root, DOWNLOAD_INFO) diff --git a/src/anomalib/data/utils/download.py b/src/anomalib/data/utils/download.py index bbe9130e4a..bfc2bd3a07 100644 --- a/src/anomalib/data/utils/download.py +++ b/src/anomalib/data/utils/download.py @@ -29,6 +29,7 @@ class DownloadInfo: name: str url: str hash: str + filename: str | None = None class DownloadProgressBar(tqdm): @@ -227,7 +228,11 @@ def download_and_extract(root: Path, info: DownloadInfo) -> None: root.mkdir(parents=True, exist_ok=True) # save the compressed file in the specified root directory, using the same file name as on the server - downloaded_file_path = root / info.url.split("/")[-1] + if info.filename: + downloaded_file_path = root / info.filename + else: + downloaded_file_path = root / info.url.split("/")[-1] + if downloaded_file_path.exists(): logger.info("Existing dataset archive found. Skipping download stage.") else: