From ed20d62ac720ead4ed2cbd5992ed7016f86a8b3b Mon Sep 17 00:00:00 2001 From: youdaoyzbx Date: Mon, 30 Jan 2023 19:20:00 +0800 Subject: [PATCH 1/5] first commit for ymir --- mmyolo/engine/hooks/__init__.py | 3 +- .../hooks/ymir_training_monitor_hook.py | 75 +++++ requirements.txt | 4 +- requirements/ymir.txt | 7 + tools/train.py | 4 + ymir/Dockerfile | 33 ++ ymir/develop.md | 27 ++ ymir/img-man/infer-template.yaml | 1 + ymir/img-man/manifest.yaml | 1 + ymir/img-man/mining-template.yaml | 2 + ymir/img-man/training-template.yaml | 12 + ymir/readme.md | 66 ++++ ymir/start.py | 29 ++ ymir/utils/common.py | 288 ++++++++++++++++++ ymir/weights/download_weights.sh | 37 +++ ymir/ymir_infer.py | 135 ++++++++ ymir/ymir_mining.py | 125 ++++++++ ymir/ymir_training.py | 147 +++++++++ 18 files changed, 994 insertions(+), 2 deletions(-) create mode 100644 mmyolo/engine/hooks/ymir_training_monitor_hook.py create mode 100644 requirements/ymir.txt create mode 100644 ymir/Dockerfile create mode 100644 ymir/develop.md create mode 100644 ymir/img-man/infer-template.yaml create mode 100644 ymir/img-man/manifest.yaml create mode 100644 ymir/img-man/mining-template.yaml create mode 100644 ymir/img-man/training-template.yaml create mode 100644 ymir/readme.md create mode 100644 ymir/start.py create mode 100644 ymir/utils/common.py create mode 100644 ymir/weights/download_weights.sh create mode 100644 ymir/ymir_infer.py create mode 100644 ymir/ymir_mining.py create mode 100644 ymir/ymir_training.py diff --git a/mmyolo/engine/hooks/__init__.py b/mmyolo/engine/hooks/__init__.py index 0b8deebc8..43ef0cd4e 100644 --- a/mmyolo/engine/hooks/__init__.py +++ b/mmyolo/engine/hooks/__init__.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. from .ppyoloe_param_scheduler_hook import PPYOLOEParamSchedulerHook from .switch_to_deploy_hook import SwitchToDeployHook +from .ymir_training_monitor_hook import YmirTrainingMonitorHook from .yolov5_param_scheduler_hook import YOLOv5ParamSchedulerHook from .yolox_mode_switch_hook import YOLOXModeSwitchHook __all__ = [ 'YOLOv5ParamSchedulerHook', 'YOLOXModeSwitchHook', 'SwitchToDeployHook', - 'PPYOLOEParamSchedulerHook' + 'PPYOLOEParamSchedulerHook', 'YmirTrainingMonitorHook' ] diff --git a/mmyolo/engine/hooks/ymir_training_monitor_hook.py b/mmyolo/engine/hooks/ymir_training_monitor_hook.py new file mode 100644 index 000000000..1ad79873c --- /dev/null +++ b/mmyolo/engine/hooks/ymir_training_monitor_hook.py @@ -0,0 +1,75 @@ +""" +hook for ymir training process, write the monitor.txt, save the latest model +""" +import glob +import logging +import os.path as osp +import re +import warnings +from typing import Dict, Optional, Union + +from mmengine.hooks import Hook +from mmengine.registry import HOOKS +from ymir_exc.util import (get_merged_config, write_ymir_monitor_process, write_ymir_training_result) + + +@HOOKS.register_module() +class YmirTrainingMonitorHook(Hook): + """ + for epoch based training loop only. + + 1. write monitor.txt + 2. save the latest checkpoint with id=last if exist, note the checkpoint maybe clear late. + 3. save the latest best checkpoint with id=best if exist, note the checkpoint maybe clear late. + """ + # the priority should lower than CheckpointHook (priority = VERY_LOW) + priority = 'LOWEST' + + def __init__(self, interval: int = 10): + self.interval = interval + self.ymir_cfg = get_merged_config() + + def after_train_iter(self, + runner, + batch_idx: int, + data_batch: Optional[Union[dict, tuple, list]] = None, + outputs: Optional[dict] = None) -> None: + if runner.rank in [0, -1] and self.every_n_inner_iters(batch_idx, self.interval): + percent = (runner.epoch + batch_idx / len(runner.train_dataloader)) / runner.max_epochs + write_ymir_monitor_process(self.ymir_cfg, task='training', naive_stage_percent=percent, stage='task') + + def after_val_epoch(self, runner, metrics: Optional[Dict[str, float]] = None) -> None: + """ + metrics: {'coco/bbox_mAP': 0.001, 'coco/bbox_mAP_50': 0.003, 'coco/bbox_mAP_75': 0.0, 'coco/bbox_mAP_s': 0.0, 'coco/bbox_mAP_m': 0.0, 'coco/bbox_mAP_l': 0.001} + + evaluation_result: {'mAP': 0.001, 'mAP_50': 0.003, ...} + """ + if runner.rank in [0, -1]: + N = len('coco/bbox_') + evaluation_result = {key[N:]: value for key, value in metrics.items()} + out_dir = self.ymir_cfg.ymir.output.models_dir + cfg_files = glob.glob(osp.join(out_dir, '*.py')) + + best_ckpts = glob.glob(osp.join(out_dir, 'best_coco', '*.pth')) + if len(best_ckpts) > 0: + newest_best_ckpt = max(best_ckpts, key=osp.getctime) + best_epoch = int(re.findall(r'\d+', newest_best_ckpt)[0]) + # if current checkpoint is the newest checkpoint, keep it + if best_epoch == runner.epoch: + logging.info(f'epoch={runner.epoch}, save {newest_best_ckpt} to result.yaml') + write_ymir_training_result(self.ymir_cfg, + files=[newest_best_ckpt] + cfg_files, + id='best', + evaluation_result=evaluation_result) + else: + warnings.warn(f'no best checkpoint found on {runner.epoch}') + + last_ckpts = glob.glob(osp.join(out_dir, '*.pth')) + if len(last_ckpts) > 0: + logging.info(f'epoch={runner.epoch}, save {newest_best_ckpt} to result.yaml') + write_ymir_training_result(self.ymir_cfg, + files=last_ckpts + cfg_files, + id='last', + evaluation_result=evaluation_result) + else: + warnings.warn(f'no latest checkpoint found on {runner.epoch}') diff --git a/requirements.txt b/requirements.txt index 5f50cbdc0..44b2ea037 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ -r requirements/build.txt -r requirements/runtime.txt --r requirements/tests.txt +# -r requirements/tests.txt +-r requirements/albu.txt +-r requirements/ymir.txt diff --git a/requirements/ymir.txt b/requirements/ymir.txt new file mode 100644 index 000000000..22b00af09 --- /dev/null +++ b/requirements/ymir.txt @@ -0,0 +1,7 @@ +opencv-contrib-python>=4.0 +easydict +tqdm +imagesize +nptyping +tensorboard +-e git+https://github.com/modelai/ymir-executor-sdk.git@ymir2.1.0#egg=ymir-exc diff --git a/tools/train.py b/tools/train.py index e6ba9ebc6..9bb934023 100644 --- a/tools/train.py +++ b/tools/train.py @@ -7,9 +7,11 @@ from mmengine.config import Config, DictAction from mmengine.logging import print_log from mmengine.runner import Runner +from ymir_exc.util import get_merged_config from mmyolo.registry import RUNNERS from mmyolo.utils import register_all_modules +from ymir.utils.common import modify_mmengine_config def parse_args(): @@ -61,6 +63,8 @@ def main(): # load config cfg = Config.fromfile(args.config) + ymir_cfg = get_merged_config() + modify_mmengine_config(cfg, ymir_cfg) # replace the ${key} with the value of cfg.key # cfg = replace_cfg_vals(cfg) cfg.launcher = args.launcher diff --git a/ymir/Dockerfile b/ymir/Dockerfile new file mode 100644 index 000000000..bcc1e55a8 --- /dev/null +++ b/ymir/Dockerfile @@ -0,0 +1,33 @@ +FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime + +# To fix GPG key error when running apt-get update +RUN apt-get update && apt-get install -y gnupg2 +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + +# (Optional) +RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list && \ + pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + +RUN apt-get update \ + && apt-get install -y gcc ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev vim \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install openmim \ + && mim install "mmengine>=0.3.1" \ + && mim install "mmcv>=2.0.0rc1,<2.1.0" \ + && mim install "mmdet>=3.0.0rc5,<3.1.0" + +# Install MMYOLO +COPY . /app +RUN cd /app && \ + pip install --no-cache-dir -r requirements.txt && \ + mkdir /weights && \ + mv ymir/weights/*.pth /weights + +ENV PYTHONPATH=. +WORKDIR /app + +RUN echo "python3 ymir/start.py > /usr/bin/start.sh" +CMD bash /usr/bin/start.sh diff --git a/ymir/develop.md b/ymir/develop.md new file mode 100644 index 000000000..237a8447e --- /dev/null +++ b/ymir/develop.md @@ -0,0 +1,27 @@ +# 开发文档 + +## 训练 + +### 训练脚本调用链 + +1. 启动镜像时调用 `bash /usr/bin/start.sh` + +2. `start.sh` 调用 `python3 ymir/start.py` + +3. `start.py` 调用 `python3 ymir/ymir_training.py` + +4. `ymir_training.py` 调用 `bash tools/dist_train.sh ...` + +### 核心功能实现 + +- 数据格式转换 + +在 `ymir_training.py` 中首次调用 `convert_ymir_to_coco()` 进行数据格式转换,后续调用 `convert_ymir_to_coco()` 仅获得数据集信息。 + +- 加载预训练权重 + +- 加载超参数 + +- 写进度 + +- 写结果文件 diff --git a/ymir/img-man/infer-template.yaml b/ymir/img-man/infer-template.yaml new file mode 100644 index 000000000..b778c2931 --- /dev/null +++ b/ymir/img-man/infer-template.yaml @@ -0,0 +1 @@ +conf_threshold: 0.2 diff --git a/ymir/img-man/manifest.yaml b/ymir/img-man/manifest.yaml new file mode 100644 index 000000000..ec97611b4 --- /dev/null +++ b/ymir/img-man/manifest.yaml @@ -0,0 +1 @@ +object_type: 2 # object detection diff --git a/ymir/img-man/mining-template.yaml b/ymir/img-man/mining-template.yaml new file mode 100644 index 000000000..eff2135cc --- /dev/null +++ b/ymir/img-man/mining-template.yaml @@ -0,0 +1,2 @@ +mining_algorithm: 'entropy' +conf_threshold: 0.1 diff --git a/ymir/img-man/training-template.yaml b/ymir/img-man/training-template.yaml new file mode 100644 index 000000000..54c035a79 --- /dev/null +++ b/ymir/img-man/training-template.yaml @@ -0,0 +1,12 @@ +export_format: 'ark:raw' +samples_per_gpu: 16 # batch size per gpu +workers_per_gpu: 4 +max_epochs: 100 +# config_file: 'configs/yolox/yolox_tiny_8x8_300e_coco.py' +model_name: yolov5_n +args_options: '' +cfg_options: '' +metric: 'bbox' +val_interval: 1 # <0 means evaluation every interval +max_keep_checkpoints: 1 # <0 means save all weight file, 1 means save last and best weight files, k means save topk best weight files and topk epoch/step weigth files +ymir_saved_file_patterns: '' # custom saved files, support python regular expression, use , to split multiple pattern diff --git a/ymir/readme.md b/ymir/readme.md new file mode 100644 index 000000000..c158a3d30 --- /dev/null +++ b/ymir/readme.md @@ -0,0 +1,66 @@ +# mmyolo 镜像说明文档 + +## 仓库地址 + +> 参考[open-mmlab/mmyolo](https://github.com/open-mmlab/mmyolo) +- [modelai/ymir-mmyolo](https://github.com/modelai/ymir-mmyolo) + +## 镜像地址 + +``` +youdaoyzbx/ymir-executor:ymir2.1.0-mmyolo-cu113-tmi +``` + +## 性能表现 + +> 结果来自mmyolo官方的COCO评测, 表格外的其他结构同样支持,但镜像中没有提供相应的预训练模型 + +### yolov5-v8, yolox, ppyoloe+ + +| Backbone | Arch | size | SyncBN | AMP | Mem (GB) | box AP | Config | Download | +| :------: | :--: | :--: | :----: | :-: | :------: | :----: | :-------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOv8-n | P5 | 640 | Yes | Yes | 2.8 | 37.2 | [config](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolov8/yolov8_n_syncbn_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco/yolov8_n_syncbn_fast_8xb16-500e_coco_20230114_131804-88c11cdb.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco/yolov8_n_syncbn_fast_8xb16-500e_coco_20230114_131804.log.json) | +| YOLOv8-s | P5 | 640 | Yes | Yes | 4.0 | 44.2 | [config](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolov8/yolov8_s_syncbn_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101-5aa5f0f1.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101.log.json) | +| YOLOv8-m | P5 | 640 | Yes | Yes | 7.2 | 49.8 | [config](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolov8/yolov8_m_syncbn_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco/yolov8_m_syncbn_fast_8xb16-500e_coco_20230115_192200-c22e560a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco/yolov8_m_syncbn_fast_8xb16-500e_coco_20230115_192200.log.json) | +| YOLOv7-tiny | P5 | 640 | Yes | Yes | 2.7 | 37.5 | [config](https://github.com/open-mmlab/mmyolo/tree/master/configs/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719.log.json) | +| YOLOv7-l | P5 | 640 | Yes | Yes | 10.3 | 50.9 | [config](https://github.com/open-mmlab/mmyolo/tree/master/configs/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601-8113c0eb.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601.log.json) | +| YOLOv7-x | P5 | 640 | Yes | Yes | 13.7 | 52.8 | [config](https://github.com/open-mmlab/mmyolo/tree/master/configs/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331-ef949a68.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331.log.json) | +| YOLOv6-n | P5 | 640 | Yes | Yes | 6.04 | 36.2 | [config](../yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco/yolov6_n_syncbn_fast_8xb32-400e_coco_20221030_202726-d99b2e82.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco/yolov6_n_syncbn_fast_8xb32-400e_coco_20221030_202726.log.json) | +| YOLOv6-t | P5 | 640 | Yes | Yes | 8.13 | 41.0 | [config](../yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco/yolov6_t_syncbn_fast_8xb32-400e_coco_20221030_143755-cf0d278f.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco/yolov6_t_syncbn_fast_8xb32-400e_coco_20221030_143755.log.json) | +| YOLOv6-s | P5 | 640 | Yes | Yes | 8.88 | 44.0 | [config](../yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035.log.json) | +| YOLOv6-m | P5 | 640 | Yes | Yes | 16.69 | 48.4 | [config](../yolov6/yolov6_m_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658-85bda3f4.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658.log.json) | +| YOLOv5-n | P5 | 640 | Yes | Yes | 1.5 | 28.0 | [config](https://github.com/open-mmlab/mmyolo/tree/master/configs/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739-b804c1ad.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739.log.json) | +| YOLOv5-s | P5 | 640 | Yes | Yes | 2.7 | 37.7 | [config](https://github.com/open-mmlab/mmyolo/tree/master/configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json) | +| YOLOv5-m | P5 | 640 | Yes | Yes | 5.0 | 45.3 | [config](https://github.com/open-mmlab/mmyolo/tree/master/configs/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944.log.json) | +| YOLOX-tiny | - | 416 | - | - | 2.8 | 32.7 | [config](https://github.com/open-mmlab/mmyolo/tree/master/configs/yolox/yolox_tiny_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_8xb8-300e_coco/yolox_tiny_8xb8-300e_coco_20220919_090908-0e40a6fc.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_8xb8-300e_coco/yolox_tiny_8xb8-300e_coco_20220919_090908.log.json) | +| YOLOX-s | - | 640 | - | - | 5.6 | 40.8 | [config](https://github.com/open-mmlab/mmyolo/tree/master/configs/yolox/yolox_s_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_8xb8-300e_coco/yolox_s_8xb8-300e_coco_20220917_030738-d7e60cb2.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_8xb8-300e_coco/yolox_s_8xb8-300e_coco_20220917_030738.log.json) | +| PPYOLOE_plus_s | P5 | 640 | Yes | - | 4.7 | 43.5 | [config](../ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052-9fee7619.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052.log.json) | +| PPYOLOE_plus_m | P5 | 640 | Yes | - | 8.4 | 49.5 | [config](../ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco/ppyoloe_plus_m_fast_8xb8-80e_coco_20230104_193132-e4325ada.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco/ppyoloe_plus_m_fast_8xb8-80e_coco_20230104_193132.log.json) | + + +### RTMDet + +| Model | size | box AP | Params(M) | FLOPS(G) | TRT-FP16-Latency(ms) | Config | Download | +| :---------: | :--: | :----: | :-------: | :------: | :------------------: | :-------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| RTMDet-tiny | 640 | 41.0 | 4.8 | 8.1 | 0.98 | [config](./rtmdet_l_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117.log.json) | +| RTMDet-s | 640 | 44.6 | 8.89 | 14.8 | 1.22 | [config](./rtmdet_s_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329-0a8c901a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329.log.json) | +| RTMDet-m | 640 | 49.3 | 24.71 | 39.27 | 1.62 | [config](./rtmdet_m_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco/rtmdet_m_syncbn_fast_8xb32-300e_coco_20230102_135952-40af4fe8.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco/rtmdet_m_syncbn_fast_8xb32-300e_coco_20230102_135952.log.json) | +| RTMDet-l | 640 | 51.4 | 52.3 | 80.23 | 2.44 | [config](./rtmdet_l_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928.log.json) | + +## 训练参数 + +| 超参数 | 默认值 | 类型 | 说明 | 建议 | +| - | - | - | - | - | +| hyper-parameter | default value | type | note | advice | +| config_file | +| shm_size | 128G | 字符串| 受ymir后台处理,docker image 可用共享内存 | 建议大小:镜像占用GPU数 * 32G | +| export_format | ark:raw | 字符串| 受ymir后台处理,ymir数据集导出格式 | - | +| model_name | yolov8_n | 字符串 | 模型简写, 如yolov7_tiny, yolov5_m, yolov6_t, rtmdet_m, ppyoloe_plus_s | 支持yolov5-v8, yolox, rtmdet, ppyoloe_plus | +| samples_per_gpu | 16 | 整数 | 每张GPU一次处理的图片数量 | 建议大小:显存占用<50% 可增加2倍加快训练速度 | +| workers_per_gpu | 4 | 整数 | 每张GPU对应的数据读取进程数 | - | +| max_epochs | 100 | 整数 | 整个数据集的训练遍历次数 | 建议:必要时分析tensorboard确定是否有必要改变,一般采用默认值即可 | +| args_options | '' | 字符串 | 训练命令行参数 | 参考 [ymir-mmyolo/tools/train.py](https://github.com/modelai/ymir-mmyolo/blob/ymir/tools/train.py) +| cfg_options | '' | 字符串 | 训练命令行参数 | 参考 [ymir-mmyolo/tools/train.py](https://github.com/modelai/ymir-mmyolo/blob/ymir/tools/train.py) +| metric | bbox | 字符串 | 模型评测方式 | 采用默认值即可 | +| val_interval | 1 | 整数 | 模型在验证集上评测的周期, 以epoch为单位 | 设置为1,每个epoch可评测一次 | +| max_keep_checkpoints | 1 | 整数 | 最多保存的权重文件数量 | 设置为k, 可保存k个最优权重和k个最新的权重文件,设置为-1可保存所有权重文件。 diff --git a/ymir/start.py b/ymir/start.py new file mode 100644 index 000000000..e33f05b9f --- /dev/null +++ b/ymir/start.py @@ -0,0 +1,29 @@ +import logging +import sys + +from ymir_exc.executor import Executor +from ymir_exc.util import find_free_port, get_merged_config + + +def main(): + ymir_cfg = get_merged_config() + gpu_id: str = ymir_cfg.param.get('gpu_id', '0') + gpu_count: int = len(gpu_id.split(',')) + port: int = find_free_port() + + logger = logging.getLogger() + log_level = ymir_cfg.param.get('log_level', 'info') + if log_level == 'debug': + logger.setLevel(logging.DEBUG) + + torchrun_cmd = f'torchrun --standalone --nnodes 1 --nproc_per_node {gpu_count} --master_port {port}' + apps = dict(training='python3 ymir/ymir_training.py', + mining=f'{torchrun_cmd} ymir/ymir_mining.py', + infer=f'{torchrun_cmd} ymir/ymir_infer.py') + executor = Executor(apps=apps) + executor.start() + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/ymir/utils/common.py b/ymir/utils/common.py new file mode 100644 index 000000000..45187b6a3 --- /dev/null +++ b/ymir/utils/common.py @@ -0,0 +1,288 @@ +""" +utils function for ymir and yolov5 +""" +import difflib +import glob +import logging +import os +import os.path as osp +import warnings +from typing import Any, Dict, Iterable, List, Union + +from easydict import EasyDict as edict +from mmengine.config import Config, ConfigDict +from ymir_exc.dataset_convert import convert_ymir_to_coco +from ymir_exc.util import get_bool + + +def modify_mmengine_config(mmengine_cfg: Config, ymir_cfg: edict) -> None: + """ + useful for training process + - modify dataset config + - modify model output channel + - modify epochs, checkpoint, tensorboard config + """ + + def recursive_modify_attribute(mmengine_cfgdict: Union[Config, ConfigDict], attribute_key: str, + attribute_value: Any): + """ + recursive modify mmcv_cfg: + 1. mmcv_cfg.attribute_key to attribute_value + 2. mmcv_cfg.xxx.xxx.xxx.attribute_key to attribute_value (recursive) + 3. mmcv_cfg.xxx[i].attribute_key to attribute_value (i=0, 1, 2 ...) + 4. mmcv_cfg.xxx[i].xxx.xxx[j].attribute_key to attribute_value + """ + for key in mmengine_cfgdict: + if key == attribute_key: + mmengine_cfgdict[key] = attribute_value + logging.info(f'modify {mmengine_cfgdict}, {key} = {attribute_value}') + elif isinstance(mmengine_cfgdict[key], (Config, ConfigDict)): + recursive_modify_attribute(mmengine_cfgdict[key], attribute_key, attribute_value) + elif isinstance(mmengine_cfgdict[key], Iterable): + for cfg in mmengine_cfgdict[key]: + if isinstance(cfg, (Config, ConfigDict)): + recursive_modify_attribute(cfg, attribute_key, attribute_value) + + # modify dataset config + data_info = convert_ymir_to_coco() + + # validation may augment the image and use more gpu + # so set smaller samples_per_gpu for validation + samples_per_gpu = ymir_cfg.param.samples_per_gpu + workers_per_gpu = ymir_cfg.param.workers_per_gpu + mmengine_cfg.train_batch_size_per_gpu = samples_per_gpu + mmengine_cfg.train_num_workers = workers_per_gpu + mmengine_cfg.train_dataloader.batch_size = samples_per_gpu + mmengine_cfg.train_dataloader.num_workers = workers_per_gpu + mmengine_cfg.optim_wrapper.optimizer.batch_size_per_gpu = samples_per_gpu + + # modify model output channel + num_classes = len(ymir_cfg.param.class_names) + mmengine_cfg.num_classes = num_classes + recursive_modify_attribute(mmengine_cfg.model, 'num_classes', num_classes) + + for split in ['train', 'val']: + ymir_dataset_cfg = dict( + type='YOLOv5CocoDataset', + ann_file=data_info[split]['ann_file'], + # metainfo=dict(classes=ymir_cfg.param.class_names), + data_root=ymir_cfg.ymir.input.root_dir) + # modify dataset config for `split` + + mmdet_dataset_cfg = mmengine_cfg[f'{split}_dataloader']['dataset'] + if mmdet_dataset_cfg is None: + continue + + if isinstance(mmdet_dataset_cfg, (list, tuple)): + for x in mmdet_dataset_cfg: + x.update(ymir_dataset_cfg) + else: + src_dataset_type = mmdet_dataset_cfg.type + if src_dataset_type in ['CocoDataset', 'YOLOv5CocoDataset']: + mmdet_dataset_cfg.update(ymir_dataset_cfg) + elif src_dataset_type in ['MultiImageMixDataset', 'RepeatDataset']: + raise Exception(f'unsupported source dataset type {src_dataset_type}') + # mmdet_dataset_cfg.dataset.update(ymir_dataset_cfg) + else: + raise Exception(f'unsupported source dataset type {src_dataset_type}') + + # modify max_epochs + if ymir_cfg.param.get('max_epochs', None): + max_epochs = int(ymir_cfg.param.max_epochs) + mmengine_cfg.train_cfg.max_epochs = max_epochs + mmengine_cfg.max_epochs = max_epochs + mmengine_cfg.default_hooks.param_scheduler.max_epochs = max_epochs + + # modify checkpoint + mmengine_cfg.default_hooks.checkpoint['out_dir'] = ymir_cfg.ymir.output.models_dir + + # modify tensorboard + tensorboard_logger = dict(type='TensorboardVisBackend', save_dir=ymir_cfg.ymir.output.tensorboard_dir) + if len(mmengine_cfg.visualizer.vis_backends) <= 1: + mmengine_cfg.visualizer.vis_backends.append(tensorboard_logger) + else: + mmengine_cfg.visualizer.vis_backends[1].update(tensorboard_logger) + + # TODO save only the best top-k model weight files. + # modify evaluation and interval + val_interval: int = int(ymir_cfg.param.get('val_interval', 1)) + if val_interval > 0: + val_interval = min(val_interval, mmengine_cfg.train_cfg.max_epochs) + else: + val_interval = 1 + + mmengine_cfg.save_epoch_intervals = val_interval + mmengine_cfg.train_cfg.val_interval = val_interval + mmengine_cfg.val_evaluator.ann_file = data_info['val']['ann_file'] + mmengine_cfg.val_evaluator.metric = ymir_cfg.param.get('metric', 'bbox') + + # save best top-k model weights files + # max_keep_ckpts <= 0 # save all checkpoints + max_keep_ckpts: int = int(ymir_cfg.param.get('max_keep_checkpoints', 1)) + mmengine_cfg.default_hooks.checkpoint.interval = val_interval + mmengine_cfg.default_hooks.checkpoint.max_keep_ckpts = max_keep_ckpts + + # TODO Whether to evaluating the AP for each class + # mmdet_cfg.evaluation.classwise = True + + # fix DDP error or make training faster? + mmengine_cfg.find_unused_parameters = get_bool(ymir_cfg, 'find_unused_parameters', False) + + # learning rate + if ymir_cfg.param.get('learning_rate', None): + mmengine_cfg.base_lr = float(ymir_cfg.param.learning_rate) + mmengine_cfg.optim_wrapper.optimizer.lr = float(ymir_cfg.param.learning_rate) + + # set training log interval (iter) + with open(ymir_cfg.ymir.input.training_index_file, 'r') as fp: + train_dataset_size = len(fp.readlines()) + gpu_id: str = str(ymir_cfg.param.get("gpu_id", '')) + num_gpus = len(gpu_id.split(",")) + max_interval = train_dataset_size // (samples_per_gpu * num_gpus) + + log_interval = max(1, min(50, max_interval)) + mmengine_cfg.default_hooks.logger.interval = log_interval + + # add YmirTrainingMonitorHook + ymir_hook = dict(type='YmirTrainingMonitorHook', interval=log_interval) + mmengine_cfg.custom_hooks.append(ymir_hook) + + # set work dir + mmengine_cfg.work_dir = ymir_cfg.ymir.output.models_dir + + args_options = ymir_cfg.param.get("args_options", '') + cfg_options = ymir_cfg.param.get("cfg_options", '') + + # auto load offered weight file if not set by user! + if (args_options.find('--resume-from') == -1 and args_options.find('--load-from') == -1 + and cfg_options.find('load_from') == -1 and cfg_options.find('resume_from') == -1): # noqa: E129 + + weight_file = get_best_weight_file(ymir_cfg) + if weight_file: + if cfg_options: + cfg_options += f' load_from={weight_file}' + else: + cfg_options = f'load_from={weight_file}' + else: + logging.warning('no weight file used for training!') + + +def get_best_weight_file(cfg: edict) -> str: + """ + return the weight file path by priority + find weight file in cfg.param.pretrained_model_params or cfg.param.model_params_path + load coco-pretrained weight for yolox + """ + if cfg.ymir.run_training: + model_params_path: List[str] = cfg.param.get('pretrained_model_params', []) + else: + model_params_path = cfg.param.get('model_params_path', []) + + model_dir = cfg.ymir.input.models_dir + model_params_path = [ + osp.join(model_dir, p) for p in model_params_path + if osp.exists(osp.join(model_dir, p)) and p.endswith(('.pth', '.pt')) + ] + + # choose weight file by priority, best_xxx.pth > latest.pth > epoch_xxx.pth + best_pth_files = [f for f in model_params_path if osp.basename(f).startswith('best_')] + if len(best_pth_files) > 0: + return max(best_pth_files, key=os.path.getctime) + + epoch_pth_files = [f for f in model_params_path if osp.basename(f).startswith(('epoch_', 'iter_'))] + if len(epoch_pth_files) > 0: + return max(epoch_pth_files, key=os.path.getctime) + + if cfg.ymir.run_training: + weight_files = [f for f in glob.glob('/weights/**/*', recursive=True) if f.endswith(('.pth', '.pt'))] + + # load pretrained model weight for target model + model_name = cfg.param.get("model_name") + config_files_map = get_id_for_config_files() + config_id = model_name.lower().replace('-', '_') + config_file = config_files_map[config_id] + config_file_splits = osp.basename(config_file).split('_') + + base_model = config_file_splits[0] + assert base_model in ['yolov5', 'yolov6', 'yolov7', 'yolov8', 'yolox', 'rtmdet', 'ppyoloe'] + if len(weight_files) > 0: + matched_weight_files = difflib.get_close_matches(config_file, weight_files) + if len(matched_weight_files) > 0: + logging.info(f'load yolox pretrained weight {matched_weight_files[0]}') + return matched_weight_files[0] + return "" + + +def get_topk_checkpoints(files: List[str], k: int) -> List[str]: + """ + keep topk checkpoint files, remove other files. + + 1. keep topk best checkpoint for ensembel + 2. keep topk latest checkpoint for quantization + """ + checkpoints_files = [f for f in files if f.endswith(('.pth', '.pt'))] + + best_pth_files = [f for f in checkpoints_files if f.find('best_') > -1] + if len(best_pth_files) > 0: + # newest first + topk_best_pth_files = sorted(best_pth_files, key=os.path.getctime, reverse=True) + else: + topk_best_pth_files = [] + + epoch_pth_files = [f for f in checkpoints_files if osp.basename(f).startswith(('epoch_', 'iter_'))] + if len(epoch_pth_files) > 0: + topk_epoch_pth_files = sorted(epoch_pth_files, key=os.path.getctime, reverse=True) + else: + topk_epoch_pth_files = [] + + # python will check the length of list + if k < 0: + return topk_best_pth_files + topk_epoch_pth_files + else: + return topk_best_pth_files[0:k] + topk_epoch_pth_files[0:k] + + +def get_id_for_config_files() -> dict: + """ + use id instead of config_file: + yolov5_l_p6_v62: configs/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py + + note: '-' will be replace with '_' + """ + + py_files = glob.glob(osp.join('configs', '*', '*_coco.py')) + + config_files = [f for f in py_files if f.split('/')[1] not in ['_base_', 'deploy']] + + id_dict: Dict[str, str] = {} + for f in config_files: + f_name = osp.basename(f).replace('-', '_') + splits = f_name.split('_') + + # yolov5_n, yolov5_s, yolov5_m, ... + for x in range(2, len(splits)): + idx = '_'.join(splits[0:x]) + id_dict[idx] = f + + id_dict[f] = f + + return id_dict + + +def get_config_file(cfg): + if cfg.ymir.run_training: + model_params_path: List = cfg.param.get('pretrained_model_params', []) # type: ignore + else: + model_params_path: List = cfg.param.get('model_params_path', []) # type: ignore + + model_dir = cfg.ymir.input.models_dir + config_files = [ + osp.join(model_dir, p) for p in model_params_path if osp.exists(osp.join(model_dir, p)) and p.endswith(('.py')) + ] + + if len(config_files) > 0: + if len(config_files) > 1: + warnings.warn(f'multiple config file found! use {config_files[0]}') + return config_files[0] + else: + raise Exception(f'no config_file found in {model_dir} and {model_params_path}') diff --git a/ymir/weights/download_weights.sh b/ymir/weights/download_weights.sh new file mode 100644 index 000000000..5ec20bad1 --- /dev/null +++ b/ymir/weights/download_weights.sh @@ -0,0 +1,37 @@ +# view https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolov5 + +# yolov5 640 n,s,m +wget https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739-b804c1ad.pth +wget https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth +wget https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth + +# yolov6 640 n,t,s,m +wget https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco/yolov6_n_syncbn_fast_8xb32-400e_coco_20221030_202726-d99b2e82.pth +wget https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco/yolov6_t_syncbn_fast_8xb32-400e_coco_20221030_143755-cf0d278f.pth +wget https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth +wget https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658-85bda3f4.pth + +# yolov7 tiny, l, x +wget https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth +wget https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601-8113c0eb.pth +wget https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331-ef949a68.pth + +# yolov8 n, s, m +wget https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco/yolov8_n_syncbn_fast_8xb16-500e_coco_20230114_131804-88c11cdb.pth +wget https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101-5aa5f0f1.pth +wget https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco/yolov8_m_syncbn_fast_8xb16-500e_coco_20230115_192200-c22e560a.pth + +# yolox tiny, s +wget https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_8xb8-300e_coco/yolox_tiny_8xb8-300e_coco_20220919_090908-0e40a6fc.pth +wget https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_8xb8-300e_coco/yolox_s_8xb8-300e_coco_20220917_030738-d7e60cb2.pth + +# rtmdet tiny, s, m +wget https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth +wget https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329-0a8c901a.pth +wget https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco/rtmdet_m_syncbn_fast_8xb32-300e_coco_20230102_135952-40af4fe8.pth + +# ppyoloe+ s, m, l +wget https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052-9fee7619.pth +wget https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco/ppyoloe_plus_m_fast_8xb8-80e_coco_20230104_193132-e4325ada.pth +wget https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco/ppyoloe_plus_l_fast_8xb8-80e_coco_20230102_203825-1864e7b3.pth + diff --git a/ymir/ymir_infer.py b/ymir/ymir_infer.py new file mode 100644 index 000000000..e05735355 --- /dev/null +++ b/ymir/ymir_infer.py @@ -0,0 +1,135 @@ +import argparse +import os +import os.path as osp +import sys +import warnings +from typing import Any, List + +import cv2 +import numpy as np +import torch.distributed as dist +from easydict import EasyDict as edict +from mmdet.apis import inference_detector, init_detector +from mmdet.structures.det_data_sample import DetDataSample +from mmengine.config import DictAction +from mmengine.dist import collect_results_gpu, init_dist +from tqdm import tqdm +from ymir_exc import result_writer as rw +from ymir_exc.util import (YmirStage, get_merged_config, write_ymir_monitor_process) + +from mmyolo.utils import register_all_modules +from ymir.utils.common import get_best_weight_file, get_config_file + +LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html +RANK = int(os.getenv('RANK', -1)) +WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1)) + + +def parse_option(cfg_options: str) -> dict: + parser = argparse.ArgumentParser(description='parse cfg options') + parser.add_argument('--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + + args = parser.parse_args(f'--cfg-options {cfg_options}'.split()) + return args.cfg_options + + +def mmdet_result_to_ymir(results: DetDataSample, class_names: List[str]) -> List[rw.Annotation]: + """ + results: DetDataSample + """ + ann_list = [] + scores = results.pred_instances.scores + bboxes = results.pred_instances.bboxes + labels = results.pred_instances.labels + for idx, result in enumerate(zip(bboxes, scores, labels)): + bbox, score, label = result + x1, y1, x2, y2 = [x.item() for x in bbox] + score = score.item() + label = label.item() + ann = rw.Annotation(class_name=class_names[label], + score=score, + box=rw.Box(x=round(x1), y=round(y1), w=round(x2 - x1), h=round(y2 - y1))) + ann_list.append(ann) + return ann_list + + +class YmirModel: + + def __init__(self, cfg: edict): + self.cfg = cfg + + # Specify the path to model config and checkpoint file + config_file = get_config_file(cfg) + checkpoint_file = get_best_weight_file(cfg) + + gpu_id = max(0, RANK) + # build the model from a config file and a checkpoint file + self.model = init_detector(config_file, checkpoint_file, device=f'cuda:{gpu_id}') + + def infer(self, img): + return inference_detector(self.model, img) + + +def main(): + register_all_modules() + + if LOCAL_RANK != -1: + init_dist(launcher='pytorch', backend="nccl" if dist.is_nccl_available() else "gloo") + + cfg = get_merged_config() + + with open(cfg.ymir.input.candidate_index_file, 'r') as f: + images = [line.strip() for line in f.readlines()] + + max_barrier_times = len(images) // WORLD_SIZE + if RANK == -1: + N = len(images) + tbar = tqdm(images) + else: + images_rank = images[RANK::WORLD_SIZE] + N = len(images_rank) + if RANK == 0: + tbar = tqdm(images_rank) + else: + tbar = images_rank + infer_result_list = [] + model = YmirModel(cfg) + + # write infer result + monitor_gap = max(1, N // 100) + conf_threshold = float(cfg.param.conf_threshold) + for idx, asset_path in enumerate(tbar): + img = cv2.imread(asset_path) + result = model.infer(img) + raw_anns = mmdet_result_to_ymir(result, cfg.param.class_names) + + # batch-level sync, avoid 30min time-out error + if WORLD_SIZE > 1 and idx < max_barrier_times: + dist.barrier() + + infer_result_list.append((asset_path, [ann for ann in raw_anns if ann.score >= conf_threshold])) + + if idx % monitor_gap == 0 and RANK in [0, -1]: + write_ymir_monitor_process(cfg, task='infer', naive_stage_percent=idx / N, stage=YmirStage.TASK) + + if WORLD_SIZE > 1: + dist.barrier() + infer_result_list = collect_results_gpu(infer_result_list, len(images)) + + if RANK in [0, -1]: + infer_result_dict = {k: v for k, v in infer_result_list} + rw.write_infer_result(infer_result=infer_result_dict) + write_ymir_monitor_process(cfg, task='infer', naive_stage_percent=1.0, stage=YmirStage.POSTPROCESS) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ymir/ymir_mining.py b/ymir/ymir_mining.py new file mode 100644 index 000000000..e4a5d8462 --- /dev/null +++ b/ymir/ymir_mining.py @@ -0,0 +1,125 @@ +import os +import random +import sys + +import numpy as np +import torch +import torch.distributed as dist +from easydict import EasyDict as edict +from mmdet.apis import inference_detector, init_detector +from mmengine.dist import collect_results_gpu, init_dist +from tqdm import tqdm +from ymir_exc import result_writer as rw +from ymir_exc.util import (YmirStage, get_merged_config, + write_ymir_monitor_process) + +from mmyolo.utils import register_all_modules +from ymir.utils.common import get_best_weight_file, get_config_file + +LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html +RANK = int(os.getenv('RANK', -1)) +WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1)) + + +class RandomMiner(object): + + def __init__(self, cfg: edict): + if LOCAL_RANK != -1: + init_dist(launcher='pytorch', backend="nccl" if dist.is_nccl_available() else "gloo") + + self.cfg = cfg + gpu_id = max(0, LOCAL_RANK) + self.device = f'cuda:{gpu_id}' + + self.conf_threshold = float(cfg.param.conf_threshold) + config_file = get_config_file(cfg) + checkpoint_file = get_best_weight_file(cfg) + self.model = init_detector(config_file, checkpoint_file, device=f'cuda:{gpu_id}') + + def infer(self, img): + return inference_detector(self.model, img) + + def mining(self): + with open(self.cfg.ymir.input.candidate_index_file, 'r') as f: + images = [line.strip() for line in f.readlines()] + + max_barrier_times = len(images) // WORLD_SIZE + if RANK == -1: + N = len(images) + tbar = tqdm(images) + else: + images_rank = images[RANK::WORLD_SIZE] + N = len(images_rank) + if RANK == 0: + tbar = tqdm(images_rank) + else: + tbar = images_rank + + monitor_gap = max(1, N // 100) + + mining_result = [] + for idx, asset_path in enumerate(tbar): + if idx % monitor_gap == 0: + write_ymir_monitor_process(cfg=self.cfg, + task='mining', + naive_stage_percent=idx / N, + stage=YmirStage.TASK, + task_order='tmi') + + if WORLD_SIZE > 1 and idx < max_barrier_times: + dist.barrier() + + with torch.no_grad(): + consistency = self.compute_score(asset_path=asset_path) + mining_result.append((asset_path, consistency)) + + if WORLD_SIZE > 1: + mining_result = collect_results_gpu(mining_result, len(images)) + + if RANK in [0, -1]: + rw.write_mining_result(mining_result=mining_result) + write_ymir_monitor_process(cfg=self.cfg, + task='mining', + naive_stage_percent=1, + stage=YmirStage.POSTPROCESS, + task_order='tmi') + return mining_result + + def compute_score(self, asset_path: str) -> float: + return random.random() + + +class EntropyMiner(RandomMiner): + + def compute_score(self, asset_path: str) -> float: + results = self.infer(asset_path) + conf = results.pred_instances.scores.data.cpu().numpy() + conf = conf[conf > self.conf_threshold] + + # if not empty, mining_score > 0 + if len(conf) == 0: + return 0 + + mining_score = -np.sum(conf * np.log2(conf)) + return mining_score + + +def main(): + register_all_modules() + + cfg = get_merged_config() + mining_algorithm = cfg.param.mining_algorithm + supported_miner = ['random', 'entropy'] + + assert mining_algorithm in supported_miner, f'unknown mining_algorithm {mining_algorithm}, not in {supported_miner}' + if mining_algorithm == 'random': + miner = RandomMiner(cfg) + elif mining_algorithm == 'entropy': + miner = EntropyMiner(cfg) + + miner.mining() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ymir/ymir_training.py b/ymir/ymir_training.py new file mode 100644 index 000000000..946633224 --- /dev/null +++ b/ymir/ymir_training.py @@ -0,0 +1,147 @@ +import glob +import logging +import os +import os.path as osp +import re +import subprocess +import sys + +from easydict import EasyDict as edict +from ymir_exc.dataset_convert import convert_ymir_to_coco +from ymir_exc.util import (YmirStage, find_free_port, get_merged_config, + write_ymir_monitor_process, + write_ymir_training_result) + +from ymir.utils.common import (get_best_weight_file, get_id_for_config_files, + get_topk_checkpoints) + + +def _parse_log_line(line): + """ + 2023/01/29 09:35:58 - mmengine - INFO - Epoch(val) [10][654/654] + coco/bbox_mAP: 0.0020 coco/bbox_mAP_50: 0.0080 coco/bbox_mAP_75: 0.0000 + coco/bbox_mAP_s: 0.0000 coco/bbox_mAP_m: 0.0000 coco/bbox_mAP_l: 0.0020 + """ + epoch = int(re.findall(r'\[\d+\]', line)[0][1:-1]) + maps = [float(x) for x in re.findall(r'\d+\.\d+', line)] + N = len('coco/bbox_') + keys = [x[N:-1] for x in re.findall(r'coco/bbox_mAP\w*:', line)] + + info = {key: map for key, map in zip(keys, maps)} + return epoch, info + + +def write_mmyolo_training_result(cfg: edict) -> None: + """ + save the best checkpoint for ymir after training. + keep the same id with YmirTrainingMonitorHook, make sure the saved checkpoint exist. + """ + out_dir = cfg.ymir.output.models_dir + log_files = glob.glob(osp.join(out_dir, '*', '*.log')) + + assert len(log_files) > 0 + if len(log_files) > 1: + logging.info('too many log files found!!!') + + # only one log file + with open(log_files[-1], 'r') as fp: + lines = fp.readlines() + + log_info_dict = {} + for line in lines: + if line.find('coco/bbox_mAP:') > -1: + epoch, info = _parse_log_line(line) + log_info_dict[epoch] = info + + # for the best files + cfg_files = glob.glob(osp.join(out_dir, '*.py')) + best_ckpts = glob.glob(osp.join(out_dir, 'best_coco', '*.pth')) + + topk = cfg.param.max_keep_checkpoints + # skip the newest ckpt, note YmirTrainingMonitorHook will save newest ckpt. + topk_best_ckpts = get_topk_checkpoints(best_ckpts, topk)[1:] + + for ckpt in topk_best_ckpts: + epoch = int(re.findall(r'\d+', ckpt)[0]) + write_ymir_training_result(cfg, + files=[ckpt] + cfg_files, + id=f'best_{epoch}', + evaluation_result=log_info_dict[epoch]) + + # save the last ckpt only, note YmirTrainingMonitorHook will save the last ckpt too. + last_ckpt = max(glob.glob(osp.join(out_dir, 'epoch_*.pth')), key=osp.getctime) + last_epoch = int(re.findall(r'\d+', last_ckpt)[0]) + write_ymir_training_result(cfg, + files=[last_ckpt] + cfg_files, + id='last', + evaluation_result=log_info_dict[last_epoch]) + + +def main(cfg: edict) -> int: + # default ymir config + gpu_id: str = str(cfg.param.get("gpu_id", '')) + num_gpus = len(gpu_id.split(",")) + + classes = cfg.param.class_names + num_classes = len(classes) + logging.info(f'num_classes = {num_classes}') + + # convert dataset before ddp + data_info = convert_ymir_to_coco() + logging.info(f'convert dataset to {data_info}') + + # mmcv args config + model_name = cfg.param.get("model_name") + config_files_map = get_id_for_config_files() + config_id = model_name.lower().replace('-', '_') + config_file = config_files_map[config_id] + # config_file = cfg.param.get("config_file") + args_options = cfg.param.get("args_options", '') + cfg_options = cfg.param.get("cfg_options", '') + + # auto load offered weight file if not set by user! + if (args_options.find('--resume-from') == -1) and ((cfg_options.find('load_from') == -1 + and cfg_options.find('resume_from') == -1)): + + weight_file = get_best_weight_file(cfg) + if weight_file: + if cfg_options: + cfg_options += f' load_from={weight_file}' + else: + cfg_options = f'load_from={weight_file}' + else: + logging.warning('no weight file used for training!') + + write_ymir_monitor_process(cfg, task='training', naive_stage_percent=0.2, stage=YmirStage.POSTPROCESS) + + work_dir = cfg.ymir.output.models_dir + if num_gpus == 0: + # view https://mmdetection.readthedocs.io/en/stable/1_exist_data_model.html#training-on-cpu + os.environ.setdefault('CUDA_VISIBLE_DEVICES', "-1") + cmd = f"python3 tools/train.py {config_file} " + \ + f"--work-dir {work_dir}" + else: + os.environ.setdefault('CUDA_VISIBLE_DEVICES', gpu_id) + port = find_free_port() + os.environ.setdefault('PORT', str(port)) + cmd = f"bash ./tools/dist_train.sh {config_file} {num_gpus} " + \ + f"--work-dir {work_dir}" + + if args_options: + cmd += f" {args_options}" + + if cfg_options: + cmd += f" --cfg-options {cfg_options}" + + logging.info(f"training command: {cmd}") + subprocess.run(cmd.split(), check=True) + + # save the last checkpoint + write_mmyolo_training_result(cfg) + return 0 + + +if __name__ == '__main__': + cfg = get_merged_config() + os.environ.setdefault('PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION', 'python') + sys.exit(main(cfg)) From 17e7b5994660537171b3b2832428cbf890d18dc8 Mon Sep 17 00:00:00 2001 From: youdaoyzbx Date: Mon, 30 Jan 2023 19:46:06 +0800 Subject: [PATCH 2/5] update doc --- ymir/develop.md | 40 ++++++++++++++++++++++++++++++++++++++++ ymir/utils/common.py | 5 ++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/ymir/develop.md b/ymir/develop.md index 237a8447e..bc5353d0b 100644 --- a/ymir/develop.md +++ b/ymir/develop.md @@ -20,8 +20,48 @@ - 加载预训练权重 + - 参考 `get_best_weight_file()` + + - 如果用户提供预训练权重, 则先其中找带 `best_` 或 `bbox_mAP_` 的权重,其次找带 `epoch_` 的权重, 最后选择其中最新的。 + + - 如果用户没有提供预训练权重,则在镜像的 `/weights` 目录下, 通过超参数 `model_name` 获得 `config_file` 再通过相似度找到最相似的权重文件。 + - 加载超参数 + - 参考 `modify_mmengine_config()`, 将ymir超参数覆盖 `mmengine.config.Config` + - 写进度 + - 参考 `YmirTrainingMonitorHook`, 该 hook 可实时返回进度信息, 并保存最新的权重文件到ymir中,以支持提前终止训练的功能。 + - 写结果文件 + + - 参考 `YmirTrainingMonitorHook` 与 `write_mmyolo_training_result()`, 其中后者支持依据超参数 `max_keep_checkpoints` 保存多个权重文件。 + +## 推理 + +1. 启动镜像时调用 `bash /usr/bin/start.sh` + +2. `start.sh` 调用 `python3 ymir/start.py` + +3. `start.py` 调用 `python3 ymir/ymir_infer.py` + + - 调用 `init_detector()` 与 `inference_detector()` 获取推理结果 + + - 调用 `mmdet_result_to_ymir()` 将mmdet推理结果转换为ymir格式 + + - 调用 `rw.write_infer_result()` 保存推理结果 + +## 挖掘 + +1. 启动镜像时调用 `bash /usr/bin/start.sh` + +2. `start.sh` 调用 `python3 ymir/start.py` + +3. `start.py` 调用 `python3 ymir/ymir_mining.py` + + - 调用 `init_detector()` 与 `inference_detector()` 获取推理结果 + + - 调用 `compute_score()` 计算挖掘分数 + + - 调用 `rw.write_mining_result()` 保存挖掘结果 diff --git a/ymir/utils/common.py b/ymir/utils/common.py index 45187b6a3..82b56a8dd 100644 --- a/ymir/utils/common.py +++ b/ymir/utils/common.py @@ -185,7 +185,7 @@ def get_best_weight_file(cfg: edict) -> str: ] # choose weight file by priority, best_xxx.pth > latest.pth > epoch_xxx.pth - best_pth_files = [f for f in model_params_path if osp.basename(f).startswith('best_')] + best_pth_files = [f for f in model_params_path if osp.basename(f).startswith(('best_', 'bbox_mAP_'))] if len(best_pth_files) > 0: return max(best_pth_files, key=os.path.getctime) @@ -193,6 +193,9 @@ def get_best_weight_file(cfg: edict) -> str: if len(epoch_pth_files) > 0: return max(epoch_pth_files, key=os.path.getctime) + if len(model_params_path) > 0: + return max(model_params_path, key=os.path.getctime) + if cfg.ymir.run_training: weight_files = [f for f in glob.glob('/weights/**/*', recursive=True) if f.endswith(('.pth', '.pt'))] From 0926cc15be912f8f7de43b6a127ac75bc1f3ee8b Mon Sep 17 00:00:00 2001 From: youdaoyzbx Date: Tue, 31 Jan 2023 14:45:56 +0800 Subject: [PATCH 3/5] udpate docker file and training template --- ymir/Dockerfile | 4 +++- ymir/img-man/training-template.yaml | 9 ++++----- ymir/readme.md | 18 +++++++++++++++++- ymir/start.py | 6 +++--- ymir/ymir_infer.py | 8 +++----- 5 files changed, 30 insertions(+), 15 deletions(-) diff --git a/ymir/Dockerfile b/ymir/Dockerfile index bcc1e55a8..868006284 100644 --- a/ymir/Dockerfile +++ b/ymir/Dockerfile @@ -23,11 +23,13 @@ RUN pip install openmim \ COPY . /app RUN cd /app && \ pip install --no-cache-dir -r requirements.txt && \ + mkdir -p /img-man && \ + mv ymir/img-man/*.yaml /img-man && \ mkdir /weights && \ mv ymir/weights/*.pth /weights ENV PYTHONPATH=. WORKDIR /app -RUN echo "python3 ymir/start.py > /usr/bin/start.sh" +RUN echo "python3 ymir/start.py" > /usr/bin/start.sh CMD bash /usr/bin/start.sh diff --git a/ymir/img-man/training-template.yaml b/ymir/img-man/training-template.yaml index 54c035a79..5d6b67b5b 100644 --- a/ymir/img-man/training-template.yaml +++ b/ymir/img-man/training-template.yaml @@ -1,12 +1,11 @@ export_format: 'ark:raw' -samples_per_gpu: 16 # batch size per gpu +samples_per_gpu: 8 # batch size per gpu workers_per_gpu: 4 max_epochs: 100 -# config_file: 'configs/yolox/yolox_tiny_8x8_300e_coco.py' -model_name: yolov5_n +model_name: yolov8_n args_options: '' cfg_options: '' -metric: 'bbox' +# metric: 'bbox' val_interval: 1 # <0 means evaluation every interval max_keep_checkpoints: 1 # <0 means save all weight file, 1 means save last and best weight files, k means save topk best weight files and topk epoch/step weigth files -ymir_saved_file_patterns: '' # custom saved files, support python regular expression, use , to split multiple pattern +# ymir_saved_file_patterns: '' # custom saved files, support python regular expression, use , to split multiple pattern diff --git a/ymir/readme.md b/ymir/readme.md index c158a3d30..bd0af5c7e 100644 --- a/ymir/readme.md +++ b/ymir/readme.md @@ -56,7 +56,7 @@ youdaoyzbx/ymir-executor:ymir2.1.0-mmyolo-cu113-tmi | shm_size | 128G | 字符串| 受ymir后台处理,docker image 可用共享内存 | 建议大小:镜像占用GPU数 * 32G | | export_format | ark:raw | 字符串| 受ymir后台处理,ymir数据集导出格式 | - | | model_name | yolov8_n | 字符串 | 模型简写, 如yolov7_tiny, yolov5_m, yolov6_t, rtmdet_m, ppyoloe_plus_s | 支持yolov5-v8, yolox, rtmdet, ppyoloe_plus | -| samples_per_gpu | 16 | 整数 | 每张GPU一次处理的图片数量 | 建议大小:显存占用<50% 可增加2倍加快训练速度 | +| samples_per_gpu | 8 | 整数 | 每张GPU一次处理的图片数量 | 建议大小:显存占用<50% 可增加2倍加快训练速度 | | workers_per_gpu | 4 | 整数 | 每张GPU对应的数据读取进程数 | - | | max_epochs | 100 | 整数 | 整个数据集的训练遍历次数 | 建议:必要时分析tensorboard确定是否有必要改变,一般采用默认值即可 | | args_options | '' | 字符串 | 训练命令行参数 | 参考 [ymir-mmyolo/tools/train.py](https://github.com/modelai/ymir-mmyolo/blob/ymir/tools/train.py) @@ -64,3 +64,19 @@ youdaoyzbx/ymir-executor:ymir2.1.0-mmyolo-cu113-tmi | metric | bbox | 字符串 | 模型评测方式 | 采用默认值即可 | | val_interval | 1 | 整数 | 模型在验证集上评测的周期, 以epoch为单位 | 设置为1,每个epoch可评测一次 | | max_keep_checkpoints | 1 | 整数 | 最多保存的权重文件数量 | 设置为k, 可保存k个最优权重和k个最新的权重文件,设置为-1可保存所有权重文件。 + + +## 推理参数 + +| 超参数 | 默认值 | 类型 | 说明 | 建议 | +| - | - | - | - | - | +| hyper-parameter | default value | type | note | advice | +| conf_threshold | 0.2 | 浮点数 | 推理结果置信度过滤阈值 | 设置为0可保存所有结果,设置为0.6可过滤大量结果 | + +## 挖掘参数 + +| 超参数 | 默认值 | 类型 | 说明 | 建议 | +| - | - | - | - | - | +| hyper-parameter | default value | type | note | advice | +| mining_algorithm | entropy | 字符串 | 挖掘算法可选 entropy 和 random | 建议采用entropy | +| conf_threshold | 0.1 | 浮点数 | 推理结果置信度过滤阈值 | 设置为0可保存所有结果,设置为0.1可过滤一些推理结果,避免挖掘算法受低置信度结果影响 | diff --git a/ymir/start.py b/ymir/start.py index e33f05b9f..0de101cf7 100644 --- a/ymir/start.py +++ b/ymir/start.py @@ -7,9 +7,9 @@ def main(): ymir_cfg = get_merged_config() - gpu_id: str = ymir_cfg.param.get('gpu_id', '0') - gpu_count: int = len(gpu_id.split(',')) - port: int = find_free_port() + gpu_id = ymir_cfg.param.get('gpu_id', '0') + gpu_count = len(gpu_id.split(',')) + port = find_free_port() logger = logging.getLogger() log_level = ymir_cfg.param.get('log_level', 'info') diff --git a/ymir/ymir_infer.py b/ymir/ymir_infer.py index e05735355..a367a867e 100644 --- a/ymir/ymir_infer.py +++ b/ymir/ymir_infer.py @@ -1,12 +1,9 @@ import argparse import os -import os.path as osp import sys -import warnings -from typing import Any, List +from typing import List import cv2 -import numpy as np import torch.distributed as dist from easydict import EasyDict as edict from mmdet.apis import inference_detector, init_detector @@ -15,7 +12,8 @@ from mmengine.dist import collect_results_gpu, init_dist from tqdm import tqdm from ymir_exc import result_writer as rw -from ymir_exc.util import (YmirStage, get_merged_config, write_ymir_monitor_process) +from ymir_exc.util import (YmirStage, get_merged_config, + write_ymir_monitor_process) from mmyolo.utils import register_all_modules from ymir.utils.common import get_best_weight_file, get_config_file From ce78026fd851de3f83eb8e789c4a99f290c83c9c Mon Sep 17 00:00:00 2001 From: youdaoyzbx Date: Tue, 31 Jan 2023 19:00:08 +0800 Subject: [PATCH 4/5] fix bug for rtmdet --- mmyolo/engine/hooks/ymir_training_monitor_hook.py | 14 ++++++++------ ymir/Dockerfile | 4 +++- ymir/utils/common.py | 10 ++++++++-- ymir/weights/download_weights.sh | 4 ++++ 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/mmyolo/engine/hooks/ymir_training_monitor_hook.py b/mmyolo/engine/hooks/ymir_training_monitor_hook.py index 1ad79873c..4e0e5123e 100644 --- a/mmyolo/engine/hooks/ymir_training_monitor_hook.py +++ b/mmyolo/engine/hooks/ymir_training_monitor_hook.py @@ -10,7 +10,8 @@ from mmengine.hooks import Hook from mmengine.registry import HOOKS -from ymir_exc.util import (get_merged_config, write_ymir_monitor_process, write_ymir_training_result) +from ymir_exc.util import (get_merged_config, write_ymir_monitor_process, + write_ymir_training_result) @HOOKS.register_module() @@ -46,7 +47,7 @@ def after_val_epoch(self, runner, metrics: Optional[Dict[str, float]] = None) -> """ if runner.rank in [0, -1]: N = len('coco/bbox_') - evaluation_result = {key[N:]: value for key, value in metrics.items()} + evaluation_result = {key[N:]: value for key, value in metrics.items()} # type: ignore out_dir = self.ymir_cfg.ymir.output.models_dir cfg_files = glob.glob(osp.join(out_dir, '*.py')) @@ -64,11 +65,12 @@ def after_val_epoch(self, runner, metrics: Optional[Dict[str, float]] = None) -> else: warnings.warn(f'no best checkpoint found on {runner.epoch}') - last_ckpts = glob.glob(osp.join(out_dir, '*.pth')) - if len(last_ckpts) > 0: - logging.info(f'epoch={runner.epoch}, save {newest_best_ckpt} to result.yaml') + latest_ckpts = glob.glob(osp.join(out_dir, '*.pth')) + if len(latest_ckpts) > 0: + last_ckpt = max(latest_ckpts, key=osp.getctime) + logging.info(f'epoch={runner.epoch}, save {last_ckpt} to result.yaml') write_ymir_training_result(self.ymir_cfg, - files=last_ckpts + cfg_files, + files=[last_ckpt] + cfg_files, id='last', evaluation_result=evaluation_result) else: diff --git a/ymir/Dockerfile b/ymir/Dockerfile index 868006284..af0af8e95 100644 --- a/ymir/Dockerfile +++ b/ymir/Dockerfile @@ -26,7 +26,9 @@ RUN cd /app && \ mkdir -p /img-man && \ mv ymir/img-man/*.yaml /img-man && \ mkdir /weights && \ - mv ymir/weights/*.pth /weights + mv ymir/weights/*.pth /weights && \ + mkdir -p /root/.cache/torch/hub/checkpoints && \ + mv ymir/weights/imagenet/*.pth /root/.cache/torch/hub/checkpoints ENV PYTHONPATH=. WORKDIR /app diff --git a/ymir/utils/common.py b/ymir/utils/common.py index 82b56a8dd..e50d5d6ae 100644 --- a/ymir/utils/common.py +++ b/ymir/utils/common.py @@ -54,7 +54,9 @@ def recursive_modify_attribute(mmengine_cfgdict: Union[Config, ConfigDict], attr mmengine_cfg.train_num_workers = workers_per_gpu mmengine_cfg.train_dataloader.batch_size = samples_per_gpu mmengine_cfg.train_dataloader.num_workers = workers_per_gpu - mmengine_cfg.optim_wrapper.optimizer.batch_size_per_gpu = samples_per_gpu + + if 'batch_size_per_gpu' in mmengine_cfg.optim_wrapper.optimizer: + mmengine_cfg.optim_wrapper.optimizer.batch_size_per_gpu = samples_per_gpu # modify model output channel num_classes = len(ymir_cfg.param.class_names) @@ -91,8 +93,12 @@ def recursive_modify_attribute(mmengine_cfgdict: Union[Config, ConfigDict], attr max_epochs = int(ymir_cfg.param.max_epochs) mmengine_cfg.train_cfg.max_epochs = max_epochs mmengine_cfg.max_epochs = max_epochs - mmengine_cfg.default_hooks.param_scheduler.max_epochs = max_epochs + param_scheduler_type = mmengine_cfg.default_hooks.param_scheduler.type + if param_scheduler_type == 'YOLOv5ParamSchedulerHook': + mmengine_cfg.default_hooks.param_scheduler.max_epochs = max_epochs + elif param_scheduler_type == 'PPYOLOEParamSchedulerHook': + mmengine_cfg.default_hooks.param_scheduler.total_epochs = max_epochs # modify checkpoint mmengine_cfg.default_hooks.checkpoint['out_dir'] = ymir_cfg.ymir.output.models_dir diff --git a/ymir/weights/download_weights.sh b/ymir/weights/download_weights.sh index 5ec20bad1..b3581596a 100644 --- a/ymir/weights/download_weights.sh +++ b/ymir/weights/download_weights.sh @@ -35,3 +35,7 @@ wget https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-8 wget https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco/ppyoloe_plus_m_fast_8xb8-80e_coco_20230104_193132-e4325ada.pth wget https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco/ppyoloe_plus_l_fast_8xb8-80e_coco_20230102_203825-1864e7b3.pth +# rtmdet imagenet pretrain, copy to /root/.cache/torch/hub/checkpoints +cd imagenet +wget https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth +wget https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth From 9191f525da436c166228991e30c257de091a80e2 Mon Sep 17 00:00:00 2001 From: youdaoyzbx Date: Tue, 31 Jan 2023 19:08:24 +0800 Subject: [PATCH 5/5] fix best checkpoint save bug for rtmdet --- .dockerignore | 226 +++++++++++++++++++++++++++++++++++++++++++ ymir/utils/common.py | 1 + 2 files changed, 227 insertions(+) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..840863a57 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,226 @@ +# Repo-specific DockerIgnore ------------------------------------------------------------------------------------------- +#.git +.cache +.idea +runs +output +coco +storage.googleapis.com + +data/samples/* +**/results*.csv +*.jpg + +ymir/tensorrt/build +ymir/tensorrt/pt_result +ymir/tensorrt/trt_result +# Neural Network weights ----------------------------------------------------------------------------------------------- +#**/*.pt +#**/*.pth +**/*.pkl +**/*.onnx +**/*.engine +**/*.mlmodel +**/*.torchscript +**/*.torchscript.pt +**/*.tflite +**/*.h5 +**/*.pb +*_saved_model/ +*_web_model/ +*_openvino_model/ + +# Below Copied From .gitignore ----------------------------------------------------------------------------------------- +# Below Copied From .gitignore ----------------------------------------------------------------------------------------- + + +# GitHub Python GitIgnore ---------------------------------------------------------------------------------------------- +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +wandb/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv* +venv*/ +ENV*/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + + +# https://github.com/github/gitignore/blob/master/Global/macOS.gitignore ----------------------------------------------- + +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon +Icon? + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + + +# https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/* +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/dictionaries +.html # Bokeh Plots +.pg # TensorFlow Frozen Graphs +.avi # videos + +# Sensitive or high-churn files: +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml + +# Gradle: +.idea/**/gradle.xml +.idea/**/libraries + +# CMake +cmake-build-debug/ +cmake-build-release/ + +# Mongo Explorer plugin: +.idea/**/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties diff --git a/ymir/utils/common.py b/ymir/utils/common.py index e50d5d6ae..c1795e705 100644 --- a/ymir/utils/common.py +++ b/ymir/utils/common.py @@ -101,6 +101,7 @@ def recursive_modify_attribute(mmengine_cfgdict: Union[Config, ConfigDict], attr mmengine_cfg.default_hooks.param_scheduler.total_epochs = max_epochs # modify checkpoint mmengine_cfg.default_hooks.checkpoint['out_dir'] = ymir_cfg.ymir.output.models_dir + mmengine_cfg.default_hooks.checkpoint['save_best'] = 'auto' # modify tensorboard tensorboard_logger = dict(type='TensorboardVisBackend', save_dir=ymir_cfg.ymir.output.tensorboard_dir)