Skip to content

Commit

Permalink
Merge pull request #1 from modelai/ymir-dev
Browse files Browse the repository at this point in the history
first commit for ymir
  • Loading branch information
yzbx committed Feb 1, 2023
2 parents e62c8c4 + 9191f52 commit 24371ae
Show file tree
Hide file tree
Showing 19 changed files with 1,293 additions and 2 deletions.
226 changes: 226 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
# Repo-specific DockerIgnore -------------------------------------------------------------------------------------------
#.git
.cache
.idea
runs
output
coco
storage.googleapis.com

data/samples/*
**/results*.csv
*.jpg

ymir/tensorrt/build
ymir/tensorrt/pt_result
ymir/tensorrt/trt_result
# Neural Network weights -----------------------------------------------------------------------------------------------
#**/*.pt
#**/*.pth
**/*.pkl
**/*.onnx
**/*.engine
**/*.mlmodel
**/*.torchscript
**/*.torchscript.pt
**/*.tflite
**/*.h5
**/*.pb
*_saved_model/
*_web_model/
*_openvino_model/

# Below Copied From .gitignore -----------------------------------------------------------------------------------------
# Below Copied From .gitignore -----------------------------------------------------------------------------------------


# GitHub Python GitIgnore ----------------------------------------------------------------------------------------------
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
wandb/
.installed.cfg
*.egg

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# dotenv
.env

# virtualenv
.venv*
venv*/
ENV*/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/


# https://github.com/github/gitignore/blob/master/Global/macOS.gitignore -----------------------------------------------

# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon
Icon?

# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk


# https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff:
.idea/*
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/dictionaries
.html # Bokeh Plots
.pg # TensorFlow Frozen Graphs
.avi # videos

# Sensitive or high-churn files:
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml

# Gradle:
.idea/**/gradle.xml
.idea/**/libraries

# CMake
cmake-build-debug/
cmake-build-release/

# Mongo Explorer plugin:
.idea/**/mongoSettings.xml

## File-based project format:
*.iws

## Plugin-specific files:

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
3 changes: 2 additions & 1 deletion mmyolo/engine/hooks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# Copyright (c) OpenMMLab. All rights reserved.
from .ppyoloe_param_scheduler_hook import PPYOLOEParamSchedulerHook
from .switch_to_deploy_hook import SwitchToDeployHook
from .ymir_training_monitor_hook import YmirTrainingMonitorHook
from .yolov5_param_scheduler_hook import YOLOv5ParamSchedulerHook
from .yolox_mode_switch_hook import YOLOXModeSwitchHook

__all__ = [
'YOLOv5ParamSchedulerHook', 'YOLOXModeSwitchHook', 'SwitchToDeployHook',
'PPYOLOEParamSchedulerHook'
'PPYOLOEParamSchedulerHook', 'YmirTrainingMonitorHook'
]
77 changes: 77 additions & 0 deletions mmyolo/engine/hooks/ymir_training_monitor_hook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
hook for ymir training process, write the monitor.txt, save the latest model
"""
import glob
import logging
import os.path as osp
import re
import warnings
from typing import Dict, Optional, Union

from mmengine.hooks import Hook
from mmengine.registry import HOOKS
from ymir_exc.util import (get_merged_config, write_ymir_monitor_process,
write_ymir_training_result)


@HOOKS.register_module()
class YmirTrainingMonitorHook(Hook):
"""
for epoch based training loop only.
1. write monitor.txt
2. save the latest checkpoint with id=last if exist, note the checkpoint maybe clear late.
3. save the latest best checkpoint with id=best if exist, note the checkpoint maybe clear late.
"""
# the priority should lower than CheckpointHook (priority = VERY_LOW)
priority = 'LOWEST'

def __init__(self, interval: int = 10):
self.interval = interval
self.ymir_cfg = get_merged_config()

def after_train_iter(self,
runner,
batch_idx: int,
data_batch: Optional[Union[dict, tuple, list]] = None,
outputs: Optional[dict] = None) -> None:
if runner.rank in [0, -1] and self.every_n_inner_iters(batch_idx, self.interval):
percent = (runner.epoch + batch_idx / len(runner.train_dataloader)) / runner.max_epochs
write_ymir_monitor_process(self.ymir_cfg, task='training', naive_stage_percent=percent, stage='task')

def after_val_epoch(self, runner, metrics: Optional[Dict[str, float]] = None) -> None:
"""
metrics: {'coco/bbox_mAP': 0.001, 'coco/bbox_mAP_50': 0.003, 'coco/bbox_mAP_75': 0.0, 'coco/bbox_mAP_s': 0.0, 'coco/bbox_mAP_m': 0.0, 'coco/bbox_mAP_l': 0.001}
evaluation_result: {'mAP': 0.001, 'mAP_50': 0.003, ...}
"""
if runner.rank in [0, -1]:
N = len('coco/bbox_')
evaluation_result = {key[N:]: value for key, value in metrics.items()} # type: ignore
out_dir = self.ymir_cfg.ymir.output.models_dir
cfg_files = glob.glob(osp.join(out_dir, '*.py'))

best_ckpts = glob.glob(osp.join(out_dir, 'best_coco', '*.pth'))
if len(best_ckpts) > 0:
newest_best_ckpt = max(best_ckpts, key=osp.getctime)
best_epoch = int(re.findall(r'\d+', newest_best_ckpt)[0])
# if current checkpoint is the newest checkpoint, keep it
if best_epoch == runner.epoch:
logging.info(f'epoch={runner.epoch}, save {newest_best_ckpt} to result.yaml')
write_ymir_training_result(self.ymir_cfg,
files=[newest_best_ckpt] + cfg_files,
id='best',
evaluation_result=evaluation_result)
else:
warnings.warn(f'no best checkpoint found on {runner.epoch}')

latest_ckpts = glob.glob(osp.join(out_dir, '*.pth'))
if len(latest_ckpts) > 0:
last_ckpt = max(latest_ckpts, key=osp.getctime)
logging.info(f'epoch={runner.epoch}, save {last_ckpt} to result.yaml')
write_ymir_training_result(self.ymir_cfg,
files=[last_ckpt] + cfg_files,
id='last',
evaluation_result=evaluation_result)
else:
warnings.warn(f'no latest checkpoint found on {runner.epoch}')
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
-r requirements/build.txt
-r requirements/runtime.txt
-r requirements/tests.txt
# -r requirements/tests.txt
-r requirements/albu.txt
-r requirements/ymir.txt
7 changes: 7 additions & 0 deletions requirements/ymir.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
opencv-contrib-python>=4.0
easydict
tqdm
imagesize
nptyping
tensorboard
-e git+https://github.com/modelai/ymir-executor-sdk.git@ymir2.1.0#egg=ymir-exc
4 changes: 4 additions & 0 deletions tools/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
from mmengine.config import Config, DictAction
from mmengine.logging import print_log
from mmengine.runner import Runner
from ymir_exc.util import get_merged_config

from mmyolo.registry import RUNNERS
from mmyolo.utils import register_all_modules
from ymir.utils.common import modify_mmengine_config


def parse_args():
Expand Down Expand Up @@ -61,6 +63,8 @@ def main():

# load config
cfg = Config.fromfile(args.config)
ymir_cfg = get_merged_config()
modify_mmengine_config(cfg, ymir_cfg)
# replace the ${key} with the value of cfg.key
# cfg = replace_cfg_vals(cfg)
cfg.launcher = args.launcher
Expand Down
37 changes: 37 additions & 0 deletions ymir/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime

# To fix GPG key error when running apt-get update
RUN apt-get update && apt-get install -y gnupg2
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub

# (Optional)
RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list && \
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple

RUN apt-get update \
&& apt-get install -y gcc ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev vim \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

RUN pip install openmim \
&& mim install "mmengine>=0.3.1" \
&& mim install "mmcv>=2.0.0rc1,<2.1.0" \
&& mim install "mmdet>=3.0.0rc5,<3.1.0"

# Install MMYOLO
COPY . /app
RUN cd /app && \
pip install --no-cache-dir -r requirements.txt && \
mkdir -p /img-man && \
mv ymir/img-man/*.yaml /img-man && \
mkdir /weights && \
mv ymir/weights/*.pth /weights && \
mkdir -p /root/.cache/torch/hub/checkpoints && \
mv ymir/weights/imagenet/*.pth /root/.cache/torch/hub/checkpoints

ENV PYTHONPATH=.
WORKDIR /app

RUN echo "python3 ymir/start.py" > /usr/bin/start.sh
CMD bash /usr/bin/start.sh
Loading

0 comments on commit 24371ae

Please sign in to comment.