configs/train/train_stage2.yaml

model:
  cldm:
    target: model.cldm.ControlLDM
    params:
      latent_scale_factor: 0.18215
      unet_cfg:
        use_checkpoint: True
        image_size: 32 # unused
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 2
        channel_mult: [ 1, 2, 4, 4 ]
        num_head_channels: 64 # need to fix for flash-attn
        use_spatial_transformer: True
        use_linear_in_transformer: True
        transformer_depth: 1
        context_dim: 1024
        legacy: False
      vae_cfg:
        embed_dim: 4
        ddconfig:
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
      clip_cfg:
        embed_dim: 1024
        vision_cfg:
          image_size: 224
          layers: 32
          width: 1280
          head_width: 80
          patch_size: 14
        text_cfg:
          context_length: 77
          vocab_size: 49408
          width: 1024
          heads: 16
          layers: 24
        layer: "penultimate"
      controlnet_cfg:
        use_checkpoint: True
        image_size: 32 # unused
        in_channels: 4
        hint_channels: 4
        model_channels: 320
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 2
        channel_mult: [ 1, 2, 4, 4 ]
        num_head_channels: 64 # need to fix for flash-attn
        use_spatial_transformer: True
        use_linear_in_transformer: True
        transformer_depth: 1
        context_dim: 1024
        legacy: False

  swinir:
    target: model.swinir.SwinIR
    params:
      img_size: 64
      patch_size: 1
      in_chans: 3
      embed_dim: 180
      depths: [6, 6, 6, 6, 6, 6, 6, 6]
      num_heads: [6, 6, 6, 6, 6, 6, 6, 6]
      window_size: 8
      mlp_ratio: 2
      sf: 8
      img_range: 1.0
      upsampler: "nearest+conv"
      resi_connection: "1conv"
      unshuffle: True
      unshuffle_scale: 8

  diffusion:
    target: model.gaussian_diffusion.Diffusion
    params:
      linear_start: 0.00085
      linear_end: 0.0120
      timesteps: 1000

dataset:
  train:
    target: dataset.codeformer.CodeformerDataset
    params:
      # training file list path
      file_list: 
      file_backend_cfg:
        target: dataset.file_backend.HardDiskBackend
      out_size: 512
      crop_type: center
      blur_kernel_size: 41
      kernel_list: ['iso', 'aniso']
      kernel_prob: [0.5, 0.5]
      blur_sigma: [0.1, 12]
      downsample_range: [1, 12]
      noise_range: [0, 15]
      jpeg_range: [30, 100]

train:
  # pretrained sd v2.1 path
  sd_path: 
  # experiment directory path
  exp_dir: 
  # stage 1 swinir path
  swinir_path: 
  learning_rate: 1e-4
  # ImageNet 1k (1.3M images)
  # batch size = 192, lr = 1e-4, total training steps = 25k
  # Our filtered laion2b-en (15M images)
  # batch size = 256, lr = 1e-4 (first 30k), 1e-5 (next 50k), total training steps = 80k
  batch_size: 256
  num_workers: 
  train_steps: 30000
  log_every: 50
  ckpt_every: 10000
  image_every: 1000
  resume: ~