From f37702cbceea9b9ef2f39705642e53d638595af6 Mon Sep 17 00:00:00 2001
From: Josh Holmer <jholmer.in@gmail.com>
Date: Sun, 23 Oct 2022 23:54:58 -0400
Subject: [PATCH] Add developer CLI flags for tuning encoder internals

Currently adds the following:
- `--deblock-strength`
- `--deblock-sharpness`
- `--ip-qidx-ratio`
- `--pb-qidx-ratio`
- `--b-qidx-ratio`
- `--temporal-rdo-strength`

These flags will be available only if the `devel` feature flag
is active, as they are intended to be used to ease development
and not to be tweaked by end users. rav1e continues to subscribe
to the philosophy that end users should not need to be cargo culting
their command lines to get optimal results, and we would prefer to
tune the encoder internals so that users do not have to.
---
 .github/workflows/rav1e.yml |  3 +-
 Cargo.toml                  |  3 ++
 src/api/config/encoder.rs   | 45 ++++++++++++++++++++
 src/api/test.rs             |  2 +
 src/bin/common.rs           | 62 +++++++++++++++++++++++++++-
 src/encoder.rs              | 20 ++++++++-
 src/fuzzing.rs              |  1 +
 src/rate.rs                 | 82 ++++++++++++++++++++++++++++---------
 src/rdo.rs                  | 18 ++++++++
 9 files changed, 212 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/rav1e.yml b/.github/workflows/rav1e.yml
index f59560b84f..3bd52c3560 100644
--- a/.github/workflows/rav1e.yml
+++ b/.github/workflows/rav1e.yml
@@ -12,7 +12,6 @@ on:
 
 jobs:
   rustfmt-clippy:
-
     runs-on: ubuntu-22.04
 
     steps:
@@ -193,7 +192,7 @@ jobs:
       - name: Check extra features
         if: matrix.toolchain == 'stable' && matrix.conf == 'check-extra-feats'
         run: |
-          cargo check --features=check_asm,capi,dump_lookahead_data,serialize,bench --all-targets
+          cargo check --features=check_asm,capi,dump_lookahead_data,serialize,bench,devel --all-targets
       - name: Check extra features
         if: matrix.toolchain == 'stable' && matrix.conf == 'check-unstable-feats'
         run: |
diff --git a/Cargo.toml b/Cargo.toml
index 320e7aafa4..29791822c6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,6 +24,9 @@ default-run = "rav1e"
 
 [features]
 unstable = []
+# Exposes extra flags for tuning compiler internals.
+# Intended to be used by developers to find ideal internal settings.
+devel = []
 channel-api = ["crossbeam"]
 decode_test = ["aom-sys"]
 decode_test_dav1d = ["dav1d-sys"]
diff --git a/src/api/config/encoder.rs b/src/api/config/encoder.rs
index 7f84d5a081..17caf5c652 100644
--- a/src/api/config/encoder.rs
+++ b/src/api/config/encoder.rs
@@ -108,6 +108,50 @@ pub struct EncoderConfig {
 
   /// Settings which affect the encoding speed vs. quality trade-off.
   pub speed_settings: SpeedSettings,
+
+  /// Advanced settings which are intended for use by developers.
+  /// Non-developers should use the default values.
+  pub advanced_flags: AdvancedTuning,
+}
+
+/// Advanced settings that are intended for use by developers
+/// for tuning compiler internals.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct AdvancedTuning {
+  /// Controls the strength of the deblock filter, as a multiplier to the default.
+  pub deblock_strength: f32,
+  /// Controls the sharpness of the deblock filter. Accepts a value from 0-7.
+  pub deblock_sharpness: u8,
+  /// Controls the ratio between intra frame and inter frame quantizers, as a multiplier.
+  /// Default is 1.0. Higher values create a higher quantizer difference, while lower values
+  /// create a lower quantizer difference. A value of 0.0 would mean that I and P quantizers
+  /// are the same.
+  pub ip_qidx_ratio: f32,
+  /// Controls the ratio between "P"-frame and "B"-frame quantizers, as a multiplier.
+  /// Default is 1.0. Higher values create a higher quantizer difference, while lower values
+  /// create a lower quantizer difference. A value of 0.0 would mean that P and B quantizers
+  /// are the same.
+  pub pb_qidx_ratio: f32,
+  /// Controls the ratio between frame quantizers in the levels of the pyramid betweem "B"-frames,
+  /// as a multiplier. Default is 1.0. Higher values create a higher quantizer difference,
+  /// while lower values create a lower quantizer difference. A value of 0.0 would mean that
+  /// B0 and B1 quantizers are the same.
+  pub b_qidx_ratio: f32,
+  /// Controls the strength of temporal RDO, as a multiplier to the default.
+  pub temporal_rdo_strength: f32,
+}
+
+impl Default for AdvancedTuning {
+  fn default() -> Self {
+    Self {
+      deblock_strength: 1.0,
+      deblock_sharpness: 0,
+      ip_qidx_ratio: 1.0,
+      pb_qidx_ratio: 1.0,
+      b_qidx_ratio: 1.0,
+      temporal_rdo_strength: 1.0,
+    }
+  }
 }
 
 /// Default preset for `EncoderConfig`: it is a balance between quality and
@@ -163,6 +207,7 @@ impl EncoderConfig {
       tile_rows: 0,
       tiles: 0,
       speed_settings: SpeedSettings::from_preset(speed),
+      advanced_flags: Default::default(),
     }
   }
 
diff --git a/src/api/test.rs b/src/api/test.rs
index 072562631f..b82f3311fb 100644
--- a/src/api/test.rs
+++ b/src/api/test.rs
@@ -2164,6 +2164,7 @@ fn log_q_exp_overflow() {
       },
       ..Default::default()
     },
+    advanced_flags: Default::default(),
   };
   let config = Config::new().with_encoder_config(enc).with_threads(1);
 
@@ -2240,6 +2241,7 @@ fn guess_frame_subtypes_assert() {
       },
       ..Default::default()
     },
+    advanced_flags: Default::default(),
   };
   let config = Config::new().with_encoder_config(enc).with_threads(1);
 
diff --git a/src/bin/common.rs b/src/bin/common.rs
index 3e0d5aa8c4..f541e737ae 100644
--- a/src/bin/common.rs
+++ b/src/bin/common.rs
@@ -242,10 +242,53 @@ pub struct CliOptions {
   #[clap(long, short, value_parser, help_heading = "DEBUGGING")]
   pub reconstruction: Option<PathBuf>,
 
+  /// Controls the strength of the deblock filter, as a multiplier to the default.
+  #[cfg(feature = "devel")]
+  #[clap(long, value_parser = positive_float, default_value_t=1.0f32, help_heading = "ADVANCED")]
+  pub deblock_strength: f32,
+  /// Controls the sharpness of the deblock filter. Accepts a value from 0-7.
+  #[cfg(feature = "devel")]
+  #[clap(long, value_parser = clap::value_parser!(u8).range(0..=7), default_value_t=0, help_heading = "ADVANCED")]
+  pub deblock_sharpness: u8,
+  /// Controls the ratio between intra frame and inter frame quantizers, as a multiplier.
+  /// Higher values create a higher quantizer difference, while lower values
+  /// create a lower quantizer difference. A value of 0.0 would mean that I and P quantizers
+  /// are the same.
+  #[cfg(feature = "devel")]
+  #[clap(long, value_parser = positive_float, default_value_t=1.0f32, help_heading = "ADVANCED")]
+  pub ip_qidx_ratio: f32,
+  /// Controls the ratio between "P"-frame and "B"-frame quantizers, as a multiplier.
+  /// Default is 1.0. Higher values create a higher quantizer difference, while lower values
+  /// create a lower quantizer difference. A value of 0.0 would mean that P and B quantizers
+  /// are the same.
+  #[cfg(feature = "devel")]
+  #[clap(long, value_parser = positive_float, default_value_t=1.0f32, help_heading = "ADVANCED")]
+  pub pb_qidx_ratio: f32,
+  /// Controls the ratio between frame quantizers in the levels of the pyramid betweem "B"-frames,
+  /// as a multiplier. Default is 1.0. Higher values create a higher quantizer difference,
+  /// while lower values create a lower quantizer difference. A value of 0.0 would mean that
+  /// B0 and B1 quantizers are the same.
+  #[cfg(feature = "devel")]
+  #[clap(long, value_parser = positive_float, default_value_t=1.0f32, help_heading = "ADVANCED")]
+  pub b_qidx_ratio: f32,
+  /// Controls the strength of temporal RDO, as a multiplier to the default.
+  #[cfg(feature = "devel")]
+  #[clap(long, value_parser = positive_float, default_value_t=1.0f32, help_heading = "ADVANCED")]
+  pub temporal_rdo_strength: f32,
+
   #[clap(subcommand)]
   pub command: Option<Commands>,
 }
 
+#[cfg(feature = "devel")]
+fn positive_float(input: &str) -> Result<f32, String> {
+  let value = input.parse::<f32>().map_err(|e| e.to_string())?;
+  if value < 0.0 {
+    return Err("Value must not be negative".to_string());
+  }
+  Ok(value)
+}
+
 fn get_version() -> &'static str {
   static VERSION_STR: Lazy<String> = Lazy::new(|| {
     format!(
@@ -299,7 +342,7 @@ pub enum Commands {
     #[clap(long, short, value_parser)]
     save_config: Option<PathBuf>,
     /// Load the encoder configuration from a toml file
-    #[clap(long, short, value_parser, conflicts_with = "save-config")]
+    #[clap(long, short, value_parser, conflicts_with = "save_config")]
     load_config: Option<PathBuf>,
   },
 }
@@ -484,6 +527,18 @@ pub fn parse_cli() -> Result<ParsedCliOptions, CliError> {
   })
 }
 
+#[cfg(feature = "devel")]
+const fn parse_advanced_flags(cli: &CliOptions) -> AdvancedTuning {
+  AdvancedTuning {
+    deblock_strength: cli.deblock_strength,
+    deblock_sharpness: cli.deblock_sharpness,
+    ip_qidx_ratio: cli.ip_qidx_ratio,
+    pb_qidx_ratio: cli.pb_qidx_ratio,
+    b_qidx_ratio: cli.b_qidx_ratio,
+    temporal_rdo_strength: cli.temporal_rdo_strength,
+  }
+}
+
 fn parse_config(matches: &CliOptions) -> Result<EncoderConfig, CliError> {
   let maybe_quantizer = matches.quantizer;
   let maybe_bitrate = matches.bitrate;
@@ -674,5 +729,10 @@ fn parse_config(matches: &CliOptions) -> Result<EncoderConfig, CliError> {
     cfg.speed_settings.scene_detection_mode = SceneDetectionSpeed::None;
   }
 
+  #[cfg(feature = "devel")]
+  {
+    cfg.advanced_flags = parse_advanced_flags(matches);
+  }
+
   Ok(cfg)
 }
diff --git a/src/encoder.rs b/src/encoder.rs
index dbf8fd97a4..9255178f65 100644
--- a/src/encoder.rs
+++ b/src/encoder.rs
@@ -470,7 +470,7 @@ impl<T: Pixel> FrameState<T> {
       cdfs: CDFContext::new(0),
       context_update_tile_id: 0,
       max_tile_size_bytes: 0,
-      deblock: Default::default(),
+      deblock: DeblockState::new(&fi.config, fi.frame_type),
       segmentation: Default::default(),
       restoration: rs,
       frame_me_stats: me_stats,
@@ -501,7 +501,7 @@ impl<T: Pixel> FrameState<T> {
       cdfs: CDFContext::new(0),
       context_update_tile_id: 0,
       max_tile_size_bytes: 0,
-      deblock: Default::default(),
+      deblock: DeblockState::new(&fi.config, fi.frame_type),
       segmentation: Default::default(),
       restoration: rs,
       frame_me_stats: FrameMEStats::new_arc_array(fi.w_in_b, fi.h_in_b),
@@ -543,6 +543,22 @@ pub struct DeblockState {
   pub block_delta_multi: bool,
 }
 
+impl DeblockState {
+  pub fn new(config: &EncoderConfig, frame_type: FrameType) -> Self {
+    let mut state = DeblockState { ..Default::default() };
+    if frame_type == FrameType::INTER {
+      // Apply deblock strength only to inter frames
+      for level in &mut state.levels {
+        *level = ((*level as f32) * config.advanced_flags.deblock_strength)
+          .min(MAX_LOOP_FILTER as f32)
+          .round() as u8;
+      }
+    }
+    state.sharpness = config.advanced_flags.deblock_sharpness;
+    state
+  }
+}
+
 impl Default for DeblockState {
   fn default() -> Self {
     DeblockState {
diff --git a/src/fuzzing.rs b/src/fuzzing.rs
index aab9abe059..2d767da8a2 100644
--- a/src/fuzzing.rs
+++ b/src/fuzzing.rs
@@ -257,6 +257,7 @@ impl Arbitrary for ArbitraryEncoder {
       switch_frame_interval: u.int_in_range(0..=3)?,
       tune: *u.choose(&[Tune::Psnr, Tune::Psychovisual])?,
       film_grain_params: None,
+      advanced_flags: Default::default(),
     };
 
     let frame_count =
diff --git a/src/rate.rs b/src/rate.rs
index e7633777a1..2df8359aa6 100644
--- a/src/rate.rs
+++ b/src/rate.rs
@@ -14,6 +14,7 @@ use crate::quantize::{ac_q, dc_q, select_ac_qi, select_dc_qi};
 use crate::util::{
   bexp64, bexp_q24, blog64, clamp, q24_to_q57, q57, q57_to_q24, Pixel,
 };
+use debug_unreachable::debug_unreachable;
 use std::cmp;
 
 // The number of frame sub-types for which we track distinct parameters.
@@ -71,14 +72,30 @@ const MQP_Q12: &[i32; FRAME_NSUBTYPES] = &[
   (1.0 * (1 << 12) as f64) as i32,
 ];
 
-// The ratio 33_810_170.0 / 86_043_287.0 was derived by approximating the median
-// of a change of 15 quantizer steps in the quantizer tables.
-const DQP_Q57: &[i64; FRAME_NSUBTYPES] = &[
-  (-(33_810_170.0 / 86_043_287.0) * (1i64 << 57) as f64) as i64,
-  (0.0 * (1i64 << 57) as f64) as i64,
-  ((33_810_170.0 / 86_043_287.0) * (1i64 << 57) as f64) as i64,
-  (2.0 * (33_810_170.0 / 86_043_287.0) * (1i64 << 57) as f64) as i64,
-];
+#[cfg_attr(not(feature = "devel"), allow(unused_variables))]
+fn dqp_q57(fti: usize, ip_ratio: f64, pb_ratio: f64, b_ratio: f64) -> i64 {
+  // The ratio 33_810_170.0 / 86_043_287.0 was derived by approximating the median
+  // of a change of 15 quantizer steps in the quantizer tables.
+  const BASE: f64 = (33_810_170.0 / 86_043_287.0) * (1i64 << 57) as f64;
+
+  // If we are not in devel mode, hardcode these as constants to the compiler
+  // can optimize better.
+  #[cfg(not(feature = "devel"))]
+  let ip_ratio = 1.0;
+  #[cfg(not(feature = "devel"))]
+  let pb_ratio = 1.0;
+  #[cfg(not(feature = "devel"))]
+  let b_ratio = 1.0;
+
+  match fti {
+    FRAME_SUBTYPE_I => (-ip_ratio * BASE) as i64,
+    FRAME_SUBTYPE_P => 0i64,
+    FRAME_SUBTYPE_B0 => (pb_ratio * BASE) as i64,
+    FRAME_SUBTYPE_B1 => ((pb_ratio + b_ratio) * BASE) as i64,
+    // SAFETY: This branch should never occur, if it does the macro will catch it in debug mode.
+    _ => unsafe { debug_unreachable!("Unsupported frame subtype") },
+  }
+}
 
 // For 8-bit-depth inter frames, log_q_y is derived from log_target_q with a
 //  linear model:
@@ -703,11 +720,12 @@ impl RCState {
 
   pub(crate) fn select_first_pass_qi(
     &self, bit_depth: usize, fti: usize, chroma_sampling: ChromaSampling,
+    ip_ratio: f64, pb_ratio: f64, b_ratio: f64,
   ) -> QuantizerParameters {
     // Adjust the quantizer for the frame type, result is Q57:
     let log_q = ((self.pass1_log_base_q + (1i64 << 11)) >> 12)
       * (MQP_Q12[fti] as i64)
-      + DQP_Q57[fti];
+      + dqp_q57(fti, ip_ratio, pb_ratio, b_ratio);
     QuantizerParameters::new_from_log_q(
       self.pass1_log_base_q,
       log_q,
@@ -723,14 +741,24 @@ impl RCState {
     &self, ctx: &ContextInner<T>, output_frameno: u64, fti: usize,
     maybe_prev_log_base_q: Option<i64>, log_isqrt_mean_scale: i64,
   ) -> QuantizerParameters {
+    let ip_ratio = ctx.config.advanced_flags.ip_qidx_ratio as f64;
+    let pb_ratio = ctx.config.advanced_flags.pb_qidx_ratio as f64;
+    let b_ratio = ctx.config.advanced_flags.b_qidx_ratio as f64;
+
     // Is rate control active?
     if self.target_bitrate <= 0 {
       // Rate control is not active.
       // Derive quantizer directly from frame type.
       let bit_depth = ctx.config.bit_depth;
       let chroma_sampling = ctx.config.chroma_sampling;
-      let (log_base_q, log_q) =
-        Self::calc_flat_quantizer(ctx.config.quantizer as u8, bit_depth, fti);
+      let (log_base_q, log_q) = Self::calc_flat_quantizer(
+        ctx.config.quantizer as u8,
+        bit_depth,
+        fti,
+        ip_ratio,
+        pb_ratio,
+        b_ratio,
+      );
       QuantizerParameters::new_from_log_q(
         log_base_q,
         log_q,
@@ -752,6 +780,9 @@ impl RCState {
             ctx.config.bit_depth,
             fti,
             ctx.config.chroma_sampling,
+            ip_ratio,
+            pb_ratio,
+            b_ratio,
           );
         }
         // Second pass of 2-pass mode: we know exactly how much of each frame
@@ -925,7 +956,7 @@ impl RCState {
           // Modulate base quantizer by frame type.
           let log_q = ((log_base_q + (1i64 << 11)) >> 12)
             * (MQP_Q12[ftj] as i64)
-            + DQP_Q57[ftj];
+            + dqp_q57(ftj, ip_ratio, pb_ratio, b_ratio);
           // All the fields here are Q57 except for the exponent, which is
           //  Q6.
           bits += (nframes[ftj] as i64)
@@ -959,7 +990,7 @@ impl RCState {
       // Modulate base quantizer by frame type.
       let mut log_q = ((log_base_q + (1i64 << 11)) >> 12)
         * (MQP_Q12[fti] as i64)
-        + DQP_Q57[fti];
+        + dqp_q57(fti, ip_ratio, pb_ratio, b_ratio);
       // The above allocation looks only at the total rate we'll accumulate
       //  in the next reservoir_frame_delay frames.
       // However, we could overflow the bit reservoir on the very next
@@ -1019,14 +1050,26 @@ impl RCState {
       }
 
       if let Some(qi_max) = self.maybe_ac_qi_max {
-        let (max_log_base_q, max_log_q) =
-          Self::calc_flat_quantizer(qi_max, ctx.config.bit_depth, fti);
+        let (max_log_base_q, max_log_q) = Self::calc_flat_quantizer(
+          qi_max,
+          ctx.config.bit_depth,
+          fti,
+          ip_ratio,
+          pb_ratio,
+          b_ratio,
+        );
         log_base_q = cmp::min(log_base_q, max_log_base_q);
         log_q = cmp::min(log_q, max_log_q);
       }
       if self.ac_qi_min > 0 {
-        let (min_log_base_q, min_log_q) =
-          Self::calc_flat_quantizer(self.ac_qi_min, ctx.config.bit_depth, fti);
+        let (min_log_base_q, min_log_q) = Self::calc_flat_quantizer(
+          self.ac_qi_min,
+          ctx.config.bit_depth,
+          fti,
+          ip_ratio,
+          pb_ratio,
+          b_ratio,
+        );
         log_base_q = cmp::max(log_base_q, min_log_base_q);
         log_q = cmp::max(log_q, min_log_q);
       }
@@ -1044,7 +1087,8 @@ impl RCState {
   // Computes a quantizer directly from the frame type and base quantizer index,
   // without consideration for rate control.
   fn calc_flat_quantizer(
-    base_qi: u8, bit_depth: usize, fti: usize,
+    base_qi: u8, bit_depth: usize, fti: usize, ip_ratio: f64, pb_ratio: f64,
+    b_ratio: f64,
   ) -> (i64, i64) {
     // TODO: Rename "quantizer" something that indicates it is a quantizer
     //  index, and move it somewhere more sensible (or choose a better way to
@@ -1063,7 +1107,7 @@ impl RCState {
     let log_base_q = (log_ac_q + log_dc_q + 1) >> 1;
     // Adjust the quantizer for the frame type, result is Q57:
     let log_q = ((log_base_q + (1i64 << 11)) >> 12) * (MQP_Q12[fti] as i64)
-      + DQP_Q57[fti];
+      + dqp_q57(fti, ip_ratio, pb_ratio, b_ratio);
     (log_base_q, log_q)
   }
 
diff --git a/src/rdo.rs b/src/rdo.rs
index 553d6f9d75..aa0a33a4b6 100644
--- a/src/rdo.rs
+++ b/src/rdo.rs
@@ -455,6 +455,7 @@ pub fn distortion_scale<T: Pixel>(
 
   let coded_data = fi.coded_frame_data.as_ref().unwrap();
   coded_data.distortion_scales[y * coded_data.w_in_imp_b + x]
+    .strength_adjusted(fi.config.advanced_flags.temporal_rdo_strength as f64)
 }
 
 /// # Panics
@@ -504,6 +505,7 @@ pub fn spatiotemporal_scale<T: Pixel>(
     .sum::<u64>();
   }
   DistortionScale(((sum + (den >> 1)) / den) as u32)
+    .strength_adjusted(fi.config.advanced_flags.temporal_rdo_strength as f64)
 }
 
 pub fn distortion_scale_for(
@@ -617,6 +619,22 @@ impl DistortionScale {
   pub const fn mul_u64(self, dist: u64) -> u64 {
     (self.0 as u64 * dist + (1 << Self::SHIFT >> 1)) >> Self::SHIFT
   }
+
+  #[inline]
+  #[cfg(feature = "devel")]
+  pub fn strength_adjusted(self, strength: f64) -> Self {
+    let diff = 1.0 - f64::from(self);
+    let add = diff * strength;
+    DistortionScale::from((1.0 + add).max(0.0))
+  }
+
+  #[inline(always)]
+  #[cfg(not(feature = "devel"))]
+  pub fn strength_adjusted(self, _strength: f64) -> Self {
+    // If we aren't using a devel build, just return self
+    // so we do not add any performance cost.
+    self
+  }
 }
 
 impl std::ops::Mul for DistortionScale {