diff --git a/SUMMARY.md b/SUMMARY.md
index aa30609b6281c..79a381d1e3ee4 100644
--- a/SUMMARY.md
+++ b/SUMMARY.md
@@ -28,6 +28,8 @@
     - [Device Interface Functions](./sw/device/lib/dif/dif_rv_plic.h)
     - [Checklist](./hw/top_earlgrey/ip_autogen/rv_plic/doc/checklist.md)
   - [Sensor Control](./hw/top_earlgrey/ip/sensor_ctrl/README.md)
+    - [Theory of Operation](./hw/top_earlgrey/ip/sensor_ctrl/doc/theory_of_operation.md)
+    - [Programmer's Guide](./hw/top_earlgrey/ip/sensor_ctrl/doc/programmers_guide.md)
     - [Interface and Registers](./hw/top_earlgrey/ip/sensor_ctrl/data/sensor_ctrl.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_sensor_ctrl.h)
     - [Checklist](./hw/top_earlgrey/ip/sensor_ctrl/doc/checklist.md)
@@ -59,127 +61,169 @@
 
 - [Hardware IP Blocks](./hw/ip/README.md)
   - [Analog to Digital Converter Control](./hw/ip/adc_ctrl/README.md)
+    - [Theory of Operation](./hw/ip/adc_ctrl/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/adc_ctrl/dv/README.md)
       - [Testplan](./hw/ip/adc_ctrl/data/adc_ctrl_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/adc_ctrl/doc/programmers_guide.md)
     - [Checklist](./hw/ip/adc_ctrl/doc/checklist.md)
     - [Interface and Registers](./hw/ip/adc_ctrl/data/adc_ctrl.hjson)
   - [AES](./hw/ip/aes/README.md)
+    - [Theory of Operation](./hw/ip/aes/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/aes/dv/README.md)
       - [Testplan](./hw/ip/aes/data/aes_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/aes/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/aes/data/aes.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_aes.h)
     - [Checklist](./hw/ip/aes/doc/checklist.md)
   - [AON Timer](./hw/ip/aon_timer/README.md)
+    - [Theory of Operation](./hw/ip/aon_timer/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/aon_timer/dv/README.md)
       - [Testplan](./hw/ip/aon_timer/data/aon_timer_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/aon_timer/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/aon_timer/data/aon_timer.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_aon_timer.h)
     - [Checklist](./hw/ip/aon_timer/doc/checklist.md)
   - [Clock Manager](./hw/ip/clkmgr/README.md)
+    - [Theory of Operation](./hw/ip/clkmgr/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/clkmgr/dv/README.md)
       - [Testplan](./hw/ip/clkmgr/data/clkmgr_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/clkmgr/doc/programmers_guide.md)
     - [Interface and Registers](./hw/top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_clkmgr.h)
     - [Checklist](./hw/ip/clkmgr/doc/checklist.md)
   - [CSRNG](./hw/ip/csrng/README.md)
+    - [Theory of Operation](./hw/ip/csrng/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/csrng/dv/README.md)
       - [Testplan](./hw/ip/csrng/data/csrng_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/csrng/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/csrng/data/csrng.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_csrng.h)
     - [Checklist](./hw/ip/csrng/doc/checklist.md)
   - [EDN](./hw/ip/edn/README.md)
+    - [Theory of Operation](./hw/ip/edn/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/edn/dv/README.md)
       - [Testplan](./hw/ip/edn/data/edn_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/edn/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/edn/data/edn.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_edn.h)
     - [Checklist](./hw/ip/edn/doc/checklist.md)
   - [Entropy Source](./hw/ip/entropy_src/README.md)
+    - [Theory of Operation](./hw/ip/entropy_src/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/entropy_src/dv/README.md)
       - [Testplan](./hw/ip/entropy_src/data/entropy_src_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/entropy_src/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/entropy_src/data/entropy_src.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_entropy_src.h)
     - [Checklist](./hw/ip/entropy_src/doc/checklist.md)
   - [Flash Controller](./hw/ip/flash_ctrl/README.md)
+    - [Theory of Operation](./hw/ip/flash_ctrl/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/flash_ctrl/dv/README.md)
       - [Testplan](./hw/ip/flash_ctrl/data/flash_ctrl_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/flash_ctrl/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/flash_ctrl/data/flash_ctrl.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_flash_ctrl.h)
     - [Checklist](./hw/ip/flash_ctrl/doc/checklist.md)
   - [GPIO](./hw/ip/gpio/README.md)
+    - [Theory of Operation](./hw/ip/gpio/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/gpio/dv/README.md)
       - [Testplan](./hw/ip/gpio/data/gpio_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/gpio/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/gpio/data/gpio.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_gpio.h)
     - [Checklist](./hw/ip/gpio/doc/checklist.md)
   - [HMAC](./hw/ip/hmac/README.md)
+    - [Theory of Operation](./hw/ip/hmac/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/hmac/dv/README.md)
       - [Testplan](./hw/ip/hmac/data/hmac_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/hmac/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/hmac/data/hmac.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_hmac.h)
     - [Checklist](./hw/ip/hmac/doc/checklist.md)
   - [I2C](./hw/ip/i2c/README.md)
+    - [Theory of Operation](./hw/ip/i2c/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/i2c/dv/README.md)
       - [Testplan](./hw/ip/i2c/data/i2c_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/i2c/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/i2c/data/i2c.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_i2c.h)
     - [Checklist](./hw/ip/i2c/doc/checklist.md)
   - [Key Manager](./hw/ip/keymgr/README.md)
+    - [Theory of Operation](./hw/ip/keymgr/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/keymgr/dv/README.md)
       - [Testplan](./hw/ip/keymgr/data/keymgr_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/keymgr/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/keymgr/data/keymgr.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_keymgr.h)
     - [Checklist](./hw/ip/keymgr/doc/checklist.md)
   - [KMAC](./hw/ip/kmac/README.md)
+    - [Theory of Operation](./hw/ip/kmac/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/kmac/dv/README.md)
       - [Testplan](./hw/ip/kmac/data/kmac_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/kmac/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/kmac/data/kmac.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_kmac.h)
     - [Checklist](./hw/ip/kmac/doc/checklist.md)
   - [Life Cycle Controller](./hw/ip/lc_ctrl/README.md)
+    - [Theory of Operation](./hw/ip/lc_ctrl/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/lc_ctrl/dv/README.md)
       - [Testplan](./hw/ip/lc_ctrl/data/lc_ctrl_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/lc_ctrl/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/lc_ctrl/data/lc_ctrl.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_lc_ctrl.h)
     - [Checklist](./hw/ip/lc_ctrl/doc/checklist.md)
   - [OTP Controller](./hw/ip/otp_ctrl/README.md)
+    - [Theory of Operation](./hw/ip/otp_ctrl/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/otp_ctrl/dv/README.md)
       - [Testplan](./hw/ip/otp_ctrl/data/otp_ctrl_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/otp_ctrl/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/otp_ctrl/data/otp_ctrl.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_otp_ctrl.h)
     - [Checklist](./hw/ip/otp_ctrl/doc/checklist.md)
   - [Pattern Generator](./hw/ip/pattgen/README.md)
+    - [Theory of Operation](./hw/ip/pattgen/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/pattgen/dv/README.md)
       - [Testplan](./hw/ip/pattgen/data/pattgen_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/pattgen/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/pattgen/data/pattgen.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_pattgen.h)
     - [Checklist](./hw/ip/pattgen/doc/checklist.md)
   - [Pinmux](./hw/ip/pinmux/README.md)
+    - [Theory of Operation](./hw/ip/pinmux/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/pinmux/doc/dv/README.md)
       - [Testplan](./hw/ip/pinmux/data/pinmux_fpv_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/pinmux/doc/programmers_guide.md)
     - [Interface and Registers](./hw/top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_pinmux.h)
     - [Checklist](./hw/ip/pinmux/doc/checklist.md)
   - [Pulse Width Modulator](./hw/ip/pwm/README.md)
+    - [Theory of Operation](./hw/ip/pwm/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/pwm/dv/README.md)
       - [Testplan](./hw/ip/pwm/data/pwm_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/pwm/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/pwm/data/pwm.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_pwm.h)
     - [Checklist](./hw/ip/pwm/doc/checklist.md)
   - [Power Management](./hw/ip/pwrmgr/README.md)
+    - [Theory of Operation](./hw/ip/pwrmgr/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/pwrmgr/dv/README.md)
       - [Testplan](./hw/ip/pwrmgr/data/pwrmgr_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/pwrmgr/doc/programmers_guide.md)
     - [Interface and Registers](./hw/top_earlgrey/ip/pwrmgr/data/autogen/pwrmgr.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_pwrmgr.h)
     - [Checklist](./hw/ip/pwrmgr/doc/checklist.md)
   - [ROM Control](./hw/ip/rom_ctrl/README.md)
+    - [Theory of Operation](./hw/ip/rom_ctrl/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/rom_ctrl/dv/README.md)
       - [Testplan](./hw/ip/rom_ctrl/data/rom_ctrl_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/rom_ctrl/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/rom_ctrl/data/rom_ctrl.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_rom_ctrl.h)
     - [Checklist](./hw/ip/rom_ctrl/doc/checklist.md)
   - [Reset Manager](./hw/ip/rstmgr/README.md)
+    - [Theory of Operation](./hw/ip/rstmgr/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/rstmgr/dv/README.md)
       - [Testplan](./hw/ip/rstmgr/data/rstmgr_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/rstmgr/doc/programmers_guide.md)
     - [Interface and Registers](./hw/top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_rstmgr.h)
     - [Checklist](./hw/ip/rstmgr/doc/checklist.md)
@@ -189,20 +233,26 @@
     - [Interface and Registers](./hw/ip/rv_dm/data/rv_dm.hjson)
     - [Checklist](./hw/ip/rv_dm/doc/checklist.md)
   - [SPI Device](./hw/ip/spi_device/README.md)
+    - [Theory of Operation](./hw/ip/spi_device/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/spi_device/dv/README.md)
       - [Testplan](./hw/ip/spi_device/data/spi_device_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/spi_device/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/spi_device/data/spi_device.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_spi_device.h)
     - [Checklist](./hw/ip/spi_device/doc/checklist.md)
   - [SPI Host](./hw/ip/spi_host/README.md)
+    - [Theory of Operation](./hw/ip/spi_host/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/spi_host/dv/README.md)
       - [Testplan](./hw/ip/spi_host/data/spi_host_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/spi_host/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/spi_host/data/spi_host.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_spi_host.h)
     - [Checklist](./hw/ip/spi_host/doc/checklist.md)
   - [SRAM Controller](./hw/ip/sram_ctrl/README.md)
+    - [Theory of Operation](./hw/ip/sram_ctrl/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/sram_ctrl/dv/README.md)
       - [Testplan](./hw/ip/sram_ctrl/data/sram_ctrl_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/sram_ctrl/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/sram_ctrl/data/sram_ctrl.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_sram_ctrl.h)
     - [Checklist](./hw/ip/sram_ctrl/doc/checklist.md)
@@ -213,8 +263,10 @@
     - [Device Interface Functions](./sw/device/lib/dif/dif_sysrst_ctrl.h)
     - [Checklist](./hw/ip/sysrst_ctrl/doc/checklist.md)
   - [Timer](./hw/ip/rv_timer/README.md)
+    - [Theory of Operation](./hw/ip/rv_timer/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/rv_timer/dv/README.md)
       - [Testplan](./hw/ip/rv_timer/data/rv_timer_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/rv_timer/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/rv_timer/data/rv_timer.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_rv_timer.h)
     - [Checklist](./hw/ip/rv_timer/doc/checklist.md)
@@ -223,14 +275,18 @@
       - [Testplan](./hw/ip/tlul/data/tlul_testplan.hjson)
       - [Protocol Checker](./hw/ip/tlul/doc/TlulProtocolChecker.md)
   - [UART](./hw/ip/uart/README.md)
+    - [Theory of Operation](./hw/ip/uart/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/uart/dv/README.md)
       - [Testplan](./hw/ip/uart/data/uart_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/uart/doc/programmers_guide.md)
     - [Interface and Registers](./hw/ip/uart/data/uart.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_uart.h)
     - [Checklist](./hw/ip/uart/doc/checklist.md)
   - [USB 2.0](./hw/ip/usbdev/README.md)
+    - [Theory of Operation](./hw/ip/usbdev/doc/theory_of_operation.md)
     - [Design Verification](./hw/ip/usbdev/dv/README.md)
       - [Testplan](./hw/ip/usbdev/data/usbdev_testplan.hjson)
+    - [Programmer's Guide](./hw/ip/usbdev/doc/programmers_guide.md)
     - [Suspending and Resuming](./hw/ip/usbdev/doc/wake_resume.md)
     - [Interface and Registers](./hw/ip/usbdev/data/usbdev.hjson)
     - [Device Interface Functions](./sw/device/lib/dif/dif_usbdev.h)
diff --git a/hw/ip/adc_ctrl/README.md b/hw/ip/adc_ctrl/README.md
index d8cef20360024..6db879471ab40 100644
--- a/hw/ip/adc_ctrl/README.md
+++ b/hw/ip/adc_ctrl/README.md
@@ -28,240 +28,3 @@ The ADC controller is a simple front-end to an analog block that allows filterin
 ## Compatibility
 
 The ADC controller programming interface is not based on any existing interface.
-
-# Theory of Operation
-
-The block diagram shows a conceptual view of the ADC controller state machine and filters.
-
-## Block Diagram
-
-![ADC_CTRL Block Diagram](doc/adc_overview.svg)
-
-
-## Hardware Interface
-
-* [Interface Tables](data/adc_ctrl.hjson#interfaces)
-
-### Signals
-
-In addition to the interrupts and bus signals, the tables below lists additional IOs.
-
-Signal                  | Direction | Description
-------------------------|-----------|---------------
-`adc_o`                 | `output`  | Output controls to the actual `AST ADC` module.  Contains signals such as power down control and ADC channel select.
-`adc_i`                 | `input`   | Input data from `AST ADC` module. Contains ADC data output as well as data valid indication.
-
-
-## Design Details
-
-### Sampling state machine
-
-The state machine that takes ADC samples follows a very simple pattern:
-
-1. *Power up ADC*: The controller issues the power up command to the ADC.
-
-2. *Wait for ADC turn on*: The controller waits for the number of clock cycles programmed in [`adc_pd_ctl.pwrup_time`](data/adc_ctrl.hjson#adc_pd_ctl) which should be set to match the ADC power up delay.
-
-3. *Take sample Channel 0*: The ADC is requested to sample channel 0.
-When the ADC signals complete the value is stored in [`adc_chn_val[0].adc_chn_value`](data/adc_ctrl.hjson#adc_chn_val_0).
-Note that the time taken in this step depends on the properties of the ADC.
-
-4. *Take sample Channel 1*: The ADC is requested to sample channel 1.
-When the ADC signals complete the value is stored in [`adc_chn_val[1].adc_chn_value`](data/adc_ctrl.hjson#adc_chn_val_1).
-Note that the time taken in this step depends on the properties of the ADC.
-
-5. *Evaluate Filters*: The filters are evaluated and debounce logic applied (see [next section](#filters-and-debounce)).
-
-6. *Scan type check*: At this point if the [`adc_pd_ctl.lp_mode`](data/adc_ctrl.hjson#adc_pd_ctl) bit is clear scanning continues at step (3).
-   If the bit is set the next step depends on how many samples have hit the filters.
-   If more than [`adc_lp_sample_ctl.lp_sample_cnt`](data/adc_ctrl.hjson#adc_lp_sample_ctl) samples have hit then continuous scanning continues at step (3) else periodic scanning will continue at the next step (7).
-
-7. *Power off ADC*: The controller issues the power down command to the ADC.
-
-8. *Wait sleep time*: The controller will wait for the next sample timer to time out before restarting at step (1).
-
-In active operation the controller is in continuous scanning mode:
-* The ADC is continually powered on.
-* The sampling cycle time is the time taken for the ADC to take two samples (450us) plus internal processing time (4 clock cycles) from the ADC controller.
-* The debounce timer will trigger the [`filter_status`](data/adc_ctrl.hjson#filter_status) and interrupt after a configurable number of matching ADC samples have been seen, as determined by [`adc_sample_ctl`](data/adc_ctrl.hjson#adc_sample_ctl).
-
-For low power operation the periodic scanning mode can be used.
-In this mode samples are taken using a slower periodic sampling cycle time with the ADC powered down most of the time.
-Once a small number of cycles have hit the filter with periodic scanning then the controller switches to continuous scanning in order to more accurately debounce the signal.
-In low power mode:
-* The ADC is periodically powered up to take samples; this interval is determined by [`adc_pd_ctl.wakeup_time`](data/adc_ctrl.hjson#adc_pd_ctl).
-* Similar to normal operation, the ADC power-up delay is controlled by [`adc_pd_ctl.pwrup_time`](data/adc_ctrl.hjson#adc_pd_ctl).
-* Once the ADC is powered up, two samples are taken and compared to the filter thresholds.
-* If a configurable number of matches, as determined by [`adc_lp_sample_ctl`](data/adc_ctrl.hjson#adc_lp_sample_ctl), are seen, the ADC controller transitions to normal operation for continuous sampling.
-
-Although it can be used at any time, the periodic operation mode and use of the slow clock allows the ADC controller to continue to scan when most of the chip is in sleep or power-down modes.
-The controller can be configured to issue a wakeup to the rest of the chip.
-
-If a filter is enabled for wakeup in [`adc_wakeup_ctl`](data/adc_ctrl.hjson#adc_wakeup_ctl) and [`filter_status`](data/adc_ctrl.hjson#filter_status) indicates a match, a wakeup is generated to the system power manager.
-
-
-## Filters and debounce
-
-There are two reserved bits in ADC filter control registers for future use.
-In the current implementation, ADC has 10-bit granularity.
-Each step is 2.148mV.
-It covers 0-2.2V.
-
-The ADC controller implements eight pairs of filters that feed the debounce logic.
-Each pair has a filter for channel 0 and a filter for channel 1.
-
-A filter consists of a `max` value, a `min` value and a `cond` flag indicating if the filter is hit by a sample inside or outside the range.
-* *Inside the range*: the filter is hit if `min` &le; `value` &le; `max`.
-* *Outside the range*: inverse of inside, so the filter is hit if `value` &lt; `min` or `value` &gt; `max`.
-
-Some example filters:
-* Inside `min=7`, `max=23`: any value between and including 7 and 23 will hit.
-* Outside `min=7`, `max=23`: any value less than 7 or greater than 23 will hit.
-* Inside `min=7`, `max=7`: the value must be exactly 7 to hit (sample noise may make an exact hit unlikely).
-* Inside `min=0`, `max=7`: the value must be less than 8 to hit.
-* Outside `min=8`, `max=0xFFF`: the value must be less than 8 to hit (alternate method).
-* Inside `min=0`, `max=0xFFF`: any value will hit. This may be useful to exclude one channel from the filter.
-* Outside `min=0`, `max=0xFFF`: no value will hit. If set for either channel the filter is effectively disabled.
-
-All pairs of filters that are enabled in [`adc_chn0_filter_ctl[7:0]`](data/adc_ctrl.hjson#adc_chn0_filter_ctl_0) and [`adc_chn1_filter_ctl[7:0]`](data/adc_ctrl.hjson#adc_chn1_filter_ctl_0) are evaluated after each pair of samples has been taken.
-The filter result is passed to the periodic scan counter if enabled and not at its limit otherwise the result is passed to the debounce counter.
-The list below describes how the counters interpret the filter results:
-* If no filters are hit then the counter will reset to zero.
-* If one or more filters are hit but the set hit differs from the previous evaluation the counter resets to zero.
-* If one or more filters are hit and either none was hit in the previous evaluation or the same set was hit in the previous evaluation and the counter is not at its threshold then the counter will increment.
-* If one or more filters are hit and the same set was hit in the previous evaluation and the counter is at its threshold then the counter stays at the threshold.
-* If the counter is the periodic scan counter and it reaches its threshold, as defined by [`adc_lp_sample_ctl.lp_sample_cnt`](data/adc_ctrl.hjson#adc_lp_sample_ctl), then continuous scanning is enabled and the debounce counter will be used for future evaluations.
-* If the counter is the debounce counter and it reaches its threshold, as defined by [`adc_sample_ctl.np_sample_cnt`](data/adc_ctrl.hjson#adc_sample_ctl), then:
-  * An interrupt is raised if the threshold is met for the first time.
-  * The current sample values are latched into [`adc_chn_val[0].adc_chn_value_intr`](data/adc_ctrl.hjson#adc_chn_val_0) and  [`adc_chn_val[1].adc_chn_value_intr`](data/adc_ctrl.hjson#adc_chn_val_1).
-    *  If a series of interrupts and matches are seen, these registers only record the value of the last debounced hit.
-  * The [`adc_intr_status`](data/adc_ctrl.hjson#adc_intr_status) register is updated by setting the bits corresponding to filters that are hit (note that bits that are already set will not be cleared).
-    This will cause the block to raise an interrupt if it was not already doing so.
-  * If a filter is a hit and is also enabled in [`adc_wakeup_ctl`](data/adc_ctrl.hjson#adc_wakeup_ctl) the corresponding filter generates a wakeup.
-  * Note that the debounce counter will remain at its threshold until the set of filters are changed by software to debounce a different event or if the current match changes.
-    *  This implies that a stable matching event continuously matches until some condition in the system (changed filter settings, changed ADC output, software issued fsm reset in [`adc_fsm_rst`](data/adc_ctrl.hjson#adc_fsm_rst)) alters the result.
-
-
-Because scanning continues the [`adc_intr_status`](data/adc_ctrl.hjson#adc_intr_status) register will reflect any debounced events that are detected between the controller raising an interrupt and the status bits being cleared (by having 1 written to them).
-However, the [`adc_chn_val[0].adc_chn_value_intr`](data/adc_ctrl.hjson#adc_chn_val_0) and [`adc_chn_val[1].adc_chn_value_intr`](data/adc_ctrl.hjson#adc_chn_val_1) registers record the value at the time the interrupt was first raised and thus reflect the filter state from that point.
-
-### ADC_CTRL and ADC Interface
-
-The interface between the ADC controller and the ADC is diagrammed below.
-The interface is from the perspective of the ADC controller.
-Before operation can begin, the ADC controller first powers on the ADC by setting `adc_o.pd` to 0.
-The controller then waits for the ADC to fully power up, as determined by [`adc_pd_ctl.pwrup_time`](data/adc_ctrl.hjson#adc_pd_ctl).
-
-Once the ADC is ready to go, the controller then selects the channel it wishes to sample by setting `adc_o.channel_sel`.
-The controller holds this value until the ADC responds with `adc_i.data_valid` and `adc_i.data`.
-
-Since there is no request sample signal between the controller and the ADC, the ADC takes a new sample when `adc_o.channel_sel` is changed from 0 to a valid channel.
-To take a new sample then, the controller actively sets `adc_o.channel_sel` to 0, before setting it to another valid channel.
-
-```wavejson
-{
-  signal: [
-    {node: '.a..b........', phase:0.2},
-    {name: 'clk_aon_i',         wave: 'p.|..|.....|....|...'},
-    {name: 'adc_o.pd',          wave: '10|..|.....|....|..1'},
-    {name: 'adc_o.channel_sel', wave: '0.|.3|..04.|....|0..'},
-    {name: 'adc_i.data_valid',  wave: '0.|..|.1.0.|.1..|.0.'},
-    {name: 'adc_i.data',        wave: 'x.|..|.3.x.|.4..|.x.', data: ['ch0', 'ch1', 'ch1']},
-  ],
-  edge: [  'a<->b wakeup time',   ]
-}
-```
-
-# Programmers Guide
-
-## Initialization
-
-The controller should be initialized with the properties of the ADC and scan times.
-* The ADC power up delay must be set in [`adc_pd_ctl.pwrup_time`](data/adc_ctrl.hjson#adc_pd_ctl).
-* The time to delay between samples in a slow scan should be set in [`adc_pd_ctl.wakeup_time`](data/adc_ctrl.hjson#adc_pd_ctl).
-* The number of samples to cause transition from slow to fast scan should be set in [`adc_lp_sample_ctl`](data/adc_ctrl.hjson#adc_lp_sample_ctl).
-* The number of samples for debounce should be set in [`adc_sample_ctl`](data/adc_ctrl.hjson#adc_sample_ctl).
-* The filter registers [`adc_chnX_filter_ctlN`](data/adc_ctrl.hjson#adc_chnx_filter_ctln) should be programmed.
-* The interrupt [`adc_intr_ctl`](data/adc_ctrl.hjson#adc_intr_ctl) and wakeup [`adc_wakeup_ctl`](data/adc_ctrl.hjson#adc_wakeup_ctl) enables should be configured.
-* All ones should be written to [`adc_intr_status`](data/adc_ctrl.hjson#adc_intr_status) and  [`filter_status`](data/adc_ctrl.hjson#filter_status) to ensure there are no spurious pending triggers.
-* Optionally, the low-power mode should be set in [`adc_pd_ctl.lp_mode`](data/adc_ctrl.hjson#adc_pd_ctl) if the system is going to the low-power mode.
-* The state machine will only start running when [`adc_en_ctl`](data/adc_ctrl.hjson#adc_en_ctl) is set.
-
-## Running in normal mode
-
-If fast sampling is always required then the [`adc_pd_ctl.lp_mode`](data/adc_ctrl.hjson#adc_pd_ctl) bit should be clear.
-In this case the values in the [`adc_lp_sample_ctl`](data/adc_ctrl.hjson#adc_lp_sample_ctl) are not used.
-The ADC will always be enabled and consuming power.
-
-If power saving is required then the controller can be set to operate in low power mode by setting [`adc_pd_ctl.lp_mode`](data/adc_ctrl.hjson#adc_pd_ctl).
-The [`adc_lp_sample_ctl`](data/adc_ctrl.hjson#adc_lp_sample_ctl) must be programmed prior to setting this bit.
-
-## Running with the rest of the chip in sleep
-
-Once programmed the controller and ADC can run when the rest of the chip is in low power state and the main clocks are stopped.
-This allows the chip to be woken when appropriate values are detected on the two ADC channels.
-The fast sampling mode can be used but will usually consume more power than desired when the chip is in sleep.
-So it is expected that [`adc_lp_sample_ctl`](data/adc_ctrl.hjson#adc_lp_sample_ctl) is configured and low power mode enabled by setting [`adc_pd_ctl.lp_mode`](data/adc_ctrl.hjson#adc_pd_ctl) prior to the sleep being initiated.
-
-If the ADC wakeup is not required then the controller and ADC should both be disabled by clearing [`adc_en_ctl`](data/adc_ctrl.hjson#adc_en_ctl) prior to the sleep being initiated.
-
-## Use Case
-
-While the ADC controller is meant to be used generically, it can be configured to satisfy more complex use cases.
-As an illustrative example, the programmers guide uses the [Chrome OS Hardware Debug](https://chromium.googlesource.com/chromiumos/third_party/hdctools/+/HEAD/docs/ccd.md) as an example of how the ADC controller can be used.
-
-The debug setup referred to uses a USB-C debug accessory.
-This insertion of this debug accessory into a system, can be detected by the ADC controller.
-
-The debug accessory voltage range of interest is shown in the diagram below:
-![Debug Cable Regions](doc/debug_cable_regions.svg)
-
-The ADC can be used to detect debug cable connection / disconnection in the non-overlapping regions.
-As an example use case of the two channel filters they can be used for detection of a USB-C debug accessory.
-The ADC must meet some minimum specifications:
-* Full scale range is 0.0V to 2.2V
-* If the signal is below 0.0V the ADC value will be zero.
-* If the signal is above 2.2V the ADC value will be maximum (i.e. same as 2.2V)
-* Absolute maximum error +/- 15 mV in the 0.25 - 0.45 V range
-* Absolute maximum error +/- 30 mV in the rest of the 0.0 - 2.2 V range
-
-The following assumes:
-* The slow clock runs at 200kHz or 5 us.
-* The ADC requires 30 us to power on.
-* The ADC takes a single sample in 44 clocks (220 us)
-
-The controller should be initialized with the properties of the ADC and scan times.
-* The ADC power up delay must be set in [`adc_pd_ctl.pwrup_time`](data/adc_ctrl.hjson#adc_pd_ctl) to `6` (30 us).
-* The time to delay between samples in a slow scan should be set in [`adc_pd_ctl.wakeup_time`](data/adc_ctrl.hjson#adc_pd_ctl) to `1600` (8ms).
-* The number of samples to cause transition from slow to fast scan should be set in [`adc_lp_sample_ctl`](data/adc_ctrl.hjson#adc_lp_sample_ctl) to `4` (causing slow scan to be 4*8ms = 32ms of debounce time).
-* The number of samples for debounce should be set in [`adc_sample_ctl`](data/adc_ctrl.hjson#adc_sample_ctl) to `155` (causing the total debounce time to be 32ms (slow scan) + 220us * 2 * 155 = 100ms, at the low end of the USB-C spec window).
-
-* For the 10-bit ADC granularity, the filter registers [`adc_chnX_filter_ctlN`](data/adc_ctrl.hjson#adc_chnx_filter_ctln) should be programmed to:
-
-| Filter | Ch0 Min      | Ch0 Max      | Ch1 Min      | Ch1 Max      | Device connected            |
-|--------|--------------|--------------|--------------|--------------|-----------------------------|
-| 0 IN   |  149 (0.32V) |  279 (0.60V) |  149 (0.32V) |  279 (0.60V) | Debug Sink (local RpUSB)    |
-| 1 IN   |  391 (0.84V) |  524 (1.125V)|  391 (0.84V) |  524 (1.125V)| Debug Sink (local Rp1.5A)   |
-| 2 IN   |  712 (1.53V) |  931 (2.00V) |  712 (1.53V) |  931 (2.00V) | Debug Sink (local Rp3A)     |
-| 3 IN   |  712 (1.53V) |  847 (1.82V) |  405 (0.87V) |  503 (1.08V) | Debug Source with RpUSB     |
-| 4 IN   |  349 (0.75V) |  512 (1.12V) |  186 (0.40V) |  279 (0.60V) | Debug Source with Rp1.5A    |
-| 5 IN   |  405 (0.87V) |  503 (1.08V) |  712 (1.53V) |  841 (1.82V) | Debug Source RpUSB Flipped  |
-| 6 IN   |  186 (0.40V) |  279 (0.60V) |  349 (0.75V) |  521 (1.12V) | Debug Source Rp1.5A Flipped |
-| 7 OUT  |  116 (0.25V) |  954 (2.05V) |  116 (0.25V) |  954 (2.05V) | Disconnect                  |
-
-
-* The interrupt [`adc_intr_ctl`](data/adc_ctrl.hjson#adc_intr_ctl) and wakeup [`adc_wakeup_ctl`](data/adc_ctrl.hjson#adc_wakeup_ctl) enables should be configured.
-* All ones should be written to [`adc_intr_status`](data/adc_ctrl.hjson#adc_intr_status) and  [`filter_status`](data/adc_ctrl.hjson#filter_status) to ensure there are no spurious pending triggers.
-* The state machine will only start running when [`adc_en_ctl`](data/adc_ctrl.hjson#adc_en_ctl) is set.
-
-Note that for the debug controller (DTS in USB-C specification) as a power source the filter that is hit will indicate the orientation of the connector.
-If the debug controller is acting as a power sink then the orientation cannot be known unless the debug controller supports the optional behavior of converting one of its pulldowns to an Ra (rather than Rp) to indicate CC2 (the CC that is not used for communication).
-This would not be detected by the filters since it happens later than connection detection and debounce in the USB-C protocol state machine, but could be detected by monitoring the current ADC value.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_adc_ctrl.h)
-
-## Registers
-
-* [Register Table](data/adc_ctrl.hjson#registers)
diff --git a/hw/ip/adc_ctrl/doc/programmers_guide.md b/hw/ip/adc_ctrl/doc/programmers_guide.md
new file mode 100644
index 0000000000000..f9aefed75cf3d
--- /dev/null
+++ b/hw/ip/adc_ctrl/doc/programmers_guide.md
@@ -0,0 +1,93 @@
+# Programmer's Guide
+
+## Initialization
+
+The controller should be initialized with the properties of the ADC and scan times.
+* The ADC power up delay must be set in [`adc_pd_ctl.pwrup_time`](../data/adc_ctrl.hjson#adc_pd_ctl).
+* The time to delay between samples in a slow scan should be set in [`adc_pd_ctl.wakeup_time`](../data/adc_ctrl.hjson#adc_pd_ctl).
+* The number of samples to cause transition from slow to fast scan should be set in [`adc_lp_sample_ctl`](../data/adc_ctrl.hjson#adc_lp_sample_ctl).
+* The number of samples for debounce should be set in [`adc_sample_ctl`](../data/adc_ctrl.hjson#adc_sample_ctl).
+* The filter registers [`adc_chnX_filter_ctlN`](../data/adc_ctrl.hjson#adc_chnx_filter_ctln) should be programmed.
+* The interrupt [`adc_intr_ctl`](../data/adc_ctrl.hjson#adc_intr_ctl) and wakeup [`adc_wakeup_ctl`](../data/adc_ctrl.hjson#adc_wakeup_ctl) enables should be configured.
+* All ones should be written to [`adc_intr_status`](../data/adc_ctrl.hjson#adc_intr_status) and  [`filter_status`](../data/adc_ctrl.hjson#filter_status) to ensure there are no spurious pending triggers.
+* Optionally, the low-power mode should be set in [`adc_pd_ctl.lp_mode`](../data/adc_ctrl.hjson#adc_pd_ctl) if the system is going to the low-power mode.
+* The state machine will only start running when [`adc_en_ctl`](../data/adc_ctrl.hjson#adc_en_ctl) is set.
+
+## Running in normal mode
+
+If fast sampling is always required then the [`adc_pd_ctl.lp_mode`](../data/adc_ctrl.hjson#adc_pd_ctl) bit should be clear.
+In this case the values in the [`adc_lp_sample_ctl`](../data/adc_ctrl.hjson#adc_lp_sample_ctl) are not used.
+The ADC will always be enabled and consuming power.
+
+If power saving is required then the controller can be set to operate in low power mode by setting [`adc_pd_ctl.lp_mode`](../data/adc_ctrl.hjson#adc_pd_ctl).
+The [`adc_lp_sample_ctl`](../data/adc_ctrl.hjson#adc_lp_sample_ctl) must be programmed prior to setting this bit.
+
+## Running with the rest of the chip in sleep
+
+Once programmed the controller and ADC can run when the rest of the chip is in low power state and the main clocks are stopped.
+This allows the chip to be woken when appropriate values are detected on the two ADC channels.
+The fast sampling mode can be used but will usually consume more power than desired when the chip is in sleep.
+So it is expected that [`adc_lp_sample_ctl`](../data/adc_ctrl.hjson#adc_lp_sample_ctl) is configured and low power mode enabled by setting [`adc_pd_ctl.lp_mode`](../data/adc_ctrl.hjson#adc_pd_ctl) prior to the sleep being initiated.
+
+If the ADC wakeup is not required then the controller and ADC should both be disabled by clearing [`adc_en_ctl`](../data/adc_ctrl.hjson#adc_en_ctl) prior to the sleep being initiated.
+
+## Use Case
+
+While the ADC controller is meant to be used generically, it can be configured to satisfy more complex use cases.
+As an illustrative example, the programmers guide uses the [Chrome OS Hardware Debug](https://chromium.googlesource.com/chromiumos/third_party/hdctools/+/HEAD/docs/ccd.md) as an example of how the ADC controller can be used.
+
+The debug setup referred to uses a USB-C debug accessory.
+This insertion of this debug accessory into a system, can be detected by the ADC controller.
+
+The debug accessory voltage range of interest is shown in the diagram below:
+![Debug Cable Regions](../doc/debug_cable_regions.svg)
+
+The ADC can be used to detect debug cable connection / disconnection in the non-overlapping regions.
+As an example use case of the two channel filters they can be used for detection of a USB-C debug accessory.
+The ADC must meet some minimum specifications:
+* Full scale range is 0.0V to 2.2V
+* If the signal is below 0.0V the ADC value will be zero.
+* If the signal is above 2.2V the ADC value will be maximum (i.e. same as 2.2V)
+* Absolute maximum error +/- 15 mV in the 0.25 - 0.45 V range
+* Absolute maximum error +/- 30 mV in the rest of the 0.0 - 2.2 V range
+
+The following assumes:
+* The slow clock runs at 200kHz or 5 us.
+* The ADC requires 30 us to power on.
+* The ADC takes a single sample in 44 clocks (220 us)
+
+The controller should be initialized with the properties of the ADC and scan times.
+* The ADC power up delay must be set in [`adc_pd_ctl.pwrup_time`](../data/adc_ctrl.hjson#adc_pd_ctl) to `6` (30 us).
+* The time to delay between samples in a slow scan should be set in [`adc_pd_ctl.wakeup_time`](../data/adc_ctrl.hjson#adc_pd_ctl) to `1600` (8ms).
+* The number of samples to cause transition from slow to fast scan should be set in [`adc_lp_sample_ctl`](../data/adc_ctrl.hjson#adc_lp_sample_ctl) to `4` (causing slow scan to be 4*8ms = 32ms of debounce time).
+* The number of samples for debounce should be set in [`adc_sample_ctl`](../data/adc_ctrl.hjson#adc_sample_ctl) to `155` (causing the total debounce time to be 32ms (slow scan) + 220us * 2 * 155 = 100ms, at the low end of the USB-C spec window).
+
+* For the 10-bit ADC granularity, the filter registers [`adc_chnX_filter_ctlN`](../data/adc_ctrl.hjson#adc_chnx_filter_ctln) should be programmed to:
+
+| Filter | Ch0 Min      | Ch0 Max      | Ch1 Min      | Ch1 Max      | Device connected            |
+|--------|--------------|--------------|--------------|--------------|-----------------------------|
+| 0 IN   |  149 (0.32V) |  279 (0.60V) |  149 (0.32V) |  279 (0.60V) | Debug Sink (local RpUSB)    |
+| 1 IN   |  391 (0.84V) |  524 (1.125V)|  391 (0.84V) |  524 (1.125V)| Debug Sink (local Rp1.5A)   |
+| 2 IN   |  712 (1.53V) |  931 (2.00V) |  712 (1.53V) |  931 (2.00V) | Debug Sink (local Rp3A)     |
+| 3 IN   |  712 (1.53V) |  847 (1.82V) |  405 (0.87V) |  503 (1.08V) | Debug Source with RpUSB     |
+| 4 IN   |  349 (0.75V) |  512 (1.12V) |  186 (0.40V) |  279 (0.60V) | Debug Source with Rp1.5A    |
+| 5 IN   |  405 (0.87V) |  503 (1.08V) |  712 (1.53V) |  841 (1.82V) | Debug Source RpUSB Flipped  |
+| 6 IN   |  186 (0.40V) |  279 (0.60V) |  349 (0.75V) |  521 (1.12V) | Debug Source Rp1.5A Flipped |
+| 7 OUT  |  116 (0.25V) |  954 (2.05V) |  116 (0.25V) |  954 (2.05V) | Disconnect                  |
+
+
+* The interrupt [`adc_intr_ctl`](../data/adc_ctrl.hjson#adc_intr_ctl) and wakeup [`adc_wakeup_ctl`](../data/adc_ctrl.hjson#adc_wakeup_ctl) enables should be configured.
+* All ones should be written to [`adc_intr_status`](../data/adc_ctrl.hjson#adc_intr_status) and  [`filter_status`](../data/adc_ctrl.hjson#filter_status) to ensure there are no spurious pending triggers.
+* The state machine will only start running when [`adc_en_ctl`](../data/adc_ctrl.hjson#adc_en_ctl) is set.
+
+Note that for the debug controller (DTS in USB-C specification) as a power source the filter that is hit will indicate the orientation of the connector.
+If the debug controller is acting as a power sink then the orientation cannot be known unless the debug controller supports the optional behavior of converting one of its pulldowns to an Ra (rather than Rp) to indicate CC2 (the CC that is not used for communication).
+This would not be detected by the filters since it happens later than connection detection and debounce in the USB-C protocol state machine, but could be detected by monitoring the current ADC value.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_adc_ctrl.h)
+
+## Registers
+
+* [Register Table](../data/adc_ctrl.hjson#registers)
diff --git a/hw/ip/adc_ctrl/doc/theory_of_operation.md b/hw/ip/adc_ctrl/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..ae7be90bb81fb
--- /dev/null
+++ b/hw/ip/adc_ctrl/doc/theory_of_operation.md
@@ -0,0 +1,142 @@
+# Theory of Operation
+
+The block diagram shows a conceptual view of the ADC controller state machine and filters.
+
+## Block Diagram
+
+![ADC_CTRL Block Diagram](../doc/adc_overview.svg)
+
+
+## Hardware Interface
+
+* [Interface Tables](../data/adc_ctrl.hjson#interfaces)
+
+### Signals
+
+In addition to the interrupts and bus signals, the tables below lists additional IOs.
+
+Signal                  | Direction | Description
+------------------------|-----------|---------------
+`adc_o`                 | `output`  | Output controls to the actual `AST ADC` module.  Contains signals such as power down control and ADC channel select.
+`adc_i`                 | `input`   | Input data from `AST ADC` module. Contains ADC data output as well as data valid indication.
+
+
+## Design Details
+
+### Sampling state machine
+
+The state machine that takes ADC samples follows a very simple pattern:
+
+1. *Power up ADC*: The controller issues the power up command to the ADC.
+
+2. *Wait for ADC turn on*: The controller waits for the number of clock cycles programmed in [`adc_pd_ctl.pwrup_time`](../data/adc_ctrl.hjson#adc_pd_ctl) which should be set to match the ADC power up delay.
+
+3. *Take sample Channel 0*: The ADC is requested to sample channel 0.
+When the ADC signals complete the value is stored in [`adc_chn_val[0].adc_chn_value`](../data/adc_ctrl.hjson#adc_chn_val_0).
+Note that the time taken in this step depends on the properties of the ADC.
+
+4. *Take sample Channel 1*: The ADC is requested to sample channel 1.
+When the ADC signals complete the value is stored in [`adc_chn_val[1].adc_chn_value`](../data/adc_ctrl.hjson#adc_chn_val_1).
+Note that the time taken in this step depends on the properties of the ADC.
+
+5. *Evaluate Filters*: The filters are evaluated and debounce logic applied (see [next section](#filters-and-debounce)).
+
+6. *Scan type check*: At this point if the [`adc_pd_ctl.lp_mode`](../data/adc_ctrl.hjson#adc_pd_ctl) bit is clear scanning continues at step (3).
+   If the bit is set the next step depends on how many samples have hit the filters.
+   If more than [`adc_lp_sample_ctl.lp_sample_cnt`](../data/adc_ctrl.hjson#adc_lp_sample_ctl) samples have hit then continuous scanning continues at step (3) else periodic scanning will continue at the next step (7).
+
+7. *Power off ADC*: The controller issues the power down command to the ADC.
+
+8. *Wait sleep time*: The controller will wait for the next sample timer to time out before restarting at step (1).
+
+In active operation the controller is in continuous scanning mode:
+* The ADC is continually powered on.
+* The sampling cycle time is the time taken for the ADC to take two samples (450us) plus internal processing time (4 clock cycles) from the ADC controller.
+* The debounce timer will trigger the [`filter_status`](../data/adc_ctrl.hjson#filter_status) and interrupt after a configurable number of matching ADC samples have been seen, as determined by [`adc_sample_ctl`](../data/adc_ctrl.hjson#adc_sample_ctl).
+
+For low power operation the periodic scanning mode can be used.
+In this mode samples are taken using a slower periodic sampling cycle time with the ADC powered down most of the time.
+Once a small number of cycles have hit the filter with periodic scanning then the controller switches to continuous scanning in order to more accurately debounce the signal.
+In low power mode:
+* The ADC is periodically powered up to take samples; this interval is determined by [`adc_pd_ctl.wakeup_time`](../data/adc_ctrl.hjson#adc_pd_ctl).
+* Similar to normal operation, the ADC power-up delay is controlled by [`adc_pd_ctl.pwrup_time`](../data/adc_ctrl.hjson#adc_pd_ctl).
+* Once the ADC is powered up, two samples are taken and compared to the filter thresholds.
+* If a configurable number of matches, as determined by [`adc_lp_sample_ctl`](../data/adc_ctrl.hjson#adc_lp_sample_ctl), are seen, the ADC controller transitions to normal operation for continuous sampling.
+
+Although it can be used at any time, the periodic operation mode and use of the slow clock allows the ADC controller to continue to scan when most of the chip is in sleep or power-down modes.
+The controller can be configured to issue a wakeup to the rest of the chip.
+
+If a filter is enabled for wakeup in [`adc_wakeup_ctl`](../data/adc_ctrl.hjson#adc_wakeup_ctl) and [`filter_status`](../data/adc_ctrl.hjson#filter_status) indicates a match, a wakeup is generated to the system power manager.
+
+
+## Filters and debounce
+
+There are two reserved bits in ADC filter control registers for future use.
+In the current implementation, ADC has 10-bit granularity.
+Each step is 2.148mV.
+It covers 0-2.2V.
+
+The ADC controller implements eight pairs of filters that feed the debounce logic.
+Each pair has a filter for channel 0 and a filter for channel 1.
+
+A filter consists of a `max` value, a `min` value and a `cond` flag indicating if the filter is hit by a sample inside or outside the range.
+* *Inside the range*: the filter is hit if `min` &le; `value` &le; `max`.
+* *Outside the range*: inverse of inside, so the filter is hit if `value` &lt; `min` or `value` &gt; `max`.
+
+Some example filters:
+* Inside `min=7`, `max=23`: any value between and including 7 and 23 will hit.
+* Outside `min=7`, `max=23`: any value less than 7 or greater than 23 will hit.
+* Inside `min=7`, `max=7`: the value must be exactly 7 to hit (sample noise may make an exact hit unlikely).
+* Inside `min=0`, `max=7`: the value must be less than 8 to hit.
+* Outside `min=8`, `max=0xFFF`: the value must be less than 8 to hit (alternate method).
+* Inside `min=0`, `max=0xFFF`: any value will hit. This may be useful to exclude one channel from the filter.
+* Outside `min=0`, `max=0xFFF`: no value will hit. If set for either channel the filter is effectively disabled.
+
+All pairs of filters that are enabled in [`adc_chn0_filter_ctl[7:0]`](../data/adc_ctrl.hjson#adc_chn0_filter_ctl_0) and [`adc_chn1_filter_ctl[7:0]`](../data/adc_ctrl.hjson#adc_chn1_filter_ctl_0) are evaluated after each pair of samples has been taken.
+The filter result is passed to the periodic scan counter if enabled and not at its limit otherwise the result is passed to the debounce counter.
+The list below describes how the counters interpret the filter results:
+* If no filters are hit then the counter will reset to zero.
+* If one or more filters are hit but the set hit differs from the previous evaluation the counter resets to zero.
+* If one or more filters are hit and either none was hit in the previous evaluation or the same set was hit in the previous evaluation and the counter is not at its threshold then the counter will increment.
+* If one or more filters are hit and the same set was hit in the previous evaluation and the counter is at its threshold then the counter stays at the threshold.
+* If the counter is the periodic scan counter and it reaches its threshold, as defined by [`adc_lp_sample_ctl.lp_sample_cnt`](../data/adc_ctrl.hjson#adc_lp_sample_ctl), then continuous scanning is enabled and the debounce counter will be used for future evaluations.
+* If the counter is the debounce counter and it reaches its threshold, as defined by [`adc_sample_ctl.np_sample_cnt`](../data/adc_ctrl.hjson#adc_sample_ctl), then:
+  * An interrupt is raised if the threshold is met for the first time.
+  * The current sample values are latched into [`adc_chn_val[0].adc_chn_value_intr`](../data/adc_ctrl.hjson#adc_chn_val_0) and  [`adc_chn_val[1].adc_chn_value_intr`](../data/adc_ctrl.hjson#adc_chn_val_1).
+    *  If a series of interrupts and matches are seen, these registers only record the value of the last debounced hit.
+  * The [`adc_intr_status`](../data/adc_ctrl.hjson#adc_intr_status) register is updated by setting the bits corresponding to filters that are hit (note that bits that are already set will not be cleared).
+    This will cause the block to raise an interrupt if it was not already doing so.
+  * If a filter is a hit and is also enabled in [`adc_wakeup_ctl`](../data/adc_ctrl.hjson#adc_wakeup_ctl) the corresponding filter generates a wakeup.
+  * Note that the debounce counter will remain at its threshold until the set of filters are changed by software to debounce a different event or if the current match changes.
+    *  This implies that a stable matching event continuously matches until some condition in the system (changed filter settings, changed ADC output, software issued fsm reset in [`adc_fsm_rst`](../data/adc_ctrl.hjson#adc_fsm_rst)) alters the result.
+
+
+Because scanning continues the [`adc_intr_status`](../data/adc_ctrl.hjson#adc_intr_status) register will reflect any debounced events that are detected between the controller raising an interrupt and the status bits being cleared (by having 1 written to them).
+However, the [`adc_chn_val[0].adc_chn_value_intr`](../data/adc_ctrl.hjson#adc_chn_val_0) and [`adc_chn_val[1].adc_chn_value_intr`](../data/adc_ctrl.hjson#adc_chn_val_1) registers record the value at the time the interrupt was first raised and thus reflect the filter state from that point.
+
+### ADC_CTRL and ADC Interface
+
+The interface between the ADC controller and the ADC is diagrammed below.
+The interface is from the perspective of the ADC controller.
+Before operation can begin, the ADC controller first powers on the ADC by setting `adc_o.pd` to 0.
+The controller then waits for the ADC to fully power up, as determined by [`adc_pd_ctl.pwrup_time`](../data/adc_ctrl.hjson#adc_pd_ctl).
+
+Once the ADC is ready to go, the controller then selects the channel it wishes to sample by setting `adc_o.channel_sel`.
+The controller holds this value until the ADC responds with `adc_i.data_valid` and `adc_i.data`.
+
+Since there is no request sample signal between the controller and the ADC, the ADC takes a new sample when `adc_o.channel_sel` is changed from 0 to a valid channel.
+To take a new sample then, the controller actively sets `adc_o.channel_sel` to 0, before setting it to another valid channel.
+
+```wavejson
+{
+  signal: [
+    {node: '.a..b........', phase:0.2},
+    {name: 'clk_aon_i',         wave: 'p.|..|.....|....|...'},
+    {name: 'adc_o.pd',          wave: '10|..|.....|....|..1'},
+    {name: 'adc_o.channel_sel', wave: '0.|.3|..04.|....|0..'},
+    {name: 'adc_i.data_valid',  wave: '0.|..|.1.0.|.1..|.0.'},
+    {name: 'adc_i.data',        wave: 'x.|..|.3.x.|.4..|.x.', data: ['ch0', 'ch1', 'ch1']},
+  ],
+  edge: [  'a<->b wakeup time',   ]
+}
+```
diff --git a/hw/ip/aes/README.md b/hw/ip/aes/README.md
index 56129cf58c97e..12d5e93f7a0e1 100644
--- a/hw/ip/aes/README.md
+++ b/hw/ip/aes/README.md
@@ -46,596 +46,3 @@ The AES unit is attached to the chip interconnect bus as a peripheral module.
 Communication with the processor happens through a set of control and status registers (CSRs).
 This includes input/output data and key, as well as status and control information.
 Future versions of the AES unit might include a separate interface through which a possible system key manager can provide the key without exposing it to the processor or other hosts attached to the system bus interconnect.
-
-
-# Theory of Operations
-
-The AES unit supports both encryption and decryption for AES-128/192/256 in ECB, CBC, CFB, OFB and CTR modes using a single, shared data path.
-That is, it can either do encryption or decryption but not both at the same time.
-
-The AES unit features a key expanding mechanism to generate the required round keys on-the-fly from a single initial key provided through the register interface.
-This means the processor needs to provide just the initial encryption key to the AES unit via register interface.
-The AES unit then uses this key to generate all round keys as they are needed in parallel to the actual encryption/decryption.
-The benefits of this design compared to passing all round keys via register interface include:
-
-- Reduced storage requirements and smaller circuit area: Instead of storing 15 128-bit round keys, only 3 256-bit key registers are required for AES-256:
-  - one set of registers to which the processor writes the initial key, i.e., the start key for encryption,
-  - one set of registers to hold the current full key, and
-  - one set of registers to hold the full key of the last encryption round, i.e., the start key for decryption.
-- Faster re-configuration and key switching: The core just needs to perform 8 write operations instead of 60 write operations for AES-256.
-
-On-the-fly round-key generation comes however at the price of an initial delay whenever the key is changed by the processor before the AES unit can perform ECB/CBC **decryption** using this new key.
-During this phase, the key expanding mechanism iteratively computes the start key for the decryption.
-The duration of this delay phase corresponds to the latency required for encrypting one 16B block (i.e., 12/14/16 cycles for AES-128/192/256).
-Once the start key for decryption has been computed, it is stored in a dedicated internal register for later use.
-The AES unit can then switch between decryption and encryption without additional overhead.
-
-For encryption or if the mode is set to CFB, OFB or CTR, there is no such initial delay upon changing the key.
-If the next operation after a key switch is ECB or CBC **decryption**, the AES unit automatically initiates a key expansion using the key schedule first (to generate the start key for decryption, the actual data path remains idle during that phase).
-
-The AES unit uses a status register to indicate to the processor when ready to receive the next input data block via the register interface.
-While the AES unit is performing encryption/decryption of a data block, it is safe for the processor to provide the next input data block.
-The AES unit automatically starts the encryption/decryption of the next data block once the previous encryption/decryption is finished and new input data is available.
-The order in which the input registers are written does not matter.
-Every input register must be written at least once for the AES unit to automatically start encryption/decryption.
-This is the default behavior.
-It can be disabled by setting the MANUAL_OPERATION bit in [`CTRL_SHADOWED`](data/aes.hjson#ctrl_shadowed) to `1`.
-In this case, the AES unit only starts the encryption/decryption once the START bit in [`TRIGGER`](data/aes.hjson#trigger) is set to `1` (automatically cleared to `0` once the next encryption/decryption is started).
-
-Similarly, the AES unit indicates via a status register when having new output data available to be read by the processor.
-Also, there is a back-pressure mechanism for the output data.
-If the AES unit wants to finish the encryption/decryption of a data block but the previous output data has not yet been read by the processor, the AES unit is stalled.
-It hangs and does not drop data.
-It only continues once the previous output data has been read and the corresponding registers can be safely overwritten.
-The order in which the output registers are read does not matter.
-Every output register must be read at least once for the AES unit to continue.
-This is the default behavior.
-It can be disabled by setting the MANUAL_OPERATION bit in [`CTRL_SHADOWED`](data/aes.hjson#ctrl_shadowed) to `1`.
-In this case, the AES unit never stalls and just overwrites previous output data, independent of whether it has been read or not.
-
-
-## Block Diagram
-
-This AES unit targets medium performance (\~1 cycle per round for the unmasked implementation).
-High-speed, single-cycle operation for high-bandwidth data streaming is not required.
-
-Therefore, the AES unit uses an iterative cipher core architecture with a 128-bit wide data path as shown in the figure below.
-Note that for the sake of simplicity, the figure shows the unmasked implementation.
-For details on the masked implementation of the cipher core refer to [Security Hardening below](#security-hardening)).
-Using an iterative architecture allows for a smaller circuit area at the cost of throughput.
-Employing a 128-bit wide data path allows to achieve the latency requirements of 12/14/16 clock cycles per 16B data block in AES-128/192/256 mode in the unmasked implementation, respectively.
-
-![AES unit block diagram (unmasked implementation) with shared data paths for encryption and decryption (using the Equivalent Inverse Cipher).](./doc/aes_block_diagram.svg)
-
-Inside the cipher core, both the data paths for the actual cipher (left) and the round key generation (right) are shared between encryption and decryption.
-Consequently, the blocks shown in the diagram always implement the forward and backward (inverse) version of the corresponding operation.
-For example, SubBytes implements both SubBytes and InvSubBytes.
-
-Besides the actual AES cipher core, the AES unit features a set of control and status registers (CSRs) accessible by the processor via TL-UL bus interface, and a counter module (used in CTR mode only).
-This counter module implements the Standard Incrementing Function according to [Recommendation for Block Cipher Modes of Operation (Appendix B.1)](https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf) with a fixed parameter m = 128.
-Note that for AES, parameter b = 128 and the counter increment is big-endian.
-CFB mode is supported with a fixed parameter s = 128 (CFB-128).
-Support for data segment sizes other than 128 bits would require a substantial amount of additional muxing resources and is thus not provided.
-The initialization vector (IV) register and the register to hold the previous input data are used in CBC, CFB, OFB and CTR modes only.
-
-
-## Hardware Interfaces
-
-* [Interface Tables](data/aes.hjson#interfaces)
-
-The table below lists other signals of the AES unit.
-
-Signal             | Direction        | Type                   | Description
--------------------|------------------|------------------------|---------------
-`idle_o`           | `output`         | `logic`                | Idle indication signal for clock manager.
-`lc_escalate_en_i` | `input`          | `lc_ctrl_pkg::lc_tx_t` | Life cycle escalation enable coming from [life cycle controller](../lc_ctrl/README.md). This signal moves the main controller FSM within the AES unit into the terminal error state. The AES unit needs to be reset.
-`edn_o`            | `output`         | `edn_pkg::edn_req_t`   | Entropy request to [entropy distribution network (EDN)](../edn/README.md) for reseeding internal pseudo-random number generators (PRNGs) used for register clearing and masking.
-`edn_i`            | `input`          | `edn_pkg::edn_rsp_t`   | [EDN](../edn/README.md) acknowledgment and entropy input for reseeding internal PRNGs.
-`keymgr_key_i`     | `input`          | `keymgr_pgk::hw_key_req_t` | Key sideload request coming from [key manager](../keymgr/README.md).
-
-Note that the `edn_o` and `edn_i` signals used to interface [EDN](../edn/README.md) follow a REQ/ACK protocol.
-The entropy distributed by EDN is obtained from the [cryptographically secure random number generator (CSRNG)](../csrng/README.md).
-
-## Design Details
-
-This section discusses different design details of the AES module.
-
-
-### Datapath Architecture and Operation
-
-The AES unit implements the Equivalent Inverse Cipher described in the [AES specification](https://csrc.nist.gov/csrc/media/publications/fips/197/final/documents/fips-197.pdf).
-This allows for more efficient cipher data path sharing between encryption/decryption as the operations are applied in the same order (less muxes, simpler control), but requires the round key during decryption to be transformed using an inverse MixColumns in all rounds except for the first and the last one.
-
-This architectural choice targets at efficient cipher data path sharing and low area footprint.
-Depending on the application scenario, other architectures might offer a more suitable area/performance tradeoff.
-For example if only CFB, OFB or CTR modes are ever used, the inverse cipher is not used at all.
-Moreover, if the key is changed extremely rarely (as for example in the case of bulk decryption), it may pay off to store all round keys instead of generating them on the fly.
-Future versions of the AES unit might offer compile-time parameters to selectively instantiate the forward/inverse cipher part only to allow for dedicated encryption/decryption-only units.
-
-All submodules in the data path are purely combinational.
-The only sequential logic in the cipher and round key generation are the State, Full Key and Decryption Key registers.
-
-The following description explains how the AES unit operates, i.e., how the operation of the AES cipher is mapped to the datapath architecture of the AES unit.
-Phrases in italics apply to peculiarities of different block cipher modes.
-For a general introduction into these cipher modes, refer to [Recommendation for Block Cipher Modes of Operation](https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf).
-
-1. The configuration and initial key is provided to the AES unit via a set of control and status registers (CSRs) accessible by the processor via TL-UL bus interface.
-   The processor must first provide the configuration to the [`CTRL_SHADOWED`](data/aes.hjson#ctrl_shadowed) register.
-   Then follows the initial key.
-   Each key register must be written at least once.
-   The order in which the registers are written does not matter.
-1. _The processor provides the initialization vector (IV) or initial counter value to the four IV registers via TL-UL bus interface in CBC, CFB and OFB modes, or CTR mode, respectively.
-   Each IV register must be written at least once.
-   The order in which the registers are written does not matter.
-   Note that while operating, the AES unit automatically updates the IV registers after having consumed the current IV value.
-   Whenever a new message is started, the processor must provide the corresponding IV value via TL-UL bus interface.
-   In ECB mode, no IV needs to be provided.
-   The content of the IV registers is ignored in ECB mode._
-1. The input data is provided to the AES unit via four CSRs.
-   Each input register must be written at least once.
-   The order in which the registers are written does not matter.
-1. If new input data is available, the AES unit automatically starts encryption/decryption by performing the following actions.
-    1. The AES unit loads initial state into the State register inside the cipher core.
-
-       _Depending on the cipher mode, the initial state is a combination of input data as well as IV._
-       _Note, if CBC decryption is performed, or if running in CFB, OFB or CTR mode, the input data is also registered (Data In Prev in the block diagram)._
-    2. The initial key is loaded into the Full Key register inside the cipher core.
-
-       _Note, if the ECB/CBC decryption is performed, the Full Key register is loaded with the value stored in the Decryption Key register._
-
-    _Note, for the AES unit to automatically start in CBC, CFB, OFB or CTR mode, also the IV must be ready.
-    The IV is ready if -- since the last IV update (either done by the processor or the AES unit itself) -- all IV registers have been written at least once or none of them.
-    The AES unit will not automatically start the next encryption/decryption with a partially updated IV._
-
-    By setting the MANUAL_OPERATION bit in [`CTRL_SHADOWED`](data/aes.hjson#ctrl_shadowed) to `1`, the AES unit can be operated in manual mode.
-    In manual mode, the AES unit starts encryption/decryption whenever the START bit in [`TRIGGER`](data/aes.hjson#trigger) is set to `1`, irrespective of the status of the IV and input data registers.
-
-1. Once the State and Full Key registers have been loaded, the AES cipher core starts the encryption/decryption by adding the first round key to the initial state (all blocks in both data paths are bypassed).
-   The result is stored back in the State register.
-1. Then, the AES cipher core performs 9/11/13 rounds of encryption/decryption when using a 128/192/256-bit key, respectively.
-   In every round, the cipher data path performs the four following transformations.
-   For more details, refer to the [AES specification](https://csrc.nist.gov/csrc/media/publications/fips/197/final/documents/fips-197.pdf).
-    1. SubBytes Transformation: A non-linear byte substitution that operates independently on each byte of the state using a substitution table (S-Box).
-    2. ShiftRows Transformation: The bytes of the last three rows of the state are cyclically shifted over different offsets.
-    3. MixColumns Transformation: Each of the four columns of the state are considered as polynomials over GF(2^8) and individually multiplied with another fixed polynomial.
-    4. AddRoundKey Transformation: The round key is XORed with the output of the MixColumns operation and stored back into the State register.
-       The 128-bit round key itself is extracted from the current value in the Full Key register.
-
-    In parallel, the full key used for the next round is computed on the fly using the key expand module.
-
-    _If running in CTR mode, the counter module iteratively updates the IV in parallel to the cipher core performing encryption/decryption.
-    Internally, the counter module uses one 16-bit counter, meaning it requires 8 clock cycles to increment the 128-bit counter value stored in the IV register.
-    Since the counter value is used in the first round only, and since the encryption/decryption of a single block takes 12/14/16 cycles, the iterative counter implementation does not affect the throughput of the AES unit._
-1. Finally, the AES cipher core performs the final encryption/decryption round in which the MixColumns operation is skipped.
-   The output is forwarded to the output register in the CSRs but not stored back into the State register.
-   The internal State register is cleared with pseudo-random data.
-
-   _Depending on the cipher mode, the output of the final round is potentially XORed with either the value in the IV registers (CBC decryption) or the value stored in the previous input data register (CFB, OFB, CTR modes), before being forwarded to the output register in the CSRs.
-   If running in CBC mode, the IV registers are updated with the output data (encryption) or the value stored in the previous input data register (decryption).
-   If running in CFB or OFB mode, the IV registers are updated with the output data or the output of the final cipher round (before XORing with the previous input data), respectively._
-
-Having separate registers for input, output and internal state prevents the extraction of intermediate state via TL-UL bus interface and allows to overlap reconfiguration with operation.
-While the AES unit is performing encryption/decryption, the processor can safely write the next input data block into the CSRs or read the previous output data block from the CSRs.
-The State register is internal to the AES unit and not exposed via the TL-UL bus interface.
-If the AES unit wants to finish the encryption/decryption of an output data block but the previous one has not yet been read by the processor, the AES unit is stalled.
-It hangs and does not drop data.
-It only continues once the previous output data has been read and the corresponding registers can be safely overwritten.
-The order in which the output registers are read does not matter.
-Every output register must be read at least once for the AES unit to continue.
-In contrast, the initial key, and control register can only be updated if the AES unit is idle, which eases design verification (DV).
-Similarly, the initialization vector (IV) register can only be updated by the processor if the AES unit is idle.
-If the AES unit is busy and running in CBC or CTR mode, the AES unit itself updates the IV register.
-
-The cipher core architecture of the AES unit is derived from the architecture proposed by Satoh et al.: ["A compact Rijndael Hardware Architecture with S-Box Optimization"](https://link.springer.com/chapter/10.1007%2F3-540-45682-1_15).
-The expected circuit area in a 110nm CMOS technology is in the order of 12 - 22 kGE (unmasked implementation, AES-128 only).
-The expected circuit area of the entire AES unit with masking enabled is around 110 kGE.
-
-For a description of the various sub modules, see the following sections.
-
-
-### SubBytes / S-Box
-
-The SubBytes operation is a non-linear byte substitution that operates independently on each byte of the state using a substitution table (S-Box).
-It is both used for the cipher data path and the key expand data path.
-In total, the AES unit instantiates 20 S-Boxes in parallel (16 for SubBytes, 4 for KeyExpand), each having 8-bit input and output.
-In combination with the 128-bit wide data path, this allows to perform one AES round per iteration.
-
-The design of this S-Box and its inverse can have a big impact on circuit area, timing critical path, robustness and power leakage, and is itself its own research topic.
-
-The S-Boxes are decoupled from the rest of the AES unit with a handshake protocol, allowing them to be easily replaced by different implementations if required.
-The AES unit comes with the following S-Box implementations that can be selected by a compile-time Verilog parameter:
-- Domain-oriented masking (DOM) S-Box: default, see [Gross et al.: "Domain-Oriented Masking: Compact Masked Hardware Implementations with Arbitrary Protection Order"](https://eprint.iacr.org/2016/486.pdf)
-- Masked Canright S-Box: provided for reference, usage discouraged, a version w/ and w/o mask re-use is provided, see [Canright and Batina: "A very compact "perfectly masked" S-Box for AES (corrected)"](https://eprint.iacr.org/2009/011.pdf)
-- Canright S-Box: only use when disabling masking, recommended when targeting ASIC implementation, see [Canright: "A very compact Rijndael S-Box"](https://hdl.handle.net/10945/25608)
-- LUT-based S-Box: only use when disabling masking, recommended when targeting FPGA implementation
-
-The DOM S-Box has a latency of 5 clock cycles.
-All other implementations are fully combinational (one S-Box evaluation every clock cycle).
-See also [Security Hardening below.](#1st-order-masking-of-the-cipher-core)
-
-### ShiftRows
-
-The ShiftRows operation simply performs a cyclic shift of Rows 1, 2 and 3 of the state matrix.
-Consequently, it can be implemented using 3\*4 32-bit 2-input muxes (encryption/decryption).
-
-
-### MixColumns
-
-Each of the four columns of the state are considered as polynomials over GF(2^8) and individually multiplied with another fixed polynomial.
-The whole operation can be implemented using 36 2-input XORs and 16 4-input XORs (all 8-bit), 8 2-input muxes (8-bit), as well as 78 2-input and 24 3-input XOR gates.
-
-
-### KeyExpand
-
-The key expand module (KEM) integrated in the AES unit is responsible for generating the various round keys from the initial key for both encryption and decryption.
-The KEM generates the next 128/192/256-bit full key in parallel to the actual encryption/decryption based on the current full key or the initial key (for the first encryption round).
-The actual 128-bit round key is then extracted from this full key.
-
-Generating the keys on-the-fly allows for lower storage requirements and smaller circuit area but comes at the price of an initial delay before doing ECB/CBC **decryption** whenever the key is changed.
-During this phase, the KEM cycles through all full keys to obtain the start key for decryption (equals the key for final round of encryption).
-The duration of this delay phase corresponds to the latency required for encrypting one 16B block.
-During this initial phase, the cipher data path is kept idle.
-
-The timing diagram below visualizes this process.
-
-```wavejson
-{
-  signal: [
-    {    name: 'clk',       wave: 'p........|.......'},
-    ['TL-UL IF',
-      {  name: 'write',     wave: '01...0...|.......'},
-      {  name: 'addr',      wave: 'x2345xxxx|xxxxxxx', data: 'K0 K1 K2 K3'},
-      {  name: 'wdata',     wave: 'x2345xxxx|xxxxxxx', data: 'K0 K1 K2 K3'},
-    ],
-    {},
-    ['AES Unit',
-      {  name: 'Config op', wave: 'x4...............', data: 'DECRYPT'},
-      {  name: 'AES op',    wave: '2........|.4.....', data: 'IDLE DECRYPT'},
-      {  name: 'KEM op',    wave: '2....3...|.4.....', data: 'IDLE ENCRYPT DECRYPT'},
-      {  name: 'round',     wave: 'xxxxx2.22|22.2222', data: '0 1 2 9 0 1 2 3 4'},
-      {  name: 'key_init',  wave: 'xxxx5....|.......', data: 'K0-3'},
-      {  name: 'key_full',  wave: 'xxxxx5222|4.22222', data: 'K0-3 f(K) f(K) f(K) K0-3\' f(K) f(K) f(K) f(K) f(K)'},
-      {  name: 'key_dec',   wave: 'xxxxxxxxx|4......', data: 'K0-3\''},
-    ]
-  ]
-}
-```
-
-The AES unit is configured to do decryption (`Config op` = DECRYPT).
-Once the new key has been provided via the control and status registers (top), this new key is loaded into the Full Key register (`key_full` = K0-3) and the KEM starts performing encryption (`KEM op`=ENCRYPT).
-The cipher data path remains idle (`AES op`=IDLE).
-In every round, the value in `key_full` is updated.
-After 10 encryption rounds, the value in `key_full` equals the start key for decryption.
-This value is stored into the Decryption Key register (`key_dec` = K0-3' at the very bottom).
-Now the AES unit can switch between encryption/decryption without overhead as both the start key for encryption (`key_init`) and decryption (`key_dec`) can be loaded into `full_key`.
-
-For details on the KeyExpand operation refer to the [AES specification, Section 5.2](https://csrc.nist.gov/csrc/media/publications/fips/197/final/documents/fips-197.pdf).
-
-Key expanding is the only operation in the AES unit for which the functionality depends on the selected key length.
-Having a KEM that supports 128-bit key expansion, support for the 256-bit mode can be added at low overhead.
-In contrast, the 192-bit mode requires much larger muxes.
-Support for this mode is thus optional and can be enabled/disabled via a design-time parameter.
-
-Once we have cost estimates in terms of gate count increase for 192-bit mode, we can decide on whether or not to use it in OpenTitan.
-Typically, systems requiring security above AES-128 go directly for AES-256.
-
-### System Key-Manager Interface
-
-By default, the AES unit is controlled entirely by the processor.
-The processor writes both input data as well as the initial key to dedicated registers via the system bus interconnect.
-
-Alternatively, the processor can configure the AES unit to use an initial key provided by the [key manager](../keymgr/README.md) via key sideload interface without exposing the key to the processor or other hosts attached to the system bus interconnect.
-To this end, the processor has to set the SIDELOAD bit in [`CTRL_SHADOWED`](data/aes.hjson#ctrl_shadowed) to `1`.
-Any write operations of the processor to the Initial Key registers [`KEY_SHARE0_0`](data/aes.hjson#key_share0_0) - [`KEY_SHARE1_7`](data/aes.hjson#key_share1_7) are then ignored.
-In normal/automatic mode, the AES unit only starts encryption/decryption if the sideload key is marked as valid.
-To update the sideload key, the processor has to 1) wait for the AES unit to become idle, 2) wait for the key manager to update the sideload key and assert the valid signal, and 3) write to the [`CTRL_SHADOWED`](data/aes.hjson#ctrl_shadowed) register to start a new message.
-After using a sideload key, the processor has to trigger the clearing of all key registers inside the AES unit (see [De-Initialization](#de-initialization) below).
-
-
-# Security Hardening
-
-The AES unit employs different means at architectural, micro-architectural and physical levels for security hardening against side-channel analysis and fault injection.
-
-## Side-Channel Analysis
-
-To aggravate side-channel analysis (SCA), the AES unit implements the following countermeasures.
-
-### 1st-order Masking of the Cipher Core
-
-The AES unit employs 1st-order masking of the AES cipher core.
-More precisely, both the cipher and the key expand data path use two shares.
-As shown in the block diagram below, the width of all registers and data paths basically doubles.
-
-![Block diagram of the masked AES cipher core.](./doc/aes_block_diagram_cipher_core_masked.svg)
-
-The initial key is provided in two shares via the register interface.
-The input data is provided in unmasked form and masked outside of the cipher core to obtain the two shares of the initial state.
-The pseudo-random data (PRD) required for masking the input data is provided by the pseudo-random number generator (PRNG) of the cipher core.
-Similarly, the two shares of the output state are combined outside the cipher core to obtain the output data.
-
-The same PRNG also generates the fresh randomness required by the masked SubBytes (16 masked S-Boxes) and the masked KeyExpand (4 masked S-Boxes).
-The masking scheme selected for the S-Box can have a high impact on SCA resistance, circuit area, number of PRD bits consumed per cycle and per S-Box evaluation, and throughput.
-The selection of the masked S-Box implementation can be controlled via compile-time Verilog parameter.
-By default, the AES unit uses domain-oriented masking (DOM) for the S-Boxes as proposed by [Gross et al.: "Domain-Oriented Masking: Compact Masked Hardware Implementations with Arbitrary Protection Order".](https://eprint.iacr.org/2016/486.pdf)
-The provided implementation has a latency of 5 clock cycles per S-Box evaluation.
-As a result, the overall latency for processing a 16-byte data block increases from 12/14/16 to 56/66/72 clock cycles in AES-128/192/256 mode, respectively.
-The provided implementation further forwards partial, intermediate results among DOM S-Box instances for remasking purposes.
-This allows to reduce circuit area related to generating, buffering and applying PRD without impacting SCA resistance.
-Alternatively, the two original versions of the masked Canright S-Box can be chosen as proposed by [Canright and Batina: "A very compact "perfectly masked" S-Box for AES (corrected)".](https://eprint.iacr.org/2009/011.pdf)
-These are fully combinational (one S-Box evaluation every cycle) and have lower area footprint, but they are significantly less resistant to SCA.
-They are mainly included for reference but their usage is discouraged due to potential vulnerabilities to the correlation-enhanced collision attack as described by [Moradi et al.: "Correlation-Enhanced Power Analysis Collision Attack".](https://eprint.iacr.org/2010/297.pdf)
-
-The masking PRNG is reseeded with fresh entropy via [EDN](../edn/README.md) automatically 1) whenever a new key is provided (see [`CTRL_AUX_SHADOWED.KEY_TOUCH_FORCES_RESEED`](data/aes.hjson#ctrl_aux_shadowed)) and 2) based on a block counter.
-The rate at which this block counter initiates automatic reseed operations can be configured via [`CTRL_SHADOWED.PRNG_RESEED_RATE`](data/aes.hjson#ctrl_shadowed).
-In addition software can manually initiate a reseed operation via [`TRIGGER.PRNG_RESEED`](data/aes.hjson#trigger).
-
-Note that the masking can be enabled/disabled via compile-time Verilog parameter.
-It may be acceptable to disable the masking when using the AES cipher core for random number generation e.g. inside [CSRNG.](../csrng/README.md)
-When disabling the masking, also an unmasked S-Box implementation needs to be selected using the corresponding compile-time Verilog parameter.
-When disabling masking, it is recommended to use the unmasked Canright or LUT S-Box implementation for ASIC or FPGA targets, respectively.
-Both are fully combinational and allow for one S-Box evaluation every clock cycle.
-
-It's worth noting that since input/output data are provided/retrieved via register interface in unmasked form, the AES unit should not be used to form an identity ladder where the output of one AES operation is used to form the key for the next AES operation in the ladder.
-In OpenTitan, the [Keccak Message Authentication Code (KMAC) unit](../kmac/README.md) is used for that purpose.
-
-### Fully-Parallel Data Path
-
-Any 1st-order masking scheme primarily protects against 1st-order SCA.
-Vulnerabilities against higher-order SCA might still be present.
-A common technique to aggravate higher-order attacks is to increase the noise in the system e.g. by leveraging parallel architectures.
-To this end, the AES cipher core uses a 128-bit parallel data path with a total of up to 20 S-Boxes (16 inside SubBytes, 4 inside KeyExpand) that are evaluated in parallel.
-
-Besides more noise for increased resistance against higher-order SCA, the fully-parallel architecture also enables for higher performance and flexibility.
-It allows users to seamlessly switch out the S-Box implementation in order to experiment with different masking schemes.
-To interface the data paths with the S-Boxes, a handshake protocol is used.
-
-### Note on Reset vs. Non-Reset Flip-Flops
-
-The choice of flip-flop type for registering sensitive assets such as keys can have implications on the vulnerability against e.g. combined reset glitch attacks and SCA.
-Following the [OpenTitan non-reset vs. reset flops rationale](https://github.com/lowRISC/opentitan/issues/2603), the following observations can be made:
-- If masking is enabled, key and state values are stored in two shares inside the AES unit.
-  Neither the Hamming weights of the individual shares nor the summed Hamming weight are proportional to the Hamming weight of the secret asset.
-- Input/output data and IV values are (currently) not stored in multiple shares but these are less critical as they are used only once.
-  Further, they are stored in banks of 32 bits leaving a larger hypothesis space compared to when glitching e.g. an 8-bit register into reset.
-  In addition, they could potentially also be extracted when being transferred over the TL-UL bus interface.
-
-For this reason, the AES unit uses reset flops only.
-However, all major key and data registers are cleared with pseudo-random data upon reset.
-
-### Clearing Registers with Pseudo-Random Data
-
-Upon reset or if initiated by software, all major key and data registers inside the AES module are cleared with pseudo-random data (PRD).
-This helps to reduce SCA leakage when both writing these registers for reconfiguration and when clearing the registers after use.
-
-In addition, the state registers inside the cipher core are cleared with PRD during the last round of every encryption/decryption.
-This prevents Hamming distance leakage between the states of the last two rounds as well as between output and input data.
-
-## Fault Injection
-
-Fault injection (FI) attacks can be distinguished based on the FI target.
-
-### Control Path
-
-In cryptographic devices, fault attacks on the control path usually aim to disturb the control flow in a way to facilitate SCA or other attacks.
-Example targets for AES include: switch to less secure mode of operation (ECB), keep processing the same input data, reduce the number of rounds/early termination, skip particular rounds, skip individual operations in a round.
-
-To protect against FI attacks on the control path, the AES unit implements the following countermeasures.
-
-- Shadowed Control Register:
-  The main control register is implemented as a shadow register.
-  This means software has to perform two subsequent write operations to perform an update.
-  Internally, a shadow copy is used that is constantly compared with the actual register.
-  For further details, refer to the [Register Tool documentation.](../../../util/reggen/README.md#shadow-registers)
-
-- Sparse encodings of FSM states:
-  All FSMs inside the AES unit use sparse state encodings.
-
-- Sparse encodings for mux selector signals:
-  All main muxes use sparsely encoded selector signals.
-
-- Sparse encodings for handshake and other important control signals.
-
-- Multi-rail control logic:
-  All FSMs inside the AES unit are implemented using multiple independent and redundant logic rails.
-  Every rail evaluates and drives exactly one bit of sparsely encoded handshake or other important control signals.
-  The outputs of the different rails are constantly compared to detect potential faults.
-  The number of logic rails can be scaled up by means of relatively easy RTL modifications.
-  By default, three independent logic rails are used.
-
-- Hardened round counter:
-  Similar to the cipher core FSM, the internal round counter is protected against FI through a multi-rail implementation.
-  The outputs of the different rails are constantly compared to detect potential faults in the round counter.
-
-If any of these countermeasures detects a fault, a fatal alert is triggered, the internal FSMs go into a terminal error state, the AES unit does not release further data and locks up until reset.
-Since the AES unit has no ability to reset itself, a system-supplied reset is required before the AES unit can become operational again.
-Such a condition is reported in [`STATUS.ALERT_FATAL_FAULT`](data/aes.hjson#status).
-Details on where the fault has been detected are not provided.
-
-### Data Path
-
-The aim of fault attacks on the data path is typically to extract information on the key by means of statistical analysis.
-The current version of the AES unit does not employ countermeasures against such attacks, but future versions most likely will.
-
-
-# Programmers Guide
-
-This section discusses how software can interface with the AES unit.
-
-
-## Clear upon Reset
-
-Upon reset, the AES unit will first reseed the internal PRNGs for register clearing and masking via EDN, and then clear all key, IV and data registers with pseudo-random data.
-Only after this sequence has finished, the unit becomes idle (indicated in [`STATUS.IDLE`](data/aes.hjson#status)).
-The AES unit is then ready for software initialization.
-Note that at this point, the key, IV and data registers' values can no longer be expected to match the reset values.
-
-
-## Initialization
-
-Before initialization, software must ensure that the AES unit is idle by checking [`STATUS.IDLE`](data/aes.hjson#status).
-If the AES unit is not idle, write operations to [`CTRL_SHADOWED`](data/aes.hjson#ctrl_shadowed), the Initial Key registers [`KEY_SHARE0_0`](data/aes.hjson#key_share0_0) - [`KEY_SHARE1_7`](data/aes.hjson#key_share1_7) and initialization vector (IV) registers [`IV_0`](data/aes.hjson#iv_0) - [`IV_3`](data/aes.hjson#iv_3) are ignored.
-
-To initialize the AES unit, software must first provide the configuration to the [`CTRL_SHADOWED`](data/aes.hjson#ctrl_shadowed) register.
-Since writing this register may initiate the reseeding of the internal PRNGs, software must check that the AES unit is idle before providing the initial key.
-Then software must write the initial key to the Initial Key registers [`KEY_SHARE0_0`](data/aes.hjson#key_share0_0) - [`KEY_SHARE1_7`](data/aes.hjson#key_share1_7).
-The key is provided in two shares:
-The first share is written to [`KEY_SHARE0_0`](data/aes.hjson#key_share0_0) - [`KEY_SHARE0_7`](data/aes.hjson#key_share0_7) and the second share is written to [`KEY_SHARE1_0`](data/aes.hjson#key_share1_0) - [`KEY_SHARE1_7`](data/aes.hjson#key_share1_7).
-The actual initial key used for encryption corresponds to the value obtained by XORing [`KEY_SHARE0_0`](data/aes.hjson#key_share0_0) - [`KEY_SHARE0_7`](data/aes.hjson#key_share0_7) with [`KEY_SHARE1_0`](data/aes.hjson#key_share1_0) - [`KEY_SHARE1_7`](data/aes.hjson#key_share1_7).
-Note that all registers are little-endian.
-The key length is configured using the KEY_LEN field of [`CTRL_SHADOWED`](data/aes.hjson#ctrl_shadowed).
-Independent of the selected key length, software must always write all 8 32-bit registers of both shares.
-Each register must be written at least once.
-The order in which the key registers are written does not matter.
-Anything can be written to the unused key registers of both shares, however, random data is preferred.
-For AES-128 ,the actual initial key used for encryption is formed by XORing [`KEY_SHARE0_0`](data/aes.hjson#key_share0_0) - [`KEY_SHARE0_3`](data/aes.hjson#key_share0_3) with [`KEY_SHARE1_0`](data/aes.hjson#key_share1_0) - [`KEY_SHARE1_3`](data/aes.hjson#key_share1_3).
-For AES-192, the actual initial key used for encryption is formed by XORing [`KEY_SHARE0_0`](data/aes.hjson#key_share0_0) - [`KEY_SHARE0_5`](data/aes.hjson#key_share0_5) with [`KEY_SHARE1_0`](data/aes.hjson#key_share1_0) - [`KEY_SHARE1_5`](data/aes.hjson#key_share1_5).
-
-If running in CBC, CFB, OFB or CTR mode, software must also write the IV registers [`IV_0`](data/aes.hjson#iv_0) - [`IV_3`](data/aes.hjson#iv_3).
-Since providing the initial key initiate the reseeding of the internal PRNGs, software must check that the AES unit is idle before writing the IV registers.
-These registers are little-endian, but the increment of the IV in CTR mode is big-endian (see [Recommendation for Block Cipher Modes of Operation](https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf)).
-Each IV register must be written at least once.
-The order in which these registers are written does not matter.
-Note that the AES unit automatically updates the IV registers when running in CBC, CFB, OFB or CTR mode (after having consumed the current IV value).
-To start the encryption/decryption of a new message, software must wait for the AES unit to become idle and then provide new values to the IV registers.
-
-## Block Operation
-
-For block operation, software must initialize the AES unit as described in the previous section.
-In particular, the AES unit must be configured to run in normal/automatic mode.
-This is indicated by the MANUAL_OPERATION bit in [`CTRL_SHADOWED`](data/aes.hjson#ctrl_shadowed) reading as `0`.
-It ensures that the AES unit:
-1. Automatically starts encryption/decryption when new input data is available.
-1. Does not overwrite previous output data that has not yet been read by the processor.
-
-Then, software must:
-1. Ensure that the INPUT_READY bit in [`STATUS`](data/aes.hjson#status) is `1`.
-1. Write Input Data Block `0` to the Input Data registers [`DATA_IN_0`](data/aes.hjson#data_in_0) - [`DATA_IN_3`](data/aes.hjson#data_in_3).
-   Each register must be written at least once.
-   The order in which these registers are written does not matter.
-1. Wait for the INPUT_READY bit in [`STATUS`](data/aes.hjson#status) to become `1`, i.e. wait for the AES unit to load Input Data Block `0` into the internal state register and start operation.
-1. Write Input Data Block `1` to the Input Data registers.
-
-Then for every Data Block `I=0,..,N-3`, software must:
-1. Wait for the OUTPUT_VALID bit in [`STATUS`](data/aes.hjson#status) to become `1`, i.e., wait for the AES unit to finish encryption/decryption of Block `I`.
-   The AES unit then directly starts processing the previously input block `I+1`
-2. Read Output Data Block `I` from the Output Data registers [`DATA_OUT_0`](data/aes.hjson#data_out_0) - [`DATA_OUT_3`](data/aes.hjson#data_out_3).
-   Each register must be read at least once.
-   The order in which these registers are read does not matter.
-3. Write Input Data Block `I+2` into the Input Data register.
-   There is no need to explicitly check INPUT_READY as in the same cycle OUTPUT_VALID becomes `1`, the current input is loaded in (meaning INPUT_READY becomes `1` one cycle later).
-
-Once all blocks have been input, the final data blocks `I=N-2,N-1` must be read out:
-1. Wait for the OUTPUT_VALID bit in [`STATUS`](data/aes.hjson#status) to become `1`, i.e., wait for the AES unit to finish encryption/decryption of Block `I`.
-2. Read Output Data Block `I` from the Output Data register.
-
-Note that interrupts are not provided, the latency of the AES unit is such that they are of little utility.
-
-The code snippet below shows how to perform block operation.
-
-```c
-  // Enable autostart, disable overwriting of previous output data. Note the control register is
-  // shadowed and thus needs to be written twice.
-  uint32_t aes_ctrl_val =
-      (op & AES_CTRL_SHADOWED_OPERATION_MASK) << AES_CTRL_SHADOWED_OPERATION_OFFSET |
-      (mode & AES_CTRL_SHADOWED_MODE_MASK) << AES_CTRL_SHADOWED_MODE_OFFSET |
-      (key_len & AES_CTRL_SHADOWED_KEY_LEN_MASK) << AES_CTRL_SHADOWED_KEY_LEN_OFFSET |
-      0x0 << AES_CTRL_SHADOWED_MANUAL_OPERATION_OFFSET;
-  REG32(AES_CTRL_SHADOWED(0)) = aes_ctrl_val;
-  REG32(AES_CTRL_SHADOWED(0)) = aes_ctrl_val;
-
-  // Write key - Note: All registers are little-endian.
-  for (int j = 0; j < 8; j++) {
-    REG32(AES_KEY_SHARE0_0(0) + j * 4) = key_share0[j];
-    REG32(AES_KEY_SHARE1_0(0) + j * 4) = key_share1[j];
-  }
-
-  // Write IV.
-  for (int j = 0; j < 4; j++) {
-    REG32(AES_IV_0(0) + j * 4) = iv[j];
-  }
-
-  // Write Input Data Block 0.
-  for (int j = 0; j < 4; j++) {
-    REG32(AES_DATA_IN_0(0) + j * 4) = input_data[j];
-  }
-
-  // Wait for INPUT_READY bit
-  while (!((REG32(AES_STATUS(0)) >> AES_STATUS_INPUT_READY) & 0x1)) {
-  }
-
-  // Write Input Data Block 1
-  for (int j = 0; j < 4; j++) {
-    REG32(AES_DATA_IN_0(0) + j * 4) = input_data[j + 4];
-  }
-
-  // For Data Block I=0,...,N-1
-  for (int i = 0; i < N; i++) {
-
-    // Wait for OUTPUT_VALID bit
-    while (!((REG32(AES_STATUS(0)) >> AES_STATUS_OUTPUT_VALID) & 0x1)) {
-    }
-
-    // Read Output Data Block I
-    for (int j = 0; j < 4; j++) {
-      output_data[j + i * 4] = REG32(AES_DATA_OUT_0(0) + j * 4);
-    }
-
-    // Write Input Data Block I+2 - For I=0,...,N-3 only.
-    if (i < N - 2) {
-      for (int j = 0; j < 4; j++) {
-        REG32(AES_DATA_IN_0(0) + j * 4) = input_data[j + 4 * (i + 2)];
-      }
-    }
-  }
-
-```
-
-
-## Padding
-
-For the AES unit to automatically start encryption/decryption of the next data block, software is required to always update all four Input Data registers [`DATA_IN_0`](data/aes.hjson#data_in_0) - [`DATA_IN_3`](data/aes.hjson#data_in_3) and read all four Output Data registers [`DATA_OUT_0`](data/aes.hjson#data_out_0) - [`DATA_OUT_3`](data/aes.hjson#data_out_3).
-This is also true if the AES unit is operated in OFB or CTR mode, i.e., if the plaintext/ciphertext not necessarily needs to be a multiple of the block size (for more details refer to Appendix A of [Recommendation for Block Cipher Modes of Operation](https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf)).
-
-In the case that the plaintext/ciphertext is not a multiple of the block size and the AES unit is operated in OFB or CTR mode, software can employ any form of padding for the input data of the last message block as the padding bits do not have an effect on the actual message bits.
-It is recommended that software discards the padding bits after reading the output data.
-
-
-## De-Initialization
-
-After finishing operation, software must:
-1. Disable the AES unit to no longer automatically start encryption/decryption by setting the MANUAL_OPERATION bit in [`CTRL_SHADOWED`](data/aes.hjson#ctrl_shadowed) to `1`.
-1. Clear all key registers, IV registers as well as the Input Data and the Output Data registers with pseudo-random data by setting the KEY_IV_DATA_IN_CLEAR and DATA_OUT_CLEAR bits in [`TRIGGER`](data/aes.hjson#trigger) to `1`.
-
-The code snippet below shows how to perform this task.
-
-```c
-  // Disable autostart. Note the control register is shadowed and thus needs to be written twice.
-  uint32_t aes_ctrl_val = 0x1 << AES_CTRL_SHADOWED_MANUAL_OPERATION;
-  REG32(AES_CTRL_SHADOWED(0)) = aes_ctrl_val;
-  REG32(AES_CTRL_SHADOWED(0)) = aes_ctrl_val;
-
-  // Clear all key, IV, Input Data and Output Data registers.
-  REG32(AES_TRIGGER(0)) =
-      (0x1 << AES_TRIGGER_KEY_IV_DATA_IN_CLEAR) |
-      (0x1 << AES_TRIGGER_DATA_OUT_CLEAR);
-```
-
-## Device Interface Functions (DIFs)
-
-* [DIF Listings](../../../sw/device/lib/dif/dif_aes.h)
-
-## Register Table
-
-The AES unit uses 8 and 2x4 separate write-only registers for the initial key, initialization vector, and input data, as well as 4 separate read-only registers for the output data.
-All registers are little-endian.
-Compared to first-in, first-out (FIFO) interfaces, having separate registers has a couple of advantages:
-
-- Supported out-of-the-box by the register tool (the FIFO would have to be implemented separately).
-- Usability: critical corner cases where software updates input data or the key partially only are easier to avoid using separate registers and the `hwqe`-signals provided by the Register Tool.
-- Easier interaction with DMA engines
-
-Also, using a FIFO interface for something that is not actually FIFO (internally, 16B of input/output data are consumed/produced at once) is less natural.
-
-For a detailed overview of the register tool, please refer to the [Register Tool documentation.](../../../util/reggen/README.md)
-
-* [Register Table](data/aes.hjson#registers)
diff --git a/hw/ip/aes/doc/programmers_guide.md b/hw/ip/aes/doc/programmers_guide.md
new file mode 100644
index 0000000000000..c473e7b50c9bd
--- /dev/null
+++ b/hw/ip/aes/doc/programmers_guide.md
@@ -0,0 +1,182 @@
+# Programmer's Guide
+
+This section discusses how software can interface with the AES unit.
+
+
+## Clear upon Reset
+
+Upon reset, the AES unit will first reseed the internal PRNGs for register clearing and masking via EDN, and then clear all key, IV and data registers with pseudo-random data.
+Only after this sequence has finished, the unit becomes idle (indicated in [`STATUS.IDLE`](../data/aes.hjson#status)).
+The AES unit is then ready for software initialization.
+Note that at this point, the key, IV and data registers' values can no longer be expected to match the reset values.
+
+
+## Initialization
+
+Before initialization, software must ensure that the AES unit is idle by checking [`STATUS.IDLE`](../data/aes.hjson#status).
+If the AES unit is not idle, write operations to [`CTRL_SHADOWED`](../data/aes.hjson#ctrl_shadowed), the Initial Key registers [`KEY_SHARE0_0`](../data/aes.hjson#key_share0_0) - [`KEY_SHARE1_7`](../data/aes.hjson#key_share1_7) and initialization vector (IV) registers [`IV_0`](../data/aes.hjson#iv_0) - [`IV_3`](../data/aes.hjson#iv_3) are ignored.
+
+To initialize the AES unit, software must first provide the configuration to the [`CTRL_SHADOWED`](../data/aes.hjson#ctrl_shadowed) register.
+Since writing this register may initiate the reseeding of the internal PRNGs, software must check that the AES unit is idle before providing the initial key.
+Then software must write the initial key to the Initial Key registers [`KEY_SHARE0_0`](../data/aes.hjson#key_share0_0) - [`KEY_SHARE1_7`](../data/aes.hjson#key_share1_7).
+The key is provided in two shares:
+The first share is written to [`KEY_SHARE0_0`](../data/aes.hjson#key_share0_0) - [`KEY_SHARE0_7`](../data/aes.hjson#key_share0_7) and the second share is written to [`KEY_SHARE1_0`](../data/aes.hjson#key_share1_0) - [`KEY_SHARE1_7`](../data/aes.hjson#key_share1_7).
+The actual initial key used for encryption corresponds to the value obtained by XORing [`KEY_SHARE0_0`](../data/aes.hjson#key_share0_0) - [`KEY_SHARE0_7`](../data/aes.hjson#key_share0_7) with [`KEY_SHARE1_0`](../data/aes.hjson#key_share1_0) - [`KEY_SHARE1_7`](../data/aes.hjson#key_share1_7).
+Note that all registers are little-endian.
+The key length is configured using the KEY_LEN field of [`CTRL_SHADOWED`](../data/aes.hjson#ctrl_shadowed).
+Independent of the selected key length, software must always write all 8 32-bit registers of both shares.
+Each register must be written at least once.
+The order in which the key registers are written does not matter.
+Anything can be written to the unused key registers of both shares, however, random data is preferred.
+For AES-128 ,the actual initial key used for encryption is formed by XORing [`KEY_SHARE0_0`](../data/aes.hjson#key_share0_0) - [`KEY_SHARE0_3`](../data/aes.hjson#key_share0_3) with [`KEY_SHARE1_0`](../data/aes.hjson#key_share1_0) - [`KEY_SHARE1_3`](../data/aes.hjson#key_share1_3).
+For AES-192, the actual initial key used for encryption is formed by XORing [`KEY_SHARE0_0`](../data/aes.hjson#key_share0_0) - [`KEY_SHARE0_5`](../data/aes.hjson#key_share0_5) with [`KEY_SHARE1_0`](../data/aes.hjson#key_share1_0) - [`KEY_SHARE1_5`](../data/aes.hjson#key_share1_5).
+
+If running in CBC, CFB, OFB or CTR mode, software must also write the IV registers [`IV_0`](../data/aes.hjson#iv_0) - [`IV_3`](../data/aes.hjson#iv_3).
+Since providing the initial key initiate the reseeding of the internal PRNGs, software must check that the AES unit is idle before writing the IV registers.
+These registers are little-endian, but the increment of the IV in CTR mode is big-endian (see [Recommendation for Block Cipher Modes of Operation](https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf)).
+Each IV register must be written at least once.
+The order in which these registers are written does not matter.
+Note that the AES unit automatically updates the IV registers when running in CBC, CFB, OFB or CTR mode (after having consumed the current IV value).
+To start the encryption/decryption of a new message, software must wait for the AES unit to become idle and then provide new values to the IV registers.
+
+## Block Operation
+
+For block operation, software must initialize the AES unit as described in the previous section.
+In particular, the AES unit must be configured to run in normal/automatic mode.
+This is indicated by the MANUAL_OPERATION bit in [`CTRL_SHADOWED`](../data/aes.hjson#ctrl_shadowed) reading as `0`.
+It ensures that the AES unit:
+1. Automatically starts encryption/decryption when new input data is available.
+1. Does not overwrite previous output data that has not yet been read by the processor.
+
+Then, software must:
+1. Ensure that the INPUT_READY bit in [`STATUS`](../data/aes.hjson#status) is `1`.
+1. Write Input Data Block `0` to the Input Data registers [`DATA_IN_0`](../data/aes.hjson#data_in_0) - [`DATA_IN_3`](../data/aes.hjson#data_in_3).
+   Each register must be written at least once.
+   The order in which these registers are written does not matter.
+1. Wait for the INPUT_READY bit in [`STATUS`](../data/aes.hjson#status) to become `1`, i.e. wait for the AES unit to load Input Data Block `0` into the internal state register and start operation.
+1. Write Input Data Block `1` to the Input Data registers.
+
+Then for every Data Block `I=0,..,N-3`, software must:
+1. Wait for the OUTPUT_VALID bit in [`STATUS`](../data/aes.hjson#status) to become `1`, i.e., wait for the AES unit to finish encryption/decryption of Block `I`.
+   The AES unit then directly starts processing the previously input block `I+1`
+2. Read Output Data Block `I` from the Output Data registers [`DATA_OUT_0`](../data/aes.hjson#data_out_0) - [`DATA_OUT_3`](../data/aes.hjson#data_out_3).
+   Each register must be read at least once.
+   The order in which these registers are read does not matter.
+3. Write Input Data Block `I+2` into the Input Data register.
+   There is no need to explicitly check INPUT_READY as in the same cycle OUTPUT_VALID becomes `1`, the current input is loaded in (meaning INPUT_READY becomes `1` one cycle later).
+
+Once all blocks have been input, the final data blocks `I=N-2,N-1` must be read out:
+1. Wait for the OUTPUT_VALID bit in [`STATUS`](../data/aes.hjson#status) to become `1`, i.e., wait for the AES unit to finish encryption/decryption of Block `I`.
+2. Read Output Data Block `I` from the Output Data register.
+
+Note that interrupts are not provided, the latency of the AES unit is such that they are of little utility.
+
+The code snippet below shows how to perform block operation.
+
+```c
+  // Enable autostart, disable overwriting of previous output data. Note the control register is
+  // shadowed and thus needs to be written twice.
+  uint32_t aes_ctrl_val =
+      (op & AES_CTRL_SHADOWED_OPERATION_MASK) << AES_CTRL_SHADOWED_OPERATION_OFFSET |
+      (mode & AES_CTRL_SHADOWED_MODE_MASK) << AES_CTRL_SHADOWED_MODE_OFFSET |
+      (key_len & AES_CTRL_SHADOWED_KEY_LEN_MASK) << AES_CTRL_SHADOWED_KEY_LEN_OFFSET |
+      0x0 << AES_CTRL_SHADOWED_MANUAL_OPERATION_OFFSET;
+  REG32(AES_CTRL_SHADOWED(0)) = aes_ctrl_val;
+  REG32(AES_CTRL_SHADOWED(0)) = aes_ctrl_val;
+
+  // Write key - Note: All registers are little-endian.
+  for (int j = 0; j < 8; j++) {
+    REG32(AES_KEY_SHARE0_0(0) + j * 4) = key_share0[j];
+    REG32(AES_KEY_SHARE1_0(0) + j * 4) = key_share1[j];
+  }
+
+  // Write IV.
+  for (int j = 0; j < 4; j++) {
+    REG32(AES_IV_0(0) + j * 4) = iv[j];
+  }
+
+  // Write Input Data Block 0.
+  for (int j = 0; j < 4; j++) {
+    REG32(AES_DATA_IN_0(0) + j * 4) = input_data[j];
+  }
+
+  // Wait for INPUT_READY bit
+  while (!((REG32(AES_STATUS(0)) >> AES_STATUS_INPUT_READY) & 0x1)) {
+  }
+
+  // Write Input Data Block 1
+  for (int j = 0; j < 4; j++) {
+    REG32(AES_DATA_IN_0(0) + j * 4) = input_data[j + 4];
+  }
+
+  // For Data Block I=0,...,N-1
+  for (int i = 0; i < N; i++) {
+
+    // Wait for OUTPUT_VALID bit
+    while (!((REG32(AES_STATUS(0)) >> AES_STATUS_OUTPUT_VALID) & 0x1)) {
+    }
+
+    // Read Output Data Block I
+    for (int j = 0; j < 4; j++) {
+      output_data[j + i * 4] = REG32(AES_DATA_OUT_0(0) + j * 4);
+    }
+
+    // Write Input Data Block I+2 - For I=0,...,N-3 only.
+    if (i < N - 2) {
+      for (int j = 0; j < 4; j++) {
+        REG32(AES_DATA_IN_0(0) + j * 4) = input_data[j + 4 * (i + 2)];
+      }
+    }
+  }
+
+```
+
+
+## Padding
+
+For the AES unit to automatically start encryption/decryption of the next data block, software is required to always update all four Input Data registers [`DATA_IN_0`](../data/aes.hjson#data_in_0) - [`DATA_IN_3`](../data/aes.hjson#data_in_3) and read all four Output Data registers [`DATA_OUT_0`](../data/aes.hjson#data_out_0) - [`DATA_OUT_3`](../data/aes.hjson#data_out_3).
+This is also true if the AES unit is operated in OFB or CTR mode, i.e., if the plaintext/ciphertext not necessarily needs to be a multiple of the block size (for more details refer to Appendix A of [Recommendation for Block Cipher Modes of Operation](https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf)).
+
+In the case that the plaintext/ciphertext is not a multiple of the block size and the AES unit is operated in OFB or CTR mode, software can employ any form of padding for the input data of the last message block as the padding bits do not have an effect on the actual message bits.
+It is recommended that software discards the padding bits after reading the output data.
+
+
+## De-Initialization
+
+After finishing operation, software must:
+1. Disable the AES unit to no longer automatically start encryption/decryption by setting the MANUAL_OPERATION bit in [`CTRL_SHADOWED`](../data/aes.hjson#ctrl_shadowed) to `1`.
+1. Clear all key registers, IV registers as well as the Input Data and the Output Data registers with pseudo-random data by setting the KEY_IV_DATA_IN_CLEAR and DATA_OUT_CLEAR bits in [`TRIGGER`](../data/aes.hjson#trigger) to `1`.
+
+The code snippet below shows how to perform this task.
+
+```c
+  // Disable autostart. Note the control register is shadowed and thus needs to be written twice.
+  uint32_t aes_ctrl_val = 0x1 << AES_CTRL_SHADOWED_MANUAL_OPERATION;
+  REG32(AES_CTRL_SHADOWED(0)) = aes_ctrl_val;
+  REG32(AES_CTRL_SHADOWED(0)) = aes_ctrl_val;
+
+  // Clear all key, IV, Input Data and Output Data registers.
+  REG32(AES_TRIGGER(0)) =
+      (0x1 << AES_TRIGGER_KEY_IV_DATA_IN_CLEAR) |
+      (0x1 << AES_TRIGGER_DATA_OUT_CLEAR);
+```
+
+## Device Interface Functions (DIFs)
+
+* [DIF Listings](../../../../sw/device/lib/dif/dif_aes.h)
+
+## Register Table
+
+The AES unit uses 8 and 2x4 separate write-only registers for the initial key, initialization vector, and input data, as well as 4 separate read-only registers for the output data.
+All registers are little-endian.
+Compared to first-in, first-out (FIFO) interfaces, having separate registers has a couple of advantages:
+
+- Supported out-of-the-box by the register tool (the FIFO would have to be implemented separately).
+- Usability: critical corner cases where software updates input data or the key partially only are easier to avoid using separate registers and the `hwqe`-signals provided by the Register Tool.
+- Easier interaction with DMA engines
+
+Also, using a FIFO interface for something that is not actually FIFO (internally, 16B of input/output data are consumed/produced at once) is less natural.
+
+For a detailed overview of the register tool, please refer to the [Register Tool documentation.](../../../../util/reggen/README.md)
+
+* [Register Table](../data/aes.hjson#registers)
diff --git a/hw/ip/aes/doc/theory_of_operation.md b/hw/ip/aes/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..9626d55ebea20
--- /dev/null
+++ b/hw/ip/aes/doc/theory_of_operation.md
@@ -0,0 +1,407 @@
+# Theory of Operation
+
+The AES unit supports both encryption and decryption for AES-128/192/256 in ECB, CBC, CFB, OFB and CTR modes using a single, shared data path.
+That is, it can either do encryption or decryption but not both at the same time.
+
+The AES unit features a key expanding mechanism to generate the required round keys on-the-fly from a single initial key provided through the register interface.
+This means the processor needs to provide just the initial encryption key to the AES unit via register interface.
+The AES unit then uses this key to generate all round keys as they are needed in parallel to the actual encryption/decryption.
+The benefits of this design compared to passing all round keys via register interface include:
+
+- Reduced storage requirements and smaller circuit area: Instead of storing 15 128-bit round keys, only 3 256-bit key registers are required for AES-256:
+  - one set of registers to which the processor writes the initial key, i.e., the start key for encryption,
+  - one set of registers to hold the current full key, and
+  - one set of registers to hold the full key of the last encryption round, i.e., the start key for decryption.
+- Faster re-configuration and key switching: The core just needs to perform 8 write operations instead of 60 write operations for AES-256.
+
+On-the-fly round-key generation comes however at the price of an initial delay whenever the key is changed by the processor before the AES unit can perform ECB/CBC **decryption** using this new key.
+During this phase, the key expanding mechanism iteratively computes the start key for the decryption.
+The duration of this delay phase corresponds to the latency required for encrypting one 16B block (i.e., 12/14/16 cycles for AES-128/192/256).
+Once the start key for decryption has been computed, it is stored in a dedicated internal register for later use.
+The AES unit can then switch between decryption and encryption without additional overhead.
+
+For encryption or if the mode is set to CFB, OFB or CTR, there is no such initial delay upon changing the key.
+If the next operation after a key switch is ECB or CBC **decryption**, the AES unit automatically initiates a key expansion using the key schedule first (to generate the start key for decryption, the actual data path remains idle during that phase).
+
+The AES unit uses a status register to indicate to the processor when ready to receive the next input data block via the register interface.
+While the AES unit is performing encryption/decryption of a data block, it is safe for the processor to provide the next input data block.
+The AES unit automatically starts the encryption/decryption of the next data block once the previous encryption/decryption is finished and new input data is available.
+The order in which the input registers are written does not matter.
+Every input register must be written at least once for the AES unit to automatically start encryption/decryption.
+This is the default behavior.
+It can be disabled by setting the MANUAL_OPERATION bit in [`CTRL_SHADOWED`](../data/aes.hjson#ctrl_shadowed) to `1`.
+In this case, the AES unit only starts the encryption/decryption once the START bit in [`TRIGGER`](../data/aes.hjson#trigger) is set to `1` (automatically cleared to `0` once the next encryption/decryption is started).
+
+Similarly, the AES unit indicates via a status register when having new output data available to be read by the processor.
+Also, there is a back-pressure mechanism for the output data.
+If the AES unit wants to finish the encryption/decryption of a data block but the previous output data has not yet been read by the processor, the AES unit is stalled.
+It hangs and does not drop data.
+It only continues once the previous output data has been read and the corresponding registers can be safely overwritten.
+The order in which the output registers are read does not matter.
+Every output register must be read at least once for the AES unit to continue.
+This is the default behavior.
+It can be disabled by setting the MANUAL_OPERATION bit in [`CTRL_SHADOWED`](../data/aes.hjson#ctrl_shadowed) to `1`.
+In this case, the AES unit never stalls and just overwrites previous output data, independent of whether it has been read or not.
+
+
+## Block Diagram
+
+This AES unit targets medium performance (\~1 cycle per round for the unmasked implementation).
+High-speed, single-cycle operation for high-bandwidth data streaming is not required.
+
+Therefore, the AES unit uses an iterative cipher core architecture with a 128-bit wide data path as shown in the figure below.
+Note that for the sake of simplicity, the figure shows the unmasked implementation.
+For details on the masked implementation of the cipher core refer to [Security Hardening below](#security-hardening)).
+Using an iterative architecture allows for a smaller circuit area at the cost of throughput.
+Employing a 128-bit wide data path allows to achieve the latency requirements of 12/14/16 clock cycles per 16B data block in AES-128/192/256 mode in the unmasked implementation, respectively.
+
+![AES unit block diagram (unmasked implementation) with shared data paths for encryption and decryption (using the Equivalent Inverse Cipher).](../doc/aes_block_diagram.svg)
+
+Inside the cipher core, both the data paths for the actual cipher (left) and the round key generation (right) are shared between encryption and decryption.
+Consequently, the blocks shown in the diagram always implement the forward and backward (inverse) version of the corresponding operation.
+For example, SubBytes implements both SubBytes and InvSubBytes.
+
+Besides the actual AES cipher core, the AES unit features a set of control and status registers (CSRs) accessible by the processor via TL-UL bus interface, and a counter module (used in CTR mode only).
+This counter module implements the Standard Incrementing Function according to [Recommendation for Block Cipher Modes of Operation (Appendix B.1)](https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf) with a fixed parameter m = 128.
+Note that for AES, parameter b = 128 and the counter increment is big-endian.
+CFB mode is supported with a fixed parameter s = 128 (CFB-128).
+Support for data segment sizes other than 128 bits would require a substantial amount of additional muxing resources and is thus not provided.
+The initialization vector (IV) register and the register to hold the previous input data are used in CBC, CFB, OFB and CTR modes only.
+
+
+## Hardware Interfaces
+
+* [Interface Tables](../data/aes.hjson#interfaces)
+
+The table below lists other signals of the AES unit.
+
+Signal             | Direction        | Type                   | Description
+-------------------|------------------|------------------------|---------------
+`idle_o`           | `output`         | `logic`                | Idle indication signal for clock manager.
+`lc_escalate_en_i` | `input`          | `lc_ctrl_pkg::lc_tx_t` | Life cycle escalation enable coming from [life cycle controller](../../lc_ctrl/README.md). This signal moves the main controller FSM within the AES unit into the terminal error state. The AES unit needs to be reset.
+`edn_o`            | `output`         | `edn_pkg::edn_req_t`   | Entropy request to [entropy distribution network (EDN)](../../edn/README.md) for reseeding internal pseudo-random number generators (PRNGs) used for register clearing and masking.
+`edn_i`            | `input`          | `edn_pkg::edn_rsp_t`   | [EDN](../../edn/README.md) acknowledgment and entropy input for reseeding internal PRNGs.
+`keymgr_key_i`     | `input`          | `keymgr_pgk::hw_key_req_t` | Key sideload request coming from [key manager](../../keymgr/README.md).
+
+Note that the `edn_o` and `edn_i` signals used to interface [EDN](../../edn/README.md) follow a REQ/ACK protocol.
+The entropy distributed by EDN is obtained from the [cryptographically secure random number generator (CSRNG)](../../csrng/README.md).
+
+## Design Details
+
+This section discusses different design details of the AES module.
+
+
+### Datapath Architecture and Operation
+
+The AES unit implements the Equivalent Inverse Cipher described in the [AES specification](https://csrc.nist.gov/csrc/media/publications/fips/197/final/documents/fips-197.pdf).
+This allows for more efficient cipher data path sharing between encryption/decryption as the operations are applied in the same order (less muxes, simpler control), but requires the round key during decryption to be transformed using an inverse MixColumns in all rounds except for the first and the last one.
+
+This architectural choice targets at efficient cipher data path sharing and low area footprint.
+Depending on the application scenario, other architectures might offer a more suitable area/performance tradeoff.
+For example if only CFB, OFB or CTR modes are ever used, the inverse cipher is not used at all.
+Moreover, if the key is changed extremely rarely (as for example in the case of bulk decryption), it may pay off to store all round keys instead of generating them on the fly.
+Future versions of the AES unit might offer compile-time parameters to selectively instantiate the forward/inverse cipher part only to allow for dedicated encryption/decryption-only units.
+
+All submodules in the data path are purely combinational.
+The only sequential logic in the cipher and round key generation are the State, Full Key and Decryption Key registers.
+
+The following description explains how the AES unit operates, i.e., how the operation of the AES cipher is mapped to the datapath architecture of the AES unit.
+Phrases in italics apply to peculiarities of different block cipher modes.
+For a general introduction into these cipher modes, refer to [Recommendation for Block Cipher Modes of Operation](https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf).
+
+1. The configuration and initial key is provided to the AES unit via a set of control and status registers (CSRs) accessible by the processor via TL-UL bus interface.
+   The processor must first provide the configuration to the [`CTRL_SHADOWED`](../data/aes.hjson#ctrl_shadowed) register.
+   Then follows the initial key.
+   Each key register must be written at least once.
+   The order in which the registers are written does not matter.
+1. _The processor provides the initialization vector (IV) or initial counter value to the four IV registers via TL-UL bus interface in CBC, CFB and OFB modes, or CTR mode, respectively.
+   Each IV register must be written at least once.
+   The order in which the registers are written does not matter.
+   Note that while operating, the AES unit automatically updates the IV registers after having consumed the current IV value.
+   Whenever a new message is started, the processor must provide the corresponding IV value via TL-UL bus interface.
+   In ECB mode, no IV needs to be provided.
+   The content of the IV registers is ignored in ECB mode._
+1. The input data is provided to the AES unit via four CSRs.
+   Each input register must be written at least once.
+   The order in which the registers are written does not matter.
+1. If new input data is available, the AES unit automatically starts encryption/decryption by performing the following actions.
+    1. The AES unit loads initial state into the State register inside the cipher core.
+
+       _Depending on the cipher mode, the initial state is a combination of input data as well as IV._
+       _Note, if CBC decryption is performed, or if running in CFB, OFB or CTR mode, the input data is also registered (Data In Prev in the block diagram)._
+    2. The initial key is loaded into the Full Key register inside the cipher core.
+
+       _Note, if the ECB/CBC decryption is performed, the Full Key register is loaded with the value stored in the Decryption Key register._
+
+    _Note, for the AES unit to automatically start in CBC, CFB, OFB or CTR mode, also the IV must be ready.
+    The IV is ready if -- since the last IV update (either done by the processor or the AES unit itself) -- all IV registers have been written at least once or none of them.
+    The AES unit will not automatically start the next encryption/decryption with a partially updated IV._
+
+    By setting the MANUAL_OPERATION bit in [`CTRL_SHADOWED`](../data/aes.hjson#ctrl_shadowed) to `1`, the AES unit can be operated in manual mode.
+    In manual mode, the AES unit starts encryption/decryption whenever the START bit in [`TRIGGER`](../data/aes.hjson#trigger) is set to `1`, irrespective of the status of the IV and input data registers.
+
+1. Once the State and Full Key registers have been loaded, the AES cipher core starts the encryption/decryption by adding the first round key to the initial state (all blocks in both data paths are bypassed).
+   The result is stored back in the State register.
+1. Then, the AES cipher core performs 9/11/13 rounds of encryption/decryption when using a 128/192/256-bit key, respectively.
+   In every round, the cipher data path performs the four following transformations.
+   For more details, refer to the [AES specification](https://csrc.nist.gov/csrc/media/publications/fips/197/final/documents/fips-197.pdf).
+    1. SubBytes Transformation: A non-linear byte substitution that operates independently on each byte of the state using a substitution table (S-Box).
+    2. ShiftRows Transformation: The bytes of the last three rows of the state are cyclically shifted over different offsets.
+    3. MixColumns Transformation: Each of the four columns of the state are considered as polynomials over GF(2^8) and individually multiplied with another fixed polynomial.
+    4. AddRoundKey Transformation: The round key is XORed with the output of the MixColumns operation and stored back into the State register.
+       The 128-bit round key itself is extracted from the current value in the Full Key register.
+
+    In parallel, the full key used for the next round is computed on the fly using the key expand module.
+
+    _If running in CTR mode, the counter module iteratively updates the IV in parallel to the cipher core performing encryption/decryption.
+    Internally, the counter module uses one 16-bit counter, meaning it requires 8 clock cycles to increment the 128-bit counter value stored in the IV register.
+    Since the counter value is used in the first round only, and since the encryption/decryption of a single block takes 12/14/16 cycles, the iterative counter implementation does not affect the throughput of the AES unit._
+1. Finally, the AES cipher core performs the final encryption/decryption round in which the MixColumns operation is skipped.
+   The output is forwarded to the output register in the CSRs but not stored back into the State register.
+   The internal State register is cleared with pseudo-random data.
+
+   _Depending on the cipher mode, the output of the final round is potentially XORed with either the value in the IV registers (CBC decryption) or the value stored in the previous input data register (CFB, OFB, CTR modes), before being forwarded to the output register in the CSRs.
+   If running in CBC mode, the IV registers are updated with the output data (encryption) or the value stored in the previous input data register (decryption).
+   If running in CFB or OFB mode, the IV registers are updated with the output data or the output of the final cipher round (before XORing with the previous input data), respectively._
+
+Having separate registers for input, output and internal state prevents the extraction of intermediate state via TL-UL bus interface and allows to overlap reconfiguration with operation.
+While the AES unit is performing encryption/decryption, the processor can safely write the next input data block into the CSRs or read the previous output data block from the CSRs.
+The State register is internal to the AES unit and not exposed via the TL-UL bus interface.
+If the AES unit wants to finish the encryption/decryption of an output data block but the previous one has not yet been read by the processor, the AES unit is stalled.
+It hangs and does not drop data.
+It only continues once the previous output data has been read and the corresponding registers can be safely overwritten.
+The order in which the output registers are read does not matter.
+Every output register must be read at least once for the AES unit to continue.
+In contrast, the initial key, and control register can only be updated if the AES unit is idle, which eases design verification (DV).
+Similarly, the initialization vector (IV) register can only be updated by the processor if the AES unit is idle.
+If the AES unit is busy and running in CBC or CTR mode, the AES unit itself updates the IV register.
+
+The cipher core architecture of the AES unit is derived from the architecture proposed by Satoh et al.: ["A compact Rijndael Hardware Architecture with S-Box Optimization"](https://link.springer.com/chapter/10.1007%2F3-540-45682-1_15).
+The expected circuit area in a 110nm CMOS technology is in the order of 12 - 22 kGE (unmasked implementation, AES-128 only).
+The expected circuit area of the entire AES unit with masking enabled is around 110 kGE.
+
+For a description of the various sub modules, see the following sections.
+
+
+### SubBytes / S-Box
+
+The SubBytes operation is a non-linear byte substitution that operates independently on each byte of the state using a substitution table (S-Box).
+It is both used for the cipher data path and the key expand data path.
+In total, the AES unit instantiates 20 S-Boxes in parallel (16 for SubBytes, 4 for KeyExpand), each having 8-bit input and output.
+In combination with the 128-bit wide data path, this allows to perform one AES round per iteration.
+
+The design of this S-Box and its inverse can have a big impact on circuit area, timing critical path, robustness and power leakage, and is itself its own research topic.
+
+The S-Boxes are decoupled from the rest of the AES unit with a handshake protocol, allowing them to be easily replaced by different implementations if required.
+The AES unit comes with the following S-Box implementations that can be selected by a compile-time Verilog parameter:
+- Domain-oriented masking (DOM) S-Box: default, see [Gross et al.: "Domain-Oriented Masking: Compact Masked Hardware Implementations with Arbitrary Protection Order"](https://eprint.iacr.org/2016/486.pdf)
+- Masked Canright S-Box: provided for reference, usage discouraged, a version w/ and w/o mask re-use is provided, see [Canright and Batina: "A very compact "perfectly masked" S-Box for AES (corrected)"](https://eprint.iacr.org/2009/011.pdf)
+- Canright S-Box: only use when disabling masking, recommended when targeting ASIC implementation, see [Canright: "A very compact Rijndael S-Box"](https://hdl.handle.net/10945/25608)
+- LUT-based S-Box: only use when disabling masking, recommended when targeting FPGA implementation
+
+The DOM S-Box has a latency of 5 clock cycles.
+All other implementations are fully combinational (one S-Box evaluation every clock cycle).
+See also [Security Hardening below.](#1st-order-masking-of-the-cipher-core)
+
+### ShiftRows
+
+The ShiftRows operation simply performs a cyclic shift of Rows 1, 2 and 3 of the state matrix.
+Consequently, it can be implemented using 3\*4 32-bit 2-input muxes (encryption/decryption).
+
+
+### MixColumns
+
+Each of the four columns of the state are considered as polynomials over GF(2^8) and individually multiplied with another fixed polynomial.
+The whole operation can be implemented using 36 2-input XORs and 16 4-input XORs (all 8-bit), 8 2-input muxes (8-bit), as well as 78 2-input and 24 3-input XOR gates.
+
+
+### KeyExpand
+
+The key expand module (KEM) integrated in the AES unit is responsible for generating the various round keys from the initial key for both encryption and decryption.
+The KEM generates the next 128/192/256-bit full key in parallel to the actual encryption/decryption based on the current full key or the initial key (for the first encryption round).
+The actual 128-bit round key is then extracted from this full key.
+
+Generating the keys on-the-fly allows for lower storage requirements and smaller circuit area but comes at the price of an initial delay before doing ECB/CBC **decryption** whenever the key is changed.
+During this phase, the KEM cycles through all full keys to obtain the start key for decryption (equals the key for final round of encryption).
+The duration of this delay phase corresponds to the latency required for encrypting one 16B block.
+During this initial phase, the cipher data path is kept idle.
+
+The timing diagram below visualizes this process.
+
+```wavejson
+{
+  signal: [
+    {    name: 'clk',       wave: 'p........|.......'},
+    ['TL-UL IF',
+      {  name: 'write',     wave: '01...0...|.......'},
+      {  name: 'addr',      wave: 'x2345xxxx|xxxxxxx', data: 'K0 K1 K2 K3'},
+      {  name: 'wdata',     wave: 'x2345xxxx|xxxxxxx', data: 'K0 K1 K2 K3'},
+    ],
+    {},
+    ['AES Unit',
+      {  name: 'Config op', wave: 'x4...............', data: 'DECRYPT'},
+      {  name: 'AES op',    wave: '2........|.4.....', data: 'IDLE DECRYPT'},
+      {  name: 'KEM op',    wave: '2....3...|.4.....', data: 'IDLE ENCRYPT DECRYPT'},
+      {  name: 'round',     wave: 'xxxxx2.22|22.2222', data: '0 1 2 9 0 1 2 3 4'},
+      {  name: 'key_init',  wave: 'xxxx5....|.......', data: 'K0-3'},
+      {  name: 'key_full',  wave: 'xxxxx5222|4.22222', data: 'K0-3 f(K) f(K) f(K) K0-3\' f(K) f(K) f(K) f(K) f(K)'},
+      {  name: 'key_dec',   wave: 'xxxxxxxxx|4......', data: 'K0-3\''},
+    ]
+  ]
+}
+```
+
+The AES unit is configured to do decryption (`Config op` = DECRYPT).
+Once the new key has been provided via the control and status registers (top), this new key is loaded into the Full Key register (`key_full` = K0-3) and the KEM starts performing encryption (`KEM op`=ENCRYPT).
+The cipher data path remains idle (`AES op`=IDLE).
+In every round, the value in `key_full` is updated.
+After 10 encryption rounds, the value in `key_full` equals the start key for decryption.
+This value is stored into the Decryption Key register (`key_dec` = K0-3' at the very bottom).
+Now the AES unit can switch between encryption/decryption without overhead as both the start key for encryption (`key_init`) and decryption (`key_dec`) can be loaded into `full_key`.
+
+For details on the KeyExpand operation refer to the [AES specification, Section 5.2](https://csrc.nist.gov/csrc/media/publications/fips/197/final/documents/fips-197.pdf).
+
+Key expanding is the only operation in the AES unit for which the functionality depends on the selected key length.
+Having a KEM that supports 128-bit key expansion, support for the 256-bit mode can be added at low overhead.
+In contrast, the 192-bit mode requires much larger muxes.
+Support for this mode is thus optional and can be enabled/disabled via a design-time parameter.
+
+Once we have cost estimates in terms of gate count increase for 192-bit mode, we can decide on whether or not to use it in OpenTitan.
+Typically, systems requiring security above AES-128 go directly for AES-256.
+
+### System Key-Manager Interface
+
+By default, the AES unit is controlled entirely by the processor.
+The processor writes both input data as well as the initial key to dedicated registers via the system bus interconnect.
+
+Alternatively, the processor can configure the AES unit to use an initial key provided by the [key manager](../../keymgr/README.md) via key sideload interface without exposing the key to the processor or other hosts attached to the system bus interconnect.
+To this end, the processor has to set the SIDELOAD bit in [`CTRL_SHADOWED`](../data/aes.hjson#ctrl_shadowed) to `1`.
+Any write operations of the processor to the Initial Key registers [`KEY_SHARE0_0`](../data/aes.hjson#key_share0_0) - [`KEY_SHARE1_7`](../data/aes.hjson#key_share1_7) are then ignored.
+In normal/automatic mode, the AES unit only starts encryption/decryption if the sideload key is marked as valid.
+To update the sideload key, the processor has to 1) wait for the AES unit to become idle, 2) wait for the key manager to update the sideload key and assert the valid signal, and 3) write to the [`CTRL_SHADOWED`](../data/aes.hjson#ctrl_shadowed) register to start a new message.
+After using a sideload key, the processor has to trigger the clearing of all key registers inside the AES unit (see [De-Initialization](#de-initialization) below).
+
+
+# Security Hardening
+
+The AES unit employs different means at architectural, micro-architectural and physical levels for security hardening against side-channel analysis and fault injection.
+
+## Side-Channel Analysis
+
+To aggravate side-channel analysis (SCA), the AES unit implements the following countermeasures.
+
+### 1st-order Masking of the Cipher Core
+
+The AES unit employs 1st-order masking of the AES cipher core.
+More precisely, both the cipher and the key expand data path use two shares.
+As shown in the block diagram below, the width of all registers and data paths basically doubles.
+
+![Block diagram of the masked AES cipher core.](../doc/aes_block_diagram_cipher_core_masked.svg)
+
+The initial key is provided in two shares via the register interface.
+The input data is provided in unmasked form and masked outside of the cipher core to obtain the two shares of the initial state.
+The pseudo-random data (PRD) required for masking the input data is provided by the pseudo-random number generator (PRNG) of the cipher core.
+Similarly, the two shares of the output state are combined outside the cipher core to obtain the output data.
+
+The same PRNG also generates the fresh randomness required by the masked SubBytes (16 masked S-Boxes) and the masked KeyExpand (4 masked S-Boxes).
+The masking scheme selected for the S-Box can have a high impact on SCA resistance, circuit area, number of PRD bits consumed per cycle and per S-Box evaluation, and throughput.
+The selection of the masked S-Box implementation can be controlled via compile-time Verilog parameter.
+By default, the AES unit uses domain-oriented masking (DOM) for the S-Boxes as proposed by [Gross et al.: "Domain-Oriented Masking: Compact Masked Hardware Implementations with Arbitrary Protection Order".](https://eprint.iacr.org/2016/486.pdf)
+The provided implementation has a latency of 5 clock cycles per S-Box evaluation.
+As a result, the overall latency for processing a 16-byte data block increases from 12/14/16 to 56/66/72 clock cycles in AES-128/192/256 mode, respectively.
+The provided implementation further forwards partial, intermediate results among DOM S-Box instances for remasking purposes.
+This allows to reduce circuit area related to generating, buffering and applying PRD without impacting SCA resistance.
+Alternatively, the two original versions of the masked Canright S-Box can be chosen as proposed by [Canright and Batina: "A very compact "perfectly masked" S-Box for AES (corrected)".](https://eprint.iacr.org/2009/011.pdf)
+These are fully combinational (one S-Box evaluation every cycle) and have lower area footprint, but they are significantly less resistant to SCA.
+They are mainly included for reference but their usage is discouraged due to potential vulnerabilities to the correlation-enhanced collision attack as described by [Moradi et al.: "Correlation-Enhanced Power Analysis Collision Attack".](https://eprint.iacr.org/2010/297.pdf)
+
+The masking PRNG is reseeded with fresh entropy via [EDN](../../edn/README.md) automatically 1) whenever a new key is provided (see [`CTRL_AUX_SHADOWED.KEY_TOUCH_FORCES_RESEED`](../data/aes.hjson#ctrl_aux_shadowed)) and 2) based on a block counter.
+The rate at which this block counter initiates automatic reseed operations can be configured via [`CTRL_SHADOWED.PRNG_RESEED_RATE`](../data/aes.hjson#ctrl_shadowed).
+In addition software can manually initiate a reseed operation via [`TRIGGER.PRNG_RESEED`](../data/aes.hjson#trigger).
+
+Note that the masking can be enabled/disabled via compile-time Verilog parameter.
+It may be acceptable to disable the masking when using the AES cipher core for random number generation e.g. inside [CSRNG.](../../csrng/README.md)
+When disabling the masking, also an unmasked S-Box implementation needs to be selected using the corresponding compile-time Verilog parameter.
+When disabling masking, it is recommended to use the unmasked Canright or LUT S-Box implementation for ASIC or FPGA targets, respectively.
+Both are fully combinational and allow for one S-Box evaluation every clock cycle.
+
+It's worth noting that since input/output data are provided/retrieved via register interface in unmasked form, the AES unit should not be used to form an identity ladder where the output of one AES operation is used to form the key for the next AES operation in the ladder.
+In OpenTitan, the [Keccak Message Authentication Code (KMAC) unit](../../kmac/README.md) is used for that purpose.
+
+### Fully-Parallel Data Path
+
+Any 1st-order masking scheme primarily protects against 1st-order SCA.
+Vulnerabilities against higher-order SCA might still be present.
+A common technique to aggravate higher-order attacks is to increase the noise in the system e.g. by leveraging parallel architectures.
+To this end, the AES cipher core uses a 128-bit parallel data path with a total of up to 20 S-Boxes (16 inside SubBytes, 4 inside KeyExpand) that are evaluated in parallel.
+
+Besides more noise for increased resistance against higher-order SCA, the fully-parallel architecture also enables for higher performance and flexibility.
+It allows users to seamlessly switch out the S-Box implementation in order to experiment with different masking schemes.
+To interface the data paths with the S-Boxes, a handshake protocol is used.
+
+### Note on Reset vs. Non-Reset Flip-Flops
+
+The choice of flip-flop type for registering sensitive assets such as keys can have implications on the vulnerability against e.g. combined reset glitch attacks and SCA.
+Following the [OpenTitan non-reset vs. reset flops rationale](https://github.com/lowRISC/opentitan/issues/2603), the following observations can be made:
+- If masking is enabled, key and state values are stored in two shares inside the AES unit.
+  Neither the Hamming weights of the individual shares nor the summed Hamming weight are proportional to the Hamming weight of the secret asset.
+- Input/output data and IV values are (currently) not stored in multiple shares but these are less critical as they are used only once.
+  Further, they are stored in banks of 32 bits leaving a larger hypothesis space compared to when glitching e.g. an 8-bit register into reset.
+  In addition, they could potentially also be extracted when being transferred over the TL-UL bus interface.
+
+For this reason, the AES unit uses reset flops only.
+However, all major key and data registers are cleared with pseudo-random data upon reset.
+
+### Clearing Registers with Pseudo-Random Data
+
+Upon reset or if initiated by software, all major key and data registers inside the AES module are cleared with pseudo-random data (PRD).
+This helps to reduce SCA leakage when both writing these registers for reconfiguration and when clearing the registers after use.
+
+In addition, the state registers inside the cipher core are cleared with PRD during the last round of every encryption/decryption.
+This prevents Hamming distance leakage between the states of the last two rounds as well as between output and input data.
+
+## Fault Injection
+
+Fault injection (FI) attacks can be distinguished based on the FI target.
+
+### Control Path
+
+In cryptographic devices, fault attacks on the control path usually aim to disturb the control flow in a way to facilitate SCA or other attacks.
+Example targets for AES include: switch to less secure mode of operation (ECB), keep processing the same input data, reduce the number of rounds/early termination, skip particular rounds, skip individual operations in a round.
+
+To protect against FI attacks on the control path, the AES unit implements the following countermeasures.
+
+- Shadowed Control Register:
+  The main control register is implemented as a shadow register.
+  This means software has to perform two subsequent write operations to perform an update.
+  Internally, a shadow copy is used that is constantly compared with the actual register.
+  For further details, refer to the [Register Tool documentation.](../../../../util/reggen/README.md#shadow-registers)
+
+- Sparse encodings of FSM states:
+  All FSMs inside the AES unit use sparse state encodings.
+
+- Sparse encodings for mux selector signals:
+  All main muxes use sparsely encoded selector signals.
+
+- Sparse encodings for handshake and other important control signals.
+
+- Multi-rail control logic:
+  All FSMs inside the AES unit are implemented using multiple independent and redundant logic rails.
+  Every rail evaluates and drives exactly one bit of sparsely encoded handshake or other important control signals.
+  The outputs of the different rails are constantly compared to detect potential faults.
+  The number of logic rails can be scaled up by means of relatively easy RTL modifications.
+  By default, three independent logic rails are used.
+
+- Hardened round counter:
+  Similar to the cipher core FSM, the internal round counter is protected against FI through a multi-rail implementation.
+  The outputs of the different rails are constantly compared to detect potential faults in the round counter.
+
+If any of these countermeasures detects a fault, a fatal alert is triggered, the internal FSMs go into a terminal error state, the AES unit does not release further data and locks up until reset.
+Since the AES unit has no ability to reset itself, a system-supplied reset is required before the AES unit can become operational again.
+Such a condition is reported in [`STATUS.ALERT_FATAL_FAULT`](../data/aes.hjson#status).
+Details on where the fault has been detected are not provided.
+
+### Data Path
+
+The aim of fault attacks on the data path is typically to extract information on the key by means of statistical analysis.
+The current version of the AES unit does not employ countermeasures against such attacks, but future versions most likely will.
diff --git a/hw/ip/aon_timer/README.md b/hw/ip/aon_timer/README.md
index bf19ed5c32b09..a9000822a5a29 100644
--- a/hw/ip/aon_timer/README.md
+++ b/hw/ip/aon_timer/README.md
@@ -62,60 +62,3 @@ Without this feature, the watchdog timer might wake up the core prematurely by t
 
 The "pause during escalation" feature ensures that watchdog bites and barks do not interfere with system escalation behavior.
 If during escalation software configures the system to hang instead of reset, the watchdog bite cannot supersede that decision.
-
-# Theory of Operations
-
-## Block Diagram
-
-![AON Timer Block Diagram](./doc/aon_timer_block_diagram.svg)
-
-See the block diagram for high level function and connectivity.
-The timer interacts with the CPU core and the power manager and reset manager to drive wakeup / reset events and interrupts.
-There is also an extra input to tell the counter whether to run ("counter-run").
-This is used to stop the watchdog timer running when in debugging mode or when the alert handler has put the system in a "killed" state.
-
-## Hardware Interfaces
-
-* [Interface Tables](data/aon_timer.hjson#interfaces)
-
-## Design Details
-
-The always-on timer will run on a ~200KHz clock.
-The timers themselves are 32b wide, giving a maximum timeout window of roughly ~6 hours.
-For the wakeup timer, the pre-scaler extends the maximum timeout to ~1000 days.
-
-Register reads via the TLUL interface are synchronized to the slow clock using the "async" register generation feature.
-This means that writes can complete before the data has reached its underlying register in the slow clock domain.
-If software needs to guarantee completion of a register write, it can read back the register value (which will guarantee the completion of all previous writes to the peripheral).
-
-# Programmers Guide
-
-## Initialization
-
-1. Write the timer values [`WKUP_COUNT`](data/aon_timer.hjson#wkup_count) and [`WDOG_COUNT`](data/aon_timer.hjson#wdog_count) to zero.
-2. Program the desired wakeup pre-scaler value in [`WKUP_CTRL`](data/aon_timer.hjson#wkup_ctrl).
-3. Program the desired thresholds in [`WKUP_THOLD`](data/aon_timer.hjson#wkup_thold), [`WDOG_BARK_THOLD`](data/aon_timer.hjson#wdog_bark_thold) and [`WDOG_BITE_THOLD`](data/aon_timer.hjson#wdog_bite_thold).
-4. Set the enable bit to 1 in the [`WKUP_CTRL`](data/aon_timer.hjson#wkup_ctrl) / [`WDOG_CTRL`](data/aon_timer.hjson#wdog_ctrl) registers.
-5. If desired, lock the watchdog configuration by writing 1 to the `regwen` bit in [`WDOG_REGWEN`](data/aon_timer.hjson#wdog_regwen).
-
-## Watchdog pet
-
-Pet the watchdog by writing zero to the [`WDOG_COUNT`](data/aon_timer.hjson#wdog_count) register.
-
-## Interrupt Handling
-
-If either timer reaches the programmed threshold, interrupts are generated from the AON_TIMER module.
-Disable or reinitialize the wakeup timer if required by clearing the enable bit in [`WKUP_CTRL`](data/aon_timer.hjson#wkup_ctrl) or clearing the timer value in [`WKUP_COUNT`](data/aon_timer.hjson#wkup_count).
-Clear the interrupt by writing 1 into the Interrupt Status Register [`INTR_STATE`](data/aon_timer.hjson#intr_state).
-
-If the timer has caused a wakeup event ([`WKUP_CAUSE`](data/aon_timer.hjson#wkup_cause) is set) then clear the wakeup request by writing 0 to [`WKUP_CAUSE`](data/aon_timer.hjson#wkup_cause).
-
-If [`WKUP_COUNT`](data/aon_timer.hjson#wkup_count) remains above the threshold after clearing the interrupt or wakeup event and the timer remains enabled, the interrupt and wakeup event will trigger again at the next clock tick.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_aon_timer.h)
-
-## Register Table
-
-* [Register Table](data/aon_timer.hjson#registers)
diff --git a/hw/ip/aon_timer/doc/programmers_guide.md b/hw/ip/aon_timer/doc/programmers_guide.md
new file mode 100644
index 0000000000000..1582b3dd40b3a
--- /dev/null
+++ b/hw/ip/aon_timer/doc/programmers_guide.md
@@ -0,0 +1,31 @@
+# Programmer's Guide
+
+## Initialization
+
+1. Write the timer values [`WKUP_COUNT`](../data/aon_timer.hjson#wkup_count) and [`WDOG_COUNT`](../data/aon_timer.hjson#wdog_count) to zero.
+2. Program the desired wakeup pre-scaler value in [`WKUP_CTRL`](../data/aon_timer.hjson#wkup_ctrl).
+3. Program the desired thresholds in [`WKUP_THOLD`](../data/aon_timer.hjson#wkup_thold), [`WDOG_BARK_THOLD`](../data/aon_timer.hjson#wdog_bark_thold) and [`WDOG_BITE_THOLD`](../data/aon_timer.hjson#wdog_bite_thold).
+4. Set the enable bit to 1 in the [`WKUP_CTRL`](../data/aon_timer.hjson#wkup_ctrl) / [`WDOG_CTRL`](../data/aon_timer.hjson#wdog_ctrl) registers.
+5. If desired, lock the watchdog configuration by writing 1 to the `regwen` bit in [`WDOG_REGWEN`](../data/aon_timer.hjson#wdog_regwen).
+
+## Watchdog pet
+
+Pet the watchdog by writing zero to the [`WDOG_COUNT`](../data/aon_timer.hjson#wdog_count) register.
+
+## Interrupt Handling
+
+If either timer reaches the programmed threshold, interrupts are generated from the AON_TIMER module.
+Disable or reinitialize the wakeup timer if required by clearing the enable bit in [`WKUP_CTRL`](../data/aon_timer.hjson#wkup_ctrl) or clearing the timer value in [`WKUP_COUNT`](../data/aon_timer.hjson#wkup_count).
+Clear the interrupt by writing 1 into the Interrupt Status Register [`INTR_STATE`](../data/aon_timer.hjson#intr_state).
+
+If the timer has caused a wakeup event ([`WKUP_CAUSE`](../data/aon_timer.hjson#wkup_cause) is set) then clear the wakeup request by writing 0 to [`WKUP_CAUSE`](../data/aon_timer.hjson#wkup_cause).
+
+If [`WKUP_COUNT`](../data/aon_timer.hjson#wkup_count) remains above the threshold after clearing the interrupt or wakeup event and the timer remains enabled, the interrupt and wakeup event will trigger again at the next clock tick.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_aon_timer.h)
+
+## Register Table
+
+* [Register Table](../data/aon_timer.hjson#registers)
diff --git a/hw/ip/aon_timer/doc/theory_of_operation.md b/hw/ip/aon_timer/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..76ab468884e64
--- /dev/null
+++ b/hw/ip/aon_timer/doc/theory_of_operation.md
@@ -0,0 +1,24 @@
+# Theory of Operation
+
+## Block Diagram
+
+![AON Timer Block Diagram](../doc/aon_timer_block_diagram.svg)
+
+See the block diagram for high level function and connectivity.
+The timer interacts with the CPU core and the power manager and reset manager to drive wakeup / reset events and interrupts.
+There is also an extra input to tell the counter whether to run ("counter-run").
+This is used to stop the watchdog timer running when in debugging mode or when the alert handler has put the system in a "killed" state.
+
+## Hardware Interfaces
+
+* [Interface Tables](../data/aon_timer.hjson#interfaces)
+
+## Design Details
+
+The always-on timer will run on a ~200KHz clock.
+The timers themselves are 32b wide, giving a maximum timeout window of roughly ~6 hours.
+For the wakeup timer, the pre-scaler extends the maximum timeout to ~1000 days.
+
+Register reads via the TLUL interface are synchronized to the slow clock using the "async" register generation feature.
+This means that writes can complete before the data has reached its underlying register in the slow clock domain.
+If software needs to guarantee completion of a register write, it can read back the register value (which will guarantee the completion of all previous writes to the peripheral).
diff --git a/hw/ip/clkmgr/README.md b/hw/ip/clkmgr/README.md
index 4e7c414d1fc39..a05b7602542fc 100644
--- a/hw/ip/clkmgr/README.md
+++ b/hw/ip/clkmgr/README.md
@@ -12,315 +12,3 @@ This document specifies the functionality of the OpenTitan clock manager.
 - Minimal software clock controls to reduce risks in clock manipulation.
 - External clock switch support
 - Clock frequency /time-out measurement
-
-# Theory of Operation
-
-Clock management in OpenTitan is divided into groups.
-Each group has specific attributes and controls whether software is allowed to influence individual clocks during the active power state.
-For low power states, please see [power manager](../pwrmgr/README.md).
-
-The grouping is derived from the chip partition and related security properties.
-For illustrative purposes, this document uses the following assumed chip partition
-
-![Example chip partition](./doc/example_chip_partition.svg)
-
-The actual partition may differ per design, however the general principles are assumed to be the same.
-Each group can be made up of more than 1 source clock.
-The clocks themselves may be asynchronous - the grouping is thus a logical grouping instead of a physical one.
-
-The grouping is summarized in the table below and described in more detail afterwards.
-The table shows the group name, the modules that belong to each group, and whether SW can directly (via register control) or indirectly (via wait-for-interrupt) influence the state of the clock in the form of clock gating.
-
-| Group           | Frequencies                    | Modules                                                        | Software       | Wait for Interrupt |
-| -------------   | ------------------------------ | -------------------------------------------------------------- | -------------- | ------------------ |
-| Power-up        | 100~200KHz, 24MHz              | Clock Manager, Power Manager, Reset Manager, Pinmux            | No             | No                 |
-| Transactional   | ~100MHz                        | Aes, Kmac, Hmac, Key Manager, Otbn                             | Yes (1)        | Yes (2)            |
-| Infrastructural | 24MHz, ~100MHz                 | Fabric, Fabric gaskets (iopmp), Memories                       | No             | Yes (3)            |
-| Security        | 24MHz, ~100MHz                 | Alert handler, Entropy, Life cycle, Plic, Sensors              | No             | No                 |
-| Peripheral      | 24MHz, 48MHz, 96MHz            | I2c, Spi, Uart, Usb, others                                    | Yes            | Yes                |
-| Timers          | 100-200KHz, 24MHz              | AON timers, Timers, Watchdog                                   | No             | No                 |
-
-* 1 - Transactional clock group's software control is only a software hint.
-* 2 - Transactional clock group's wait-for-interrupt control is only a hint.
-* 3 - May require additional complexity to handle multi-host (non-wait-for-interrupt) scenarios
-
-## Power-up Clock Group
-
-The group refers to modules responsible for power up, such as power, reset and clock managers.
-Large portions of these modules operate to release clocks and resets for the rest of the design, thus cannot operate on gated versions of the clocks themselves.
-They are the only group running clocks directly from the source.
-All follow groups are derived after root clock gating.
-See [block diagram](#block-diagram) for more details.
-
-## Transactional Clock Group
-
-This group refers to the collection of modules that are transactional by nature (example: `Hmac` / `Aes` / `Kmac`).
-This means these modules perform specific tasks (for example encrypt, decrypt or hashing).
-While performing these tasks, it is unsafe to manipulate or change the clocks.
-Once these tasks are complete, the clocks can be safely shut-off.
-
-To ensure such behavior on the clocks, The final clock enable is qualified with an `Idle` indication to signify that a transaction is ongoing and manipulation of the clock is not permitted.
-The `Idle` signal must be sourced from the transactional modules and sent to the clock manager.
-
-For this group software can only express its intent to shut-off, and does not have full control over the final state.
-This intent is indicated with a register in the clock manager register file, see [`CLK_HINTS`](../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#clk_hints).
-
-Even when the hint is set, the `Idle` does not directly manipulate the clock.
-When an idle indication is received, the `clkmgr` counts for a period of 10 local clocks to ensure the idle was not a glitch.
-
-Wait-for-interrupt based control is already a software hint, it can thus be applied to this group with the same `Idle` requirement.
-
-For modules in this group, each module can be individually influenced, and thus each has its own dedicated clock gating logic.
-The added benefit of allowing software control in this group is to save power, as some transactional modules can be both power and area hungry.
-
-## Infrastructure Clock Group
-
-This group refers to the collection of modules that support infrastructure functions.
-
-If the clocks to these modules are turned off, there may not be a way to turn them back on and could thus result in system deadlock.
-This includes but is not limited to:
-* Turning off fabric / gasket clocks, and thus having no way to access the fabric and resume the clock.
-* Turning off memory clocks such that there is no way to execute code that would resume the clocks.
-
-For this group, there is no reason to allow software control over the clocks, as it could be used to create a system deadlock where after disabling infrastructure clocks there is no way to turn them back on.
-Wait-for-interrupt controls however can be used, as long as there is a way to break the processor out of wait-for-interrupt and handle other bus hosts, while also separating the functional portions from bus access.
-See Wait-for-interrupt clock gating for more details.
-
-## Security Clock Group
-
-The security clock group is made up of security modules that either have background functions (entropy, alert manager, sensors) or perform critical security functions where disabling clocks could have unexpected side effects (life cycle, otp, pinmux, plic).
-
-For this group, no software influence over the clock state is allowed during the active state.
-The clocks are always running as long as the source is on.
-
-This group is not functionally identical to the power-up group.
-The power-up group is run on clocks directly from the clock source, while the security group is derived after root clock gating.
-
-## Timer Clock Group
-
-The timer clock group is composed of modules that track time for various purposes.
-As influencing time can change the perspective of software and potentially reveal security vulnerabilities, the clock state for these modules cannot be directly or indirectly influenced by software.
-
-Functionally, this group is identical to the security group.
-
-## Peripheral Clock Group
-
-The peripheral clock group is composed of I/O peripherals modules.
-By their nature, I/O peripherals are both transactional and most of the time not security critical - so long as proper care is taken to sandbox peripherals from the system.
-
-These modules can be both directly and indirectly controlled by software.
-The controls can also be individual to each peripheral.
-
-## Wait-for-Interrupt (wfi) Gating
-
-Wait-for-interrupt clock gating refers to the mechanism of using a processor’s sleep indication to actively gate off module clocks.
-Of the groups enumerated, only transactional, infrastructural and peripheral groups can be influenced by `wfi`.
-
-As `wfi` is effectively a processor clock request, there are subtleties related to its use.
-The interaction with each clock group is briefly described below.
-
-### Transactional Clock Group
-
-While `wfi` gating can be applied to this group, the modules in this category are already expected to be turned off and on by software depending on usage.
-Specifically, these modules are already completely managed by software when not in use, thus may not see significant benefit from `wfi` gating.
-
-### Peripheral Clock Group
-
-Since peripherals, especially those in device mode, are often operated in an interrupt driven way, the peripheral's core operating clock frequently must stay alive even if the processor is asleep.
-This implies that in order for peripherals to completely support `wfi` clock gating, they must be split between functional clocks and bus clocks.
-
-The bus clocks represent the software interface and can be turned off based on `wfi gating`, while the functional clocks should be kept running to ensure outside activity can be captured and interrupts created.
-In this scenario, it is important to ensure the functional clocks are responsible for creating interrupts and not the bus clocks, as the latter may not be available during `wfi`.
-
-This division may only be beneficial for peripherals where register and local fabric size is large relative to the functional component.
-
-### Infrastructural Clock Group
-
-This clock group matches `wfi` functionality well.
-Most infrastructural components such as fabric, gaskets and memories, have no need to be clocked when the processor is idle.
-Some components such as flash controller however would also need to be split into bus and functional halves to support long, background activities while the processor is idle.
-
-However, there are additional complications.
-In systems where the processor is not the only bus host, `wfi` can only be used as the software request and not final clock state decision.
-Hardware driven requests, such as those coming from a `dma` or any peripheral driven bus host, would also need to be included as part of the equation.
-Further, since it is possible hardware may issue requests at the boundary of a clock state changes, additional fabric gaskets would be required to protect hosts when destinations are temporarily clocked off.
-The bus requests themselves thus become dynamic clock request signals to help enable components in its path.
-
-There is thus a moderate design and high verification cost to supporting `wfi` gating for the infrastructural group.
-
-## Block Diagram
-
-The following is a high level block diagram of the clock manager.
-
-![Clock Manager Block Diagram](./doc/clkmgr_block_diagram.svg)
-
-### Reset Domains
-
-Since the function of the clock manager is tied closely into the power-up behavior of the device, the reset domain selection must also be purposefully done.
-To ensure that default clocks are available for the [power manager to release resets and initialize memories](../pwrmgr/README.md#fast-clock-domain-fsm), the clock dividers inside the clock manager directly use `por` (power-on-reset) derived resets.
-This ensures that the root clocks are freely running after power-up and its status can be communicated to the `pwrmgr` regardless of any other activity in the device.
-
-The other functions inside the clock manager operate on the `life cycle reset` domain.
-This ensures that other clock manager functions still release early relative to most functions in the system, and that a user or escalation initiated reset still restores the clock manager to a default clean slate.
-
-The escalation reset restoration is especially important as the clock manager can generate fatal faults that lead to escalation.
-If there were not a mechanism that allows escalation to clear the original fault, the system would simply remain in a faulted state until a user initiated a `por` event.
-
-For a detailed breakdown between `por` and `life cycle` resets, please see the [reset manager](../rstmgr/README.md).
-
-The following diagram enhances the block diagram to illustrate the overall reset domains of the clock manager.
-![Clock Manager Block Diagram](./doc/clkmgr_rst_domain.svg)
-
-## Hardware Interfaces
-
-* [Interface Tables](../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#interfaces)
-
-## Design Details
-
-### Root Clock Gating and Interface with Power Manager
-
-All clock groups except the power-up group run from gated source clocks.
-The source clocks are gated off during low power states as controlled by the power manager.
-When the power manager makes a clock enable request, the clock manager ensures all root clock gates are enabled before acknowledging.
-Likewise, when the power manager makes a clock disable request, the clock manager ensures all root clock gates off disabled before acknowledging.
-
-Note, the power manager's request to turn off clocks supersedes all other local controls in the clock manager.
-This means even if a particular clock is turned on by the clock manager (for example a transactional unit that is ongoing or a peripheral that is enabled), the power manager requests will still turn clocks on / off at the root.
-
-This makes it software's responsibility to ensure low power entry requests (which can only be initiated by software) do not conflict with any ongoing activities controlled by software.
-For example, software should ensure that Aes / Otbn activities have completed before initializing a low power entry process.
-
-### Clock Division
-
-Not all clocks are the same frequency as the source.
-In cases where there is a frequency mismatch, the clock manager supports clock dividers to step down the frequency.
-The divided frequency is not assumed to be synchronous with its source and are thus treated like another asynchronous branch.
-Further, the clock dividers are hardwired and have no software control, this is to further ensure there are no simple paths for faulty or malicious software to tamper.
-
-### Wait-for-Interrupt Support
-
-Given the marginal benefits and the increased complexity of `wfi` support, the first version of this design does not support `wfi` gating.
-All `wfi CG` modules in the block diagram are thus drawn with dashed lines to indicate it can be theoretically supported but currently not implemented.
-
-It may be added for future more complex systems where there is a need to tightly control infrastructural power consumption as a result from clocks.
-
-### External Clock Switch Support
-
-Clock manager supports the ability to request root clocks switch to an external clock.
-There are two occasions where this is required:
--  Life cycle transition from `Raw` / `Test_locked` to `Test_unlocked` [states](../lc_ctrl/README.md#clk_byp_req).
--  Software request for external clocks during normal functional mode.
-
-
-#### Life Cycle Requested External Clock
-
-The life cycle controller only requests the io clock input to be switched.
-When the life cycle controller requests external clock, a request signal `lc_clk_byp_req_i` is sent from `lc_ctrl` to `clkmgr`.
-`clkmgr` then forwards the request to `ast` through `io_clk_byp_req_o`, which performs the actual clock switch and is acknowledged through `io_clk_byp_ack_i`.
-When the clock switch is complete, the clock dividers are stepped down by a factor of 2 and the life cycle controller is acknowledged through `lc_clk_byp_ack_o`.
-
-
-#### Software Requested External Clocks
-
-Unlike the life cycle controller, a software request for external clocks switches all clock sources to an external source.
-Software request for external clocks is not always valid.
-Software is only able to request for external clocks when hardware debug functions are [allowed](../lc_ctrl/README.md#hw_debug_en).
-
-When software requests the external clock switch, it also provides an indication how fast the external clock is through [`EXTCLK_CTRL.HI_SPEED_SEL`](../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#extclk_ctrl).
-There are two supported clock speeds:
-* High speed - external clock is close to nominal speeds (e.g. external clock is 96MHz and nominal frequency is 96MHz-100MHz)
-* Low speed - external clock is half of nominal speeds (e.g. external clock is 48MHz and nominal frequency is 96MHz-100MHz)
-
-When software requests external clock, the register bit [`EXTCLK_CTRL.SEL`](../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#extclk_ctrl) is written.
-If hardware debug functions are allowed, the `clkmgr` sends a request signal `all_clk_byp_req_o` to `ast` and is acknowledged through `all_clk_byp_ack_i`.
-
-If software requests a low speed external clock, at the completion of the switch, internal dividers are also stepped down.
-When the divider is stepped down, a divide-by-4 clock becomes divide-by-2 clock , and a divide-by-2 becomes a divide-by-1 clock.
-
-If software requests a high speed external clock, the dividers are kept as is.
-
-
-Note, software external clock switch support is meant to be a debug / evaluation feature, and should not be used in conjunction with the clock frequency and timeout measurement features.
-This is because if the clock frequency suddenly changes, the thresholds used for timeout / measurement checks will no longer apply.
-There is currently no support in hardware to dynamically synchronize a threshold change to the expected frequency.
-
-#### Clock Frequency Summary
-
-The table below summarises the valid modes and the settings required.
-
-| Mode                                            | `lc_clk_byp_req_i`     | `extclk_ctrl.sel` | `extclk_ctrl.hi_speed_sel`  | life cycle state        |
-| -------------                                   | ---------------------  | ----------------- | ----------------------------| ----------------------- |
-| Life cycle in RAW, TEST* and RMA states         | `lc_ctrl_pkg::On`      | `kMultiBit4False` | Don't care                  | Controlled by `lc_ctrl` |
-| Internal Clocks                                 | `lc_ctrl_pkg::Off`     | `kMultiBit4False` | Don't care                  | All                     |
-| Software external high speed                    | `lc_ctrl_pkg::Off`     | `kMultiBit4True`  | `kMultiBit4True`            | TEST_UNLOCKED, RMA      |
-| Software external low speed                     | `lc_ctrl_pkg::Off`     | `kMultiBit4True`  | `kMultiBit4False`           | TEST_UNLOCKED, RMA      |
-
-The table below summarizes the frequencies in each mode.
-This table assumes that the internal clock source is 96MHz.
-This table also assumes that high speed external clock is 96MHz, while low speed external clock is 48MHz.
-
-| Mode                         | External Clock Frequency | div_1_clock   | div_2_clock     | div_4_clock  |
-| -------------                | ------------------------ | ------------- | --------------- | -------------|
-| Internal Clocks              | Not applicable           | 96MHz         | 48MHz           | 24MHz        |
-| Life cycle transition        | 48MHz                    | 48MHz         | 48MHz           | 24MHz        |
-| Software external high speed | 96MHz                    | 96MHz         | 48MHz           | 24MHz        |
-| Software external low speed  | 48MHz                    | 48MHz         | 48MHz           | 24MHz        |
-
-As can be seen from the table, the external clock switch scheme prioritizes the stability of the divided clocks, while allowing the undivided clocks to slow down.
-
-
-### Clock Frequency / Time-out Measurements
-
-Clock manager can continuously measure root clock frequencies to see if any of the root clocks have deviated from the expected frequency.
-This feature can be enabled through the various measurement control registers such as [`IO_MEASURE_CTRL`](../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#io_measure_ctrl).
-
-The root clocks, specifically the clocks supplied from `ast` and their divided variants, are constantly measured against the `always on clock` when this feature is enabled.
-Software sets both an expected maximum and minimum for each measured clock.
-
-Clock manager then counts the number of relevant root clock cycles in each always-on clock period.
-If the resulting count differs from the programmed thresholds, a recoverable error is registered.
-
-Since the counts are measured against a single cycle of always on clock, the minimal error that can be detected is dependent on the clock ratio between the measured clock and 1 cycle of of the always on clock.
-Assume a 24MHz clock and an always-on clock of 200KHz.
-The minimal error detection is then 200KHz / 24MHz, or approximately 0.83%.
-
-This means if the clock's actual value is between 23.8MHz and 24.2MHz, this deviation will not be detected.
-Conversely, if the clock's natural operation has an error range wider than this resolution, the min / max counts must be adjusted to account for this error.
-
-Additionally, clock manager uses a similar time-out mechanism to see if any of the root clocks have stopped toggling for an extended period of time.
-This is done by creating an artificial handshake between the root clock domain and the always on clock domain that must complete within a certain amount of time based on known clock ratios.
-Based on the nature of the handshake and the margin window, the minimal timeout detection window is approximately 2-4 always on clock cycles.
-If the root clock domain stops and resumes in significantly less time than this window, the time-out may not be detected.
-
-There are three types of errors:
-* Clock too fast error
-* Clock too slow error
-* Clock time-out error
-
-Clock too fast is registered when the clock cycle count is greater than the software programmed max threshold.
-Clock too slow is registered when the clock cycle count is less than the software programmed min threshold.
-Clock time-out is registered when the clock stops toggling and the timeout threshold is reached.
-
-As these are all software supplied values, the entire measurement control can be locked from further programming through [`MEASURE_CTRL_REGWEN`](../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#measure_ctrl_regwen).
-
-# Programmers Guide
-
-There are in general only two software controllable functions in the clock manager.
-
-
-## Transactional Clock Hints
-
-To enable a transactional clock, set the corresponding hint in [`CLK_HINTS`](../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#clk_hints) to `1`.
-To disable a transactional clock, set the corresponding hint in [`CLK_HINTS`](../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#clk_hints) to `0`.
-Note, a `0` does not indicate clock is actually disabled, software can thus check [`CLK_HINTS_STATUS`](../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#clk_hints_status) for the actual state of the clock.
-
-## Peripheral Clock Controls
-To control peripheral clocks, directly change the bits in [`CLK_ENABLES`](../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#clk_enables).
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_clkmgr.h)
-
-## Register Table
-
-* [Register Table](../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#registers)
diff --git a/hw/ip/clkmgr/doc/programmers_guide.md b/hw/ip/clkmgr/doc/programmers_guide.md
new file mode 100644
index 0000000000000..e5787ec58be84
--- /dev/null
+++ b/hw/ip/clkmgr/doc/programmers_guide.md
@@ -0,0 +1,21 @@
+# Programmer's Guide
+
+There are in general only two software controllable functions in the clock manager.
+
+
+## Transactional Clock Hints
+
+To enable a transactional clock, set the corresponding hint in [`CLK_HINTS`](../../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#clk_hints) to `1`.
+To disable a transactional clock, set the corresponding hint in [`CLK_HINTS`](../../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#clk_hints) to `0`.
+Note, a `0` does not indicate clock is actually disabled, software can thus check [`CLK_HINTS_STATUS`](../../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#clk_hints_status) for the actual state of the clock.
+
+## Peripheral Clock Controls
+To control peripheral clocks, directly change the bits in [`CLK_ENABLES`](../../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#clk_enables).
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_clkmgr.h)
+
+## Register Table
+
+* [Register Table](../../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#registers)
diff --git a/hw/ip/clkmgr/doc/theory_of_operation.md b/hw/ip/clkmgr/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..4a87d14d2a2fd
--- /dev/null
+++ b/hw/ip/clkmgr/doc/theory_of_operation.md
@@ -0,0 +1,289 @@
+# Theory of Operation
+
+Clock management in OpenTitan is divided into groups.
+Each group has specific attributes and controls whether software is allowed to influence individual clocks during the active power state.
+For low power states, please see [power manager](../../pwrmgr/README.md).
+
+The grouping is derived from the chip partition and related security properties.
+For illustrative purposes, this document uses the following assumed chip partition
+
+![Example chip partition](../doc/example_chip_partition.svg)
+
+The actual partition may differ per design, however the general principles are assumed to be the same.
+Each group can be made up of more than 1 source clock.
+The clocks themselves may be asynchronous - the grouping is thus a logical grouping instead of a physical one.
+
+The grouping is summarized in the table below and described in more detail afterwards.
+The table shows the group name, the modules that belong to each group, and whether SW can directly (via register control) or indirectly (via wait-for-interrupt) influence the state of the clock in the form of clock gating.
+
+| Group           | Frequencies                    | Modules                                                        | Software       | Wait for Interrupt |
+| -------------   | ------------------------------ | -------------------------------------------------------------- | -------------- | ------------------ |
+| Power-up        | 100~200KHz, 24MHz              | Clock Manager, Power Manager, Reset Manager, Pinmux            | No             | No                 |
+| Transactional   | ~100MHz                        | Aes, Kmac, Hmac, Key Manager, Otbn                             | Yes (1)        | Yes (2)            |
+| Infrastructural | 24MHz, ~100MHz                 | Fabric, Fabric gaskets (iopmp), Memories                       | No             | Yes (3)            |
+| Security        | 24MHz, ~100MHz                 | Alert handler, Entropy, Life cycle, Plic, Sensors              | No             | No                 |
+| Peripheral      | 24MHz, 48MHz, 96MHz            | I2c, Spi, Uart, Usb, others                                    | Yes            | Yes                |
+| Timers          | 100-200KHz, 24MHz              | AON timers, Timers, Watchdog                                   | No             | No                 |
+
+* 1 - Transactional clock group's software control is only a software hint.
+* 2 - Transactional clock group's wait-for-interrupt control is only a hint.
+* 3 - May require additional complexity to handle multi-host (non-wait-for-interrupt) scenarios
+
+## Power-up Clock Group
+
+The group refers to modules responsible for power up, such as power, reset and clock managers.
+Large portions of these modules operate to release clocks and resets for the rest of the design, thus cannot operate on gated versions of the clocks themselves.
+They are the only group running clocks directly from the source.
+All follow groups are derived after root clock gating.
+See [block diagram](#block-diagram) for more details.
+
+## Transactional Clock Group
+
+This group refers to the collection of modules that are transactional by nature (example: `Hmac` / `Aes` / `Kmac`).
+This means these modules perform specific tasks (for example encrypt, decrypt or hashing).
+While performing these tasks, it is unsafe to manipulate or change the clocks.
+Once these tasks are complete, the clocks can be safely shut-off.
+
+To ensure such behavior on the clocks, The final clock enable is qualified with an `Idle` indication to signify that a transaction is ongoing and manipulation of the clock is not permitted.
+The `Idle` signal must be sourced from the transactional modules and sent to the clock manager.
+
+For this group software can only express its intent to shut-off, and does not have full control over the final state.
+This intent is indicated with a register in the clock manager register file, see [`CLK_HINTS`](../../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#clk_hints).
+
+Even when the hint is set, the `Idle` does not directly manipulate the clock.
+When an idle indication is received, the `clkmgr` counts for a period of 10 local clocks to ensure the idle was not a glitch.
+
+Wait-for-interrupt based control is already a software hint, it can thus be applied to this group with the same `Idle` requirement.
+
+For modules in this group, each module can be individually influenced, and thus each has its own dedicated clock gating logic.
+The added benefit of allowing software control in this group is to save power, as some transactional modules can be both power and area hungry.
+
+## Infrastructure Clock Group
+
+This group refers to the collection of modules that support infrastructure functions.
+
+If the clocks to these modules are turned off, there may not be a way to turn them back on and could thus result in system deadlock.
+This includes but is not limited to:
+* Turning off fabric / gasket clocks, and thus having no way to access the fabric and resume the clock.
+* Turning off memory clocks such that there is no way to execute code that would resume the clocks.
+
+For this group, there is no reason to allow software control over the clocks, as it could be used to create a system deadlock where after disabling infrastructure clocks there is no way to turn them back on.
+Wait-for-interrupt controls however can be used, as long as there is a way to break the processor out of wait-for-interrupt and handle other bus hosts, while also separating the functional portions from bus access.
+See Wait-for-interrupt clock gating for more details.
+
+## Security Clock Group
+
+The security clock group is made up of security modules that either have background functions (entropy, alert manager, sensors) or perform critical security functions where disabling clocks could have unexpected side effects (life cycle, otp, pinmux, plic).
+
+For this group, no software influence over the clock state is allowed during the active state.
+The clocks are always running as long as the source is on.
+
+This group is not functionally identical to the power-up group.
+The power-up group is run on clocks directly from the clock source, while the security group is derived after root clock gating.
+
+## Timer Clock Group
+
+The timer clock group is composed of modules that track time for various purposes.
+As influencing time can change the perspective of software and potentially reveal security vulnerabilities, the clock state for these modules cannot be directly or indirectly influenced by software.
+
+Functionally, this group is identical to the security group.
+
+## Peripheral Clock Group
+
+The peripheral clock group is composed of I/O peripherals modules.
+By their nature, I/O peripherals are both transactional and most of the time not security critical - so long as proper care is taken to sandbox peripherals from the system.
+
+These modules can be both directly and indirectly controlled by software.
+The controls can also be individual to each peripheral.
+
+## Wait-for-Interrupt (wfi) Gating
+
+Wait-for-interrupt clock gating refers to the mechanism of using a processor’s sleep indication to actively gate off module clocks.
+Of the groups enumerated, only transactional, infrastructural and peripheral groups can be influenced by `wfi`.
+
+As `wfi` is effectively a processor clock request, there are subtleties related to its use.
+The interaction with each clock group is briefly described below.
+
+### Transactional Clock Group
+
+While `wfi` gating can be applied to this group, the modules in this category are already expected to be turned off and on by software depending on usage.
+Specifically, these modules are already completely managed by software when not in use, thus may not see significant benefit from `wfi` gating.
+
+### Peripheral Clock Group
+
+Since peripherals, especially those in device mode, are often operated in an interrupt driven way, the peripheral's core operating clock frequently must stay alive even if the processor is asleep.
+This implies that in order for peripherals to completely support `wfi` clock gating, they must be split between functional clocks and bus clocks.
+
+The bus clocks represent the software interface and can be turned off based on `wfi gating`, while the functional clocks should be kept running to ensure outside activity can be captured and interrupts created.
+In this scenario, it is important to ensure the functional clocks are responsible for creating interrupts and not the bus clocks, as the latter may not be available during `wfi`.
+
+This division may only be beneficial for peripherals where register and local fabric size is large relative to the functional component.
+
+### Infrastructural Clock Group
+
+This clock group matches `wfi` functionality well.
+Most infrastructural components such as fabric, gaskets and memories, have no need to be clocked when the processor is idle.
+Some components such as flash controller however would also need to be split into bus and functional halves to support long, background activities while the processor is idle.
+
+However, there are additional complications.
+In systems where the processor is not the only bus host, `wfi` can only be used as the software request and not final clock state decision.
+Hardware driven requests, such as those coming from a `dma` or any peripheral driven bus host, would also need to be included as part of the equation.
+Further, since it is possible hardware may issue requests at the boundary of a clock state changes, additional fabric gaskets would be required to protect hosts when destinations are temporarily clocked off.
+The bus requests themselves thus become dynamic clock request signals to help enable components in its path.
+
+There is thus a moderate design and high verification cost to supporting `wfi` gating for the infrastructural group.
+
+## Block Diagram
+
+The following is a high level block diagram of the clock manager.
+
+![Clock Manager Block Diagram](../doc/clkmgr_block_diagram.svg)
+
+### Reset Domains
+
+Since the function of the clock manager is tied closely into the power-up behavior of the device, the reset domain selection must also be purposefully done.
+To ensure that default clocks are available for the [power manager to release resets and initialize memories](../../pwrmgr/README.md#fast-clock-domain-fsm), the clock dividers inside the clock manager directly use `por` (power-on-reset) derived resets.
+This ensures that the root clocks are freely running after power-up and its status can be communicated to the `pwrmgr` regardless of any other activity in the device.
+
+The other functions inside the clock manager operate on the `life cycle reset` domain.
+This ensures that other clock manager functions still release early relative to most functions in the system, and that a user or escalation initiated reset still restores the clock manager to a default clean slate.
+
+The escalation reset restoration is especially important as the clock manager can generate fatal faults that lead to escalation.
+If there were not a mechanism that allows escalation to clear the original fault, the system would simply remain in a faulted state until a user initiated a `por` event.
+
+For a detailed breakdown between `por` and `life cycle` resets, please see the [reset manager](../../rstmgr/README.md).
+
+The following diagram enhances the block diagram to illustrate the overall reset domains of the clock manager.
+![Clock Manager Block Diagram](../doc/clkmgr_rst_domain.svg)
+
+## Hardware Interfaces
+
+* [Interface Tables](../../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#interfaces)
+
+## Design Details
+
+### Root Clock Gating and Interface with Power Manager
+
+All clock groups except the power-up group run from gated source clocks.
+The source clocks are gated off during low power states as controlled by the power manager.
+When the power manager makes a clock enable request, the clock manager ensures all root clock gates are enabled before acknowledging.
+Likewise, when the power manager makes a clock disable request, the clock manager ensures all root clock gates off disabled before acknowledging.
+
+Note, the power manager's request to turn off clocks supersedes all other local controls in the clock manager.
+This means even if a particular clock is turned on by the clock manager (for example a transactional unit that is ongoing or a peripheral that is enabled), the power manager requests will still turn clocks on / off at the root.
+
+This makes it software's responsibility to ensure low power entry requests (which can only be initiated by software) do not conflict with any ongoing activities controlled by software.
+For example, software should ensure that Aes / Otbn activities have completed before initializing a low power entry process.
+
+### Clock Division
+
+Not all clocks are the same frequency as the source.
+In cases where there is a frequency mismatch, the clock manager supports clock dividers to step down the frequency.
+The divided frequency is not assumed to be synchronous with its source and are thus treated like another asynchronous branch.
+Further, the clock dividers are hardwired and have no software control, this is to further ensure there are no simple paths for faulty or malicious software to tamper.
+
+### Wait-for-Interrupt Support
+
+Given the marginal benefits and the increased complexity of `wfi` support, the first version of this design does not support `wfi` gating.
+All `wfi CG` modules in the block diagram are thus drawn with dashed lines to indicate it can be theoretically supported but currently not implemented.
+
+It may be added for future more complex systems where there is a need to tightly control infrastructural power consumption as a result from clocks.
+
+### External Clock Switch Support
+
+Clock manager supports the ability to request root clocks switch to an external clock.
+There are two occasions where this is required:
+-  Life cycle transition from `Raw` / `Test_locked` to `Test_unlocked` [states](../../lc_ctrl/README.md#clk_byp_req).
+-  Software request for external clocks during normal functional mode.
+
+
+#### Life Cycle Requested External Clock
+
+The life cycle controller only requests the io clock input to be switched.
+When the life cycle controller requests external clock, a request signal `lc_clk_byp_req_i` is sent from `lc_ctrl` to `clkmgr`.
+`clkmgr` then forwards the request to `ast` through `io_clk_byp_req_o`, which performs the actual clock switch and is acknowledged through `io_clk_byp_ack_i`.
+When the clock switch is complete, the clock dividers are stepped down by a factor of 2 and the life cycle controller is acknowledged through `lc_clk_byp_ack_o`.
+
+
+#### Software Requested External Clocks
+
+Unlike the life cycle controller, a software request for external clocks switches all clock sources to an external source.
+Software request for external clocks is not always valid.
+Software is only able to request for external clocks when hardware debug functions are [allowed](../../lc_ctrl/README.md#hw_debug_en).
+
+When software requests the external clock switch, it also provides an indication how fast the external clock is through [`EXTCLK_CTRL.HI_SPEED_SEL`](../../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#extclk_ctrl).
+There are two supported clock speeds:
+* High speed - external clock is close to nominal speeds (e.g. external clock is 96MHz and nominal frequency is 96MHz-100MHz)
+* Low speed - external clock is half of nominal speeds (e.g. external clock is 48MHz and nominal frequency is 96MHz-100MHz)
+
+When software requests external clock, the register bit [`EXTCLK_CTRL.SEL`](../../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#extclk_ctrl) is written.
+If hardware debug functions are allowed, the `clkmgr` sends a request signal `all_clk_byp_req_o` to `ast` and is acknowledged through `all_clk_byp_ack_i`.
+
+If software requests a low speed external clock, at the completion of the switch, internal dividers are also stepped down.
+When the divider is stepped down, a divide-by-4 clock becomes divide-by-2 clock , and a divide-by-2 becomes a divide-by-1 clock.
+
+If software requests a high speed external clock, the dividers are kept as is.
+
+
+Note, software external clock switch support is meant to be a debug / evaluation feature, and should not be used in conjunction with the clock frequency and timeout measurement features.
+This is because if the clock frequency suddenly changes, the thresholds used for timeout / measurement checks will no longer apply.
+There is currently no support in hardware to dynamically synchronize a threshold change to the expected frequency.
+
+#### Clock Frequency Summary
+
+The table below summarises the valid modes and the settings required.
+
+| Mode                                            | `lc_clk_byp_req_i`     | `extclk_ctrl.sel` | `extclk_ctrl.hi_speed_sel`  | life cycle state        |
+| -------------                                   | ---------------------  | ----------------- | ----------------------------| ----------------------- |
+| Life cycle in RAW, TEST* and RMA states         | `lc_ctrl_pkg::On`      | `kMultiBit4False` | Don't care                  | Controlled by `lc_ctrl` |
+| Internal Clocks                                 | `lc_ctrl_pkg::Off`     | `kMultiBit4False` | Don't care                  | All                     |
+| Software external high speed                    | `lc_ctrl_pkg::Off`     | `kMultiBit4True`  | `kMultiBit4True`            | TEST_UNLOCKED, RMA      |
+| Software external low speed                     | `lc_ctrl_pkg::Off`     | `kMultiBit4True`  | `kMultiBit4False`           | TEST_UNLOCKED, RMA      |
+
+The table below summarizes the frequencies in each mode.
+This table assumes that the internal clock source is 96MHz.
+This table also assumes that high speed external clock is 96MHz, while low speed external clock is 48MHz.
+
+| Mode                         | External Clock Frequency | div_1_clock   | div_2_clock     | div_4_clock  |
+| -------------                | ------------------------ | ------------- | --------------- | -------------|
+| Internal Clocks              | Not applicable           | 96MHz         | 48MHz           | 24MHz        |
+| Life cycle transition        | 48MHz                    | 48MHz         | 48MHz           | 24MHz        |
+| Software external high speed | 96MHz                    | 96MHz         | 48MHz           | 24MHz        |
+| Software external low speed  | 48MHz                    | 48MHz         | 48MHz           | 24MHz        |
+
+As can be seen from the table, the external clock switch scheme prioritizes the stability of the divided clocks, while allowing the undivided clocks to slow down.
+
+
+### Clock Frequency / Time-out Measurements
+
+Clock manager can continuously measure root clock frequencies to see if any of the root clocks have deviated from the expected frequency.
+This feature can be enabled through the various measurement control registers such as [`IO_MEASURE_CTRL`](../../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#io_measure_ctrl).
+
+The root clocks, specifically the clocks supplied from `ast` and their divided variants, are constantly measured against the `always on clock` when this feature is enabled.
+Software sets both an expected maximum and minimum for each measured clock.
+
+Clock manager then counts the number of relevant root clock cycles in each always-on clock period.
+If the resulting count differs from the programmed thresholds, a recoverable error is registered.
+
+Since the counts are measured against a single cycle of always on clock, the minimal error that can be detected is dependent on the clock ratio between the measured clock and 1 cycle of of the always on clock.
+Assume a 24MHz clock and an always-on clock of 200KHz.
+The minimal error detection is then 200KHz / 24MHz, or approximately 0.83%.
+
+This means if the clock's actual value is between 23.8MHz and 24.2MHz, this deviation will not be detected.
+Conversely, if the clock's natural operation has an error range wider than this resolution, the min / max counts must be adjusted to account for this error.
+
+Additionally, clock manager uses a similar time-out mechanism to see if any of the root clocks have stopped toggling for an extended period of time.
+This is done by creating an artificial handshake between the root clock domain and the always on clock domain that must complete within a certain amount of time based on known clock ratios.
+Based on the nature of the handshake and the margin window, the minimal timeout detection window is approximately 2-4 always on clock cycles.
+If the root clock domain stops and resumes in significantly less time than this window, the time-out may not be detected.
+
+There are three types of errors:
+* Clock too fast error
+* Clock too slow error
+* Clock time-out error
+
+Clock too fast is registered when the clock cycle count is greater than the software programmed max threshold.
+Clock too slow is registered when the clock cycle count is less than the software programmed min threshold.
+Clock time-out is registered when the clock stops toggling and the timeout threshold is reached.
+
+As these are all software supplied values, the entire measurement control can be locked from further programming through [`MEASURE_CTRL_REGWEN`](../../../top_earlgrey/ip/clkmgr/data/autogen/clkmgr.hjson#measure_ctrl_regwen).
diff --git a/hw/ip/csrng/README.md b/hw/ip/csrng/README.md
index 1439f86d15e29..4d1a06338e0e5 100644
--- a/hw/ip/csrng/README.md
+++ b/hw/ip/csrng/README.md
@@ -107,603 +107,3 @@ It is expected that the requesting block (EDN) will do an additional hardware ch
 
 ## Compatibility
 This block is compatible with NIST's SP 800-90A and BSI's AIS31 recommendations for Common Criteria.
-
-# Theory of Operations
-
-The CSRNG block has been constructed to follow the NIST recommendation for a DRBG mechanism based on block ciphers.
-Specifically, it is a CTR_DRBG that uses an approved block cipher algorithm in counter mode.
-As such, the block diagram below makes reference to hardware blocks that either directly or closely follow NIST descriptions for the equivalent functions.
-
-There are two major hardware interfaces: the application interface and the entropy request interface.
-The application interface, which is described in more detail later, is provided for an application to manage an `instance` in CSRNG.
-Once setup, the application interface user can request for entropy bits to be generated, as well as other functions.
-The application interface supports up to 15 hardware interfaces, and one software interface.
-
-A walk through of how CSRNG generates entropy bits begins with the application interface.
-An `instantiate` command is issued from one of the application interfaces.
-This request moves into the `cmd_stage` block.
-Here the request is arbitrated between all of the `cmd_stage` blocks.
-The winner will get its command moved into the command dispatch logic.
-A common state machine will process all application interface commands in order of arbitration.
-At this point, some seed entropy may be required depending on the command and any flags.
-If needed, a request to the entropy source hardware interface will be made.
-This step can take milliseconds if seed entropy is not immediately available.
-Once all of the prerequisites have been collected, a CTR_DRBG command can be launched.
-This command will go into the `ctr_drbg_cmd` block.
-This `ctr_drbg_cmd` block uses two NIST-defined functions, the update and the `block_encrypt` functions.
-If the command is a generate, the `ctr_drbg_cmd` block will process the first half of the algorithm, and then pass it on to the `ctr_drbg_gen` block.
-Additionally, the `ctr_drbg_gen` block also uses the `update` block and the `block_encrypt` block.
-To keep resources to a minimum, both of these blocks have arbiters to allow sharing between the `ctr_drbg_cmd` and `ctr_drbg_gen` blocks.
-The command field called `ccmd` (for current command) is sent along the pipeline to not only identify the command, but is also reused as a routing tag for the arbiters to use when returning the block response.
-
-Once the command has traversed through all of the CTR_DRBG blocks, the result will eventually land into the `state_db` block.
-This block will hold the instance state for each application interface.
-The specific state information held in the instance is documented below.
-If the command was a `generate` command, the genbits data word will be returned to the requesting `cmd_stage` block.
-Finally, an `ack` response and status will be returned to the application interface once the command has been completely processed.
-
-
-## Block Diagram
-
-![CSRNG Block Diagram](./doc/csrng_blk_diag.svg)
-
-## Hardware Interfaces
-
- * [Interface Tables](data/csrng.hjson#interfaces)
-
-The table below lists other CSRNG signals.
-
-Signal                       | Direction        | Type                        | Description
------------------------------|------------------|-----------------------------|---------------
-`otp_en_csrng_sw_app_read_i` | `input `         | `otp_en_t `                 | An efuse that will enable firmware to access the NIST CTR_DRBG internal state and genbits through registers.
-`lc_hw_debug_en_i`           | `input`          | `lc_tx_t `                  | A life-cycle that will select which diversification value is used for xoring with the seed from ENTROPY_SRC.
-`entropy_src_hw_if_o`        | `output`         | `entropy_src_hw_if_req_t`   | Seed request made to the ENTROPY_SRC module.
-`entropy_src_hw_if_i`        | `input`          | `entropy_src_hw_if_rsp_t`   | Seed response from the ENTROPY_SRC module.
-`cs_aes_halt_i`              | `input`          | `cs_aes_halt_req_t`         | Request to CSRNG from ENTROPY_SRC to halt requests to the AES block for power leveling purposes.
-`cs_aes_halt_o`              | `output`         | `cs_aes_halt_rsp_t`         | Response from CSRNG to ENTROPY_SRC that all requests to AES block are halted.
-`csrng_cmd_i`                | `input`          | `csrng_req_t`               | Application interface request to CSRNG from an EDN block.
-`csrng_cmd_o`                | `output`         | `csrng_rsp_t`               | Application interface response from CSRNG to an EDN block.
-
-
-## Design Details
-
-#### Non-blocking Commands
-Regarding command processing, all commands process immediately except for the generate command.
-The command generate length count (`glen`) is kept in the `cmd_stage` block.
-When the `state_db` block issues an ack to the `cmd_stage` block, the `cmd_stage` block increments an internal counter.
-This process repeats until the `glen` field value has been matched.
-Because each request is pipelined, requests from other `cmd_stage` blocks can be processed before the original generate command is completely done.
-This provides some interleaving of commands since a generate command can be programmed to take a very long time.
-
-When sending an unsupported or illegal command, `CS_MAIN_SM_ALERT` will be triggered, but there will be no status response or indication of which app the error occurred in.
-
-#### Working State Values
-The CSRNG working state data base (`state_db`) contains the current working state for a given DRBG instance.
-It holds the following values:
-
-<table>
-<caption>Values stored by <tt>state_db</tt></caption>
-<thead>
-  <tr>
-    <th>Bits</th>
-    <th>Name</th>
-    <th>Description</th>
-  </tr>
-</thead>
-<tbody>
-  <tr>
-    <td>31:0</td>
-    <td>Reseed Counter</td>
-    <td> Value required and defined by NIST's SP 800-90A to be held in the state instance.
-    It keeps track of the number of pseudorandom bits requested since the last instantiation or reseeding.
-    </td>
-  </tr>
-  <tr>
-    <td>159:32</td>
-    <td>V</td>
-    <td> Value required and defined by NIST's SP 800-90A to be held in the state instance, and is of size <tt>BlkLen</tt>.
-    This value changes every time a <tt>BlkLen</tt> bits of output are generated.
-    </td>
-  </tr>
-  <tr>
-    <td>415:160</td>
-    <td>Key</td>
-    <td> Value required and defined by NIST's SP 800-90A to be held in the state instance, and is of size <tt>KeyLen</tt>.
-    The key is changed after a predetermined number of blocks of output have been produced.
-    </td>
-  </tr>
-  <tr>
-    <td>416</td>
-    <td>Status</td>
-    <td> Set when instantiated.
-    </td>
-  </tr>
-  <tr>
-    <td>417</td>
-    <td>Compliance</td>
-    <td> Set when FIPS/CC compliant entropy was used to seed this instance.
-    </td>
-  </tr>
-</table>
-
-#### AES Cipher
-The `block_encrypt` block is where the `aes_cipher_core` block is located.
-This is the same block used in the AES design.
-Parameters are selected such that this is the unmasked version.
-
-#### Software Support
-The software application interface uses a set of TL-UL registers to send commands and receive generated bits.
-Since the registers are 32-bit words wide, some sequencing will need to be done by firmware to make this interface work properly.
-
-### Application Interface
-
-This section describes the application interface, which is required for performing any operations using a CSRNG instance (i.e. instantiation, reseeding, RNG generation, or uninstantiation).
-Each CSRNG instance corresponds to a unique application interface port, which implements the application interface described here.
-Any hardware peripherals which require complete control of an instance may connect directly to a dedicated interface port.
-Meanwhile peripherals without any special requirements (i.e. personalization strings or non-FIPS-approved, fully-deterministic number sequences) may share access to an instance via the entropy distribution network (EDN) IP.
-The EDNs manage the instantiation and reseeding of CSRNG instances for general use-cases, providing either on-demand or timed-delivery entropy streams to hardware peripherals.
-Firmware applications can obtain access to random bit sequences directly through application interface port 0, which is directly mapped to a set of TL-UL registers.
-
-The total number of application interface ports (for TL-UL, directly attached peripherals or EDN instances) is determined by the `NHwApp` parameter.
-
-The command bus operates like a FIFO, in which a command is pushed into the interface.
-An optional stream of additional data may follow, such as seed material for an `instantiate` application command.
-For the `generate` application command, the obfuscated entropy will be returned on the `genbits` bus.
-This bus also operates like a FIFO, and the receiving module can provide back pressure to the `genbits` bus.
-There is one instance of a firmware application interface, and it uses the TL-UL registers.
-For more details on how the application interface works, see the Theory of Operations section above.
-
-In general, users of the application interface are either firmware or some hardware module entity.
-For hardware, a module can either directly control the application interface, or it can connect to an EDN module.
-Attaching to an EDN module allows for a simpler interface connection to a more layout-friendly distributed-chip network.
-
-#### General Command Format
-
-The general format for the application interface is a 32-bit command header, optionally followed by additional data, such as a personalization string, typically twelve 32-bit words in length.
-Depending on the command, these strings are typically required to be 384-bits in length, to match the size of the seed-length when operating with 256-bit security-strength.
-The exact function of the additional data field depends in the command.
-However, in general, the additional data can be any length as specified by the command length field.
-The command header is defined below.
-
-#### Command Header
-The application interface requires that a 32-bit command header be provided to instruct the CSRNG how to manage the internal working states.
-Below is a description of the fields of this header:
-
-<table>
-<caption>Application Interface Command Header</caption>
-<thead>
-  <tr>
-    <th>Bits</th>
-    <th>Name</th>
-    <th>Description</th>
-  </tr>
-</thead>
-<tbody>
-  <tr>
-    <td>3:0</td>
-    <td>acmd</td>
-    <td> Application Command: Selects one of five operations to perform.
-         The commands supported are <tt>instantiate</tt>, <tt>reseed</tt>, <tt>generate</tt>, <tt>update</tt>, and <tt>uninstantiate</tt>.
-         Each application interface port used by peripheral hardware commands a unique instance number in CSRNG.
-    </td>
-  </tr>
-  <tr>
-    <td>7:4</td>
-    <td>clen</td>
-    <td> Command Length: Number of 32-bit words that can optionally be appended to the command.
-         A value of zero will only transfer the command header.
-         A value of <tt>4'hc</tt> will transfer the header plus an additional twelve 32-bit words of data.
-    </td>
-  </tr>
-  <tr>
-    <td>11:8</td>
-    <td>flag0</td>
-    <td> Command Flag0: flag0 is associated with current command.
-         Setting this field to kMultiBitBool4True will enable flag0 to be enabled.
-         Note that <tt>flag0</tt> is used for the <tt>instantiate</tt> and  <tt>reseed</tt> commands only, for all other commands its value is ignored.
-    </td>
-  </tr>
-  <tr>
-    <td>24:12</td>
-    <td>glen</td>
-    <td> Generate Length: Only defined for the generate command, this field is the total number of cryptographic entropy blocks requested.
-         Each unit represents 128 bits of entropy returned.
-         The NIST reference name is <tt>max_number_of_bit_per_request</tt>, and this field size supports the maximum size of 2<sup>19</sup> bits.
-         For the maximum size, this field should be set to 4096, resulting in a <tt>max_number_of_bit_per_request</tt> value of 4096 x 128 bits.
-         For a smaller example, a value of 8 would return a total of 1024 bits.
-    </td>
-  </tr>
-  <tr>
-    <td>31:25</td>
-    <td>resv</td>
-    <td> Unused and reserved.
-    </td>
-  </tr>
-</table>
-
-#### Command Description
-The command field of the application command header is described in detail in the table below.
-The actions performed by each command, as well as which flags are supported, are described in this table.
-
-<table>
-<caption>Application Interface Command Description</caption>
-<thead>
-  <tr>
-    <th>Command Name</th>
-    <th>Encoded Value</th>
-    <th>Description</th>
-  </tr>
-</thead>
-<tbody>
-  <tr>
-    <td>Instantiate</td>
-    <td>0x1</td>
-    <td> Initializes an instance in CSRNG.
-         When seeding, the following table describes how the seed is determined based on <tt>flag0</tt> and the <tt>clen</tt> field.
-         Note that the last table entry (<tt>flag0</tt> is set and <tt>clen</tt> is set to non-zero) is intended for known answer testing (KAT).
-        WARNING: Though <tt>flag0</tt> may be useful for generating fully-deterministic bit sequences, the use of this flag will render the instance non-FIPS compliant until it is re-instantiated.
-         When the <tt>Instantiate</tt> command is completed, the active bit in the CSRNG working state will be set.
-        <table>
-          <thead>
-            <tr><th>flag0</th><th>clen</th><th>Description</th></tr>
-          </thead>
-          <tbody>
-            <tr><td>0</td><td>0</td><td>Only entropy source seed is used.</td></tr>
-            <tr><td>0</td><td>1-12</td><td>Entropy source seed is xor'ed with provided additional data.</td></tr>
-            <tr><td>1</td><td>0</td><td>Seed of zero is used (no entropy source seed used).</td></tr>
-            <tr><td>1</td><td>1-12</td><td>Only provided additional data will be used as seed.</td></tr>
-          </tbody>
-        </table>
-    </td>
-  </tr>
-  <tr>
-    <td>Reseed</td>
-    <td>0x2</td>
-    <td> Reseeds an existing instance in CSRNG.
-         The <tt>flag0</tt> and <tt>clen</tt> table in the <tt>Instance</tt> command description above also applies to the <tt>Reseed</tt> command.
-         Note that the last table entry (<tt>flag0</tt> is set and <tt>clen</tt> is set to non-zero) is intended for known answer testing (KAT).
-         The <tt>Reseed</tt> command only takes in one group (a maximum of twelve 32 bit words) of generic additional data.
-         If both a seed and additional data must be provided to the <tt>Reseed</tt> command, the seed and additional data must be xor'ed first.
-         This scenario will then pass the NIST vector test requiring both a provided seed and additional data.
-    </td>
-  </tr>
-  <tr>
-    <td>Generate</td>
-    <td>0x3</td>
-    <td> Starts a request to CSRNG to generate cryptographic entropy bits.
-         The <tt>glen</tt> field defines how many 128-bit words are to be returned to the application interface.
-         The <tt>glen</tt> field needs to be a minimum value of one.
-         The NIST reference to the <tt>prediction_resistance_flag</tt> is not directly supported as a flag.
-         It is the responsibility of the calling application to reseed as needed before the <tt>Generate</tt> command to properly support prediction resistance.
-         Note that additional data is also supported when the <tt>clen</tt> field is set to non-zero.
-    </td>
-  </tr>
-  <tr>
-    <td>Update</td>
-    <td>0x4</td>
-    <td> Updates an existing instance in CSRNG.
-         This command does the same function as the <tt>Reseed</tt> command, except that:
-         <ol>
-         <li>only the additional data provided will be used in the update function (i.e. no physical entropy is gathered), and
-         <li>the <tt>Update</tt> command does not reset the reseed counter.
-         </ol>
-         When the <tt>Update</tt> command is completed, the results will be reflected in the CSRNG working state.
-    </td>
-  </tr>
-  <tr>
-    <td>Uninstantiate</td>
-    <td>0x5</td>
-    <td> Resets an instance in CSRNG.
-         Values in the instance are zeroed out.
-         When the <tt>Uninstantiate</tt> command is completed, the <tt>Status</tt> bit in the CSRNG working state will be cleared.
-         Uninstantiating an instance effectively resets it, clearing any errors that it may have encountered due to bad command syntax or entropy source failures.
-         Only a value of zero should be used for <tt>clen</tt>, since any additional data will be ignored.
-    </td>
-  </tr>
-  <tr>
-    <td>Reserved</td>
-    <td>0x0,0x6-0xf</td>
-    <td> Unused and reserved.
-    </td>
-  </tr>
-</table>
-
-#### Command Response
-
-Once a command has been completed, successfully or unsuccessfully, the CSRNG responds with a single cycle pulse on the `csrng_rsp_ack` signal associated with the same application interface port.
-If the command is successful, the `csrng_rsp_sts` signal will indicate the value 0 (`CSRNG_OK`) in the same cycle.
-Otherwise the application will receive the value 1 (`CSRNG_ERROR`) on the `csrng_rsp_sts` signal.
-A number of exception cases to be considered are enumerated in NIST SP 800-90A, and may include events such as:
-* Failure of the entropy source
-* Attempts to use an instance which has not been properly instantiated, or
-* Attempts to generate data when an instance has exceeded its maximum seed life.
-In such cases, a 32-bit exception message will be propagated to firmware via the `hw_exc_sts` register, and a `cs_hw_inst_exc` interrupt will be raised.
-
-#### Generated Bits (`genbits`) Interface
-
-In addition to the command response signals there is a bus for returning the generated bits.
-This 129-bit bus consists of 128-bits, `genbits_bus`, for the random bit sequence itself, along with a single bit flag, `genbits_fips`, indicating whether the bits were considered fully in accordance with FIPS standards.
-
-There are two cases when the sequence will not be FIPS compliant:
-- Early in the boot sequence, the `ENTROPY_SRC` generates a seed from the first 384 bits pulled from the noise source.
-This initial seed is tested to ensure some minimum quality for obfuscation use- cases, but this boot seed is not expected to be full-entropy nor do these health checks meet the 1024-bit requirement for start-up health checks required by NIST 800-90B.
-- If `flag0` is asserted during instantiation, the resulting DRBG instance will have a fully-deterministic seed, determined only by user input data.
-Such a seed will be created only using factory-entropy and will lack the physical-entropy required by NIST SP 800-90A, and thus this DRBG instance will not be FIPS compliant.
-
-#### Handshaking signals
-
-The application command signal `csrng_req_bus` is accompanied by a `csrng_valid_signal`, which is asserted by the requester when the command is valid.
-CSRNG may stall incoming commands by de-asserting the `csrng_req_ready` signal.
-A command is considered received whenever both `csrng_req_valid` and `csrng_req_ready` are asserted in the same clock cycle.
-
-Likewise a requester must only consider data on the `genbits` bus to be valid when the `genbits_valid` signal is asserted, and should assert `genbits_ready` whenever it is ready to accept the `genbits` data.
-The `genbits` data is considered successfully transmitted whenever `genbits_valid` and `genbits_ready` are asserted in the same clock cycle.
-
-A requester must always be ready to receive `csrng_req_sts` signals.
-(There is no "ready" signal for command response messages sent to hardware.)
-
-#### Waveforms
-
-##### Application Interface: Instantiate Request
-
-```wavejson
-{signal: [
-   {name: 'clk'             , wave: 'p...............|.....'},
-   {name: 'csrng_req_valid' , wave: '01............0.|.....'},
-   {name: 'csrng_req_ready' , wave: '1.............0.|..1..'},
-   {name: 'csrng_req_bus'   , wave: 'x5333333333333x.|.....',data: ['ins','sd1','sd2','sd3','sd4','sd5','sd6','sd7','sd8','sd9','sd10','sd11','sd12']},
-   {name: 'csrng_rsp_ack'   , wave: '0...............|.10..'},
-   {name: 'csrng_rsp_sts'   , wave: 'x...............|.5x..', data: ['ok']},
- {},
-]}
-```
-
-##### Application Interface:  Reseed Request
-
-```wavejson
-{signal: [
-   {name: 'clk'             , wave: 'p...............|.....'},
-   {name: 'csrng_req_valid' , wave: '01............0.|.....'},
-   {name: 'csrng_req_ready' , wave: '1.............0.|..1..'},
-   {name: 'csrng_req_bus'   , wave: 'x5333333333333x.|.....',data: ['res','ad1','ad2','ad3','ad4','ad5','ad6','ad7','ad8','ad9','ad10','ad11','ad12']},
-   {name: 'csrng_rsp_ack'   , wave: '0...............|.10..'},
-   {name: 'csrng_rsp_sts'   , wave: 'x...............|.5x..', data: ['ok']},
- {},
-]}
-```
-
-##### Application Interface:  Generate Request
-
-```wavejson
-{signal: [
-   {name: 'clk'              , wave: 'p...|...|....|....|...'},
-   {name: 'csrng_req_valid'  , wave: '010.|...|....|....|...'},
-   {name: 'csrng_req_ready'  , wave: '1...|...|....|....|...'},
-   {name: 'csrng_req_bus'    , wave: 'x5x.|...|....|....|...',data: ['gen']},
-   {name: 'csrng_rsp_ack'    , wave: '0...|...|....|....|.10'},
-   {name: 'csrng_rsp_sts'    , wave: 'x...|...|....|....|.5x', data: ['ok']},
-   {name: 'genbits_valid'    , wave: '0...|.10|.1.0|.10.|...'},
-   {name: 'csrng_rsp_fips'   , wave: '0...|.10|.1.0|.10.|...'},
-   {name: 'genbits_bus'      , wave: '0...|.40|.4.0|.40.|...', data: ['bits0','bits1','bits2']},
-   {name: 'genbits_ready'    , wave: '1...|...|0.1.|........'},
-]}
-```
-
-##### Application Interface:  Update Request
-
-```wavejson
-{signal: [
-   {name: 'clk'             , wave: 'p...............|.....'},
-   {name: 'csrng_req_valid' , wave: '01............0.|.....'},
-   {name: 'csrng_req_ready' , wave: '1.............0.|..1..'},
-   {name: 'csrng_req_bus'   , wave: 'x5333333333333x.|.....',data: ['upd','ad1','ad2','ad3','ad4','ad5','ad6','ad7','ad8','ad9','ad10','ad11','ad12']},
-   {name: 'csrng_rsp_ack'   , wave: '0...............|.10..'},
-   {name: 'csrng_rsp_sts'   , wave: 'x...............|.5x..', data: ['ok']},
- {},
-]}
-```
-
-##### Application Interface:  Uninstantiate Request
-
-```wavejson
-{signal: [
-   {name: 'clk'             , wave: 'p...............|.....'},
-   {name: 'csrng_req_valid' , wave: '010.............|.....'},
-   {name: 'csrng_req_ready' , wave: '1.0.............|..1..'},
-   {name: 'csrng_req_bus'   , wave: 'x5x.............|.....',data: ['uni']},
-   {name: 'csrng_rsp_ack'   , wave: '0...............|.10..'},
-   {name: 'csrng_rsp_sts'   , wave: 'x...............|.5x..', data: ['ok']},
- {},
-]}
-```
-
-
-##### Entropy Source Hardware Interface
-The following waveform shows an example of how the entropy source hardware interface works.
-
-
-```wavejson
-{signal: [
-   {name: 'clk'           , wave: 'p...|.........|.......'},
-   {name: 'es_req'        , wave: '0..1|..01.0..1|.....0.'},
-   {name: 'es_ack'        , wave: '0...|.10.10...|....10.'},
-   {name: 'es_bus[383:0]' , wave: '0...|.30.30...|....30.', data: ['es0','es1','es2']},
-   {name: 'es_fips'       , wave: '0...|....10...|....10.'},
-]}
-]}
-```
-
-### Interrupts
-
-The `cs_cmd_req_done` interrupt will assert when a CSRNG command has been completed.
-
-The `cs_entropy_req` interrupt will assert when CSRNG requests entropy from ENTROPY_SRC.
-
-The `cs_hw_inst_exc` interrupt will assert when any of the hardware-controlled CSRNG instances encounters an exception while executing a command, either due to errors on the command sequencing, or an exception within the `ENTROPY_SRC` IP.
-
-The `cs_fatal_err` interrupt will assert when any of the CSRNG FIFOs has a malfunction.
-The conditions that cause this to happen are either when there is a push to a full FIFO or a pull from an empty FIFO.
-
-# Programmers Guide
-
-This section discusses how software can interface with CSRNG.
-
-## Module enable and disable
-
-CSRNG may only be enabled if `ENTROPY_SRC` is enabled.
-CSRNG may only be disabled if all EDNs are disabled.
-Once disabled, CSRNG may only be re-enabled after `ENTROPY_SRC` has been disabled and re-enabled.
-
-## Endianness and Known-Answer Tests
-
-All CSRNG registers are little-endian.
-
-When providing additional data for an <tt>instantiate</tt>, <tt>reseed</tt> or <tt>update</tt> command the data words have to be written to [`CMD_REQ`](data/csrng.hjson#cmd_req) in the correct order.
-Consider a byte string B<sub>1</sub>, B<sub>2</sub>, ..., B<sub>n</sub> as defined in Appendix A of [NIST's SP 800-90A](https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-90Ar1.pdf), i.e., where B<sub>1</sub> is the most significant byte and B<sub>n</sub> the least significant byte.
-Providing this sequence as additional data to CSRNG requires software to write the following 32-bit words to [`CMD_REQ`](data/csrng.hjson#cmd_req) in the following order:
-
-<table>
-<caption>Byte order when writing to [`CMD_REQ`](data/csrng.hjson#cmd_req)</caption>
-<thead>
-  <tr>
-    <th>Word Index</th>
-    <th>Byte Indices of Additional Data</th>
-  </tr>
-</thead>
-<tbody>
-  <tr>
-    <td>1</td>
-    <td>0xB<sub>n-3</sub>B<sub>n-2</sub>B<sub>n-1</sub>B<sub>n</sub></td>
-  </tr>
-  <tr>
-    <td>...</td>
-    <td>...</td>
-  </tr>
-  <tr>
-    <td>n/4-1</td>
-    <td>0xB<sub>5</sub>B<sub>6</sub>B<sub>8</sub>B<sub>8</sub></td>
-  </tr>
-  <tr>
-    <td>n/4</td>
-    <td>0xB<sub>1</sub>B<sub>2</sub>B<sub>3</sub>B<sub>4</sub></td>
-  </tr>
-</table>
-
-When reading the internal state from [`INT_STATE_VAL`](data/csrng.hjson#int_state_val), CSRNG returns the bytes of V and Key in the following order:
-<table>
-<caption>Byte order when reading from [`INT_STATE_VAL`](data/csrng.hjson#int_state_val)</caption>
-<thead>
-  <tr>
-    <th>Word Index</th>
-    <th>Byte Indices of V and Key</th>
-  </tr>
-</thead>
-<tbody>
-  <tr>
-    <td>1</td>
-    <td>0xV<sub>13</sub>V<sub>14</sub>V<sub>15</sub>V<sub>16</sub></td>
-  </tr>
-  <tr>
-    <td>...</td>
-    <td>...</td>
-  </tr>
-  <tr>
-    <td>4</td>
-    <td>0xV<sub>01</sub>V<sub>02</sub>V<sub>03</sub>V<sub>04</sub></td>
-  </tr>
-  <tr>
-    <td>5</td>
-    <td>0xKey<sub>29</sub>Key<sub>30</sub>Key<sub>31</sub>Key<sub>32</sub></td>
-  </tr>
-  <tr>
-    <td>...</td>
-    <td>...</td>
-  </tr>
-  <tr>
-    <td>11</td>
-    <td>0xKey<sub>05</sub>Key<sub>06</sub>Key<sub>07</sub>Key<sub>08</sub></td>
-  </tr>
-  <tr>
-    <td>12</td>
-    <td>0xKey<sub>01</sub>Key<sub>02</sub>Key<sub>03</sub>Key<sub>04</sub></td>
-  </tr>
-</table>
-
-Finally, when reading a byte string of say 64 bytes (16 words) B<sub>1</sub>, B<sub>2</sub>, ..., B<sub>64</sub> from [`GENBITS`](data/csrng.hjson#genbits) as defined in Appendix A of [NIST's SP 800-90A](https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-90Ar1.pdf), the bytes are returned in the following order.
-Note that always 4 words return 1 128-bit GENBITS block.
-Within each block, the least significant bytes are returned first and the most significant bytes are returned last.
-In particular, the most significant byte B<sub>1</sub> of the string is read in Word 4 and the least significant byte B<sub>64</sub> of the string is read in Word 13.
-
-<table>
-<caption>Byte order when reading from [`GENBITS`](data/csrng.hjson#genbits)</caption>
-<thead>
-  <tr>
-    <th>Word Index</th>
-    <th>Byte Indices of Generated Bits</th>
-  </tr>
-</thead>
-<tbody>
-  <tr>
-    <td>1</td>
-    <td>0xB<sub>13</sub>B<sub>14</sub>B<sub>15</sub>B<sub>16</sub></td>
-  </tr>
-  <tr>
-    <td>2</td>
-    <td>0xB<sub>09</sub>B<sub>10</sub>B<sub>11</sub>B<sub>12</sub></td>
-  </tr>
-  <tr>
-    <td>3</td>
-    <td>0xB<sub>05</sub>B<sub>06</sub>B<sub>07</sub>B<sub>08</sub></td>
-  </tr>
-  <tr>
-    <td>4</td>
-    <td>0xB<sub>01</sub>B<sub>02</sub>B<sub>03</sub>B<sub>04</sub></td>
-  </tr>
-
-  <tr>
-    <td>5</td>
-    <td>0xB<sub>29</sub>B<sub>30</sub>B<sub>31</sub>B<sub>32</sub></td>
-  </tr>
-  <tr>
-    <td>6</td>
-    <td>0xB<sub>25</sub>B<sub>26</sub>B<sub>27</sub>B<sub>28</sub></td>
-  </tr>
-  <tr>
-    <td>7</td>
-    <td>0xB<sub>21</sub>B<sub>22</sub>B<sub>23</sub>B<sub>24</sub></td>
-  </tr>
-  <tr>
-    <td>8</td>
-    <td>0xB<sub>17</sub>B<sub>18</sub>B<sub>19</sub>B<sub>20</sub></td>
-  </tr>
-
-  <tr>
-    <td>...</td>
-    <td>...</td>
-  </tr>
-
-  <tr>
-    <td>13</td>
-    <td>0xB<sub>61</sub>B<sub>62</sub>B<sub>63</sub>B<sub>64</sub></td>
-  </tr>
-  <tr>
-    <td>14</td>
-    <td>0xB<sub>57</sub>B<sub>58</sub>B<sub>59</sub>B<sub>60</sub></td>
-  </tr>
-  <tr>
-    <td>15</td>
-    <td>0xB<sub>53</sub>B<sub>54</sub>B<sub>55</sub>B<sub>56</sub></td>
-  </tr>
-  <tr>
-    <td>16</td>
-    <td>0xB<sub>49</sub>B<sub>50</sub>B<sub>51</sub>B<sub>52</sub></td>
-  </tr>
-</table>
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_csrng.h)
-
-## Register Table
-
-* [Register Table](data/csrng.hjson#registers)
diff --git a/hw/ip/csrng/doc/programmers_guide.md b/hw/ip/csrng/doc/programmers_guide.md
new file mode 100644
index 0000000000000..a3c1125dd1411
--- /dev/null
+++ b/hw/ip/csrng/doc/programmers_guide.md
@@ -0,0 +1,163 @@
+# Programmer's Guide
+
+This section discusses how software can interface with CSRNG.
+
+## Module enable and disable
+
+CSRNG may only be enabled if `ENTROPY_SRC` is enabled.
+CSRNG may only be disabled if all EDNs are disabled.
+Once disabled, CSRNG may only be re-enabled after `ENTROPY_SRC` has been disabled and re-enabled.
+
+## Endianness and Known-Answer Tests
+
+All CSRNG registers are little-endian.
+
+When providing additional data for an <tt>instantiate</tt>, <tt>reseed</tt> or <tt>update</tt> command the data words have to be written to [`CMD_REQ`](../data/csrng.hjson#cmd_req) in the correct order.
+Consider a byte string B<sub>1</sub>, B<sub>2</sub>, ..., B<sub>n</sub> as defined in Appendix A of [NIST's SP 800-90A](https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-90Ar1.pdf), i.e., where B<sub>1</sub> is the most significant byte and B<sub>n</sub> the least significant byte.
+Providing this sequence as additional data to CSRNG requires software to write the following 32-bit words to [`CMD_REQ`](../data/csrng.hjson#cmd_req) in the following order:
+
+<table>
+<caption>Byte order when writing to [`CMD_REQ`](data/csrng.hjson#cmd_req)</caption>
+<thead>
+  <tr>
+    <th>Word Index</th>
+    <th>Byte Indices of Additional Data</th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td>1</td>
+    <td>0xB<sub>n-3</sub>B<sub>n-2</sub>B<sub>n-1</sub>B<sub>n</sub></td>
+  </tr>
+  <tr>
+    <td>...</td>
+    <td>...</td>
+  </tr>
+  <tr>
+    <td>n/4-1</td>
+    <td>0xB<sub>5</sub>B<sub>6</sub>B<sub>8</sub>B<sub>8</sub></td>
+  </tr>
+  <tr>
+    <td>n/4</td>
+    <td>0xB<sub>1</sub>B<sub>2</sub>B<sub>3</sub>B<sub>4</sub></td>
+  </tr>
+</table>
+
+When reading the internal state from [`INT_STATE_VAL`](../data/csrng.hjson#int_state_val), CSRNG returns the bytes of V and Key in the following order:
+<table>
+<caption>Byte order when reading from [`INT_STATE_VAL`](data/csrng.hjson#int_state_val)</caption>
+<thead>
+  <tr>
+    <th>Word Index</th>
+    <th>Byte Indices of V and Key</th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td>1</td>
+    <td>0xV<sub>13</sub>V<sub>14</sub>V<sub>15</sub>V<sub>16</sub></td>
+  </tr>
+  <tr>
+    <td>...</td>
+    <td>...</td>
+  </tr>
+  <tr>
+    <td>4</td>
+    <td>0xV<sub>01</sub>V<sub>02</sub>V<sub>03</sub>V<sub>04</sub></td>
+  </tr>
+  <tr>
+    <td>5</td>
+    <td>0xKey<sub>29</sub>Key<sub>30</sub>Key<sub>31</sub>Key<sub>32</sub></td>
+  </tr>
+  <tr>
+    <td>...</td>
+    <td>...</td>
+  </tr>
+  <tr>
+    <td>11</td>
+    <td>0xKey<sub>05</sub>Key<sub>06</sub>Key<sub>07</sub>Key<sub>08</sub></td>
+  </tr>
+  <tr>
+    <td>12</td>
+    <td>0xKey<sub>01</sub>Key<sub>02</sub>Key<sub>03</sub>Key<sub>04</sub></td>
+  </tr>
+</table>
+
+Finally, when reading a byte string of say 64 bytes (16 words) B<sub>1</sub>, B<sub>2</sub>, ..., B<sub>64</sub> from [`GENBITS`](../data/csrng.hjson#genbits) as defined in Appendix A of [NIST's SP 800-90A](https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-90Ar1.pdf), the bytes are returned in the following order.
+Note that always 4 words return 1 128-bit GENBITS block.
+Within each block, the least significant bytes are returned first and the most significant bytes are returned last.
+In particular, the most significant byte B<sub>1</sub> of the string is read in Word 4 and the least significant byte B<sub>64</sub> of the string is read in Word 13.
+
+<table>
+<caption>Byte order when reading from [`GENBITS`](data/csrng.hjson#genbits)</caption>
+<thead>
+  <tr>
+    <th>Word Index</th>
+    <th>Byte Indices of Generated Bits</th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td>1</td>
+    <td>0xB<sub>13</sub>B<sub>14</sub>B<sub>15</sub>B<sub>16</sub></td>
+  </tr>
+  <tr>
+    <td>2</td>
+    <td>0xB<sub>09</sub>B<sub>10</sub>B<sub>11</sub>B<sub>12</sub></td>
+  </tr>
+  <tr>
+    <td>3</td>
+    <td>0xB<sub>05</sub>B<sub>06</sub>B<sub>07</sub>B<sub>08</sub></td>
+  </tr>
+  <tr>
+    <td>4</td>
+    <td>0xB<sub>01</sub>B<sub>02</sub>B<sub>03</sub>B<sub>04</sub></td>
+  </tr>
+
+  <tr>
+    <td>5</td>
+    <td>0xB<sub>29</sub>B<sub>30</sub>B<sub>31</sub>B<sub>32</sub></td>
+  </tr>
+  <tr>
+    <td>6</td>
+    <td>0xB<sub>25</sub>B<sub>26</sub>B<sub>27</sub>B<sub>28</sub></td>
+  </tr>
+  <tr>
+    <td>7</td>
+    <td>0xB<sub>21</sub>B<sub>22</sub>B<sub>23</sub>B<sub>24</sub></td>
+  </tr>
+  <tr>
+    <td>8</td>
+    <td>0xB<sub>17</sub>B<sub>18</sub>B<sub>19</sub>B<sub>20</sub></td>
+  </tr>
+
+  <tr>
+    <td>...</td>
+    <td>...</td>
+  </tr>
+
+  <tr>
+    <td>13</td>
+    <td>0xB<sub>61</sub>B<sub>62</sub>B<sub>63</sub>B<sub>64</sub></td>
+  </tr>
+  <tr>
+    <td>14</td>
+    <td>0xB<sub>57</sub>B<sub>58</sub>B<sub>59</sub>B<sub>60</sub></td>
+  </tr>
+  <tr>
+    <td>15</td>
+    <td>0xB<sub>53</sub>B<sub>54</sub>B<sub>55</sub>B<sub>56</sub></td>
+  </tr>
+  <tr>
+    <td>16</td>
+    <td>0xB<sub>49</sub>B<sub>50</sub>B<sub>51</sub>B<sub>52</sub></td>
+  </tr>
+</table>
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_csrng.h)
+
+## Register Table
+
+* [Register Table](../data/csrng.hjson#registers)
diff --git a/hw/ip/csrng/doc/theory_of_operation.md b/hw/ip/csrng/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..cae3656a6a0cf
--- /dev/null
+++ b/hw/ip/csrng/doc/theory_of_operation.md
@@ -0,0 +1,435 @@
+# Theory of Operation
+
+The CSRNG block has been constructed to follow the NIST recommendation for a DRBG mechanism based on block ciphers.
+Specifically, it is a CTR_DRBG that uses an approved block cipher algorithm in counter mode.
+As such, the block diagram below makes reference to hardware blocks that either directly or closely follow NIST descriptions for the equivalent functions.
+
+There are two major hardware interfaces: the application interface and the entropy request interface.
+The application interface, which is described in more detail later, is provided for an application to manage an `instance` in CSRNG.
+Once setup, the application interface user can request for entropy bits to be generated, as well as other functions.
+The application interface supports up to 15 hardware interfaces, and one software interface.
+
+A walk through of how CSRNG generates entropy bits begins with the application interface.
+An `instantiate` command is issued from one of the application interfaces.
+This request moves into the `cmd_stage` block.
+Here the request is arbitrated between all of the `cmd_stage` blocks.
+The winner will get its command moved into the command dispatch logic.
+A common state machine will process all application interface commands in order of arbitration.
+At this point, some seed entropy may be required depending on the command and any flags.
+If needed, a request to the entropy source hardware interface will be made.
+This step can take milliseconds if seed entropy is not immediately available.
+Once all of the prerequisites have been collected, a CTR_DRBG command can be launched.
+This command will go into the `ctr_drbg_cmd` block.
+This `ctr_drbg_cmd` block uses two NIST-defined functions, the update and the `block_encrypt` functions.
+If the command is a generate, the `ctr_drbg_cmd` block will process the first half of the algorithm, and then pass it on to the `ctr_drbg_gen` block.
+Additionally, the `ctr_drbg_gen` block also uses the `update` block and the `block_encrypt` block.
+To keep resources to a minimum, both of these blocks have arbiters to allow sharing between the `ctr_drbg_cmd` and `ctr_drbg_gen` blocks.
+The command field called `ccmd` (for current command) is sent along the pipeline to not only identify the command, but is also reused as a routing tag for the arbiters to use when returning the block response.
+
+Once the command has traversed through all of the CTR_DRBG blocks, the result will eventually land into the `state_db` block.
+This block will hold the instance state for each application interface.
+The specific state information held in the instance is documented below.
+If the command was a `generate` command, the genbits data word will be returned to the requesting `cmd_stage` block.
+Finally, an `ack` response and status will be returned to the application interface once the command has been completely processed.
+
+
+## Block Diagram
+
+![CSRNG Block Diagram](../doc/csrng_blk_diag.svg)
+
+## Hardware Interfaces
+
+ * [Interface Tables](../data/csrng.hjson#interfaces)
+
+The table below lists other CSRNG signals.
+
+Signal                       | Direction        | Type                        | Description
+-----------------------------|------------------|-----------------------------|---------------
+`otp_en_csrng_sw_app_read_i` | `input `         | `otp_en_t `                 | An efuse that will enable firmware to access the NIST CTR_DRBG internal state and genbits through registers.
+`lc_hw_debug_en_i`           | `input`          | `lc_tx_t `                  | A life-cycle that will select which diversification value is used for xoring with the seed from ENTROPY_SRC.
+`entropy_src_hw_if_o`        | `output`         | `entropy_src_hw_if_req_t`   | Seed request made to the ENTROPY_SRC module.
+`entropy_src_hw_if_i`        | `input`          | `entropy_src_hw_if_rsp_t`   | Seed response from the ENTROPY_SRC module.
+`cs_aes_halt_i`              | `input`          | `cs_aes_halt_req_t`         | Request to CSRNG from ENTROPY_SRC to halt requests to the AES block for power leveling purposes.
+`cs_aes_halt_o`              | `output`         | `cs_aes_halt_rsp_t`         | Response from CSRNG to ENTROPY_SRC that all requests to AES block are halted.
+`csrng_cmd_i`                | `input`          | `csrng_req_t`               | Application interface request to CSRNG from an EDN block.
+`csrng_cmd_o`                | `output`         | `csrng_rsp_t`               | Application interface response from CSRNG to an EDN block.
+
+
+## Design Details
+
+#### Non-blocking Commands
+Regarding command processing, all commands process immediately except for the generate command.
+The command generate length count (`glen`) is kept in the `cmd_stage` block.
+When the `state_db` block issues an ack to the `cmd_stage` block, the `cmd_stage` block increments an internal counter.
+This process repeats until the `glen` field value has been matched.
+Because each request is pipelined, requests from other `cmd_stage` blocks can be processed before the original generate command is completely done.
+This provides some interleaving of commands since a generate command can be programmed to take a very long time.
+
+When sending an unsupported or illegal command, `CS_MAIN_SM_ALERT` will be triggered, but there will be no status response or indication of which app the error occurred in.
+
+#### Working State Values
+The CSRNG working state data base (`state_db`) contains the current working state for a given DRBG instance.
+It holds the following values:
+
+<table>
+<caption>Values stored by <tt>state_db</tt></caption>
+<thead>
+  <tr>
+    <th>Bits</th>
+    <th>Name</th>
+    <th>Description</th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td>31:0</td>
+    <td>Reseed Counter</td>
+    <td> Value required and defined by NIST's SP 800-90A to be held in the state instance.
+    It keeps track of the number of pseudorandom bits requested since the last instantiation or reseeding.
+    </td>
+  </tr>
+  <tr>
+    <td>159:32</td>
+    <td>V</td>
+    <td> Value required and defined by NIST's SP 800-90A to be held in the state instance, and is of size <tt>BlkLen</tt>.
+    This value changes every time a <tt>BlkLen</tt> bits of output are generated.
+    </td>
+  </tr>
+  <tr>
+    <td>415:160</td>
+    <td>Key</td>
+    <td> Value required and defined by NIST's SP 800-90A to be held in the state instance, and is of size <tt>KeyLen</tt>.
+    The key is changed after a predetermined number of blocks of output have been produced.
+    </td>
+  </tr>
+  <tr>
+    <td>416</td>
+    <td>Status</td>
+    <td> Set when instantiated.
+    </td>
+  </tr>
+  <tr>
+    <td>417</td>
+    <td>Compliance</td>
+    <td> Set when FIPS/CC compliant entropy was used to seed this instance.
+    </td>
+  </tr>
+</table>
+
+#### AES Cipher
+The `block_encrypt` block is where the `aes_cipher_core` block is located.
+This is the same block used in the AES design.
+Parameters are selected such that this is the unmasked version.
+
+#### Software Support
+The software application interface uses a set of TL-UL registers to send commands and receive generated bits.
+Since the registers are 32-bit words wide, some sequencing will need to be done by firmware to make this interface work properly.
+
+### Application Interface
+
+This section describes the application interface, which is required for performing any operations using a CSRNG instance (i.e. instantiation, reseeding, RNG generation, or uninstantiation).
+Each CSRNG instance corresponds to a unique application interface port, which implements the application interface described here.
+Any hardware peripherals which require complete control of an instance may connect directly to a dedicated interface port.
+Meanwhile peripherals without any special requirements (i.e. personalization strings or non-FIPS-approved, fully-deterministic number sequences) may share access to an instance via the entropy distribution network (EDN) IP.
+The EDNs manage the instantiation and reseeding of CSRNG instances for general use-cases, providing either on-demand or timed-delivery entropy streams to hardware peripherals.
+Firmware applications can obtain access to random bit sequences directly through application interface port 0, which is directly mapped to a set of TL-UL registers.
+
+The total number of application interface ports (for TL-UL, directly attached peripherals or EDN instances) is determined by the `NHwApp` parameter.
+
+The command bus operates like a FIFO, in which a command is pushed into the interface.
+An optional stream of additional data may follow, such as seed material for an `instantiate` application command.
+For the `generate` application command, the obfuscated entropy will be returned on the `genbits` bus.
+This bus also operates like a FIFO, and the receiving module can provide back pressure to the `genbits` bus.
+There is one instance of a firmware application interface, and it uses the TL-UL registers.
+For more details on how the application interface works, see the Theory of Operations section above.
+
+In general, users of the application interface are either firmware or some hardware module entity.
+For hardware, a module can either directly control the application interface, or it can connect to an EDN module.
+Attaching to an EDN module allows for a simpler interface connection to a more layout-friendly distributed-chip network.
+
+#### General Command Format
+
+The general format for the application interface is a 32-bit command header, optionally followed by additional data, such as a personalization string, typically twelve 32-bit words in length.
+Depending on the command, these strings are typically required to be 384-bits in length, to match the size of the seed-length when operating with 256-bit security-strength.
+The exact function of the additional data field depends in the command.
+However, in general, the additional data can be any length as specified by the command length field.
+The command header is defined below.
+
+#### Command Header
+The application interface requires that a 32-bit command header be provided to instruct the CSRNG how to manage the internal working states.
+Below is a description of the fields of this header:
+
+<table>
+<caption>Application Interface Command Header</caption>
+<thead>
+  <tr>
+    <th>Bits</th>
+    <th>Name</th>
+    <th>Description</th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td>3:0</td>
+    <td>acmd</td>
+    <td> Application Command: Selects one of five operations to perform.
+         The commands supported are <tt>instantiate</tt>, <tt>reseed</tt>, <tt>generate</tt>, <tt>update</tt>, and <tt>uninstantiate</tt>.
+         Each application interface port used by peripheral hardware commands a unique instance number in CSRNG.
+    </td>
+  </tr>
+  <tr>
+    <td>7:4</td>
+    <td>clen</td>
+    <td> Command Length: Number of 32-bit words that can optionally be appended to the command.
+         A value of zero will only transfer the command header.
+         A value of <tt>4'hc</tt> will transfer the header plus an additional twelve 32-bit words of data.
+    </td>
+  </tr>
+  <tr>
+    <td>11:8</td>
+    <td>flag0</td>
+    <td> Command Flag0: flag0 is associated with current command.
+         Setting this field to kMultiBitBool4True will enable flag0 to be enabled.
+         Note that <tt>flag0</tt> is used for the <tt>instantiate</tt> and  <tt>reseed</tt> commands only, for all other commands its value is ignored.
+    </td>
+  </tr>
+  <tr>
+    <td>24:12</td>
+    <td>glen</td>
+    <td> Generate Length: Only defined for the generate command, this field is the total number of cryptographic entropy blocks requested.
+         Each unit represents 128 bits of entropy returned.
+         The NIST reference name is <tt>max_number_of_bit_per_request</tt>, and this field size supports the maximum size of 2<sup>19</sup> bits.
+         For the maximum size, this field should be set to 4096, resulting in a <tt>max_number_of_bit_per_request</tt> value of 4096 x 128 bits.
+         For a smaller example, a value of 8 would return a total of 1024 bits.
+    </td>
+  </tr>
+  <tr>
+    <td>31:25</td>
+    <td>resv</td>
+    <td> Unused and reserved.
+    </td>
+  </tr>
+</table>
+
+#### Command Description
+The command field of the application command header is described in detail in the table below.
+The actions performed by each command, as well as which flags are supported, are described in this table.
+
+<table>
+<caption>Application Interface Command Description</caption>
+<thead>
+  <tr>
+    <th>Command Name</th>
+    <th>Encoded Value</th>
+    <th>Description</th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td>Instantiate</td>
+    <td>0x1</td>
+    <td> Initializes an instance in CSRNG.
+         When seeding, the following table describes how the seed is determined based on <tt>flag0</tt> and the <tt>clen</tt> field.
+         Note that the last table entry (<tt>flag0</tt> is set and <tt>clen</tt> is set to non-zero) is intended for known answer testing (KAT).
+        WARNING: Though <tt>flag0</tt> may be useful for generating fully-deterministic bit sequences, the use of this flag will render the instance non-FIPS compliant until it is re-instantiated.
+         When the <tt>Instantiate</tt> command is completed, the active bit in the CSRNG working state will be set.
+        <table>
+          <thead>
+            <tr><th>flag0</th><th>clen</th><th>Description</th></tr>
+          </thead>
+          <tbody>
+            <tr><td>0</td><td>0</td><td>Only entropy source seed is used.</td></tr>
+            <tr><td>0</td><td>1-12</td><td>Entropy source seed is xor'ed with provided additional data.</td></tr>
+            <tr><td>1</td><td>0</td><td>Seed of zero is used (no entropy source seed used).</td></tr>
+            <tr><td>1</td><td>1-12</td><td>Only provided additional data will be used as seed.</td></tr>
+          </tbody>
+        </table>
+    </td>
+  </tr>
+  <tr>
+    <td>Reseed</td>
+    <td>0x2</td>
+    <td> Reseeds an existing instance in CSRNG.
+         The <tt>flag0</tt> and <tt>clen</tt> table in the <tt>Instance</tt> command description above also applies to the <tt>Reseed</tt> command.
+         Note that the last table entry (<tt>flag0</tt> is set and <tt>clen</tt> is set to non-zero) is intended for known answer testing (KAT).
+         The <tt>Reseed</tt> command only takes in one group (a maximum of twelve 32 bit words) of generic additional data.
+         If both a seed and additional data must be provided to the <tt>Reseed</tt> command, the seed and additional data must be xor'ed first.
+         This scenario will then pass the NIST vector test requiring both a provided seed and additional data.
+    </td>
+  </tr>
+  <tr>
+    <td>Generate</td>
+    <td>0x3</td>
+    <td> Starts a request to CSRNG to generate cryptographic entropy bits.
+         The <tt>glen</tt> field defines how many 128-bit words are to be returned to the application interface.
+         The <tt>glen</tt> field needs to be a minimum value of one.
+         The NIST reference to the <tt>prediction_resistance_flag</tt> is not directly supported as a flag.
+         It is the responsibility of the calling application to reseed as needed before the <tt>Generate</tt> command to properly support prediction resistance.
+         Note that additional data is also supported when the <tt>clen</tt> field is set to non-zero.
+    </td>
+  </tr>
+  <tr>
+    <td>Update</td>
+    <td>0x4</td>
+    <td> Updates an existing instance in CSRNG.
+         This command does the same function as the <tt>Reseed</tt> command, except that:
+         <ol>
+         <li>only the additional data provided will be used in the update function (i.e. no physical entropy is gathered), and
+         <li>the <tt>Update</tt> command does not reset the reseed counter.
+         </ol>
+         When the <tt>Update</tt> command is completed, the results will be reflected in the CSRNG working state.
+    </td>
+  </tr>
+  <tr>
+    <td>Uninstantiate</td>
+    <td>0x5</td>
+    <td> Resets an instance in CSRNG.
+         Values in the instance are zeroed out.
+         When the <tt>Uninstantiate</tt> command is completed, the <tt>Status</tt> bit in the CSRNG working state will be cleared.
+         Uninstantiating an instance effectively resets it, clearing any errors that it may have encountered due to bad command syntax or entropy source failures.
+         Only a value of zero should be used for <tt>clen</tt>, since any additional data will be ignored.
+    </td>
+  </tr>
+  <tr>
+    <td>Reserved</td>
+    <td>0x0,0x6-0xf</td>
+    <td> Unused and reserved.
+    </td>
+  </tr>
+</table>
+
+#### Command Response
+
+Once a command has been completed, successfully or unsuccessfully, the CSRNG responds with a single cycle pulse on the `csrng_rsp_ack` signal associated with the same application interface port.
+If the command is successful, the `csrng_rsp_sts` signal will indicate the value 0 (`CSRNG_OK`) in the same cycle.
+Otherwise the application will receive the value 1 (`CSRNG_ERROR`) on the `csrng_rsp_sts` signal.
+A number of exception cases to be considered are enumerated in NIST SP 800-90A, and may include events such as:
+* Failure of the entropy source
+* Attempts to use an instance which has not been properly instantiated, or
+* Attempts to generate data when an instance has exceeded its maximum seed life.
+In such cases, a 32-bit exception message will be propagated to firmware via the `hw_exc_sts` register, and a `cs_hw_inst_exc` interrupt will be raised.
+
+#### Generated Bits (`genbits`) Interface
+
+In addition to the command response signals there is a bus for returning the generated bits.
+This 129-bit bus consists of 128-bits, `genbits_bus`, for the random bit sequence itself, along with a single bit flag, `genbits_fips`, indicating whether the bits were considered fully in accordance with FIPS standards.
+
+There are two cases when the sequence will not be FIPS compliant:
+- Early in the boot sequence, the `ENTROPY_SRC` generates a seed from the first 384 bits pulled from the noise source.
+This initial seed is tested to ensure some minimum quality for obfuscation use- cases, but this boot seed is not expected to be full-entropy nor do these health checks meet the 1024-bit requirement for start-up health checks required by NIST 800-90B.
+- If `flag0` is asserted during instantiation, the resulting DRBG instance will have a fully-deterministic seed, determined only by user input data.
+Such a seed will be created only using factory-entropy and will lack the physical-entropy required by NIST SP 800-90A, and thus this DRBG instance will not be FIPS compliant.
+
+#### Handshaking signals
+
+The application command signal `csrng_req_bus` is accompanied by a `csrng_valid_signal`, which is asserted by the requester when the command is valid.
+CSRNG may stall incoming commands by de-asserting the `csrng_req_ready` signal.
+A command is considered received whenever both `csrng_req_valid` and `csrng_req_ready` are asserted in the same clock cycle.
+
+Likewise a requester must only consider data on the `genbits` bus to be valid when the `genbits_valid` signal is asserted, and should assert `genbits_ready` whenever it is ready to accept the `genbits` data.
+The `genbits` data is considered successfully transmitted whenever `genbits_valid` and `genbits_ready` are asserted in the same clock cycle.
+
+A requester must always be ready to receive `csrng_req_sts` signals.
+(There is no "ready" signal for command response messages sent to hardware.)
+
+#### Waveforms
+
+##### Application Interface: Instantiate Request
+
+```wavejson
+{signal: [
+   {name: 'clk'             , wave: 'p...............|.....'},
+   {name: 'csrng_req_valid' , wave: '01............0.|.....'},
+   {name: 'csrng_req_ready' , wave: '1.............0.|..1..'},
+   {name: 'csrng_req_bus'   , wave: 'x5333333333333x.|.....',data: ['ins','sd1','sd2','sd3','sd4','sd5','sd6','sd7','sd8','sd9','sd10','sd11','sd12']},
+   {name: 'csrng_rsp_ack'   , wave: '0...............|.10..'},
+   {name: 'csrng_rsp_sts'   , wave: 'x...............|.5x..', data: ['ok']},
+ {},
+]}
+```
+
+##### Application Interface:  Reseed Request
+
+```wavejson
+{signal: [
+   {name: 'clk'             , wave: 'p...............|.....'},
+   {name: 'csrng_req_valid' , wave: '01............0.|.....'},
+   {name: 'csrng_req_ready' , wave: '1.............0.|..1..'},
+   {name: 'csrng_req_bus'   , wave: 'x5333333333333x.|.....',data: ['res','ad1','ad2','ad3','ad4','ad5','ad6','ad7','ad8','ad9','ad10','ad11','ad12']},
+   {name: 'csrng_rsp_ack'   , wave: '0...............|.10..'},
+   {name: 'csrng_rsp_sts'   , wave: 'x...............|.5x..', data: ['ok']},
+ {},
+]}
+```
+
+##### Application Interface:  Generate Request
+
+```wavejson
+{signal: [
+   {name: 'clk'              , wave: 'p...|...|....|....|...'},
+   {name: 'csrng_req_valid'  , wave: '010.|...|....|....|...'},
+   {name: 'csrng_req_ready'  , wave: '1...|...|....|....|...'},
+   {name: 'csrng_req_bus'    , wave: 'x5x.|...|....|....|...',data: ['gen']},
+   {name: 'csrng_rsp_ack'    , wave: '0...|...|....|....|.10'},
+   {name: 'csrng_rsp_sts'    , wave: 'x...|...|....|....|.5x', data: ['ok']},
+   {name: 'genbits_valid'    , wave: '0...|.10|.1.0|.10.|...'},
+   {name: 'csrng_rsp_fips'   , wave: '0...|.10|.1.0|.10.|...'},
+   {name: 'genbits_bus'      , wave: '0...|.40|.4.0|.40.|...', data: ['bits0','bits1','bits2']},
+   {name: 'genbits_ready'    , wave: '1...|...|0.1.|........'},
+]}
+```
+
+##### Application Interface:  Update Request
+
+```wavejson
+{signal: [
+   {name: 'clk'             , wave: 'p...............|.....'},
+   {name: 'csrng_req_valid' , wave: '01............0.|.....'},
+   {name: 'csrng_req_ready' , wave: '1.............0.|..1..'},
+   {name: 'csrng_req_bus'   , wave: 'x5333333333333x.|.....',data: ['upd','ad1','ad2','ad3','ad4','ad5','ad6','ad7','ad8','ad9','ad10','ad11','ad12']},
+   {name: 'csrng_rsp_ack'   , wave: '0...............|.10..'},
+   {name: 'csrng_rsp_sts'   , wave: 'x...............|.5x..', data: ['ok']},
+ {},
+]}
+```
+
+##### Application Interface:  Uninstantiate Request
+
+```wavejson
+{signal: [
+   {name: 'clk'             , wave: 'p...............|.....'},
+   {name: 'csrng_req_valid' , wave: '010.............|.....'},
+   {name: 'csrng_req_ready' , wave: '1.0.............|..1..'},
+   {name: 'csrng_req_bus'   , wave: 'x5x.............|.....',data: ['uni']},
+   {name: 'csrng_rsp_ack'   , wave: '0...............|.10..'},
+   {name: 'csrng_rsp_sts'   , wave: 'x...............|.5x..', data: ['ok']},
+ {},
+]}
+```
+
+
+##### Entropy Source Hardware Interface
+The following waveform shows an example of how the entropy source hardware interface works.
+
+
+```wavejson
+{signal: [
+   {name: 'clk'           , wave: 'p...|.........|.......'},
+   {name: 'es_req'        , wave: '0..1|..01.0..1|.....0.'},
+   {name: 'es_ack'        , wave: '0...|.10.10...|....10.'},
+   {name: 'es_bus[383:0]' , wave: '0...|.30.30...|....30.', data: ['es0','es1','es2']},
+   {name: 'es_fips'       , wave: '0...|....10...|....10.'},
+]}
+]}
+```
+
+### Interrupts
+
+The `cs_cmd_req_done` interrupt will assert when a CSRNG command has been completed.
+
+The `cs_entropy_req` interrupt will assert when CSRNG requests entropy from ENTROPY_SRC.
+
+The `cs_hw_inst_exc` interrupt will assert when any of the hardware-controlled CSRNG instances encounters an exception while executing a command, either due to errors on the command sequencing, or an exception within the `ENTROPY_SRC` IP.
+
+The `cs_fatal_err` interrupt will assert when any of the CSRNG FIFOs has a malfunction.
+The conditions that cause this to happen are either when there is a push to a full FIFO or a pull from an empty FIFO.
diff --git a/hw/ip/edn/README.md b/hw/ip/edn/README.md
index 184e6691cb723..3fb6de3d83afa 100644
--- a/hw/ip/edn/README.md
+++ b/hw/ip/edn/README.md
@@ -86,192 +86,3 @@ The `entropy_src` only supports one connection to a CSRNG, but the CSRNG has mul
 The diagram below shows an example topology where two EDN modules are used to distribute `genbits` from the CSRNG to peripheral modules.
 
 ![EDN Example Topology Diagram](./doc/edn_top_diag.svg)
-
-# Theory of Operations
-
-The EDN is for distributing random number streams to hardware blocks, via peripheral ports on on the EDN.
-Each block connected to a peripheral port is referred to as an endpoint.
-
-To enable the EDN block, set the `EDN_ENABLE` field in the [`CTRL`](data/edn.hjson#ctrl) register..
-
-## Interaction with CSRNG Application Interface Ports
-
-The CSRNG application interface implements the "function envelopes" recommended by [NIST SP 800-90A](https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-90Ar1.pdf) for random number generation, and these function envelopes establish certain requirements for the order of operations.
-For instance, the application interface port must receive an explicit `instantiate` command before receiving any `generate` commands.
-The sequences of commands generated by a particular EDN are either controlled by the EDN state machine or by commands forwarded from firmware through the [`SW_CMD_REQ`](data/edn.hjson#sw_cmd_req) register.
-
-Whenever commands are directly forwarded from firmware to the CSRNG through the [`SW_CMD_REQ`](data/edn.hjson#sw_cmd_req) register, firmware must poll and clear the `CMD_ACK` bit of the [`SW_CMD_STS`](data/edn.hjson#sw_cmd_sts) register before sending any further commands.
-
-Note that CSRNG commands are to be written into the [`SW_CMD_REQ`](data/edn.hjson#sw_cmd_req), [`RESEED_CMD`](data/edn.hjson#reseed_cmd), and [`GENERATE_CMD`](data/edn.hjson#generate_cmd) registers.
-CSRNG command format details can be found in [CSRNG](../csrng/README.md).
-
-There are two broad modes for state machine control: auto request mode and boot-time request mode.
-
-### Boot-time Request Mode
-
-Random values are needed by peripherals almost immediately after reset, so to simplify interactions with the boot ROM, boot-time request mode is the default mode.
-
-In boot-time request mode, the command sequence is fully hardware-controlled and no command customization is possible.
-In this mode, the EDN automatically issues a special reduced-latency `instantiate` command followed by the default `generate` commands.
-This means, for instance, that no personalization strings or additional data may be passed to the CSRNG application interface port in this mode.
-On exiting, the EDN issues an `uninstantiate` command to destroy the associated CSRNG instance.
-
-Once firmware initialization is complete, it is important to exit this mode if the endpoints ever need FIPS-approved random values.
-This is done by either *clearing* the `EDN_ENABLE` field or *clearing* the `BOOT_REQ_MODE` field in [`CTRL`](data/edn.hjson#ctrl) to halt the boot-time request state machine.
-Firmware must then wait for successful the shutdown of the state machine by polling the `REQ_MODE_SM_STS` field of the [`SUM_STS`](data/edn.hjson#sum_sts) register.
-
-It should be noted that when in boot-time request mode, no status will be updated that is used for the software port operation.
-If some hang condition were to occur when in this mode, the main state machine debug register should be read to determine if a hang condition is present.
-There is a limit to how much entropy can be requested in the boot-time request mode BOOT_GEN_CMD command (GLEN = 4K).
-It is the responsibility of software to switch to the software mode of operation before the command has completed.
-If the BOOT_GEN_CMD command ends while an endpoint is requesting, EDN will never ack and the endpoint bus will hang.
-
-#### Note on Security Considerations when Using Boot-time Request Mode
-
-Boot-time request mode is not intended for normal operation, as it tolerates the potential use of preliminary seeds for the attached CSRNG instance.
-These preliminary seeds are described as "pre-FIPS" since they are released from the `entropy_src` before the complete start-up health-checks recommended by FIPS have been completed.
-Thus pre-FIPS seeds have weaker guarantees on the amount of physical entropy included in their creation.
-As detailed in the [`entropy_src` documentation](../entropy_src/README.md), only the first CSRNG seed created after reset is pre-FIPS.
-All following seeds from the `entropy_src` are passed through the full FIPS-approved health checks.
-Therefore at most one EDN can receive a pre-FIPS seed after reset.
-Since boot-time request mode EDN streams may be FIPS non-compliant, firmware must at some point disable boot-time request mode and reinitialize the EDN for either firmware-driven operation or auto request mode.
-
-#### Multiple EDNs in Boot-time Request Mode
-
-If many endpoints require boot-time entropy multiple boot-time EDNs may be required, as the EDN has a fixed maximum number of peripheral ports.
-Since physical entropy generation takes time, there exists a mechanism to prioritize the EDNs, to match the boot priority of each group of attached endpoints.
-To establish an order to the instantiation of each EDN, enable them one at a time.
-To ensure that the most recently enabled EDN will get next priority for physical entropy, poll the `BOOT_INST_ACK` field in the [`SUM_STS`](data/edn.hjson#sum_sts) register before enabling the following EDN.
-
-If using boot-time request mode, the CSRNG seed material used for the first-activated EDN is the special pre-FIPS seed, which is specifically tested quickly to improve latency.
-The first random values distributed from this EDN will therefore be available roughly 2ms after reset.
-The `entropy_src` only creates one pre-FIPS seed, so any other EDNs must wait for their seeds to pass the full FIPS-recommended health checks.
-This means that each subsequent EDN must wait an additional 5ms before it can start distributing data.
-For instance, if there are three boot-time request mode EDN's in the system, the first will start distributing data 2ms after reset, the second will start distributing data 7ms after reset, and the third will start distributing data 12ms after reset.
-
-### Auto Request Mode
-
-Before entering auto request mode, it is the responsibility of firmware to first generate an `instantiate` command for the EDN-associated instance via the [`SW_CMD_REQ`](data/edn.hjson#sw_cmd_req) register.
-The required `generate` and `reseed` commands must also be custom generated by firmware and loaded into the respective command replay FIFOs via the [`GENERATE_CMD`](data/edn.hjson#generate_cmd) and [`RESEED_CMD`](data/edn.hjson#reseed_cmd) registers.
-These `generate` commands will be issued as necessary to meet the bandwidth requirements of the endpoints.
-The `reseed` commands will be issued once every `MAX_NUM_REQS_BETWEEN_RESEEDS` generate requests.
-For details on the options for application interface commands please see the [CSRNG IP Documentation](../csrng/README.md).
-Once the CSRNG instance has been instantiated, and the `generate` and `reseed` commands have been loaded, auto request mode can be entered by programming the [`CTRL`](data/edn.hjson#ctrl) register with `EDN_ENABLE` and `AUTO_REQ_MODE` fields are enabled.
-Note that if BOOT_REQ_MODE is asserted the state machine will enter boot-time request mode, even if AUTO_REQ_MODE is asserted.
-
-To issue any new commands other than those stored in the generate or reseed FIFOs, it is important to disable auto request mode, by deasserting the `AUTO_REQ_MODE` field in the [`CTRL`](data/edn.hjson#ctrl) register.
-Firmware must then wait until the current command is completed by polling the [`MAIN_SM_STATE`](data/edn.hjson#main_sm_state) register.
-Once the state machine returns to the `Idle` or `SWPortMode` states, new firmware-driven commands can be passed to the CSRNG via the [`SW_CMD_REQ`](data/edn.hjson#sw_cmd_req) register.
-
-It should be noted that when in auto request mode, no status will be updated that is used for the software port operation once the `instantiate` command has completed.
-If some hang condition were to occur when in this mode, the main state machine debug register should be read to determine if a hang condition is present.
-
-### Note on State Machine Shutdown Delays
-
-When leaving boot-time request mode or auto request mode, the EDN state machine waits for completion of the last command, before sending a shutdown acknowledgement to firmware.
-The longest possible commands are the `instantiate` or `reseed` requests, which typically take about 5ms, due to the time required to gather the necessary physical entropy.
-By contrast, the largest possible `generate` command allowed by [NIST SP 800-90A](https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-90Ar1.pdf) is for 2<sup>19</sup> bits (or 4096 AES codewords).
-Assuming an AES encryption delay of 16 clocks, and a 100 MHz clock frequency, the longest allowable `generate` command would take only 0.7 ms to complete.
-
-### Note on Sharing of CSRNG Instance State Variables
-
-Once an application interface port has received an `instantiate` command, that application interface port then has access to a unique CSRNG instance, which is shared by all endpoints on the same EDN.
-Therefore from a security perspective, an attack to that particular CSRNG instance is an attack on all the endpoints that share the same EDN.
-Meanwhile, seeds and other state variables specific to a particular CSRNG instance are not shared between endpoints on *separate* EDN instances, or with any hardware devices with direct connections to dedicated CSRNG application interface ports.
-
-## Interactions with Peripheral Devices
-
-Peripheral ports distribute data to the endpoints using four signals: `req`, `ack`, `bus`, and `fips`.
-
-Fresh (i.e. previously unseen) random values are distributed to the endpoints via the 32 bit `bus` signal, in response to a `req` signal.
-Whenever new values are placed on the `bus`, the `ack` is asserted until the values are consumed by the endpoint, as indicated by simultaneous assertion of the `req` and `ack` signals in the same cycle.
-Otherwise `ack` is deasserted until enough fresh bits are received from CSRNG.
-The bus data will persist on the bus until a new `req` is asserted.
-This persistence will allow an asynchronous endpoint to capture the correct data sometime after the `ack` de-asserts.
-
-The `fips` signal is used to identify whether the values received on the `bus` have been prepared with complete adherence to the recommendations in NIST SP 800-90.
-If the `fips` signal is deasserted, it means the associated CSRNG instance has been instantiated with a pre-FIPS seed.
-
-## Block Diagram
-
-![EDN Block Diagram](./doc/edn_blk_diag.svg)
-
-## Hardware Interfaces
-
-* [Interface Tables](data/edn.hjson#interfaces)
-
-## Design Details
-
-### EDN Initialization
-
-After power-up, the EDN block is disabled.
-A single TL-UL configuration write to the  [`CTRL`](data/edn.hjson#ctrl) register will start random-number streams processing in boot-time request mode.
-CSRNG application commands will be sent immediately.
-Once these commands have completed, a status bit will be set.
-At this point, firmware can later come and reconfigure the EDN block for a different mode of operation.
-
-The recommended write sequence for the entire entropy system is one configuration write to ENTROPY_SRC, then CSRNG, and finally to EDN (also see [Module enable and disable](#enable-disable)).
-
-### Interrupts
-
-The EDN module has two interrupts: `edn_cmd_req_done` and `edn_fatal_err`.
-
-The `edn_cmd_req_done` interrupt should be used when a CSRNG command is issued and firmware is waiting for completion.
-
-The `edn_fatal_err` interrupt will fire when a fatal error has been detected.
-The conditions that cause this to happen are FIFO error, a state machine error state transition, or a prim_count error.
-
-#### Waveforms
-
-See the [CSRNG IP](../csrng/README.md) waveform section for the CSRNG application interface commands.
-
-##### Peripheral Hardware Interface - Req/Ack
-The following waveform shows an example of how the peripheral hardware interface works.
-This example shows the case where the boot-time mode in the ENTROPY_SRC block is enabled.
-This example also shows the case where the next request will change the prior data by popping the data FIFO.
-
-```wavejson
-{signal: [
-   {name: 'clk'           , wave: 'p...|...........|......'},
-   {name: 'edn_enable'    , wave: '01..|...........|......'},
-   {name: 'edn_req'       , wave: '0..1|..0..1.0...|1.0...'},
-   {name: 'edn_ack'       , wave: '0...|.10...10...|.10...'},
-   {name: 'edn_bus[31:0]' , wave: '0...|3....3.....|3.....', data: ['es0','es1','es2']},
-   {name: 'edn_fips'      , wave: '0...|...........|......'},
- {},
-]}
-```
-
-# Programmers Guide
-
-## Initialization
-
-The following code snippet demonstrates initializing the EDN block.
-
-```cpp
-
-void edn_init(unsigned int enable) {
-
-  // set the control register enable bit
-  *CTRL_REG = enable; // should be 0x1 by default
-
-  // the EDN interrupts can optionally be enabled
-}
-```
-
-## Module enable and disable {#enable-disable}
-
-EDN may only be enabled if CSRNG is enabled.
-Once disabled, EDN may only be re-enabled after CSRNG has been disabled and re-enabled.
-
-## Error conditions
-
-Need to alert the system of a FIFO overflow condition.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_edn.h)
-
-## Register Table
-
-* [Register Table](data/edn.hjson#registers)
diff --git a/hw/ip/edn/doc/programmers_guide.md b/hw/ip/edn/doc/programmers_guide.md
new file mode 100644
index 0000000000000..7befcfc0301ad
--- /dev/null
+++ b/hw/ip/edn/doc/programmers_guide.md
@@ -0,0 +1,33 @@
+# Programmer's Guide
+
+## Initialization
+
+The following code snippet demonstrates initializing the EDN block.
+
+```cpp
+
+void edn_init(unsigned int enable) {
+
+  // set the control register enable bit
+  *CTRL_REG = enable; // should be 0x1 by default
+
+  // the EDN interrupts can optionally be enabled
+}
+```
+
+## Module enable and disable {#enable-disable}
+
+EDN may only be enabled if CSRNG is enabled.
+Once disabled, EDN may only be re-enabled after CSRNG has been disabled and re-enabled.
+
+## Error conditions
+
+Need to alert the system of a FIFO overflow condition.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_edn.h)
+
+## Register Table
+
+* [Register Table](../data/edn.hjson#registers)
diff --git a/hw/ip/edn/doc/theory_of_operation.md b/hw/ip/edn/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..d304b0081640f
--- /dev/null
+++ b/hw/ip/edn/doc/theory_of_operation.md
@@ -0,0 +1,154 @@
+# Theory of Operation
+
+The EDN is for distributing random number streams to hardware blocks, via peripheral ports on on the EDN.
+Each block connected to a peripheral port is referred to as an endpoint.
+
+To enable the EDN block, set the `EDN_ENABLE` field in the [`CTRL`](../data/edn.hjson#ctrl) register..
+
+## Interaction with CSRNG Application Interface Ports
+
+The CSRNG application interface implements the "function envelopes" recommended by [NIST SP 800-90A](https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-90Ar1.pdf) for random number generation, and these function envelopes establish certain requirements for the order of operations.
+For instance, the application interface port must receive an explicit `instantiate` command before receiving any `generate` commands.
+The sequences of commands generated by a particular EDN are either controlled by the EDN state machine or by commands forwarded from firmware through the [`SW_CMD_REQ`](../data/edn.hjson#sw_cmd_req) register.
+
+Whenever commands are directly forwarded from firmware to the CSRNG through the [`SW_CMD_REQ`](../data/edn.hjson#sw_cmd_req) register, firmware must poll and clear the `CMD_ACK` bit of the [`SW_CMD_STS`](../data/edn.hjson#sw_cmd_sts) register before sending any further commands.
+
+Note that CSRNG commands are to be written into the [`SW_CMD_REQ`](../data/edn.hjson#sw_cmd_req), [`RESEED_CMD`](../data/edn.hjson#reseed_cmd), and [`GENERATE_CMD`](../data/edn.hjson#generate_cmd) registers.
+CSRNG command format details can be found in [CSRNG](../../csrng/README.md).
+
+There are two broad modes for state machine control: auto request mode and boot-time request mode.
+
+### Boot-time Request Mode
+
+Random values are needed by peripherals almost immediately after reset, so to simplify interactions with the boot ROM, boot-time request mode is the default mode.
+
+In boot-time request mode, the command sequence is fully hardware-controlled and no command customization is possible.
+In this mode, the EDN automatically issues a special reduced-latency `instantiate` command followed by the default `generate` commands.
+This means, for instance, that no personalization strings or additional data may be passed to the CSRNG application interface port in this mode.
+On exiting, the EDN issues an `uninstantiate` command to destroy the associated CSRNG instance.
+
+Once firmware initialization is complete, it is important to exit this mode if the endpoints ever need FIPS-approved random values.
+This is done by either *clearing* the `EDN_ENABLE` field or *clearing* the `BOOT_REQ_MODE` field in [`CTRL`](../data/edn.hjson#ctrl) to halt the boot-time request state machine.
+Firmware must then wait for successful the shutdown of the state machine by polling the `REQ_MODE_SM_STS` field of the [`SUM_STS`](../data/edn.hjson#sum_sts) register.
+
+It should be noted that when in boot-time request mode, no status will be updated that is used for the software port operation.
+If some hang condition were to occur when in this mode, the main state machine debug register should be read to determine if a hang condition is present.
+There is a limit to how much entropy can be requested in the boot-time request mode BOOT_GEN_CMD command (GLEN = 4K).
+It is the responsibility of software to switch to the software mode of operation before the command has completed.
+If the BOOT_GEN_CMD command ends while an endpoint is requesting, EDN will never ack and the endpoint bus will hang.
+
+#### Note on Security Considerations when Using Boot-time Request Mode
+
+Boot-time request mode is not intended for normal operation, as it tolerates the potential use of preliminary seeds for the attached CSRNG instance.
+These preliminary seeds are described as "pre-FIPS" since they are released from the `entropy_src` before the complete start-up health-checks recommended by FIPS have been completed.
+Thus pre-FIPS seeds have weaker guarantees on the amount of physical entropy included in their creation.
+As detailed in the [`entropy_src` documentation](../../entropy_src/README.md), only the first CSRNG seed created after reset is pre-FIPS.
+All following seeds from the `entropy_src` are passed through the full FIPS-approved health checks.
+Therefore at most one EDN can receive a pre-FIPS seed after reset.
+Since boot-time request mode EDN streams may be FIPS non-compliant, firmware must at some point disable boot-time request mode and reinitialize the EDN for either firmware-driven operation or auto request mode.
+
+#### Multiple EDNs in Boot-time Request Mode
+
+If many endpoints require boot-time entropy multiple boot-time EDNs may be required, as the EDN has a fixed maximum number of peripheral ports.
+Since physical entropy generation takes time, there exists a mechanism to prioritize the EDNs, to match the boot priority of each group of attached endpoints.
+To establish an order to the instantiation of each EDN, enable them one at a time.
+To ensure that the most recently enabled EDN will get next priority for physical entropy, poll the `BOOT_INST_ACK` field in the [`SUM_STS`](../data/edn.hjson#sum_sts) register before enabling the following EDN.
+
+If using boot-time request mode, the CSRNG seed material used for the first-activated EDN is the special pre-FIPS seed, which is specifically tested quickly to improve latency.
+The first random values distributed from this EDN will therefore be available roughly 2ms after reset.
+The `entropy_src` only creates one pre-FIPS seed, so any other EDNs must wait for their seeds to pass the full FIPS-recommended health checks.
+This means that each subsequent EDN must wait an additional 5ms before it can start distributing data.
+For instance, if there are three boot-time request mode EDN's in the system, the first will start distributing data 2ms after reset, the second will start distributing data 7ms after reset, and the third will start distributing data 12ms after reset.
+
+### Auto Request Mode
+
+Before entering auto request mode, it is the responsibility of firmware to first generate an `instantiate` command for the EDN-associated instance via the [`SW_CMD_REQ`](../data/edn.hjson#sw_cmd_req) register.
+The required `generate` and `reseed` commands must also be custom generated by firmware and loaded into the respective command replay FIFOs via the [`GENERATE_CMD`](../data/edn.hjson#generate_cmd) and [`RESEED_CMD`](../data/edn.hjson#reseed_cmd) registers.
+These `generate` commands will be issued as necessary to meet the bandwidth requirements of the endpoints.
+The `reseed` commands will be issued once every `MAX_NUM_REQS_BETWEEN_RESEEDS` generate requests.
+For details on the options for application interface commands please see the [CSRNG IP Documentation](../../csrng/README.md).
+Once the CSRNG instance has been instantiated, and the `generate` and `reseed` commands have been loaded, auto request mode can be entered by programming the [`CTRL`](../data/edn.hjson#ctrl) register with `EDN_ENABLE` and `AUTO_REQ_MODE` fields are enabled.
+Note that if BOOT_REQ_MODE is asserted the state machine will enter boot-time request mode, even if AUTO_REQ_MODE is asserted.
+
+To issue any new commands other than those stored in the generate or reseed FIFOs, it is important to disable auto request mode, by deasserting the `AUTO_REQ_MODE` field in the [`CTRL`](../data/edn.hjson#ctrl) register.
+Firmware must then wait until the current command is completed by polling the [`MAIN_SM_STATE`](../data/edn.hjson#main_sm_state) register.
+Once the state machine returns to the `Idle` or `SWPortMode` states, new firmware-driven commands can be passed to the CSRNG via the [`SW_CMD_REQ`](../data/edn.hjson#sw_cmd_req) register.
+
+It should be noted that when in auto request mode, no status will be updated that is used for the software port operation once the `instantiate` command has completed.
+If some hang condition were to occur when in this mode, the main state machine debug register should be read to determine if a hang condition is present.
+
+### Note on State Machine Shutdown Delays
+
+When leaving boot-time request mode or auto request mode, the EDN state machine waits for completion of the last command, before sending a shutdown acknowledgement to firmware.
+The longest possible commands are the `instantiate` or `reseed` requests, which typically take about 5ms, due to the time required to gather the necessary physical entropy.
+By contrast, the largest possible `generate` command allowed by [NIST SP 800-90A](https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-90Ar1.pdf) is for 2<sup>19</sup> bits (or 4096 AES codewords).
+Assuming an AES encryption delay of 16 clocks, and a 100 MHz clock frequency, the longest allowable `generate` command would take only 0.7 ms to complete.
+
+### Note on Sharing of CSRNG Instance State Variables
+
+Once an application interface port has received an `instantiate` command, that application interface port then has access to a unique CSRNG instance, which is shared by all endpoints on the same EDN.
+Therefore from a security perspective, an attack to that particular CSRNG instance is an attack on all the endpoints that share the same EDN.
+Meanwhile, seeds and other state variables specific to a particular CSRNG instance are not shared between endpoints on *separate* EDN instances, or with any hardware devices with direct connections to dedicated CSRNG application interface ports.
+
+## Interactions with Peripheral Devices
+
+Peripheral ports distribute data to the endpoints using four signals: `req`, `ack`, `bus`, and `fips`.
+
+Fresh (i.e. previously unseen) random values are distributed to the endpoints via the 32 bit `bus` signal, in response to a `req` signal.
+Whenever new values are placed on the `bus`, the `ack` is asserted until the values are consumed by the endpoint, as indicated by simultaneous assertion of the `req` and `ack` signals in the same cycle.
+Otherwise `ack` is deasserted until enough fresh bits are received from CSRNG.
+The bus data will persist on the bus until a new `req` is asserted.
+This persistence will allow an asynchronous endpoint to capture the correct data sometime after the `ack` de-asserts.
+
+The `fips` signal is used to identify whether the values received on the `bus` have been prepared with complete adherence to the recommendations in NIST SP 800-90.
+If the `fips` signal is deasserted, it means the associated CSRNG instance has been instantiated with a pre-FIPS seed.
+
+## Block Diagram
+
+![EDN Block Diagram](../doc/edn_blk_diag.svg)
+
+## Hardware Interfaces
+
+* [Interface Tables](../data/edn.hjson#interfaces)
+
+## Design Details
+
+### EDN Initialization
+
+After power-up, the EDN block is disabled.
+A single TL-UL configuration write to the  [`CTRL`](../data/edn.hjson#ctrl) register will start random-number streams processing in boot-time request mode.
+CSRNG application commands will be sent immediately.
+Once these commands have completed, a status bit will be set.
+At this point, firmware can later come and reconfigure the EDN block for a different mode of operation.
+
+The recommended write sequence for the entire entropy system is one configuration write to ENTROPY_SRC, then CSRNG, and finally to EDN (also see [Module enable and disable](#enable-disable)).
+
+### Interrupts
+
+The EDN module has two interrupts: `edn_cmd_req_done` and `edn_fatal_err`.
+
+The `edn_cmd_req_done` interrupt should be used when a CSRNG command is issued and firmware is waiting for completion.
+
+The `edn_fatal_err` interrupt will fire when a fatal error has been detected.
+The conditions that cause this to happen are FIFO error, a state machine error state transition, or a prim_count error.
+
+#### Waveforms
+
+See the [CSRNG IP](../../csrng/README.md) waveform section for the CSRNG application interface commands.
+
+##### Peripheral Hardware Interface - Req/Ack
+The following waveform shows an example of how the peripheral hardware interface works.
+This example shows the case where the boot-time mode in the ENTROPY_SRC block is enabled.
+This example also shows the case where the next request will change the prior data by popping the data FIFO.
+
+```wavejson
+{signal: [
+   {name: 'clk'           , wave: 'p...|...........|......'},
+   {name: 'edn_enable'    , wave: '01..|...........|......'},
+   {name: 'edn_req'       , wave: '0..1|..0..1.0...|1.0...'},
+   {name: 'edn_ack'       , wave: '0...|.10...10...|.10...'},
+   {name: 'edn_bus[31:0]' , wave: '0...|3....3.....|3.....', data: ['es0','es1','es2']},
+   {name: 'edn_fips'      , wave: '0...|...........|......'},
+ {},
+]}
+```
diff --git a/hw/ip/entropy_src/README.md b/hw/ip/entropy_src/README.md
index ea76c92361f37..c7a8cfb770363 100644
--- a/hw/ip/entropy_src/README.md
+++ b/hw/ip/entropy_src/README.md
@@ -89,355 +89,3 @@ This feature can also be disabled for security purposes, either by locking the f
 ## Compatibility
 This IP block does not have any direct hardware compatibility requirements.
 However, the general design of this block follows the overall NIST recommendations, as described by SP 800-90B.
-
-# Theory of Operations
-
-As already described, this IP block will collect bits of entropy for firmware or hardware consumption.
-This revision supports only an external interface for a PTRNG noise source implementation.
-
-The first step is initialization and enabling.
-The PTRNG noise source mode is selected when the `ENABLE` field will be set.
-After the block is enabled and initialized, entropy bits will be collected up indefinitely until disabled.
-
-
-After a reset, the ENTROPY_SRC block will start up in boot-time mode by default.
-This feature is designed to provide an initial seed's worth of entropy with lower latency than the normal FIPS/CC compliant health check process.
-Health testing will still be performed on boot-time mode entropy, but the window of checking is, by default, 384 bits instead of 2048 bits.
-When entropy is delivered to the downstream hardware block, a signal will indicate what type of entropy it is - FIPS compliant or not.
-Boot-time mode can be completely disabled in the [`CONF`](data/entropy_src.hjson#conf) register.
-
-Once the initial boot-time mode phase has completed, the ENTROPY_SRC block will switch to FIPS compliant mode.
-In this mode, once the raw entropy has been health checked, it will be passed into a conditioner block.
-This block will compress the bits such that the entropy bits/physical bits, or min-entropy value, should be improved over the raw data source min-entropy value.
-The compression operation, by default, will compress every 2048 tested bits into 384 full-entropy bits.
-
-The hardware conditioning can also be bypassed and replaced in normal operation with a firmware-defined conditioning algorithm.
-This firmware conditioning algorithm can be disabled on boot for security purposes.
-
-The firmware override function has the capability to completely override the hardware health tests and the conditioner paths.
-In the case of health tests, firmware can turn off one or all of the health tests and perform the tests in firmware.
-A data path is provided in the hardware such that the inbound entropy can be trapped in the pre-conditioner FIFO.
-Once a pre-determined threshold of entropy has been reached in this FIFO, the firmware can then read the entropy bits out of the FIFO.
-The exact mechanism for this functionality starts with setting the `FW_OV_MODE` field in the [`FW_OV_CONTROL`](data/entropy_src.hjson#fw_ov_control) register.
-This will enable firmware to monitor post-health test entropy bits by reading from the [`FW_OV_RD_DATA`](data/entropy_src.hjson#fw_ov_rd_data) register.
-Firmware can use the [`OBSERVE_FIFO_THRESH`](data/entropy_src.hjson#observe_fifo_thresh) and  [`OBSERVE_FIFO_DEPTH`](data/entropy_src.hjson#observe_fifo_depth) to determine the state of the OBSERVE FIFO.
-At this point, firmware can do additional health checks on the entropy.
-Optionally, firmware can do the conditioning function, assuming the hardware is configured to bypass the conditioner block.
-Once firmware has processed the entropy,  it can then write the results back into the [`FW_OV_WR_DATA`](data/entropy_src.hjson#fw_ov_wr_data) register (pre-conditioner FIFO).
-The `FW_OV_ENTROPY_INSERT` in the [`FW_OV_CONTROL`](data/entropy_src.hjson#fw_ov_control) register will enable inserting entropy bits back into the entropy flow.
-The firmware override control fields will be set such that the new entropy will resume normal flow operation.
-
-An additional feature of the firmware override function is to insert entropy bits into the flow and still use the conditioning function in the hardware.
-Setting the `FW_OV_INSERT_START` field in the [`FW_OV_SHA3_START`](data/entropy_src.hjson#fw_ov_sha3_start) register will prepare the hardware for this flow.
-Once this field is set true, the [`FW_OV_WR_DATA`](data/entropy_src.hjson#fw_ov_wr_data) register can be written with entropy bits.
-The [`FW_OV_WR_FIFO_FULL`](data/entropy_src.hjson#fw_ov_wr_fifo_full) register should be monitored after each write to ensure data is not dropped.
-Once all of the data has been written, the `FW_OV_INSERT_START` field should be set to false.
-The normal SHA3 processing will continue and finally push the conditioned entropy through the module.
-
-Health checks are performed on the input raw data from the PTRNG noise source when in that mode.
-There are four health tests that will be performed: repetitive count, adaptive proportion, bucket, and Markov tests.
-Each test has a pair of threshold values that determine that pass/fail of the test, one threshold for boot-time mode, and one for normal mode.
-By default, all tests are enabled, but can be turn off in the [`CONF`](data/entropy_src.hjson#conf) register.
-Because of the variability of the PTRNG noise source, there are several registers that log statistics associated with the health tests.
-For example, the adaptive proportion test has a high watermark register that logs the highest measured number of ones.
-The [`ADAPTP_HI_WATERMARKS`](data/entropy_src.hjson#adaptp_hi_watermarks) register has an entry for both normal and boot-time modes.
-This register allows for determining how close the threshold value should be set to the fail over value.
-Specific to the adaptive proportion test, there is also the [`ADAPTP_LO_WATERMARKS`](data/entropy_src.hjson#adaptp_lo_watermarks) register, which will hold the lowest number of ones measured.
-To help understand how well the thresholds work through time, a running count of test fails is kept in the [`ADAPTP_HI_TOTAL_FAILS`](data/entropy_src.hjson#adaptp_hi_total_fails) register.
-The above example for the adaptive proportion test also applies to the other health tests, with the exception of the low watermark registers.
-See the timing diagrams below for more details on how the health tests work.
-It should be noted that for all error counter registers, they are sized for 16 bits, which prevents any case where counters might wrap.
-
-
-Vendor-specific tests are supported through an external health test interface (xht).
-This is the same interface that is used for the internal health tests.
-Below is a description of this interface:
-- entropy_bit: 4-bit wide bus of entropy to be tested.
-- entropy_bit_valid: indication of when the entropy is valid.
-- clear: signal to clear counters, and is register driven.
-- active: signal to indicate when the test should run, and is register driven.
-- thresh_hi: field to indicate what high threshold the test should use, and is register driven.
-- thresh_lo: field to indicate what low threshold the test should use, and is register driven.
-- window_wrap_pulse: field to indicate the end of the current window.
-- threshold_scope: field to indicate whether the thresholds are intended to be applied to all entropy lines collectively or on a line-by-line basis, to be read from a register.
-- test_cnt: generic test count result, to be read from a register.
-- test_fail_hi_pulse: indication that a high threshold comparison failed, to be read from a register.
-- test_fail_lo_pulse: indication that a low threshold comparison failed, to be read from a register.
-
-
-The [`ALERT_THRESHOLD`](data/entropy_src.hjson#alert_threshold) register determines how many fails can occur before an alert is issued.
-By default, the current threshold is set to two, such that the occurrence of two failing test cycles back-to-back would provide a very low &alpha; value.
-The [`ALERT_FAIL_COUNTS`](data/entropy_src.hjson#alert_fail_counts) register holds the total number of fails, plus all of the individual contributing failing tests.
-Setting the [`ALERT_THRESHOLD`](data/entropy_src.hjson#alert_threshold) register to zero will disable alert generation.
-
-Firmware has a path to read entropy from the ENTROPY_SRC block.
-The [`ENTROPY_CONTROL`](data/entropy_src.hjson#entropy_control) register allows firmware to set the internal multiplexers to steer entropy data to the [`ENTROPY_DATA`](data/entropy_src.hjson#entropy_data) register.
-The control bit `ES_TYPE` sets whether the entropy will come from the conditioning block or be sourced through the bypass path.
-A status bit will be set that can either be polled or generate an interrupt when the entropy bits are available to be read from the [`ENTROPY_DATA`](data/entropy_src.hjson#entropy_data) register.
-The firmware needs to read the [`ENTROPY_DATA`](data/entropy_src.hjson#entropy_data) register twelve times in order to cleanly evacuate the 384-bit seed from the hardware path (12*32bits=384bits total).
-The firmware will directly read out of the main entropy FIFO, and when the control bit `ES_ROUTE` is set, no entropy is being passed to the block hardware interface.
-
-If the `esfinal` FIFO fills up, additional entropy that has been health checked will be dropped before entering the conditioner.
-This drop point will save on conditioner power, and still preserve `esfinal` FIFO entropy that has already been collected.
-
-The above process will be repeated for as long as entropy bits are to be collected and processed.
-
-At any time, the `ENABLE` field can be cleared to halt the entropy generation (and health check testing).
-See the Programmers Guide section for more details on the ENTROPY_SRC block disable sequence.
-
-## Block Diagram
-
-![ENTROPY_SRC Block Diagram](./doc/entsrc_blk_diag.svg)
-
-## Hardware Interfaces
-
-* [Interface Tables](data/entropy_src.hjson#interfaces)
-
-## Design Details
-
-### Initialization
-
-After power-up, the ENTROPY_SRC block is disabled.
-
-For simplicity of initialization, only a single register write is needed to start functional operation of the ENTROPY_SRC block.
-This assumes that proper defaults are chosen for thresholds, sampling rate, and other registers.
-
-For security reasons, a configuration and control register locking function is performed by the [`REGEN`](data/entropy_src.hjson#regen) register.
-Clearing the bit in this register will prevent future modification of the [`CONF`](data/entropy_src.hjson#conf) register or other writeable registers by firmware.
-
-### Entropy Processing
-
-When enabled, the ENTROPY_SRC block will generate entropy bits continuously.
-The `es_entropy_valid` bit in the `ENTROPY_SRC_INTR_STATE` register will indicate to the firmware when entropy bits can read from the [`ENTROPY_DATA`](data/entropy_src.hjson#entropy_data) register.
-The firmware will do 32-bit register reads of the [`ENTROPY_DATA`](data/entropy_src.hjson#entropy_data) register to retrieve the entropy bits.
-Each read will automatically pop an entry from the entropy unpacker block.
-A full twelve 32-bit words need to be read at a time.
-
-The hardware entropy interface will move entropy bits out of the ENTROPY FIFO when it is not empty, and the downstream hardware is ready.
-If firmware is not currently reading entropy bits, all processed entropy bits will flow to the hardware entropy interface.
-
-### Security
-
-All module assets and countermeasures performed by hardware are listed in the hjson countermeasures section.
-Labels for each instance of asset and countermeasure are located throughout the RTL source code.
-
-For all of the health test threshold registers, these registers could be protected with shadow registers.
-A design choice was made here to not use shadow registers and save on silicon cost.
-The threshold registers are protected by software.
-It is expected that software will read the threshold registers on a periodic basis, and compare these values to what was originally programmed into the threshold registers.
-
-Bus integrity checking is performed for the final seed delivery to CSRNG.
-This is done to make sure repeated values are not occurring.
-Only 64 bits (out of 384 bits) are checked, since this is statistically significant, and more checking would cost more silicon.
-
-
-
-### Interrupts
-
-The ENTROPY_SRC module has several interrupts: `es_entropy_valid`, `es_health_test_failed`, and `es_fifo_err`.
-
-The `es_entropy_valid` interrupt should be asserted when an entropy source has been implemented that is relatively slow.
-
-The `es_health_test_failed` interrupt will trigger when the internal health test fails and exceeds the alert threshold.
-
-The `es_fifo_err` interrupt will fire when an internal FIFO has a malfunction.
-The conditions that cause this to happen are either when there is a push to a full FIFO or a pull from an empty FIFO.
-
-
-## Main State Machine Diagram
-The following diagram shows how the main state machine state is constructed.
-The larger circles show the how the overall state machine transitions.
-The sub-state machines with smaller circles show more detail about how the large circles operate.
-
-![ENTROPY_SRC State Diagram](./doc/es_main_sm.svg)
-
-
-### Entropy Source Hardware Interface
-The following waveform shows an example of how the entropy source hardware interface works, which is much like a FIFO.
-
-
-```wavejson
-{signal: [
-   {name: 'clk'           , wave: 'p...|.........|.......'},
-   {name: 'es_req'        , wave: '0..1|..01.0..1|.....0.'},
-   {name: 'es_ack'        , wave: '0...|.10.10...|....10.'},
-   {name: 'es_bus[383:0]' , wave: '0...|.30.30...|....30.', data: ['es0','es1','es2']},
-   {name: 'es_fips'       , wave: '0...|....10...|....10.'},
-]}
-```
-
-
-### PTRNG Hardware Interface
-The following waveform shows an example of what the PTRNG timing looks like.
-
-
-```wavejson
-{signal: [
-   {name: 'clk'             , wave: 'p.|......|......|......'},
-   {name: 'rng_enable'      , wave: '01|......|......|......'},
-   {name: 'rng_valid'       , wave: '0.|..10..|..10..|..10..'},
-   {name: 'rng_b'           , wave: 'x.|..3...|..4...|..5.....', data: ['es0','es1','es2']},
-]}
-```
-
-### Repetition Count Test
-The following waveform shows how a sampling of a data pattern will be tested by the Repetition Count test.
-Operating on each bit stream, this test will count when a signal is at a stuck level.
-This NIST test is intended to signal a catastrophic failure with the PTRNG noise source.
-
-
-```wavejson
-{signal: [
-   {name: 'rng_valid'      , wave: 'p...............'},
-  ['rng bits',
-   {name: 'rng_bus[3]'     , wave: '1.0.10..1...0101'},
-   {name: 'rng_bus[2]'     , wave: '01.0.10..1...010'},
-   {name: 'rng_bus[1]'     , wave: '101.0.10..1...01'},
-   {name: 'rng_bus[0]'     , wave: '10.10..1...0101.'},
-   ],
-   {name: 'thresh_i (hex)'      , wave: '3...............',data: ['3']},
-   {name: 'rep_cntr_q[3] (hex)' , wave: '4444444444444444',data: ['0','0','1','0','1','0','0','1','2','0','1','2','3','0','0','0']},
-   {name: 'rep_cntr_q[2] (hex)' , wave: '4444444444444444',data: ['0','1','0','1','0','1','0','0','1','2','0','1','2','3','0','0']},
-   {name: 'rep_cntr_q[1] (hex)' , wave: '4444444444444444',data: ['0','0','0','0','1','0','1','0','0','1','2','0','1','2','3','0']},
-   {name: 'rep_cntr_q[0] (hex)' , wave: '4444444444444444',data: ['0','0','0','1','0','0','1','2','0','1','2','3','0','0','0','0']},
-   {name: 'test_cnt_q (hex)'    , wave: '4444444444444444',data: ['0','0','0','0','0','0','0','0','0','0','0','1','2','3','4','0']},
-   {name: 'window_cnt_q (hex)'  , wave: '5555555555555555',data: ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f']},
-], head:{
-   text:'Repetition Count Test',
-   tick:0,
-  },}
-```
-
-### Adaptive Proportion Test
-This NIST-defined test is intended to detect statistical bias in the raw entropy data.
-The test counts the number of 1's in a given sample, and applies thresholds to reject samples which deviate too far from the ideal mean of 50%.
-
-Depending on the value of the [`CONF.THRESHOLD_SCOPE`](data/entropy_src.hjson#conf) field, the thresholds can either be applied collectively to the all RNG inputs, or the thresholds can be applied on a line-by-line basis.
-Setting [`CONF.THRESHOLD_SCOPE`](data/entropy_src.hjson#conf) to `kMuBi4True` will apply the thresholds to the aggregated RNG stream.
-This can be useful for lowering the likelihood of coincidental test failures (higher &alpha;).
-Meanwhile, setting [`CONF.THRESHOLD_SCOPE`](data/entropy_src.hjson#conf) to `kMuBi4False` will apply thresholds on a line-by-line basis which allows the ENTROPY_SRC to detect single line failures.
-
-The following waveform shows how a sampling of a data pattern will be tested by the Adaptive Proportion test.
-In this example, the sum is taken over all RNG lines (i.e., [`CONF.THRESHOLD_SCOPE`](data/entropy_src.hjson#conf) is True).
-
-```wavejson
-{signal: [
-   {name: 'rng_valid'      , wave: 'p...............'},
-  ['rng bits',
-   {name: 'rng_bus[3]'     , wave: '1.0.10..1...0101'},
-   {name: 'rng_bus[2]'     , wave: '01.0.10..1...010'},
-   {name: 'rng_bus[1]'     , wave: '101.0.10..1...01'},
-   {name: 'rng_bus[0]'     , wave: '10.10..1...0101.'},
-   ],
-   {name: 'Column-wise sum'   , wave: '3333333333333333',data: ['3','2','2','2','1','1','1','1','2','3', '4', '3', '3', '2', '2','3']},
-   {name: 'test_cnt_q (hex)'   , wave: '4444444444444444',data: ['0','3','5','7','9','a','b','c','d','f','12','16','19','1c','1e','20']},
-   {name: 'window_cnt_q (hex)' , wave: '5555555555555555',data: ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f']},
-], head:{
-   text:'Adaptive Proportion Test',
-   tick:0,
-  },}
-```
-
-### Bucket Test
-The following waveform shows how a sampling of a data pattern will be tested by the Bucket test.
-Operating on all four bit streams, this test will identify the symbol and sort it into bin counters, or "buckets".
-This test is intended to find bias with a symbol or symbols.
-
-```wavejson
-{signal: [
-   {name: 'rng_valid'      , wave: 'p...............'},
-  ['rng bits',
-   {name: 'rng_bus[3]'     , wave: '1.0.10..1...0101'},
-   {name: 'rng_bus[2]'     , wave: '01.0.10..1...010'},
-   {name: 'rng_bus[1]'     , wave: '101.0.10..1...01'},
-   {name: 'rng_bus[0]'     , wave: '10.10..1...0101.'},
-   ],
-   {name: 'thresh_i (hex)'       , wave: '3...............',data: ['3']},
-   {name: 'bin_cntr_q[0] (hex)'  , wave: '4...............',data: ['0']},
-   {name: 'bin_cntr_q[1] (hex)'  , wave: '4........4......',data: ['0','1']},
-   {name: 'bin_cntr_q[2] (hex)'  , wave: '4.......4.......',data: ['0','1']},
-   {name: 'bin_cntr_q[13] (hex)' , wave: '4..........4....',data: ['0','1']},
-   {name: 'bin_cntr_q[14] (hex)' , wave: '4............4..',data: ['0','1']},
-   {name: 'bin_cntr_q[15] (hex)' , wave: '4...........4...',data: ['0','1']},
-   {name: 'test_cnt_q (hex)'     , wave: '4...............',data: ['0']},
-   {name: 'window_cnt_q (hex)' , wave: '5555555555555555',data: ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f']},
-], head:{
-   text:'Bucket Test',
-   tick:0,
-  },}
-```
-
-### Markov Test
-The following waveform shows how a sampling of a data pattern will be tested by the Markov test.
-
-The test aims to detect either:
-
-1. Oversampling of AST/RNG outputs leading to "clustered" input values that eventually change, but often are just repeats of the previous sample.
-For example the string: "00111111000011000111000111000001111" has roughly equal numbers of 1's and 0's, but no good entropy source should generate such strings, because each bit is likely just a repeat of the previous one.
-
-2. Wild oscillations of the RNG, in a distinctly non-random way.
-For instance the string: "010101010101010101" has almost zero entropy, even though the number of 1's and 0's appears unbiased.
-
-The test counts the number of changes in the a fixed number of RNG samples, and comparing the number of "01"/"10" pairs to the number of "00"/"11" pairs.
-On average, the number of switching (e.g., "01") vs. non-switching (e.g., "00") pairs should be 50% of the total, with a variance proportional to the sample size.
-
-Like the Adaptive Proportion test, the Markov Test can be computed either cumulatively (summing the results over all RNG lines) or on a per-line basis.
-In this example, the RNG lines are scored individually (i.e., [`CONF.THRESHOLD_SCOPE`](data/entropy_src.hjson#conf) is False).
-
-```wavejson
-{signal: [
-   {name: 'rng_valid'      , wave: 'p...............'},
-  ['rng bits',
-   {name: 'rng_bus[3]'     , wave: '1.0.10..1...0101'},
-   {name: 'rng_bus[2]'     , wave: '01.0.10..1...010'},
-   {name: 'rng_bus[1]'     , wave: '101.0.10..1...01'},
-   {name: 'rng_bus[0]'     , wave: '10.10..1...0101.'},
-   ],
-   {name: 'pair_cntr_q[3] (hex)', wave: '4.4.4.4.4.4.4.4.',data: ['0','0','0','1','1','1','1','2']},
-   {name: 'pair_cntr_q[2] (hex)', wave: '4.4.4.4.4.4.4.4.',data: ['0','1','2','3','3','4','4','5']},
-   {name: 'pair_cntr_q[1] (hex)', wave: '4.4.4.4.4.4.4.4.',data: ['0','1','1','1','2','2','2','2']},
-   {name: 'pair_cntr_q[0] (hex)', wave: '4.4.4.4.4.4.4.4.',data: ['0','1','2','2','3','3','4','5']},
-   {name: 'window_cnt_q (hex)'  , wave: '5555555555555555',data: ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f']},
-], head:{
-   text:'Markov Test',
-   tick:0,
-  },}
-```
-
-
-# Programmers Guide
-
-## Initialization
-
-To initialize the ENTROPY_SRC block, see the Device Interface Functions (DIFs) section.
-
-
-## Entropy Processing
-
-Once entropy has been prepared for delivery, it can be consumed by either hardware (CSRNG block hardware instance) or by a software interface (CSRNG software instance).
-
-Note that when software makes frequent re-seed requests to CSRNG, any stored up entropy seeds in the final entropy FIFO will quickly consumed.
-Once the FIFO is empty, subsequent entropy seed requests will have to wait the worst case latency time while new entropy is being created.
-
-
-## Entropy Source Module Disable
-
-A useful feature for the ENTROPY_SRC block is the ability to disable it in a graceful matter.
-Since there exists another feature to avoid power spikes between ENTROPY_SRC and CSRNG, software needs to monitor the disabling process.
-Bit 16 in the [`DEBUG_STATUS`](data/entropy_src.hjson#debug_status) should be polled after the ENTROPY_SRC enable bits are cleared in the [`CONF`](data/entropy_src.hjson#conf) register.
-After the handshakes with CSRNG are finished, the above bit should be set and the ENTROPY_SRC block can be safely enabled again.
-
-ENTROPY_SRC may only be disabled if CSRNG is disabled.
-
-
-## Error conditions
-
-Need to alert the system of a FIFO overflow condition.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_entropy_src.h)
-
-## Register Table
-
-* [Register Table](data/entropy_src.hjson#registers)
diff --git a/hw/ip/entropy_src/doc/programmers_guide.md b/hw/ip/entropy_src/doc/programmers_guide.md
new file mode 100644
index 0000000000000..6ce90d60f72d8
--- /dev/null
+++ b/hw/ip/entropy_src/doc/programmers_guide.md
@@ -0,0 +1,36 @@
+# Programmer's Guide
+
+## Initialization
+
+To initialize the ENTROPY_SRC block, see the Device Interface Functions (DIFs) section.
+
+
+## Entropy Processing
+
+Once entropy has been prepared for delivery, it can be consumed by either hardware (CSRNG block hardware instance) or by a software interface (CSRNG software instance).
+
+Note that when software makes frequent re-seed requests to CSRNG, any stored up entropy seeds in the final entropy FIFO will quickly consumed.
+Once the FIFO is empty, subsequent entropy seed requests will have to wait the worst case latency time while new entropy is being created.
+
+
+## Entropy Source Module Disable
+
+A useful feature for the ENTROPY_SRC block is the ability to disable it in a graceful matter.
+Since there exists another feature to avoid power spikes between ENTROPY_SRC and CSRNG, software needs to monitor the disabling process.
+Bit 16 in the [`DEBUG_STATUS`](../data/entropy_src.hjson#debug_status) should be polled after the ENTROPY_SRC enable bits are cleared in the [`CONF`](../data/entropy_src.hjson#conf) register.
+After the handshakes with CSRNG are finished, the above bit should be set and the ENTROPY_SRC block can be safely enabled again.
+
+ENTROPY_SRC may only be disabled if CSRNG is disabled.
+
+
+## Error conditions
+
+Need to alert the system of a FIFO overflow condition.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_entropy_src.h)
+
+## Register Table
+
+* [Register Table](../data/entropy_src.hjson#registers)
diff --git a/hw/ip/entropy_src/doc/theory_of_operation.md b/hw/ip/entropy_src/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..7500eb6d5978e
--- /dev/null
+++ b/hw/ip/entropy_src/doc/theory_of_operation.md
@@ -0,0 +1,313 @@
+# Theory of Operation
+
+As already described, this IP block will collect bits of entropy for firmware or hardware consumption.
+This revision supports only an external interface for a PTRNG noise source implementation.
+
+The first step is initialization and enabling.
+The PTRNG noise source mode is selected when the `ENABLE` field will be set.
+After the block is enabled and initialized, entropy bits will be collected up indefinitely until disabled.
+
+
+After a reset, the ENTROPY_SRC block will start up in boot-time mode by default.
+This feature is designed to provide an initial seed's worth of entropy with lower latency than the normal FIPS/CC compliant health check process.
+Health testing will still be performed on boot-time mode entropy, but the window of checking is, by default, 384 bits instead of 2048 bits.
+When entropy is delivered to the downstream hardware block, a signal will indicate what type of entropy it is - FIPS compliant or not.
+Boot-time mode can be completely disabled in the [`CONF`](../data/entropy_src.hjson#conf) register.
+
+Once the initial boot-time mode phase has completed, the ENTROPY_SRC block will switch to FIPS compliant mode.
+In this mode, once the raw entropy has been health checked, it will be passed into a conditioner block.
+This block will compress the bits such that the entropy bits/physical bits, or min-entropy value, should be improved over the raw data source min-entropy value.
+The compression operation, by default, will compress every 2048 tested bits into 384 full-entropy bits.
+
+The hardware conditioning can also be bypassed and replaced in normal operation with a firmware-defined conditioning algorithm.
+This firmware conditioning algorithm can be disabled on boot for security purposes.
+
+The firmware override function has the capability to completely override the hardware health tests and the conditioner paths.
+In the case of health tests, firmware can turn off one or all of the health tests and perform the tests in firmware.
+A data path is provided in the hardware such that the inbound entropy can be trapped in the pre-conditioner FIFO.
+Once a pre-determined threshold of entropy has been reached in this FIFO, the firmware can then read the entropy bits out of the FIFO.
+The exact mechanism for this functionality starts with setting the `FW_OV_MODE` field in the [`FW_OV_CONTROL`](../data/entropy_src.hjson#fw_ov_control) register.
+This will enable firmware to monitor post-health test entropy bits by reading from the [`FW_OV_RD_DATA`](../data/entropy_src.hjson#fw_ov_rd_data) register.
+Firmware can use the [`OBSERVE_FIFO_THRESH`](../data/entropy_src.hjson#observe_fifo_thresh) and  [`OBSERVE_FIFO_DEPTH`](../data/entropy_src.hjson#observe_fifo_depth) to determine the state of the OBSERVE FIFO.
+At this point, firmware can do additional health checks on the entropy.
+Optionally, firmware can do the conditioning function, assuming the hardware is configured to bypass the conditioner block.
+Once firmware has processed the entropy,  it can then write the results back into the [`FW_OV_WR_DATA`](../data/entropy_src.hjson#fw_ov_wr_data) register (pre-conditioner FIFO).
+The `FW_OV_ENTROPY_INSERT` in the [`FW_OV_CONTROL`](../data/entropy_src.hjson#fw_ov_control) register will enable inserting entropy bits back into the entropy flow.
+The firmware override control fields will be set such that the new entropy will resume normal flow operation.
+
+An additional feature of the firmware override function is to insert entropy bits into the flow and still use the conditioning function in the hardware.
+Setting the `FW_OV_INSERT_START` field in the [`FW_OV_SHA3_START`](../data/entropy_src.hjson#fw_ov_sha3_start) register will prepare the hardware for this flow.
+Once this field is set true, the [`FW_OV_WR_DATA`](../data/entropy_src.hjson#fw_ov_wr_data) register can be written with entropy bits.
+The [`FW_OV_WR_FIFO_FULL`](../data/entropy_src.hjson#fw_ov_wr_fifo_full) register should be monitored after each write to ensure data is not dropped.
+Once all of the data has been written, the `FW_OV_INSERT_START` field should be set to false.
+The normal SHA3 processing will continue and finally push the conditioned entropy through the module.
+
+Health checks are performed on the input raw data from the PTRNG noise source when in that mode.
+There are four health tests that will be performed: repetitive count, adaptive proportion, bucket, and Markov tests.
+Each test has a pair of threshold values that determine that pass/fail of the test, one threshold for boot-time mode, and one for normal mode.
+By default, all tests are enabled, but can be turn off in the [`CONF`](../data/entropy_src.hjson#conf) register.
+Because of the variability of the PTRNG noise source, there are several registers that log statistics associated with the health tests.
+For example, the adaptive proportion test has a high watermark register that logs the highest measured number of ones.
+The [`ADAPTP_HI_WATERMARKS`](../data/entropy_src.hjson#adaptp_hi_watermarks) register has an entry for both normal and boot-time modes.
+This register allows for determining how close the threshold value should be set to the fail over value.
+Specific to the adaptive proportion test, there is also the [`ADAPTP_LO_WATERMARKS`](../data/entropy_src.hjson#adaptp_lo_watermarks) register, which will hold the lowest number of ones measured.
+To help understand how well the thresholds work through time, a running count of test fails is kept in the [`ADAPTP_HI_TOTAL_FAILS`](../data/entropy_src.hjson#adaptp_hi_total_fails) register.
+The above example for the adaptive proportion test also applies to the other health tests, with the exception of the low watermark registers.
+See the timing diagrams below for more details on how the health tests work.
+It should be noted that for all error counter registers, they are sized for 16 bits, which prevents any case where counters might wrap.
+
+
+Vendor-specific tests are supported through an external health test interface (xht).
+This is the same interface that is used for the internal health tests.
+Below is a description of this interface:
+- entropy_bit: 4-bit wide bus of entropy to be tested.
+- entropy_bit_valid: indication of when the entropy is valid.
+- clear: signal to clear counters, and is register driven.
+- active: signal to indicate when the test should run, and is register driven.
+- thresh_hi: field to indicate what high threshold the test should use, and is register driven.
+- thresh_lo: field to indicate what low threshold the test should use, and is register driven.
+- window_wrap_pulse: field to indicate the end of the current window.
+- threshold_scope: field to indicate whether the thresholds are intended to be applied to all entropy lines collectively or on a line-by-line basis, to be read from a register.
+- test_cnt: generic test count result, to be read from a register.
+- test_fail_hi_pulse: indication that a high threshold comparison failed, to be read from a register.
+- test_fail_lo_pulse: indication that a low threshold comparison failed, to be read from a register.
+
+
+The [`ALERT_THRESHOLD`](../data/entropy_src.hjson#alert_threshold) register determines how many fails can occur before an alert is issued.
+By default, the current threshold is set to two, such that the occurrence of two failing test cycles back-to-back would provide a very low &alpha; value.
+The [`ALERT_FAIL_COUNTS`](../data/entropy_src.hjson#alert_fail_counts) register holds the total number of fails, plus all of the individual contributing failing tests.
+Setting the [`ALERT_THRESHOLD`](../data/entropy_src.hjson#alert_threshold) register to zero will disable alert generation.
+
+Firmware has a path to read entropy from the ENTROPY_SRC block.
+The [`ENTROPY_CONTROL`](../data/entropy_src.hjson#entropy_control) register allows firmware to set the internal multiplexers to steer entropy data to the [`ENTROPY_DATA`](../data/entropy_src.hjson#entropy_data) register.
+The control bit `ES_TYPE` sets whether the entropy will come from the conditioning block or be sourced through the bypass path.
+A status bit will be set that can either be polled or generate an interrupt when the entropy bits are available to be read from the [`ENTROPY_DATA`](../data/entropy_src.hjson#entropy_data) register.
+The firmware needs to read the [`ENTROPY_DATA`](../data/entropy_src.hjson#entropy_data) register twelve times in order to cleanly evacuate the 384-bit seed from the hardware path (12*32bits=384bits total).
+The firmware will directly read out of the main entropy FIFO, and when the control bit `ES_ROUTE` is set, no entropy is being passed to the block hardware interface.
+
+If the `esfinal` FIFO fills up, additional entropy that has been health checked will be dropped before entering the conditioner.
+This drop point will save on conditioner power, and still preserve `esfinal` FIFO entropy that has already been collected.
+
+The above process will be repeated for as long as entropy bits are to be collected and processed.
+
+At any time, the `ENABLE` field can be cleared to halt the entropy generation (and health check testing).
+See the Programmers Guide section for more details on the ENTROPY_SRC block disable sequence.
+
+## Block Diagram
+
+![ENTROPY_SRC Block Diagram](../doc/entsrc_blk_diag.svg)
+
+## Hardware Interfaces
+
+* [Interface Tables](../data/entropy_src.hjson#interfaces)
+
+## Design Details
+
+### Initialization
+
+After power-up, the ENTROPY_SRC block is disabled.
+
+For simplicity of initialization, only a single register write is needed to start functional operation of the ENTROPY_SRC block.
+This assumes that proper defaults are chosen for thresholds, sampling rate, and other registers.
+
+For security reasons, a configuration and control register locking function is performed by the [`REGEN`](../data/entropy_src.hjson#regen) register.
+Clearing the bit in this register will prevent future modification of the [`CONF`](../data/entropy_src.hjson#conf) register or other writeable registers by firmware.
+
+### Entropy Processing
+
+When enabled, the ENTROPY_SRC block will generate entropy bits continuously.
+The `es_entropy_valid` bit in the `ENTROPY_SRC_INTR_STATE` register will indicate to the firmware when entropy bits can read from the [`ENTROPY_DATA`](../data/entropy_src.hjson#entropy_data) register.
+The firmware will do 32-bit register reads of the [`ENTROPY_DATA`](../data/entropy_src.hjson#entropy_data) register to retrieve the entropy bits.
+Each read will automatically pop an entry from the entropy unpacker block.
+A full twelve 32-bit words need to be read at a time.
+
+The hardware entropy interface will move entropy bits out of the ENTROPY FIFO when it is not empty, and the downstream hardware is ready.
+If firmware is not currently reading entropy bits, all processed entropy bits will flow to the hardware entropy interface.
+
+### Security
+
+All module assets and countermeasures performed by hardware are listed in the hjson countermeasures section.
+Labels for each instance of asset and countermeasure are located throughout the RTL source code.
+
+For all of the health test threshold registers, these registers could be protected with shadow registers.
+A design choice was made here to not use shadow registers and save on silicon cost.
+The threshold registers are protected by software.
+It is expected that software will read the threshold registers on a periodic basis, and compare these values to what was originally programmed into the threshold registers.
+
+Bus integrity checking is performed for the final seed delivery to CSRNG.
+This is done to make sure repeated values are not occurring.
+Only 64 bits (out of 384 bits) are checked, since this is statistically significant, and more checking would cost more silicon.
+
+
+
+### Interrupts
+
+The ENTROPY_SRC module has several interrupts: `es_entropy_valid`, `es_health_test_failed`, and `es_fifo_err`.
+
+The `es_entropy_valid` interrupt should be asserted when an entropy source has been implemented that is relatively slow.
+
+The `es_health_test_failed` interrupt will trigger when the internal health test fails and exceeds the alert threshold.
+
+The `es_fifo_err` interrupt will fire when an internal FIFO has a malfunction.
+The conditions that cause this to happen are either when there is a push to a full FIFO or a pull from an empty FIFO.
+
+
+## Main State Machine Diagram
+The following diagram shows how the main state machine state is constructed.
+The larger circles show the how the overall state machine transitions.
+The sub-state machines with smaller circles show more detail about how the large circles operate.
+
+![ENTROPY_SRC State Diagram](../doc/es_main_sm.svg)
+
+
+### Entropy Source Hardware Interface
+The following waveform shows an example of how the entropy source hardware interface works, which is much like a FIFO.
+
+
+```wavejson
+{signal: [
+   {name: 'clk'           , wave: 'p...|.........|.......'},
+   {name: 'es_req'        , wave: '0..1|..01.0..1|.....0.'},
+   {name: 'es_ack'        , wave: '0...|.10.10...|....10.'},
+   {name: 'es_bus[383:0]' , wave: '0...|.30.30...|....30.', data: ['es0','es1','es2']},
+   {name: 'es_fips'       , wave: '0...|....10...|....10.'},
+]}
+```
+
+
+### PTRNG Hardware Interface
+The following waveform shows an example of what the PTRNG timing looks like.
+
+
+```wavejson
+{signal: [
+   {name: 'clk'             , wave: 'p.|......|......|......'},
+   {name: 'rng_enable'      , wave: '01|......|......|......'},
+   {name: 'rng_valid'       , wave: '0.|..10..|..10..|..10..'},
+   {name: 'rng_b'           , wave: 'x.|..3...|..4...|..5.....', data: ['es0','es1','es2']},
+]}
+```
+
+### Repetition Count Test
+The following waveform shows how a sampling of a data pattern will be tested by the Repetition Count test.
+Operating on each bit stream, this test will count when a signal is at a stuck level.
+This NIST test is intended to signal a catastrophic failure with the PTRNG noise source.
+
+
+```wavejson
+{signal: [
+   {name: 'rng_valid'      , wave: 'p...............'},
+  ['rng bits',
+   {name: 'rng_bus[3]'     , wave: '1.0.10..1...0101'},
+   {name: 'rng_bus[2]'     , wave: '01.0.10..1...010'},
+   {name: 'rng_bus[1]'     , wave: '101.0.10..1...01'},
+   {name: 'rng_bus[0]'     , wave: '10.10..1...0101.'},
+   ],
+   {name: 'thresh_i (hex)'      , wave: '3...............',data: ['3']},
+   {name: 'rep_cntr_q[3] (hex)' , wave: '4444444444444444',data: ['0','0','1','0','1','0','0','1','2','0','1','2','3','0','0','0']},
+   {name: 'rep_cntr_q[2] (hex)' , wave: '4444444444444444',data: ['0','1','0','1','0','1','0','0','1','2','0','1','2','3','0','0']},
+   {name: 'rep_cntr_q[1] (hex)' , wave: '4444444444444444',data: ['0','0','0','0','1','0','1','0','0','1','2','0','1','2','3','0']},
+   {name: 'rep_cntr_q[0] (hex)' , wave: '4444444444444444',data: ['0','0','0','1','0','0','1','2','0','1','2','3','0','0','0','0']},
+   {name: 'test_cnt_q (hex)'    , wave: '4444444444444444',data: ['0','0','0','0','0','0','0','0','0','0','0','1','2','3','4','0']},
+   {name: 'window_cnt_q (hex)'  , wave: '5555555555555555',data: ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f']},
+], head:{
+   text:'Repetition Count Test',
+   tick:0,
+  },}
+```
+
+### Adaptive Proportion Test
+This NIST-defined test is intended to detect statistical bias in the raw entropy data.
+The test counts the number of 1's in a given sample, and applies thresholds to reject samples which deviate too far from the ideal mean of 50%.
+
+Depending on the value of the [`CONF.THRESHOLD_SCOPE`](../data/entropy_src.hjson#conf) field, the thresholds can either be applied collectively to the all RNG inputs, or the thresholds can be applied on a line-by-line basis.
+Setting [`CONF.THRESHOLD_SCOPE`](../data/entropy_src.hjson#conf) to `kMuBi4True` will apply the thresholds to the aggregated RNG stream.
+This can be useful for lowering the likelihood of coincidental test failures (higher &alpha;).
+Meanwhile, setting [`CONF.THRESHOLD_SCOPE`](../data/entropy_src.hjson#conf) to `kMuBi4False` will apply thresholds on a line-by-line basis which allows the ENTROPY_SRC to detect single line failures.
+
+The following waveform shows how a sampling of a data pattern will be tested by the Adaptive Proportion test.
+In this example, the sum is taken over all RNG lines (i.e., [`CONF.THRESHOLD_SCOPE`](../data/entropy_src.hjson#conf) is True).
+
+```wavejson
+{signal: [
+   {name: 'rng_valid'      , wave: 'p...............'},
+  ['rng bits',
+   {name: 'rng_bus[3]'     , wave: '1.0.10..1...0101'},
+   {name: 'rng_bus[2]'     , wave: '01.0.10..1...010'},
+   {name: 'rng_bus[1]'     , wave: '101.0.10..1...01'},
+   {name: 'rng_bus[0]'     , wave: '10.10..1...0101.'},
+   ],
+   {name: 'Column-wise sum'   , wave: '3333333333333333',data: ['3','2','2','2','1','1','1','1','2','3', '4', '3', '3', '2', '2','3']},
+   {name: 'test_cnt_q (hex)'   , wave: '4444444444444444',data: ['0','3','5','7','9','a','b','c','d','f','12','16','19','1c','1e','20']},
+   {name: 'window_cnt_q (hex)' , wave: '5555555555555555',data: ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f']},
+], head:{
+   text:'Adaptive Proportion Test',
+   tick:0,
+  },}
+```
+
+### Bucket Test
+The following waveform shows how a sampling of a data pattern will be tested by the Bucket test.
+Operating on all four bit streams, this test will identify the symbol and sort it into bin counters, or "buckets".
+This test is intended to find bias with a symbol or symbols.
+
+```wavejson
+{signal: [
+   {name: 'rng_valid'      , wave: 'p...............'},
+  ['rng bits',
+   {name: 'rng_bus[3]'     , wave: '1.0.10..1...0101'},
+   {name: 'rng_bus[2]'     , wave: '01.0.10..1...010'},
+   {name: 'rng_bus[1]'     , wave: '101.0.10..1...01'},
+   {name: 'rng_bus[0]'     , wave: '10.10..1...0101.'},
+   ],
+   {name: 'thresh_i (hex)'       , wave: '3...............',data: ['3']},
+   {name: 'bin_cntr_q[0] (hex)'  , wave: '4...............',data: ['0']},
+   {name: 'bin_cntr_q[1] (hex)'  , wave: '4........4......',data: ['0','1']},
+   {name: 'bin_cntr_q[2] (hex)'  , wave: '4.......4.......',data: ['0','1']},
+   {name: 'bin_cntr_q[13] (hex)' , wave: '4..........4....',data: ['0','1']},
+   {name: 'bin_cntr_q[14] (hex)' , wave: '4............4..',data: ['0','1']},
+   {name: 'bin_cntr_q[15] (hex)' , wave: '4...........4...',data: ['0','1']},
+   {name: 'test_cnt_q (hex)'     , wave: '4...............',data: ['0']},
+   {name: 'window_cnt_q (hex)' , wave: '5555555555555555',data: ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f']},
+], head:{
+   text:'Bucket Test',
+   tick:0,
+  },}
+```
+
+### Markov Test
+The following waveform shows how a sampling of a data pattern will be tested by the Markov test.
+
+The test aims to detect either:
+
+1. Oversampling of AST/RNG outputs leading to "clustered" input values that eventually change, but often are just repeats of the previous sample.
+For example the string: "00111111000011000111000111000001111" has roughly equal numbers of 1's and 0's, but no good entropy source should generate such strings, because each bit is likely just a repeat of the previous one.
+
+2. Wild oscillations of the RNG, in a distinctly non-random way.
+For instance the string: "010101010101010101" has almost zero entropy, even though the number of 1's and 0's appears unbiased.
+
+The test counts the number of changes in the a fixed number of RNG samples, and comparing the number of "01"/"10" pairs to the number of "00"/"11" pairs.
+On average, the number of switching (e.g., "01") vs. non-switching (e.g., "00") pairs should be 50% of the total, with a variance proportional to the sample size.
+
+Like the Adaptive Proportion test, the Markov Test can be computed either cumulatively (summing the results over all RNG lines) or on a per-line basis.
+In this example, the RNG lines are scored individually (i.e., [`CONF.THRESHOLD_SCOPE`](../data/entropy_src.hjson#conf) is False).
+
+```wavejson
+{signal: [
+   {name: 'rng_valid'      , wave: 'p...............'},
+  ['rng bits',
+   {name: 'rng_bus[3]'     , wave: '1.0.10..1...0101'},
+   {name: 'rng_bus[2]'     , wave: '01.0.10..1...010'},
+   {name: 'rng_bus[1]'     , wave: '101.0.10..1...01'},
+   {name: 'rng_bus[0]'     , wave: '10.10..1...0101.'},
+   ],
+   {name: 'pair_cntr_q[3] (hex)', wave: '4.4.4.4.4.4.4.4.',data: ['0','0','0','1','1','1','1','2']},
+   {name: 'pair_cntr_q[2] (hex)', wave: '4.4.4.4.4.4.4.4.',data: ['0','1','2','3','3','4','4','5']},
+   {name: 'pair_cntr_q[1] (hex)', wave: '4.4.4.4.4.4.4.4.',data: ['0','1','1','1','2','2','2','2']},
+   {name: 'pair_cntr_q[0] (hex)', wave: '4.4.4.4.4.4.4.4.',data: ['0','1','2','2','3','3','4','5']},
+   {name: 'window_cnt_q (hex)'  , wave: '5555555555555555',data: ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f']},
+], head:{
+   text:'Markov Test',
+   tick:0,
+  },}
+```
diff --git a/hw/ip/flash_ctrl/README.md b/hw/ip/flash_ctrl/README.md
index a0ba40bee69b8..acdb4153fe9ff 100644
--- a/hw/ip/flash_ctrl/README.md
+++ b/hw/ip/flash_ctrl/README.md
@@ -189,616 +189,3 @@ During production and RMA states, the isolated page is also readable.
 * Both `lc_iso_part_sw_wr_en` and `lc_iso_part_sw_rd_en` are set.
 
 See [life cycle](../lc_ctrl/README.md#iso_part_sw_rd_en-and-iso_part_sw_wr_en) for more details
-
-
-# Theory of Operation
-
-## Block Diagram
-
-![Flash Block Diagram](./doc/flash_block_diagram.svg)
-
-### Flash Protocol Controller
-
-The Flash Protocol Controller sits between the host software interface, other hardware components and the flash physical controller.
-Its primary functions are two fold
-*  Translate software program, erase and read requests into a high level protocol for the actual flash physical controller
-*  Act as communication interface between flash and other components in the system, such as life cycle and key manager.
-
-The flash protocol controller is not responsible for the detailed timing and waveform control of the flash, nor is it responsible for data scrambling and reliability metadata such as ICV and ECC.
-Instead, it maintains FIFOs / interrupts for the software to process data, as well as high level abstraction of region protection controls and error handling.
-
-The flash controller selects requests between the software and hardware interfaces.
-By default, the hardware interfaces have precedence and are used to read out seed materials from flash.
-The seed material is read twice to confirm the values are consistent.
-They are then forwarded to the key manager for processing.
-During this seed phase, software initiated activities are back-pressured until the seed reading is complete.
-It is recommended that instead of blindly issuing transactions to the flash controller, the software polls [`STATUS.INIT_WIP`](data/flash_ctrl.hjson#status) until it is 0.
-
-Once the seed phase is complete, the flash controller switches to the software interface.
-Software can then read / program / erase the flash as needed.
-
-#### RMA Entry Handling
-
-When an RMA entry request is received from the life cycle manager, the flash controller waits for any pending flash transaction to complete, then switches priority to the hardware interface.
-The flash controller then initiates RMA entry process and notifies the life cycle controller when it is complete.
-The RMA entry process wipes out all data, creator, owner and isolated partitions.
-
-After RMA completes, the flash controller is [disabled](#flash-access-disable).
-When disabled the flash protocol controller registers can still be accessed.
-However, flash memory access are not allowed, either directly by the host or indirectly through flash protocol controller initiated transactions.
-It is expected that after an RMA transition, the entire system will be rebooted.
-
-
-#### Initialization
-
-The flash protocol controller is initialized through [`INIT`](data/flash_ctrl.hjson#init).
-When initialization is invoked, the flash controller requests the address and data scrambling keys from an external entity, [otp_ctrl](../otp_ctrl/README.md#interface-to-flash-scrambler) in this case.
-
-After the scrambling keys are requested, the flash protocol controller reads the root seeds out of the [secret partitions](#secret-information-partitions) and sends them to the key manager.
-Once the above steps are completed, the read buffers in the flash physical controller are enabled for operation.
-
-#### RMA Entry
-During RMA entry, the flash controller "wipes" the contents of the following:
-- Creator partition
-- Owner partition
-- Isolated partition
-- All data partitions
-
-This process ensures that after RMA there is no sensitive information left that can be made use on the tester.
-As stated previously, once RMA entry completes, the flash memory can no longer be accessed, either directly or indirectly.
-The flash controller registers however, remain accessible for status reads and so forth, although new operations cannot be issued.
-
-#### Memory Protection
-
-Flash memory protection is handled differently depending on what type of partition is accessed.
-
-For data partitions, software can configure a number of memory protection regions such as [`MP_REGION_CFG_0`](data/flash_ctrl.hjson#mp_region_cfg_0).
-For each region, software specifies both the beginning page and the number of pages that belong to that region.
-Software then configures the access privileges for that region.
-Finally, each region can be activated or de-activated from matching through [`MP_REGION_CFG_0.EN`](data/flash_ctrl.hjson#mp_region_cfg_0).
-
-Subsequent accesses are then allowed or denied based on the defined rule set.
-Similar to RISCV pmp, if two region overlaps, the lower region index has higher priority.
-
-For information partitions, the protection is done per individual page.
-Each page can be configured with access privileges.
-As a result, software does not need to define a start and end page for information partitions.
-See [`BANK0_INFO0_PAGE_CFG_0`](data/flash_ctrl.hjson#bank0_info0_page_cfg_0) as an example.
-
-#### Bank Erase Protection
-
-Unlike read, program and page erase operations, the bank erase command is the only one that can be issued at a bank level.
-Because of this, bank erase commands are not guarded by the typical [memory protection mechanisms](#memory-protection).
-
-Instead, whether bank erase is allowed is controlled by [`MP_BANK_CFG_SHADOWED`](data/flash_ctrl.hjson#mp_bank_cfg_shadowed), where there is a separate configuration bit per bank.
-When the corresponding bit is set, that particular bank is permitted to have bank level operations.
-
-The specific behavior of what is erased when bank erase is issued is flash memory dependent and thus can vary by vendor and technology.
-[This section](#flash-bank-erase) describes the general behavior and how open source modeling is done.
-
-#### Memory Protection for Key Manager and Life Cycle
-
-While memory protection is largely under software control, certain behavior is hardwired to support key manager secret partitions and life cycle functions.
-
-Software can only control the accessibility of the creator secret seed page under the following condition(s):
-*  life cycle sets provision enable.
-*  OTP indicates the seeds are not locked.
-
-Software can only control the accessibility of the owner secret seed page under the following condition(s):
-*  life cycle sets provision enable.
-
-During life cycle RMA transition, the software configured memory protection for both data and information partitions is ignored.
-Instead, the flash controller assumes a default accessibility setting that allows it to secure the chip and transition to RMA.
-
-#### Program Resolution
-
-Certain flash memories place restrictions on the program window.
-This means the flash accepts program beats only if all beats belong to the same address window.
-Typically, this boundary is nicely aligned (for example, 16 words, 32 words) and is related to how the flash memory amortizes the program operation over nearby words.
-
-To support this function, the flash controller errors back anytime the start of the program beat is in a different window from the end of the program beat.
-The valid program range is thus the valid program resolution for a particular memory.
-
-This information is not configurable but instead decided at design time and is exposed as a readable status.
-
-#### Erase Suspend
-
-The flash controller supports erase suspend through [`ERASE_SUSPEND`](data/flash_ctrl.hjson#erase_suspend).
-This allows the software to interrupt an ongoing erase operation.
-
-The behavior of what happens to flash contents when erase is suspended is vendor defined; however, generally it can be assumed that the erase would be incomplete.
-It is then up to the controlling software to take appropriate steps to erase again at a later time.
-
-#### Additional Flash Attributes
-
-There are certain attributes provisioned in [`MP_REGION_CFG_0`](data/flash_ctrl.hjson#mp_region_cfg_0) that are not directly used by the open source protocol or physical controllers.
-
-Instead, these attributes are fed to the vendor flash module on a per-page or defined boundary basis.
-Currently there is only one such attribute [`MP_REGION_CFG_0.HE`](data/flash_ctrl.hjson#mp_region_cfg_0).
-
-#### Idle Indication to External Power Manager
-
-The flash controller provides an idle indication to an external power manager.
-This idle indication does not mean the controller is doing "nothing", but rather the controller is not doing anything "stateful", e.g. program or erase.
-
-This is because an external power manager event (such as shutting off power) while a flash stateful transaction is ongoing may be damaging to the vendor flash module.
-
-#### Flash Code Execution Handling
-
-Flash can be used to store both data and code.
-To support separate access privileges between data and code, the flash protocol controller provides [`EXEC`](data/flash_ctrl.hjson#exec) for software control.
-
-If software programs [`EXEC`](data/flash_ctrl.hjson#exec) to `0xa26a38f7`, code fetch from flash is allowed.
-If software programs [`EXEC`](data/flash_ctrl.hjson#exec) to any other value, code fetch from flash results in an error.
-
-The flash protocol controller distinguishes code / data transactions through the [instruction type attribute](../lc_ctrl/README.md#usage-of-user-bits) of the TL-UL interface.
-
-#### Flash Errors and Faults
-
-The flash protocol controller maintains 3 different categories of observed errors and faults.
-In general, errors are considered recoverable and primarily geared towards problems that could have been caused by software or that occurred during a software initiated operation.
-Errors can be found in [`ERR_CODE`](data/flash_ctrl.hjson#err_code).
-
-Faults, on the other hand, represent error events that are unlikely to have been caused by software and represent a major malfunction of the system.
-
-Faults are further divided into two categories:
-- Standard faults
-- Custom faults
-
-Standard faults represent errors that occur in the standard structures of the design, for example sparsely encoded FSMs, duplicated counters and the bus transmission integrity scheme.
-
-Custom faults represent custom errors, primarily errors generated by the life cycle management interface, the flash storage integrity interface and the flash macro itself.
-
-See (#flash-escalation) for further differentiation between standard and custom faults.
-
-#### Transmission Integrity Faults
-
-Since the flash controller has multiple interfaces for access, transmission integrity failures can manifest in different ways.
-
-There are 4 interfaces:
-- host direct access to flash controller [register files](#host-direct-reg).
-- host direct access to [flash macro](#host-direct-macro)
-- host / software initiated flash controller access to [flash macro (read / program / erase)](#host-controller-op)
-- life cycle management interface / hardware initiated flash controller access to [flash macro (read / program / erase)](#hw-controller-op)
-
-The impact of transmission integrity of each interface is described below.
-
-##### Host Direct Access to Flash Controller Register Files {#host-direct-reg}
-This category of transmission integrity behaves identically to other modules.
-A bus transaction, when received, is checked for command and data payload integrity.
-If an integrity error is seen, the issuing bus host receives an in-band error response and a fault is registered in [`STD_FAULT_STATUS.REG_INTG_ERR`](data/flash_ctrl.hjson#std_fault_status).
-
-##### Host Direct Access to Flash Macro {#host-direct-macro}
-Flash can only be read by the host.
-The transmission integrity scheme used is end-to-end, so integrity generated inside the flash is fed directly to the host.
-It is the host's responsibility to check for integrity correctness and react accordingly.
-
-##### Host / Software Initiated Access to Flash Macro {#host-controller-op}
-Since controller operations are initiated through writes to the register file, the command check is identical to host direct access to [regfiles](#host-direct-reg).
-Controller reads behave similarly to [host direct access to macro](#host-direct-macro), the read data and its associated integrity are returned through the controller read FIFO for the initiating host to handle.
-
-For program operations, the write data and its associated integrity are stored and propagated through the flash protocol and physical controllers.
-Prior to packing the data for final flash program, the data is then checked for integrity correctness.
-If the data integrity is incorrect, an in-band error response is returned to the initiating host and an error is registered in [`ERR_CODE.PROG_INTG_ERR`](data/flash_ctrl.hjson#err_code).
-An error is also registered in [`STD_FAULT_STATUS.PROG_INTG_ERR`](data/flash_ctrl.hjson#std_fault_status) to indicate that a fatal fault has occurred.
-
-The reasons a program error is registered in two locations are two-fold:
-- It is registered in [`ERR_CODE`](data/flash_ctrl.hjson#err_code) so software can discover during operation status that a program has failed.
-- It is registered in [`STD_FAULT_STATUS`](data/flash_ctrl.hjson#std_fault_status) because transmission integrity failures represent a fatal failure in the standard structure of the design, something that should never happen.
-
-##### Life Cycle Management Interface / Hardware Initiated Access to Flash Macro {#hw-controller-op}
-The life cycle management interface issues transactions directly to the flash controller and does not perform a command payload integrity check.
-
-For read operations, the read data and its associated integrity are directly checked by the life cycle management interface.
-If an integrity error is seen, it is registered in [`FAULT_STATUS.LCMGR_INTG_ERR`](data/flash_ctrl.hjson#fault_status).
-
-For program operations, the program data and its associated integrity are propagated into the flash controller.
-If an integrity error is seen, an error is registered in [`FAULT_STATUS.PROG_INTG_ERR`](data/flash_ctrl.hjson#fault_status).
-
-#### ECC and ICV Related Read Errors
-
-In addition to transmission integrity errors described above, the flash can also emit read errors based on [ECC and ICV checks](#flash-ecc-and-icv).
-
-Flash reliability ECC errors (multi-bit errors) and integrity check errors (integrity check errors) are both reflected as in-band errors to the entity that issued the transaction.
-That means if a host direct read, controller initiated read or hardware initiated read encounters one of these errors, the error is directly reflected in the operation status.
-
-Further, reliability ECC / integrity check errors are also captured in [`FAULT_STATUS`](data/flash_ctrl.hjson#fault_status) and can be used to generate fatal alerts.
-The reason these are not captured in [`STD_FAULT_STATUS`](data/flash_ctrl.hjson#std_fault_status) is because 1 or 2 bit errors can occur in real usage due to environmental conditions, thus they do not belong to the standard group of structural errors.
-If we assume 2-bit errors can occur, then software must have a mechanism to recover from the error instead of [escalation](#flash-escalation).
-
-#### Flash Escalation
-
-Flash has two sources of escalation - global and local.
-
-Global escalation is triggered by the life cycle controller through `lc_escalate_en`.
-Local escalation is triggered by a standard faults of flash, seen in [`STD_FAULT_STATUS`](data/flash_ctrl.hjson#std_fault_status).
-Local escalation is not configurable and automatically triggers when this subset of faults are seen.
-
-For the escalation behavior, see [flash access disable](#flash-access-disable) .
-
-#### Flash Access Disable
-
-Flash access can be disabled through global escalation trigger, local escalation trigger, rma process completion or software command.
-The escalation triggers are described [here](#flash-escalation).
-The software command to disable flash can be found in [`DIS`](data/flash_ctrl.hjson#dis).
-The description for rma entry can be found [here](#rma-entry-handling).
-
-When disabled, the flash has a two layered response:
-- The flash protocol controller [memory protection](#memory-protection) errors back all controller initiated operations.
-- The host-facing tlul adapter errors back all host initiated operations.
-- The flash physical controller completes any existing stateful operations (program or erase) and drops all future flash transactions.
-- The flash protocol controller arbiter completes any existing software issued commands and enters a disabled state where no new transactions can be issued.
-
-
-### Flash Physical Controller
-
-The Flash Physical Controller is the wrapper module that contains the actual flash memory instantiation.
-It is responsible for arbitrating high level protocol commands (such as read, program, erase) as well as any additional security (scrambling) and reliability (ECC) features.
-The contained vendor wrapper module is then responsible for converting high level commands into low level signaling and timing specific to a particular flash vendor.
-The vendor wrapper module is also responsible for any BIST, redundancy handling, remapping features or custom configurations required for the flash.
-
-The scramble keys are provided by an external static block such as the OTP.
-
-#### Host and Protocol Controller Handling
-
-Both the protocol controller and the system host converge on the physical controller.
-The protocol controller has read access to all partitions as well as program and erase privileges.
-The host on the other hand, can only read the data partitions.
-
-Even though the host has less access to flash, it is prioritized when competing against the protocol controller for access.
-When a host request and a protocol controller request arrive at the same time, the host is favored and granted.
-Every time the protocol controller loses such an arbitration, it increases an arbitration lost count.
-Once this lost count reaches 5, the protocol controller is favored.
-This ensures a stream of host activity cannot deny protocol controller access (for example a tight polling loop).
-
-#### Flash Bank Erase Behavior {#flash-bank-erase}
-
-This section describes the open source modeling of flash memory.
-The actual flash memory behavior may differ, and should consult the specific vendor or technology specification.
-
-When a bank erase command is issued and allowed, see [bank erase protection](#bank-erase-protection), the erase behavior is dependent on [`CONTROL.PARTITION_SEL`](data/flash_ctrl.hjson#control).
-- If data partition is selected, all data in the data partition is erased.
-- If info partition is selected, all data in the data partition is erased AND all data in the info partitions (including all info types) is also erased.
-
-#### Flash Scrambling
-
-Flash scrambling is built using the [XEX tweakable block cipher](https://en.wikipedia.org/wiki/Disk_encryption_theory#Xor%E2%80%93encrypt%E2%80%93xor_(XEX)).
-
-When a read transaction is sent to flash, the following steps are taken:
-*  The tweak is calculated using the transaction address and a secret address key through a Galois multiplier.
-*  The data content is read out of flash.
-*  If the data content is scrambled, the tweak is XOR'd with the scrambled text and then decrypted through the PRINCE block cipher using a secret data key.
-*  The output of the PRINCE cipher is XOR'd again with the tweak and the final results are presented.
-*  If the data content is not scrambled, the PRINCE cipher and XOR steps are skipped and data provided directly back to the requestor.
-
-When a program transaction is sent to flash, the same steps are taken if the address in question has scrambling enabled.
-During a program, the text is scrambled through the PRINCE block cipher.
-
-Scramble enablement is done differently depending on the type of partitions.
-*  For data partitions, the scramble enablement is done on contiguous page boundaries.
-   *  Software has the ability to configure these regions and whether scramble is enabled.
-*  For information partitions, the scramble enablement is done on a per page basis.
-   *  Software can configure for each page whether scramble is enabled.
-
-#### Flash ECC and ICV
-
-Flash supports both ECC (error correction) and ICV (integrity check value).
-While the two are used for different functions, they are implemented as two separate ECCs, thus flash supports two types of ECC.
-
-ICV is an integrity check, implemented as an ECC, used to detect whether the de-scrambled data has been modified.
-The other is a reliability ECC used for error detection and correction on the whole flash word.
-
-The key differentiation here is that ICV is used only for detection, while the real error correction can correct single bit errors.
-Both ICV and ECC are configurable based on the various page and memory property configurations.
-
-##### Overall ICV and ECC Application
-
-The following diagram shows how the various ICV / ECC tags are applied and used through the life of a transactions.
-![Flash ECC_LIFE](./doc/flash_integrity.svg).
-
-Note that the ICV (integrity ECC) is calculated over the descrambled data and is only 4-bits, while the reliability ECC is calculated over both the scrambled data and the ICV.
-
-##### ICV
-
-The purpose of the ICV (integrity check value, implemented as an ECC) is to emulate end-to-end integrity like the other memories.
-This is why the data is calculated over the descrambled data as it can be stored alongside for continuous checks.
-When descrambled data is returned to the host, the ICV is used to validate the data is correct.
-
-The flash may not always have the capacity to store both the ICV and reliability ECC, the ICV is thus truncated since it is not used for error correction.
-
-##### Reliability ECC
-
-Similar to scrambling, the reliability ECC is enabled based on an address decode.
-The ECC for flash is chosen such that a fully erased flash word has valid ECC.
-Likewise a flash word that is completely 0 is also valid ECC.
-
-Unlike the integrity ECC, the reliability ECC is actually used for error correction if an accidental bit-flip is seen, it is thus fully stored and not truncated.
-
-ECC enablement is done differently depending on the type of partitions.
-*  For data partitions, the ECC enablement is done on contiguous page boundaries.
-   *  Software has the ability to configure these regions and whether ECC is enabled.
-*  For information partitions,the ECC enablement is done on a per page basis.
-   *  Software can configure for each page whether ECC is enabled.
-
-##### Scrambling Consistency
-
-The flash physical controller does not keep a history of when a particular memory location has scrambling enabled or disabled.
-This means if a memory location was programmed while scrambled, disabling scrambling and then reading it back will result in garbage.
-Similarly, if a location was programmed while non-scrambled, enabling scrambling and then reading it back will also result in garbage.
-
-It it thus the programmer's responsibility to maintain a consistent definition of whether a location is scrambled.
-It is also highly recommended in a normal use case to setup up scramble and non-scramble regions and not change it further.
-
-#### Flash Read Pipeline
-
-Since the system host reads directly from the flash for instructions, it is critical to not add significant latency during read, especially if de-scrambling is required.
-As such, the flash read is actually a two stage pipeline, where each stage can take multiple cycles.
-
-Additionally, since the flash word size is typically larger than the bus word, recently read flash entries are locally cached.
-The cache behaves as a highly simplified read-only-cache and holds by default 4 flash words per flash bank.
-
-When a read transaction is sent to flash, the following steps are taken:
-*  A check is performed against the local cache
-   * If there is a hit (either the entry is already in cache, or the entry is currently being processed), the transaction is immediately forwarded to the response queue.
-   * If there is not a hit, an entry in the local cache is selected for allocation (round robin arbitration) and a flash read is issued.
-*  When the flash read completes, its descrambling attributes are checked:
-   * If descrambling is required, the read data begins the descrambling phase - at this time, a new flash read can be issued for the following transaction.
-   * if descrambling is not required, the descrambling phase is skipped and the transaction is pushed to the response queue.
-*  When the descrambling is complete, the descrambled text is pushed to the response queue.
-
-The following diagram shows how the flash read pipeline timing works.
-![Flash Read Pipeline](./doc/flash_read_pipeline.svg)
-
-
-In this example, the first two host requests trigger a full sequence.
-The third host requests immediately hits in the local cache and responds in order after the first two.
-
-#### Flash Buffer
-
-The flash buffer is a small read-only memory that holds multiple entries of recently read flash words.
-This is needed when the flash word is wider than a bus word.
-The flash access time is amortized across the the entire flash word if software accesses in a mostly
-linear sequence.
-
-The flash buffer has a round robin replacement policy when more flash words are read.
-When an erase / program is issued to the flash, the entries are evicted to ensure new words are fetched.
-
-When a page erase / program is issued to a flash bank, only entries that fall into that address range are evicted.
-When a bank erase is issued, then all entries are evicted.
-
-The flash buffer is only enabled after [`INIT`](data/flash_ctrl.hjson#init) is invoked.
-When an RMA entry sequence is received, the flash buffers are disabled.
-
-As an example, assume a flash word is made up of 2 bus words.
-Assume also the following address to word mapping:
-- Address 0 - flash word 0, bus word 0 / bus word 1
-- Address 2 - flash word 1, bus word 2 / bus word 3
-
-When software reads bus word 1, the entire flash word 0 is captured into the flash buffer.
-When software comes back to read bus word 0, instead of accessing the flash again, the data is retrieved directly from the buffer.
-
-The recently read entries store both the de-scrambled data and the [integrity ECC](#integrity-ecc).
-The [reliability ECC](#reliability-ecc) is not stored because the small buffer is purely flip-flop based and does not have storage reliability concerns like the main flash macro.
-
-When a read hits in the flash buffer, the integrity ECC is checked against the de-scrambled data and an error is returned to the initiating entity, whether it is a the controller itself or a host.
-
-
-#### Accessing Information Partition
-
-The information partition uses the same address scheme as the data partition - which is directly accessible by software.
-This means the address of page{N}.word{M} is the same no matter which type of partition is accessed.
-
-Which partition a specific transaction accesses is denoted through a separate field [`CONTROL.PARTITION_SEL`](data/flash_ctrl.hjson#control) in the [`CONTROL`](data/flash_ctrl.hjson#control) register.
-If [`CONTROL.PARTITION_SEL`](data/flash_ctrl.hjson#control) is set, then the information partition is accessed.
-If [`CONTROL.PARTITION_SEL`](data/flash_ctrl.hjson#control) is not set, then the corresponding word in the data partition is accessed.
-
-Flash scrambling, if enabled, also applies to information partitions.
-It may be required for manufacturers to directly inject data into specific pages flash information partitions via die contacts.
-For these pages, scramble shall be permanently disabled as the manufacturer should not be aware of scrambling functions.
-
-##### JTAG Connection
-
-The flash physical controller provides a JTAG connection to the vendor flash module.
-The vendor flash module can use this interface to build a testing setup or to provide backdoor access for debug.
-
-Due to the ability of this connection to bypass access controls, this connection is modulated by life cycle and only enabled when non-volatile debug, or `lc_nvm_debug_en` is allowed in the system.
-
-## Flash Default Configuration
-Since the flash controller is highly dependent on the specific flavor of flash memory chosen underneath, its configuration can vary widely between different integrations.
-
-This sections details the default settings used by the flash controller:
-* Number of banks: 2
-* Number of data partition pages per bank: 256
-* [Program resolution](#program-resolution): 8 flash words
-* Flash word data bits: 64
-* Flash word metadata bits: 8
-* ECC choice: Hamming code SECDED
-* Information partition types: 3
-* Number of information partition type 0 pages per bank: 10
-* Number of information partition type 1 pages per bank: 1
-* Number of information partition type 2 pages per bank: 2
-* Secret partition 0 (used for creator): Bank 0, information partition 0, page 1
-* Secret partition 1 (used for owner): Bank 0, information partition 0, page 2
-* Isolated partition: Bank 0, information partition 0, page 3
-
-## Hardware Interfaces
-
-* [Interface Tables](data/flash_ctrl.hjson#interfaces)
-
-### Signals
-
-In addition to the interrupts and bus signals, the tables below lists the flash controller functional I/Os.
-
-Signal                     | Direction      | Description
-------------------------   |-----------     |---------------
-`lc_creator_seed_sw_rw_en` | `input`        | Indication from `lc_ctrl` that software is allowed to read/write creator seed.
-`lc_owner_seed_sw_rw_en`   | `input`        | Indication from `lc_ctrl` that software is allowed to read/write owner seed.
-`lc_seed_hw_rd_en`         | `input`        | Indication from `lc_ctrl` that hardware is allowed to read creator / owner seeds.
-`lc_iso_part_sw_rd_en`     | `input`        | Indication from `lc_ctrl` that software is allowed to read the isolated partition.
-`lc_iso_part_sw_wr_en`     | `input`        | Indication from `lc_ctrl` that software is allowed to write the isolated partition.
-`lc_escalate_en`           | `input`        | Escalation indication from `lc_ctrl`.
-`lc_nvm_debug_en`          | `input`        | Indication from lc_ctrl that non-volatile memory debug is allowed.
-`core_tl`                  | `input/output` | TL-UL interface used to access `flash_ctrl` registers for activating program / erase and reads to information partitions/
-`prim_tl`                  | `input/output` | TL-UL interface used to access the vendor flash memory proprietary registers.
-`mem_tl`                   | `input/output` | TL-UL interface used by host to access the vendor flash memory directly.
-`OTP`                      | `input/output` | Interface used to request scrambling keys from `otp_ctrl`.
-`rma_req`                  | `input`        | rma entry request from `lc_ctrl`.
-`rma_ack`                  | `output`       | rma entry acknowlegement to `lc_ctrl`.
-`rma_seed`                 | `input`        | rma entry seed.
-`pwrmgr`                   | `output`       | Idle indication to `pwrmgr`.
-`keymgr`                   | `output`       | Secret seed bus to `keymgr`.
-
-In addition to the functional IOs, there are a set of signals that are directly connected to vendor flash module.
-
-Signal                     | Direction      | Description
-------------------------   |-----------     |---------------
-`scan_en`                  | `input`        | scan enable
-`scanmode`                 | `input`        | scan mode
-`scan_rst_n`               | `input`        | scan reset
-`flash_bist_enable`        | `input`        | enable flash built-in-self-test
-`flash_power_down_h`       | `input`        | flash power down indication, note this is NOT a core level signal
-`flash_power_ready_h`      | `input`        | flash power ready indication, note this is NOT a core level signal
-`flash_test_mode_a`        | `input/output` | flash test mode io, note this is NOT a core level signal
-`flash_test_voltage_h`     | `input/output` | flash test voltage, note this is NOT a core level signal
-`flash_alert`              | `output`       | flash alert outputs directly to AST
-
-
-
-## Design Details
-
-### Flash Protocol Controller Description
-
-The flash protocol controller uses a simple FIFO interface to communicate between the software and flash physical controller.
-There is a read FIFO for read operations, and a program FIFO for program operations.
-Note, this means flash can be read both through the controller and the main bus interface.
-This may prove useful if the controller wishes to allocate specific regions to HW FSMs only, but is not a necessary feature.
-
-When software initiates a read transaction of a programmable number of flash words, the flash controller will fill up the read FIFO for software to consume.
-Likewise, when software initiates a program transaction, software will fill up the program FIFO for the controller to consume.
-
-The controller is designed such that the overall number of words in a transaction can significantly exceed the FIFO depth.
-In the case of read, once the FIFO is full, the controller will cease writing more entries and wait for software to consume the contents (an interrupt will be triggered to the software to alert it to such an event).
-In the case of program, the controller will stop writing to flash once all existing data is consumed - it will likewise trigger an interrupt to software to prepare more data.
-See detailed steps in theory of operation.
-The following is a diagram of the controller construction as well as its over connectivity with the flash module.
-
-![Flash Protocol Controller](./doc/flash_protocol_controller.svg)
-
-
-### Host Read
-
-Unlike controller initiated reads, host reads have separate rdy / done signals to ensure transactions can be properly pipelined.
-As host reads are usually tied to host execution upstream, additional latency can severely harm performance and is not desired.
-The expected waveform from the perspective of the physical controller is shown below.
-
-```wavejson
-{signal: [
-  {name: 'clk_i',           wave: 'p..............'},
-  {name: 'rst_ni',          wave: '0.1............'},
-  {name: 'host_req_i',      wave: '0..10.1...0....'},
-  {name: 'host_addr_i',     wave: 'x..3x.3.33x....', data: ['Adr0', 'Adr1', 'Adr2', 'Adr3']},
-  {name: 'host_req_rdy_o',  wave: '1...0..1.......'},
-  {name: 'host_req_done_o', wave: '0...10..1110...'},
-  {name: 'host_rdata_o',    wave: 'x...4x..444x...',data: ['Dat0', 'Dat1', 'Dat2', 'Dat3']},
-]}
-```
-
-The `host_req_done_o` is always single cycle pulsed and upstream logic is expected to always accept and correctly handle the return.
-The same cycle the return data is posted a new command / address can be accepted.
-While the example shows flash reads completing in back to back cycles, this is typically not the case.
-
-### Controller Read
-
-Unlike host reads, controller reads are not as performance critical and do not have command / data pipeline requirements.
-Instead, the protocol controller will hold the read request and address lines until the done is seen.
-Once the done is seen, the controller then transitions to the next read operation.
-The expected waveform from the perspective of the physical controller is shown below.
-
-```wavejson
-{signal: [
-  {name: 'clk_i',                 wave: 'p..............'},
-  {name: 'rst_ni',                wave: '0.1............'},
-  {name: 'flash_ctrl_i.req',      wave: '0..1.....0.....'},
-  {name: 'flash_ctrl_i.addr',     wave: 'x..3..3..x.3..x', data: ['Adr0', 'Adr1', 'Adr2']},
-  {name: 'flash_ctrl_i.rd',       wave: '0..1.....0.1..0'},
-  {name: 'flash_ctrl_o.rd_done',  wave: '0....10.10...10'},
-  {name: 'flash_ctrl_o.rdata',    wave: 'x....4x.4x...4x', data: ['Dat0', 'Dat1', 'Dat2']},
-]}
-```
-
-### Controller Program
-
-Program behavior is similar to reads.
-The protocol controller will hold the request, address and data lines until the programming is complete.
-The expected waveform from the perspective of the physical controller is shown below.
-
-```wavejson
-{signal: [
-  {name: 'clk_i',                  wave: 'p..............'},
-  {name: 'rst_ni',                 wave: '0.1............'},
-  {name: 'flash_ctrl_i.req',       wave: '0..1.....0.....'},
-  {name: 'flash_ctrl_i.addr',      wave: 'x..3..3..x.3..x', data: ['Adr0', 'Adr1', 'Adr2']},
-  {name: 'flash_ctrl_i.prog',      wave: '0..1.....0.1..0'},
-  {name: 'flash_ctrl_o.prog_data', wave: 'x..4..4..x.4..x', data: ['Dat0', 'Dat1', 'Dat2']},
-  {name: 'flash_ctrl_o.prog_done', wave: '0....10.10...10'},
-]}
-```
-
-# Programmers Guide
-
-## Issuing a Controller Read
-
-To issue a flash read, the programmer must
-*  Specify the address of the first flash word to read
-*  Specify the number of total flash words to read, beginning at the supplied address
-*  Specify the operation to be 'READ' type
-*  Set the 'START' bit for the operation to begin
-
-The above fields can be set in the [`CONTROL`](data/flash_ctrl.hjson#control) and [`ADDR`](data/flash_ctrl.hjson#addr) registers.
-See [library code](https://github.com/lowRISC/opentitan/blob/master/sw/device/lib/flash_ctrl.c) for implementation.
-
-It is acceptable for total number of flash words to be significantly greater than the depth of the read FIFO.
-In this situation, the read FIFO will fill up (or hit programmable fill value), pause the flash read and trigger an interrupt to software.
-Once there is space inside the FIFO, the controller will resume reading until the appropriate number of words have been read.
-Once the total count has been reached, the flash controller will post OP_DONE in the [`OP_STATUS`](data/flash_ctrl.hjson#op_status) register.
-
-## Issuing a Controller Program
-
-To program flash, the same procedure as read is followed.
-However, instead of setting the [`CONTROL`](data/flash_ctrl.hjson#control) register for read operation, a program operation is selected instead.
-Software will then fill the program FIFO and wait for the controller to consume this data.
-Similar to the read case, the controller will automatically stall when there is insufficient data in the FIFO.
-When all desired words have been programmed, the controller will post OP_DONE in the [`OP_STATUS`](data/flash_ctrl.hjson#op_status) register.
-
-## Debugging a Read Error
-Since flash has multiple access modes, debugging read errors can be complicated.
-The following lays out the expected cases.
-
-### Error Encountered by Software Direct Read
-If software reads the flash directly, it may encounter a variety of errors (read data integrity / ECC failures, both reliability and integrity).
-ECC failures create in-band error responses and should be recognized as a bus exception.
-Read data integrity failures also create exceptions directly inside the processor as part of end-to-end transmission integrity.
-
-From these exceptions, software should be able to determine the error address through processor specific means.
-Once the address is discovered, further steps can be taken to triage the issue.
-
-### Error Encountered by Software Initiated Controller Operations
-A controller operation can encounter a much greater variety of errors, see [`ERR_CODE`](data/flash_ctrl.hjson#err_code).
-When such an error is encountered, as reflected by [`OP_STATUS`](data/flash_ctrl.hjson#op_status) when the operation is complete, software can examine the [`ERR_ADDR`](data/flash_ctrl.hjson#err_addr) to determine the error location.
-Once the address is discovered, further steps can be taken to triage the issue.
-
-### Correctable ECC Errors
-Correctable ECC errors are by nature not fatal errors and do not stop operation.
-Instead, if the error is correctable, the flash controller fixes the issue and registers the last address where a single bit error was seen.
-See [`ECC_SINGLE_ERR_CNT`](data/flash_ctrl.hjson#ecc_single_err_cnt) and [`ECC_SINGLE_ERR_ADDR`](data/flash_ctrl.hjson#ecc_single_err_addr)
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_flash_ctrl.h)
-
-## Register Table
-
-The flash protocol controller maintains two separate access windows for the FIFO.
-It is implemented this way because the access window supports transaction back-pressure should the FIFO become full (in case of write) or empty (in case of read).
-
-* [Register Table](data/flash_ctrl.hjson#registers)
diff --git a/hw/ip/flash_ctrl/doc/programmers_guide.md b/hw/ip/flash_ctrl/doc/programmers_guide.md
new file mode 100644
index 0000000000000..1d070391cf286
--- /dev/null
+++ b/hw/ip/flash_ctrl/doc/programmers_guide.md
@@ -0,0 +1,58 @@
+# Programmer's Guide
+
+## Issuing a Controller Read
+
+To issue a flash read, the programmer must
+*  Specify the address of the first flash word to read
+*  Specify the number of total flash words to read, beginning at the supplied address
+*  Specify the operation to be 'READ' type
+*  Set the 'START' bit for the operation to begin
+
+The above fields can be set in the [`CONTROL`](../data/flash_ctrl.hjson#control) and [`ADDR`](../data/flash_ctrl.hjson#addr) registers.
+See [library code](https://github.com/lowRISC/opentitan/blob/master/sw/device/lib/flash_ctrl.c) for implementation.
+
+It is acceptable for total number of flash words to be significantly greater than the depth of the read FIFO.
+In this situation, the read FIFO will fill up (or hit programmable fill value), pause the flash read and trigger an interrupt to software.
+Once there is space inside the FIFO, the controller will resume reading until the appropriate number of words have been read.
+Once the total count has been reached, the flash controller will post OP_DONE in the [`OP_STATUS`](../data/flash_ctrl.hjson#op_status) register.
+
+## Issuing a Controller Program
+
+To program flash, the same procedure as read is followed.
+However, instead of setting the [`CONTROL`](../data/flash_ctrl.hjson#control) register for read operation, a program operation is selected instead.
+Software will then fill the program FIFO and wait for the controller to consume this data.
+Similar to the read case, the controller will automatically stall when there is insufficient data in the FIFO.
+When all desired words have been programmed, the controller will post OP_DONE in the [`OP_STATUS`](../data/flash_ctrl.hjson#op_status) register.
+
+## Debugging a Read Error
+Since flash has multiple access modes, debugging read errors can be complicated.
+The following lays out the expected cases.
+
+### Error Encountered by Software Direct Read
+If software reads the flash directly, it may encounter a variety of errors (read data integrity / ECC failures, both reliability and integrity).
+ECC failures create in-band error responses and should be recognized as a bus exception.
+Read data integrity failures also create exceptions directly inside the processor as part of end-to-end transmission integrity.
+
+From these exceptions, software should be able to determine the error address through processor specific means.
+Once the address is discovered, further steps can be taken to triage the issue.
+
+### Error Encountered by Software Initiated Controller Operations
+A controller operation can encounter a much greater variety of errors, see [`ERR_CODE`](../data/flash_ctrl.hjson#err_code).
+When such an error is encountered, as reflected by [`OP_STATUS`](../data/flash_ctrl.hjson#op_status) when the operation is complete, software can examine the [`ERR_ADDR`](../data/flash_ctrl.hjson#err_addr) to determine the error location.
+Once the address is discovered, further steps can be taken to triage the issue.
+
+### Correctable ECC Errors
+Correctable ECC errors are by nature not fatal errors and do not stop operation.
+Instead, if the error is correctable, the flash controller fixes the issue and registers the last address where a single bit error was seen.
+See [`ECC_SINGLE_ERR_CNT`](../data/flash_ctrl.hjson#ecc_single_err_cnt) and [`ECC_SINGLE_ERR_ADDR`](../data/flash_ctrl.hjson#ecc_single_err_addr)
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_flash_ctrl.h)
+
+## Register Table
+
+The flash protocol controller maintains two separate access windows for the FIFO.
+It is implemented this way because the access window supports transaction back-pressure should the FIFO become full (in case of write) or empty (in case of read).
+
+* [Register Table](../data/flash_ctrl.hjson#registers)
diff --git a/hw/ip/flash_ctrl/doc/theory_of_operation.md b/hw/ip/flash_ctrl/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..1f59a429ccb25
--- /dev/null
+++ b/hw/ip/flash_ctrl/doc/theory_of_operation.md
@@ -0,0 +1,552 @@
+# Theory of Operation
+
+## Block Diagram
+
+![Flash Block Diagram](../doc/flash_block_diagram.svg)
+
+### Flash Protocol Controller
+
+The Flash Protocol Controller sits between the host software interface, other hardware components and the flash physical controller.
+Its primary functions are two fold
+*  Translate software program, erase and read requests into a high level protocol for the actual flash physical controller
+*  Act as communication interface between flash and other components in the system, such as life cycle and key manager.
+
+The flash protocol controller is not responsible for the detailed timing and waveform control of the flash, nor is it responsible for data scrambling and reliability metadata such as ICV and ECC.
+Instead, it maintains FIFOs / interrupts for the software to process data, as well as high level abstraction of region protection controls and error handling.
+
+The flash controller selects requests between the software and hardware interfaces.
+By default, the hardware interfaces have precedence and are used to read out seed materials from flash.
+The seed material is read twice to confirm the values are consistent.
+They are then forwarded to the key manager for processing.
+During this seed phase, software initiated activities are back-pressured until the seed reading is complete.
+It is recommended that instead of blindly issuing transactions to the flash controller, the software polls [`STATUS.INIT_WIP`](../data/flash_ctrl.hjson#status) until it is 0.
+
+Once the seed phase is complete, the flash controller switches to the software interface.
+Software can then read / program / erase the flash as needed.
+
+#### RMA Entry Handling
+
+When an RMA entry request is received from the life cycle manager, the flash controller waits for any pending flash transaction to complete, then switches priority to the hardware interface.
+The flash controller then initiates RMA entry process and notifies the life cycle controller when it is complete.
+The RMA entry process wipes out all data, creator, owner and isolated partitions.
+
+After RMA completes, the flash controller is [disabled](#flash-access-disable).
+When disabled the flash protocol controller registers can still be accessed.
+However, flash memory access are not allowed, either directly by the host or indirectly through flash protocol controller initiated transactions.
+It is expected that after an RMA transition, the entire system will be rebooted.
+
+
+#### Initialization
+
+The flash protocol controller is initialized through [`INIT`](../data/flash_ctrl.hjson#init).
+When initialization is invoked, the flash controller requests the address and data scrambling keys from an external entity, [otp_ctrl](../../otp_ctrl/README.md#interface-to-flash-scrambler) in this case.
+
+After the scrambling keys are requested, the flash protocol controller reads the root seeds out of the [secret partitions](#secret-information-partitions) and sends them to the key manager.
+Once the above steps are completed, the read buffers in the flash physical controller are enabled for operation.
+
+#### RMA Entry
+During RMA entry, the flash controller "wipes" the contents of the following:
+- Creator partition
+- Owner partition
+- Isolated partition
+- All data partitions
+
+This process ensures that after RMA there is no sensitive information left that can be made use on the tester.
+As stated previously, once RMA entry completes, the flash memory can no longer be accessed, either directly or indirectly.
+The flash controller registers however, remain accessible for status reads and so forth, although new operations cannot be issued.
+
+#### Memory Protection
+
+Flash memory protection is handled differently depending on what type of partition is accessed.
+
+For data partitions, software can configure a number of memory protection regions such as [`MP_REGION_CFG_0`](../data/flash_ctrl.hjson#mp_region_cfg_0).
+For each region, software specifies both the beginning page and the number of pages that belong to that region.
+Software then configures the access privileges for that region.
+Finally, each region can be activated or de-activated from matching through [`MP_REGION_CFG_0.EN`](../data/flash_ctrl.hjson#mp_region_cfg_0).
+
+Subsequent accesses are then allowed or denied based on the defined rule set.
+Similar to RISCV pmp, if two region overlaps, the lower region index has higher priority.
+
+For information partitions, the protection is done per individual page.
+Each page can be configured with access privileges.
+As a result, software does not need to define a start and end page for information partitions.
+See [`BANK0_INFO0_PAGE_CFG_0`](../data/flash_ctrl.hjson#bank0_info0_page_cfg_0) as an example.
+
+#### Bank Erase Protection
+
+Unlike read, program and page erase operations, the bank erase command is the only one that can be issued at a bank level.
+Because of this, bank erase commands are not guarded by the typical [memory protection mechanisms](#memory-protection).
+
+Instead, whether bank erase is allowed is controlled by [`MP_BANK_CFG_SHADOWED`](../data/flash_ctrl.hjson#mp_bank_cfg_shadowed), where there is a separate configuration bit per bank.
+When the corresponding bit is set, that particular bank is permitted to have bank level operations.
+
+The specific behavior of what is erased when bank erase is issued is flash memory dependent and thus can vary by vendor and technology.
+[This section](#flash-bank-erase) describes the general behavior and how open source modeling is done.
+
+#### Memory Protection for Key Manager and Life Cycle
+
+While memory protection is largely under software control, certain behavior is hardwired to support key manager secret partitions and life cycle functions.
+
+Software can only control the accessibility of the creator secret seed page under the following condition(s):
+*  life cycle sets provision enable.
+*  OTP indicates the seeds are not locked.
+
+Software can only control the accessibility of the owner secret seed page under the following condition(s):
+*  life cycle sets provision enable.
+
+During life cycle RMA transition, the software configured memory protection for both data and information partitions is ignored.
+Instead, the flash controller assumes a default accessibility setting that allows it to secure the chip and transition to RMA.
+
+#### Program Resolution
+
+Certain flash memories place restrictions on the program window.
+This means the flash accepts program beats only if all beats belong to the same address window.
+Typically, this boundary is nicely aligned (for example, 16 words, 32 words) and is related to how the flash memory amortizes the program operation over nearby words.
+
+To support this function, the flash controller errors back anytime the start of the program beat is in a different window from the end of the program beat.
+The valid program range is thus the valid program resolution for a particular memory.
+
+This information is not configurable but instead decided at design time and is exposed as a readable status.
+
+#### Erase Suspend
+
+The flash controller supports erase suspend through [`ERASE_SUSPEND`](../data/flash_ctrl.hjson#erase_suspend).
+This allows the software to interrupt an ongoing erase operation.
+
+The behavior of what happens to flash contents when erase is suspended is vendor defined; however, generally it can be assumed that the erase would be incomplete.
+It is then up to the controlling software to take appropriate steps to erase again at a later time.
+
+#### Additional Flash Attributes
+
+There are certain attributes provisioned in [`MP_REGION_CFG_0`](../data/flash_ctrl.hjson#mp_region_cfg_0) that are not directly used by the open source protocol or physical controllers.
+
+Instead, these attributes are fed to the vendor flash module on a per-page or defined boundary basis.
+Currently there is only one such attribute [`MP_REGION_CFG_0.HE`](../data/flash_ctrl.hjson#mp_region_cfg_0).
+
+#### Idle Indication to External Power Manager
+
+The flash controller provides an idle indication to an external power manager.
+This idle indication does not mean the controller is doing "nothing", but rather the controller is not doing anything "stateful", e.g. program or erase.
+
+This is because an external power manager event (such as shutting off power) while a flash stateful transaction is ongoing may be damaging to the vendor flash module.
+
+#### Flash Code Execution Handling
+
+Flash can be used to store both data and code.
+To support separate access privileges between data and code, the flash protocol controller provides [`EXEC`](../data/flash_ctrl.hjson#exec) for software control.
+
+If software programs [`EXEC`](../data/flash_ctrl.hjson#exec) to `0xa26a38f7`, code fetch from flash is allowed.
+If software programs [`EXEC`](../data/flash_ctrl.hjson#exec) to any other value, code fetch from flash results in an error.
+
+The flash protocol controller distinguishes code / data transactions through the [instruction type attribute](../../lc_ctrl/README.md#usage-of-user-bits) of the TL-UL interface.
+
+#### Flash Errors and Faults
+
+The flash protocol controller maintains 3 different categories of observed errors and faults.
+In general, errors are considered recoverable and primarily geared towards problems that could have been caused by software or that occurred during a software initiated operation.
+Errors can be found in [`ERR_CODE`](../data/flash_ctrl.hjson#err_code).
+
+Faults, on the other hand, represent error events that are unlikely to have been caused by software and represent a major malfunction of the system.
+
+Faults are further divided into two categories:
+- Standard faults
+- Custom faults
+
+Standard faults represent errors that occur in the standard structures of the design, for example sparsely encoded FSMs, duplicated counters and the bus transmission integrity scheme.
+
+Custom faults represent custom errors, primarily errors generated by the life cycle management interface, the flash storage integrity interface and the flash macro itself.
+
+See (#flash-escalation) for further differentiation between standard and custom faults.
+
+#### Transmission Integrity Faults
+
+Since the flash controller has multiple interfaces for access, transmission integrity failures can manifest in different ways.
+
+There are 4 interfaces:
+- host direct access to flash controller [register files](#host-direct-reg).
+- host direct access to [flash macro](#host-direct-macro)
+- host / software initiated flash controller access to [flash macro (read / program / erase)](#host-controller-op)
+- life cycle management interface / hardware initiated flash controller access to [flash macro (read / program / erase)](#hw-controller-op)
+
+The impact of transmission integrity of each interface is described below.
+
+##### Host Direct Access to Flash Controller Register Files {#host-direct-reg}
+This category of transmission integrity behaves identically to other modules.
+A bus transaction, when received, is checked for command and data payload integrity.
+If an integrity error is seen, the issuing bus host receives an in-band error response and a fault is registered in [`STD_FAULT_STATUS.REG_INTG_ERR`](../data/flash_ctrl.hjson#std_fault_status).
+
+##### Host Direct Access to Flash Macro {#host-direct-macro}
+Flash can only be read by the host.
+The transmission integrity scheme used is end-to-end, so integrity generated inside the flash is fed directly to the host.
+It is the host's responsibility to check for integrity correctness and react accordingly.
+
+##### Host / Software Initiated Access to Flash Macro {#host-controller-op}
+Since controller operations are initiated through writes to the register file, the command check is identical to host direct access to [regfiles](#host-direct-reg).
+Controller reads behave similarly to [host direct access to macro](#host-direct-macro), the read data and its associated integrity are returned through the controller read FIFO for the initiating host to handle.
+
+For program operations, the write data and its associated integrity are stored and propagated through the flash protocol and physical controllers.
+Prior to packing the data for final flash program, the data is then checked for integrity correctness.
+If the data integrity is incorrect, an in-band error response is returned to the initiating host and an error is registered in [`ERR_CODE.PROG_INTG_ERR`](../data/flash_ctrl.hjson#err_code).
+An error is also registered in [`STD_FAULT_STATUS.PROG_INTG_ERR`](../data/flash_ctrl.hjson#std_fault_status) to indicate that a fatal fault has occurred.
+
+The reasons a program error is registered in two locations are two-fold:
+- It is registered in [`ERR_CODE`](../data/flash_ctrl.hjson#err_code) so software can discover during operation status that a program has failed.
+- It is registered in [`STD_FAULT_STATUS`](../data/flash_ctrl.hjson#std_fault_status) because transmission integrity failures represent a fatal failure in the standard structure of the design, something that should never happen.
+
+##### Life Cycle Management Interface / Hardware Initiated Access to Flash Macro {#hw-controller-op}
+The life cycle management interface issues transactions directly to the flash controller and does not perform a command payload integrity check.
+
+For read operations, the read data and its associated integrity are directly checked by the life cycle management interface.
+If an integrity error is seen, it is registered in [`FAULT_STATUS.LCMGR_INTG_ERR`](../data/flash_ctrl.hjson#fault_status).
+
+For program operations, the program data and its associated integrity are propagated into the flash controller.
+If an integrity error is seen, an error is registered in [`FAULT_STATUS.PROG_INTG_ERR`](../data/flash_ctrl.hjson#fault_status).
+
+#### ECC and ICV Related Read Errors
+
+In addition to transmission integrity errors described above, the flash can also emit read errors based on [ECC and ICV checks](#flash-ecc-and-icv).
+
+Flash reliability ECC errors (multi-bit errors) and integrity check errors (integrity check errors) are both reflected as in-band errors to the entity that issued the transaction.
+That means if a host direct read, controller initiated read or hardware initiated read encounters one of these errors, the error is directly reflected in the operation status.
+
+Further, reliability ECC / integrity check errors are also captured in [`FAULT_STATUS`](../data/flash_ctrl.hjson#fault_status) and can be used to generate fatal alerts.
+The reason these are not captured in [`STD_FAULT_STATUS`](../data/flash_ctrl.hjson#std_fault_status) is because 1 or 2 bit errors can occur in real usage due to environmental conditions, thus they do not belong to the standard group of structural errors.
+If we assume 2-bit errors can occur, then software must have a mechanism to recover from the error instead of [escalation](#flash-escalation).
+
+#### Flash Escalation
+
+Flash has two sources of escalation - global and local.
+
+Global escalation is triggered by the life cycle controller through `lc_escalate_en`.
+Local escalation is triggered by a standard faults of flash, seen in [`STD_FAULT_STATUS`](../data/flash_ctrl.hjson#std_fault_status).
+Local escalation is not configurable and automatically triggers when this subset of faults are seen.
+
+For the escalation behavior, see [flash access disable](#flash-access-disable) .
+
+#### Flash Access Disable
+
+Flash access can be disabled through global escalation trigger, local escalation trigger, rma process completion or software command.
+The escalation triggers are described [here](#flash-escalation).
+The software command to disable flash can be found in [`DIS`](../data/flash_ctrl.hjson#dis).
+The description for rma entry can be found [here](#rma-entry-handling).
+
+When disabled, the flash has a two layered response:
+- The flash protocol controller [memory protection](#memory-protection) errors back all controller initiated operations.
+- The host-facing tlul adapter errors back all host initiated operations.
+- The flash physical controller completes any existing stateful operations (program or erase) and drops all future flash transactions.
+- The flash protocol controller arbiter completes any existing software issued commands and enters a disabled state where no new transactions can be issued.
+
+
+### Flash Physical Controller
+
+The Flash Physical Controller is the wrapper module that contains the actual flash memory instantiation.
+It is responsible for arbitrating high level protocol commands (such as read, program, erase) as well as any additional security (scrambling) and reliability (ECC) features.
+The contained vendor wrapper module is then responsible for converting high level commands into low level signaling and timing specific to a particular flash vendor.
+The vendor wrapper module is also responsible for any BIST, redundancy handling, remapping features or custom configurations required for the flash.
+
+The scramble keys are provided by an external static block such as the OTP.
+
+#### Host and Protocol Controller Handling
+
+Both the protocol controller and the system host converge on the physical controller.
+The protocol controller has read access to all partitions as well as program and erase privileges.
+The host on the other hand, can only read the data partitions.
+
+Even though the host has less access to flash, it is prioritized when competing against the protocol controller for access.
+When a host request and a protocol controller request arrive at the same time, the host is favored and granted.
+Every time the protocol controller loses such an arbitration, it increases an arbitration lost count.
+Once this lost count reaches 5, the protocol controller is favored.
+This ensures a stream of host activity cannot deny protocol controller access (for example a tight polling loop).
+
+#### Flash Bank Erase Behavior {#flash-bank-erase}
+
+This section describes the open source modeling of flash memory.
+The actual flash memory behavior may differ, and should consult the specific vendor or technology specification.
+
+When a bank erase command is issued and allowed, see [bank erase protection](#bank-erase-protection), the erase behavior is dependent on [`CONTROL.PARTITION_SEL`](../data/flash_ctrl.hjson#control).
+- If data partition is selected, all data in the data partition is erased.
+- If info partition is selected, all data in the data partition is erased AND all data in the info partitions (including all info types) is also erased.
+
+#### Flash Scrambling
+
+Flash scrambling is built using the [XEX tweakable block cipher](https://en.wikipedia.org/wiki/Disk_encryption_theory#Xor%E2%80%93encrypt%E2%80%93xor_(XEX)).
+
+When a read transaction is sent to flash, the following steps are taken:
+*  The tweak is calculated using the transaction address and a secret address key through a Galois multiplier.
+*  The data content is read out of flash.
+*  If the data content is scrambled, the tweak is XOR'd with the scrambled text and then decrypted through the PRINCE block cipher using a secret data key.
+*  The output of the PRINCE cipher is XOR'd again with the tweak and the final results are presented.
+*  If the data content is not scrambled, the PRINCE cipher and XOR steps are skipped and data provided directly back to the requestor.
+
+When a program transaction is sent to flash, the same steps are taken if the address in question has scrambling enabled.
+During a program, the text is scrambled through the PRINCE block cipher.
+
+Scramble enablement is done differently depending on the type of partitions.
+*  For data partitions, the scramble enablement is done on contiguous page boundaries.
+   *  Software has the ability to configure these regions and whether scramble is enabled.
+*  For information partitions, the scramble enablement is done on a per page basis.
+   *  Software can configure for each page whether scramble is enabled.
+
+#### Flash ECC and ICV
+
+Flash supports both ECC (error correction) and ICV (integrity check value).
+While the two are used for different functions, they are implemented as two separate ECCs, thus flash supports two types of ECC.
+
+ICV is an integrity check, implemented as an ECC, used to detect whether the de-scrambled data has been modified.
+The other is a reliability ECC used for error detection and correction on the whole flash word.
+
+The key differentiation here is that ICV is used only for detection, while the real error correction can correct single bit errors.
+Both ICV and ECC are configurable based on the various page and memory property configurations.
+
+##### Overall ICV and ECC Application
+
+The following diagram shows how the various ICV / ECC tags are applied and used through the life of a transactions.
+![Flash ECC_LIFE](../doc/flash_integrity.svg).
+
+Note that the ICV (integrity ECC) is calculated over the descrambled data and is only 4-bits, while the reliability ECC is calculated over both the scrambled data and the ICV.
+
+##### ICV
+
+The purpose of the ICV (integrity check value, implemented as an ECC) is to emulate end-to-end integrity like the other memories.
+This is why the data is calculated over the descrambled data as it can be stored alongside for continuous checks.
+When descrambled data is returned to the host, the ICV is used to validate the data is correct.
+
+The flash may not always have the capacity to store both the ICV and reliability ECC, the ICV is thus truncated since it is not used for error correction.
+
+##### Reliability ECC
+
+Similar to scrambling, the reliability ECC is enabled based on an address decode.
+The ECC for flash is chosen such that a fully erased flash word has valid ECC.
+Likewise a flash word that is completely 0 is also valid ECC.
+
+Unlike the integrity ECC, the reliability ECC is actually used for error correction if an accidental bit-flip is seen, it is thus fully stored and not truncated.
+
+ECC enablement is done differently depending on the type of partitions.
+*  For data partitions, the ECC enablement is done on contiguous page boundaries.
+   *  Software has the ability to configure these regions and whether ECC is enabled.
+*  For information partitions,the ECC enablement is done on a per page basis.
+   *  Software can configure for each page whether ECC is enabled.
+
+##### Scrambling Consistency
+
+The flash physical controller does not keep a history of when a particular memory location has scrambling enabled or disabled.
+This means if a memory location was programmed while scrambled, disabling scrambling and then reading it back will result in garbage.
+Similarly, if a location was programmed while non-scrambled, enabling scrambling and then reading it back will also result in garbage.
+
+It it thus the programmer's responsibility to maintain a consistent definition of whether a location is scrambled.
+It is also highly recommended in a normal use case to setup up scramble and non-scramble regions and not change it further.
+
+#### Flash Read Pipeline
+
+Since the system host reads directly from the flash for instructions, it is critical to not add significant latency during read, especially if de-scrambling is required.
+As such, the flash read is actually a two stage pipeline, where each stage can take multiple cycles.
+
+Additionally, since the flash word size is typically larger than the bus word, recently read flash entries are locally cached.
+The cache behaves as a highly simplified read-only-cache and holds by default 4 flash words per flash bank.
+
+When a read transaction is sent to flash, the following steps are taken:
+*  A check is performed against the local cache
+   * If there is a hit (either the entry is already in cache, or the entry is currently being processed), the transaction is immediately forwarded to the response queue.
+   * If there is not a hit, an entry in the local cache is selected for allocation (round robin arbitration) and a flash read is issued.
+*  When the flash read completes, its descrambling attributes are checked:
+   * If descrambling is required, the read data begins the descrambling phase - at this time, a new flash read can be issued for the following transaction.
+   * if descrambling is not required, the descrambling phase is skipped and the transaction is pushed to the response queue.
+*  When the descrambling is complete, the descrambled text is pushed to the response queue.
+
+The following diagram shows how the flash read pipeline timing works.
+![Flash Read Pipeline](../doc/flash_read_pipeline.svg)
+
+
+In this example, the first two host requests trigger a full sequence.
+The third host requests immediately hits in the local cache and responds in order after the first two.
+
+#### Flash Buffer
+
+The flash buffer is a small read-only memory that holds multiple entries of recently read flash words.
+This is needed when the flash word is wider than a bus word.
+The flash access time is amortized across the the entire flash word if software accesses in a mostly
+linear sequence.
+
+The flash buffer has a round robin replacement policy when more flash words are read.
+When an erase / program is issued to the flash, the entries are evicted to ensure new words are fetched.
+
+When a page erase / program is issued to a flash bank, only entries that fall into that address range are evicted.
+When a bank erase is issued, then all entries are evicted.
+
+The flash buffer is only enabled after [`INIT`](../data/flash_ctrl.hjson#init) is invoked.
+When an RMA entry sequence is received, the flash buffers are disabled.
+
+As an example, assume a flash word is made up of 2 bus words.
+Assume also the following address to word mapping:
+- Address 0 - flash word 0, bus word 0 / bus word 1
+- Address 2 - flash word 1, bus word 2 / bus word 3
+
+When software reads bus word 1, the entire flash word 0 is captured into the flash buffer.
+When software comes back to read bus word 0, instead of accessing the flash again, the data is retrieved directly from the buffer.
+
+The recently read entries store both the de-scrambled data and the [integrity ECC](#integrity-ecc).
+The [reliability ECC](#reliability-ecc) is not stored because the small buffer is purely flip-flop based and does not have storage reliability concerns like the main flash macro.
+
+When a read hits in the flash buffer, the integrity ECC is checked against the de-scrambled data and an error is returned to the initiating entity, whether it is a the controller itself or a host.
+
+
+#### Accessing Information Partition
+
+The information partition uses the same address scheme as the data partition - which is directly accessible by software.
+This means the address of page{N}.word{M} is the same no matter which type of partition is accessed.
+
+Which partition a specific transaction accesses is denoted through a separate field [`CONTROL.PARTITION_SEL`](../data/flash_ctrl.hjson#control) in the [`CONTROL`](../data/flash_ctrl.hjson#control) register.
+If [`CONTROL.PARTITION_SEL`](../data/flash_ctrl.hjson#control) is set, then the information partition is accessed.
+If [`CONTROL.PARTITION_SEL`](../data/flash_ctrl.hjson#control) is not set, then the corresponding word in the data partition is accessed.
+
+Flash scrambling, if enabled, also applies to information partitions.
+It may be required for manufacturers to directly inject data into specific pages flash information partitions via die contacts.
+For these pages, scramble shall be permanently disabled as the manufacturer should not be aware of scrambling functions.
+
+##### JTAG Connection
+
+The flash physical controller provides a JTAG connection to the vendor flash module.
+The vendor flash module can use this interface to build a testing setup or to provide backdoor access for debug.
+
+Due to the ability of this connection to bypass access controls, this connection is modulated by life cycle and only enabled when non-volatile debug, or `lc_nvm_debug_en` is allowed in the system.
+
+## Flash Default Configuration
+Since the flash controller is highly dependent on the specific flavor of flash memory chosen underneath, its configuration can vary widely between different integrations.
+
+This sections details the default settings used by the flash controller:
+* Number of banks: 2
+* Number of data partition pages per bank: 256
+* [Program resolution](#program-resolution): 8 flash words
+* Flash word data bits: 64
+* Flash word metadata bits: 8
+* ECC choice: Hamming code SECDED
+* Information partition types: 3
+* Number of information partition type 0 pages per bank: 10
+* Number of information partition type 1 pages per bank: 1
+* Number of information partition type 2 pages per bank: 2
+* Secret partition 0 (used for creator): Bank 0, information partition 0, page 1
+* Secret partition 1 (used for owner): Bank 0, information partition 0, page 2
+* Isolated partition: Bank 0, information partition 0, page 3
+
+## Hardware Interfaces
+
+* [Interface Tables](../data/flash_ctrl.hjson#interfaces)
+
+### Signals
+
+In addition to the interrupts and bus signals, the tables below lists the flash controller functional I/Os.
+
+Signal                     | Direction      | Description
+------------------------   |-----------     |---------------
+`lc_creator_seed_sw_rw_en` | `input`        | Indication from `lc_ctrl` that software is allowed to read/write creator seed.
+`lc_owner_seed_sw_rw_en`   | `input`        | Indication from `lc_ctrl` that software is allowed to read/write owner seed.
+`lc_seed_hw_rd_en`         | `input`        | Indication from `lc_ctrl` that hardware is allowed to read creator / owner seeds.
+`lc_iso_part_sw_rd_en`     | `input`        | Indication from `lc_ctrl` that software is allowed to read the isolated partition.
+`lc_iso_part_sw_wr_en`     | `input`        | Indication from `lc_ctrl` that software is allowed to write the isolated partition.
+`lc_escalate_en`           | `input`        | Escalation indication from `lc_ctrl`.
+`lc_nvm_debug_en`          | `input`        | Indication from lc_ctrl that non-volatile memory debug is allowed.
+`core_tl`                  | `input/output` | TL-UL interface used to access `flash_ctrl` registers for activating program / erase and reads to information partitions/
+`prim_tl`                  | `input/output` | TL-UL interface used to access the vendor flash memory proprietary registers.
+`mem_tl`                   | `input/output` | TL-UL interface used by host to access the vendor flash memory directly.
+`OTP`                      | `input/output` | Interface used to request scrambling keys from `otp_ctrl`.
+`rma_req`                  | `input`        | rma entry request from `lc_ctrl`.
+`rma_ack`                  | `output`       | rma entry acknowlegement to `lc_ctrl`.
+`rma_seed`                 | `input`        | rma entry seed.
+`pwrmgr`                   | `output`       | Idle indication to `pwrmgr`.
+`keymgr`                   | `output`       | Secret seed bus to `keymgr`.
+
+In addition to the functional IOs, there are a set of signals that are directly connected to vendor flash module.
+
+Signal                     | Direction      | Description
+------------------------   |-----------     |---------------
+`scan_en`                  | `input`        | scan enable
+`scanmode`                 | `input`        | scan mode
+`scan_rst_n`               | `input`        | scan reset
+`flash_bist_enable`        | `input`        | enable flash built-in-self-test
+`flash_power_down_h`       | `input`        | flash power down indication, note this is NOT a core level signal
+`flash_power_ready_h`      | `input`        | flash power ready indication, note this is NOT a core level signal
+`flash_test_mode_a`        | `input/output` | flash test mode io, note this is NOT a core level signal
+`flash_test_voltage_h`     | `input/output` | flash test voltage, note this is NOT a core level signal
+`flash_alert`              | `output`       | flash alert outputs directly to AST
+
+
+
+## Design Details
+
+### Flash Protocol Controller Description
+
+The flash protocol controller uses a simple FIFO interface to communicate between the software and flash physical controller.
+There is a read FIFO for read operations, and a program FIFO for program operations.
+Note, this means flash can be read both through the controller and the main bus interface.
+This may prove useful if the controller wishes to allocate specific regions to HW FSMs only, but is not a necessary feature.
+
+When software initiates a read transaction of a programmable number of flash words, the flash controller will fill up the read FIFO for software to consume.
+Likewise, when software initiates a program transaction, software will fill up the program FIFO for the controller to consume.
+
+The controller is designed such that the overall number of words in a transaction can significantly exceed the FIFO depth.
+In the case of read, once the FIFO is full, the controller will cease writing more entries and wait for software to consume the contents (an interrupt will be triggered to the software to alert it to such an event).
+In the case of program, the controller will stop writing to flash once all existing data is consumed - it will likewise trigger an interrupt to software to prepare more data.
+See detailed steps in theory of operation.
+The following is a diagram of the controller construction as well as its over connectivity with the flash module.
+
+![Flash Protocol Controller](../doc/flash_protocol_controller.svg)
+
+
+### Host Read
+
+Unlike controller initiated reads, host reads have separate rdy / done signals to ensure transactions can be properly pipelined.
+As host reads are usually tied to host execution upstream, additional latency can severely harm performance and is not desired.
+The expected waveform from the perspective of the physical controller is shown below.
+
+```wavejson
+{signal: [
+  {name: 'clk_i',           wave: 'p..............'},
+  {name: 'rst_ni',          wave: '0.1............'},
+  {name: 'host_req_i',      wave: '0..10.1...0....'},
+  {name: 'host_addr_i',     wave: 'x..3x.3.33x....', data: ['Adr0', 'Adr1', 'Adr2', 'Adr3']},
+  {name: 'host_req_rdy_o',  wave: '1...0..1.......'},
+  {name: 'host_req_done_o', wave: '0...10..1110...'},
+  {name: 'host_rdata_o',    wave: 'x...4x..444x...',data: ['Dat0', 'Dat1', 'Dat2', 'Dat3']},
+]}
+```
+
+The `host_req_done_o` is always single cycle pulsed and upstream logic is expected to always accept and correctly handle the return.
+The same cycle the return data is posted a new command / address can be accepted.
+While the example shows flash reads completing in back to back cycles, this is typically not the case.
+
+### Controller Read
+
+Unlike host reads, controller reads are not as performance critical and do not have command / data pipeline requirements.
+Instead, the protocol controller will hold the read request and address lines until the done is seen.
+Once the done is seen, the controller then transitions to the next read operation.
+The expected waveform from the perspective of the physical controller is shown below.
+
+```wavejson
+{signal: [
+  {name: 'clk_i',                 wave: 'p..............'},
+  {name: 'rst_ni',                wave: '0.1............'},
+  {name: 'flash_ctrl_i.req',      wave: '0..1.....0.....'},
+  {name: 'flash_ctrl_i.addr',     wave: 'x..3..3..x.3..x', data: ['Adr0', 'Adr1', 'Adr2']},
+  {name: 'flash_ctrl_i.rd',       wave: '0..1.....0.1..0'},
+  {name: 'flash_ctrl_o.rd_done',  wave: '0....10.10...10'},
+  {name: 'flash_ctrl_o.rdata',    wave: 'x....4x.4x...4x', data: ['Dat0', 'Dat1', 'Dat2']},
+]}
+```
+
+### Controller Program
+
+Program behavior is similar to reads.
+The protocol controller will hold the request, address and data lines until the programming is complete.
+The expected waveform from the perspective of the physical controller is shown below.
+
+```wavejson
+{signal: [
+  {name: 'clk_i',                  wave: 'p..............'},
+  {name: 'rst_ni',                 wave: '0.1............'},
+  {name: 'flash_ctrl_i.req',       wave: '0..1.....0.....'},
+  {name: 'flash_ctrl_i.addr',      wave: 'x..3..3..x.3..x', data: ['Adr0', 'Adr1', 'Adr2']},
+  {name: 'flash_ctrl_i.prog',      wave: '0..1.....0.1..0'},
+  {name: 'flash_ctrl_o.prog_data', wave: 'x..4..4..x.4..x', data: ['Dat0', 'Dat1', 'Dat2']},
+  {name: 'flash_ctrl_o.prog_done', wave: '0....10.10...10'},
+]}
+```
diff --git a/hw/ip/gpio/README.md b/hw/ip/gpio/README.md
index 50b7802794e0e..238f80413c455 100644
--- a/hw/ip/gpio/README.md
+++ b/hw/ip/gpio/README.md
@@ -49,236 +49,3 @@ in output value being reflected in the input register.
 
 See the Design Details section for more details on output, input, and
 interrupt control.
-
-# Theory of Operations
-
-## Block Diagram
-
-![GPIO Block Diagram](./doc/gpio_blockdiagram.svg)
-
-The block diagram above shows the `DATA_OUT` and `DATA_OE` registers
-managed by hardware outside of the auto-generated register file.
-For reference, it also shows the assumed connections to pads in
-the top level netlist.
-
-## Hardware Interfaces
-
-* [Interface Tables](data/gpio.hjson#interfaces)
-
-## Design Details
-
-### GPIO Output logic
-
-![GPIO Output Diagram](./doc/gpio_output.svg)
-
-The GPIO module maintains one 32-bit output register `DATA_OUT` with two
-ways to write to it. Direct write access uses [`DIRECT_OUT`](data/gpio.hjson#direct_out), and
-masked access uses [`MASKED_OUT_UPPER`](data/gpio.hjson#masked_out_upper) and
-[`MASKED_OUT_LOWER`](data/gpio.hjson#masked_out_lower). Direct access provides full write and read
-access for all 32 bits in one register.
-
-For masked access the bits to modify are given as a mask in the upper
-16 bits of the [`MASKED_OUT_UPPER`](data/gpio.hjson#masked_out_upper) and
-[`MASKED_OUT_LOWER`](data/gpio.hjson#masked_out_lower) register write, while the data to write is
-provided in the lower 16 bits of the register write.  The hardware updates
-`DATA_OUT` with the mask so that the modification is done without software
-requiring a Read-Modify-Write.
-
-Reads of masked registers return the lower/upper 16 bits of the `DATA_OUT`
-contents. Zeros are returned in the upper 16 bits (mask field). To read
-what is on the pins, software should read the [`DATA_IN`](data/gpio.hjson#data_in) register.
-(See [GPIO Input](#gpio-input) section below).
-
-The same concept is duplicated for the output enable register `DATA_OE`.
-Direct access uses [`DIRECT_OE`](data/gpio.hjson#direct_oe), and masked access is available
-using [`MASKED_OE_UPPER`](data/gpio.hjson#masked_oe_upper) and [`MASKED_OE_LOWER`](data/gpio.hjson#masked_oe_lower).
-
-The output enable is sent to the pad control block to determine if the
-pad should drive the `DATA_OUT` value to the associated pin or not.
-
-A typical use pattern is for initialization and suspend/resume code to
-use the full access registers to set the output enables and current output
-values, then switch to masked access for both `DATA_OUT` and `DATA_OE`.
-
-For GPIO outputs that are not used (either not wired to a pin output or
-not selected for pin multiplexing), the output values are disconnected
-and have no effect on the GPIO input, regardless of output enable values.
-
-### GPIO Input
-
-The [`DATA_IN`](data/gpio.hjson#data_in) register returns the contents as seen on the
-peripheral input, typically from the pads connected to those inputs.  In the
-presence of a pin-multiplexing unit, GPIO peripheral inputs that are
-not connected to a chip input will be tied to a constant zero input.
-
-The GPIO module provides optional independent noise filter control for
-each of the 32 input signals. Each input can be independently enabled with
-the [`CTRL_EN_INPUT_FILTER`](data/gpio.hjson#ctrl_en_input_filter) (one bit per input).  This 16-cycle
-filter is applied to both the [`DATA_IN`](data/gpio.hjson#data_in) register and
-the interrupt detection logic. The timing for [`DATA_IN`](data/gpio.hjson#data_in) is still
-not instantaneous if [`CTRL_EN_INPUT_FILTER`](data/gpio.hjson#ctrl_en_input_filter) is false as there is
-top-level routing involved, but no flops are between the chip input and the
-[`DATA_IN`](data/gpio.hjson#data_in) register.
-
-The contents of [`DATA_IN`](data/gpio.hjson#data_in) are always readable and reflect the
-value seen at the chip input pad regardless of the output enable setting from
-DATA_OE. If the output enable is true (and the GPIO is connected to a
-chip-level pad), the value read from [`DATA_IN`](data/gpio.hjson#data_in) includes the
-effect of the peripheral's driven output (so will only differ from DATA_OUT if
-the output driver is unable to switch the pin or during the delay imposed
-if the noise filter is enabled).
-
-### Interrupts
-
-The GPIO module provides 32 interrupt signals to the main processor.
-Each interrupt can be independently enabled, tested, and configured.
-Following the standard interrupt guidelines in the [Comportability
-Specification](../../../doc/contributing/hw/comportability/README.md),
-the 32 bits of the [`INTR_ENABLE`](data/gpio.hjson#intr_enable) register determines whether the
-associated inputs are configured to detect interrupt events. If enabled
-via the various `INTR_CTRL_EN` registers, their current state can be
-read in the [`INTR_STATE`](data/gpio.hjson#intr_state) register. Clearing is done by writing a
-`1` into the associated [`INTR_STATE`](data/gpio.hjson#intr_state) bit field.
-
-For configuration, there are 4 types of interrupts available per bit,
-controlled with four control registers. [`INTR_CTRL_EN_RISING`](data/gpio.hjson#intr_ctrl_en_rising)
-configures the associated input for rising-edge detection.
-Similarly, [`INTR_CTRL_EN_FALLING`](data/gpio.hjson#intr_ctrl_en_falling) detects falling edge inputs.
-[`INTR_CTRL_EN_LVLHIGH`](data/gpio.hjson#intr_ctrl_en_lvlhigh) and [`INTR_CTRL_EN_LVLLOW`](data/gpio.hjson#intr_ctrl_en_lvllow)
-allow the input to be level sensitive interrupts. In theory an input can be
-configured to detect both a rising and falling edge, but there is no hardware
-assistance to indicate which edge caused the output interrupt.
-
-**Note #1:** all inputs are sent through optional noise filtering before
-being sent into interrupt detection. **Note #2:** all output interrupts to
-the processor are level interrupts as per the Comportability Specification
-guidelines. The GPIO module, if configured, converts an edge detection
-into a level interrupt to the processor core.
-
-# Programmers Guide
-
-## Initialization
-
-Initialization of the GPIO module includes the setting up of the interrupt
-configuration for each GPIO input, as well as the configuration of
-the required noise filtering. These do not provide masked access since
-they are not expected to be done frequently.
-
-```cpp
-// enable inputs 0 and 1 for rising edge detection with filtering,
-// inputs 2 and 3 for falling edge detection with filtering,
-// input 4 for both rising edge detection (no filtering)
-// and inputs 6 and 7 for active low interrupt detection
-*GPIO_INTR_ENABLE =          0b11011111;
-*GPIO_INTR_CTRL_EN_RISING =  0b00010011;
-*GPIO_INTR_CTRL_EN_FALLING = 0b00011100;
-*GPIO_INTR_CTRL_EN_LVLLOW  = 0b11000000;
-*GPIO_INTR_CTRL_EN_LVLHIGH = 0b00000000;
-*GPIO_CTRL_EN_INPUT_FILTER = 0b00001111;
-```
-
-## Common Examples
-
-This section below shows the interaction between the direct access
-and mask access for data output and data enable.
-
-```cpp
-// assume all GPIO are connected to chip pads.
-// assume a weak pullup on all pads, returning 1 if undriven.
-printf("0x%x", *GPIO_DATA_IN);          // 0xffffffff
-
-*DIRECT_OUT = 0x11223344;
-printf("0x%x", *GPIO_DIRECT_OUT);       // 0x11223344
-
-*DIRECT_OE  = 0x00ff00ff;
-printf("0x%x", *GPIO_DIRECT_OE);        // 0x00ff00ff
-
-// weak pullup still applies to undriven signals
-printf("0x%x", *GPIO_DATA_IN);          // 0xff22ff44
-
-// read of direct_out still returns DATA_OUT contents
-printf("0x%x", *GPIO_DIRECT_OUT);       // 0x11223344
-
-// try masked accesses to DATA_OUT
-*GPIO_MASKED_OUT_LOWER = 0x0f0f5566
-printf("0x%x", *GPIO_MASKED_OUT_LOWER); // 0x00003546
-printf("0x%x", *GPIO_DIRECT_OUT);       // 0x11223546
-
-*GPIO_MASKED_OUT_UPPER = 0x0f0f7788
-printf("0x%x", *GPIO_MASKED_OUT_UPPER); // 0x00001728
-printf("0x%x", *GPIO_DIRECT_OUT);       // 0x17283546
-
-// OE still applies
-printf("0x%x", *GPIO_DATA_IN);          // 0xff28ff46
-
-// manipulate OE
-*GPIO_DIRECT_OE = 0xff00ff00;
-printf("0x%x", *GPIO_DIRECT_OE);        // 0xff00ff00
-printf("0x%x", *GPIO_DATA_IN);          // 0x17ff35ff
-
-*GPIO_MASKED_OE_LOWER = 0x0f0f0f0f;
-printf("0x%x", *GPIO_MASKED_OE_LOWER);  // 0x00000f0f
-printf("0x%x", *GPIO_DIRECT_OE);        // 0xff000f0f
-printf("0x%x", *GPIO_DATA_IN);          // 0x17fff5f6
-
-*GPIO_MASKED_OE_UPPER = 0x0f0f0f0f;
-printf("0x%x", *GPIO_MASKED_OE_UPPER);  // 0x00000f0f
-printf("0x%x", *GPIO_DIRECT_OE);        // 0x0f0f0f0f
-printf("0x%x", *GPIO_DATA_IN);          // 0xf7f8f5f6
-```
-
-## Interrupt Handling
-
-This section below gives an example of how interrupt clearing works,
-assuming some events have occurred as shown in comments.
-
-```cpp
-*INTR_ENABLE = 0x000000ff;              // interrupts enabled GPIO[7:0] inputs
-printf("0b%x", *GPIO_DATA_IN);          // assume 0b00000000
-printf("0b%x", *GPIO_INTR_STATE);       // 0b00000000
-
-*INTR_CTRL_EN_RISING  = 0b00010001;     // rising detect on GPIO[0], GPIO[4]
-*INTR_CTRL_EN_FALLING = 0b00010010;     // falling detect on GPIO[1], GPIO[4]
-*INTR_CTRL_EN_LVLLOW  = 0b00001100;     // falling detect on GPIO[2], GPIO[3]
-*INTR_CTRL_EN_LVLHIGH = 0b11000000;     // falling detect on GPIO[6], GPIO[7]
-
-// already detected intr[3,2] (level low)
-printf("0b%b", *GPIO_INTR_STATE);       // 0b00001100
-
-// try and clear [3:2], fails since still active low
-*GPIO_INTR_STATE = 0b00001100;
-printf("0b%b", *GPIO_INTR_STATE);       // 0b00001100
-
-// EVENT: all bits [7:0] rising, triggers [7,6,4,0], [3,2] still latched
-printf("0b%b", *GPIO_DATA_IN);          // 0b11111111
-printf("0b%b", *GPIO_INTR_STATE);       // 0b11011101
-
-// try and clear all bits, [7,6] still detecting level high
-*GPIO_INTR_STATE = 0b11111111;
-printf("0b%b", *GPIO_INTR_STATE);       // 0b11000000
-
-// EVENT: all bits [7:0] falling, triggers [4,3,2,1], [7,6] still latched
-printf("0b%b", *GPIO_DATA_IN);          // 0b00000000
-printf("0b%b", *GPIO_INTR_STATE);       // 0b11011110
-
-// try and clear all bits, [3,2] still detecting level low
-*GPIO_INTR_STATE = 0b11111111;
-printf("0b%b", *GPIO_INTR_STATE);       // 0b00001100
-
-// write test register for all 8 events, trigger regardless of external events
-*GPIO_INTR_TEST = 0b11111111;
-printf("0b%b", *GPIO_INTR_STATE);       // 0b11111111
-
-// try and clear all bits, [3,2] still detecting level low
-*GPIO_INTR_STATE = 0b11111111;
-printf("0b%b", *GPIO_INTR_STATE);       // 0b00001100
-
-```
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_gpio.h)
-
-## Register Table
-
-* [Register Table](data/gpio.hjson#registers)
diff --git a/hw/ip/gpio/doc/programmers_guide.md b/hw/ip/gpio/doc/programmers_guide.md
new file mode 100644
index 0000000000000..b72af09cec294
--- /dev/null
+++ b/hw/ip/gpio/doc/programmers_guide.md
@@ -0,0 +1,127 @@
+# Programmer's Guide
+
+## Initialization
+
+Initialization of the GPIO module includes the setting up of the interrupt
+configuration for each GPIO input, as well as the configuration of
+the required noise filtering. These do not provide masked access since
+they are not expected to be done frequently.
+
+```cpp
+// enable inputs 0 and 1 for rising edge detection with filtering,
+// inputs 2 and 3 for falling edge detection with filtering,
+// input 4 for both rising edge detection (no filtering)
+// and inputs 6 and 7 for active low interrupt detection
+*GPIO_INTR_ENABLE =          0b11011111;
+*GPIO_INTR_CTRL_EN_RISING =  0b00010011;
+*GPIO_INTR_CTRL_EN_FALLING = 0b00011100;
+*GPIO_INTR_CTRL_EN_LVLLOW  = 0b11000000;
+*GPIO_INTR_CTRL_EN_LVLHIGH = 0b00000000;
+*GPIO_CTRL_EN_INPUT_FILTER = 0b00001111;
+```
+
+## Common Examples
+
+This section below shows the interaction between the direct access
+and mask access for data output and data enable.
+
+```cpp
+// assume all GPIO are connected to chip pads.
+// assume a weak pullup on all pads, returning 1 if undriven.
+printf("0x%x", *GPIO_DATA_IN);          // 0xffffffff
+
+*DIRECT_OUT = 0x11223344;
+printf("0x%x", *GPIO_DIRECT_OUT);       // 0x11223344
+
+*DIRECT_OE  = 0x00ff00ff;
+printf("0x%x", *GPIO_DIRECT_OE);        // 0x00ff00ff
+
+// weak pullup still applies to undriven signals
+printf("0x%x", *GPIO_DATA_IN);          // 0xff22ff44
+
+// read of direct_out still returns DATA_OUT contents
+printf("0x%x", *GPIO_DIRECT_OUT);       // 0x11223344
+
+// try masked accesses to DATA_OUT
+*GPIO_MASKED_OUT_LOWER = 0x0f0f5566
+printf("0x%x", *GPIO_MASKED_OUT_LOWER); // 0x00003546
+printf("0x%x", *GPIO_DIRECT_OUT);       // 0x11223546
+
+*GPIO_MASKED_OUT_UPPER = 0x0f0f7788
+printf("0x%x", *GPIO_MASKED_OUT_UPPER); // 0x00001728
+printf("0x%x", *GPIO_DIRECT_OUT);       // 0x17283546
+
+// OE still applies
+printf("0x%x", *GPIO_DATA_IN);          // 0xff28ff46
+
+// manipulate OE
+*GPIO_DIRECT_OE = 0xff00ff00;
+printf("0x%x", *GPIO_DIRECT_OE);        // 0xff00ff00
+printf("0x%x", *GPIO_DATA_IN);          // 0x17ff35ff
+
+*GPIO_MASKED_OE_LOWER = 0x0f0f0f0f;
+printf("0x%x", *GPIO_MASKED_OE_LOWER);  // 0x00000f0f
+printf("0x%x", *GPIO_DIRECT_OE);        // 0xff000f0f
+printf("0x%x", *GPIO_DATA_IN);          // 0x17fff5f6
+
+*GPIO_MASKED_OE_UPPER = 0x0f0f0f0f;
+printf("0x%x", *GPIO_MASKED_OE_UPPER);  // 0x00000f0f
+printf("0x%x", *GPIO_DIRECT_OE);        // 0x0f0f0f0f
+printf("0x%x", *GPIO_DATA_IN);          // 0xf7f8f5f6
+```
+
+## Interrupt Handling
+
+This section below gives an example of how interrupt clearing works,
+assuming some events have occurred as shown in comments.
+
+```cpp
+*INTR_ENABLE = 0x000000ff;              // interrupts enabled GPIO[7:0] inputs
+printf("0b%x", *GPIO_DATA_IN);          // assume 0b00000000
+printf("0b%x", *GPIO_INTR_STATE);       // 0b00000000
+
+*INTR_CTRL_EN_RISING  = 0b00010001;     // rising detect on GPIO[0], GPIO[4]
+*INTR_CTRL_EN_FALLING = 0b00010010;     // falling detect on GPIO[1], GPIO[4]
+*INTR_CTRL_EN_LVLLOW  = 0b00001100;     // falling detect on GPIO[2], GPIO[3]
+*INTR_CTRL_EN_LVLHIGH = 0b11000000;     // falling detect on GPIO[6], GPIO[7]
+
+// already detected intr[3,2] (level low)
+printf("0b%b", *GPIO_INTR_STATE);       // 0b00001100
+
+// try and clear [3:2], fails since still active low
+*GPIO_INTR_STATE = 0b00001100;
+printf("0b%b", *GPIO_INTR_STATE);       // 0b00001100
+
+// EVENT: all bits [7:0] rising, triggers [7,6,4,0], [3,2] still latched
+printf("0b%b", *GPIO_DATA_IN);          // 0b11111111
+printf("0b%b", *GPIO_INTR_STATE);       // 0b11011101
+
+// try and clear all bits, [7,6] still detecting level high
+*GPIO_INTR_STATE = 0b11111111;
+printf("0b%b", *GPIO_INTR_STATE);       // 0b11000000
+
+// EVENT: all bits [7:0] falling, triggers [4,3,2,1], [7,6] still latched
+printf("0b%b", *GPIO_DATA_IN);          // 0b00000000
+printf("0b%b", *GPIO_INTR_STATE);       // 0b11011110
+
+// try and clear all bits, [3,2] still detecting level low
+*GPIO_INTR_STATE = 0b11111111;
+printf("0b%b", *GPIO_INTR_STATE);       // 0b00001100
+
+// write test register for all 8 events, trigger regardless of external events
+*GPIO_INTR_TEST = 0b11111111;
+printf("0b%b", *GPIO_INTR_STATE);       // 0b11111111
+
+// try and clear all bits, [3,2] still detecting level low
+*GPIO_INTR_STATE = 0b11111111;
+printf("0b%b", *GPIO_INTR_STATE);       // 0b00001100
+
+```
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_gpio.h)
+
+## Register Table
+
+* [Register Table](../data/gpio.hjson#registers)
diff --git a/hw/ip/gpio/doc/theory_of_operation.md b/hw/ip/gpio/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..bae6680995711
--- /dev/null
+++ b/hw/ip/gpio/doc/theory_of_operation.md
@@ -0,0 +1,104 @@
+# Theory of Operation
+
+## Block Diagram
+
+![GPIO Block Diagram](../doc/gpio_blockdiagram.svg)
+
+The block diagram above shows the `DATA_OUT` and `DATA_OE` registers
+managed by hardware outside of the auto-generated register file.
+For reference, it also shows the assumed connections to pads in
+the top level netlist.
+
+## Hardware Interfaces
+
+* [Interface Tables](../data/gpio.hjson#interfaces)
+
+## Design Details
+
+### GPIO Output logic
+
+![GPIO Output Diagram](../doc/gpio_output.svg)
+
+The GPIO module maintains one 32-bit output register `DATA_OUT` with two
+ways to write to it. Direct write access uses [`DIRECT_OUT`](../data/gpio.hjson#direct_out), and
+masked access uses [`MASKED_OUT_UPPER`](../data/gpio.hjson#masked_out_upper) and
+[`MASKED_OUT_LOWER`](../data/gpio.hjson#masked_out_lower). Direct access provides full write and read
+access for all 32 bits in one register.
+
+For masked access the bits to modify are given as a mask in the upper
+16 bits of the [`MASKED_OUT_UPPER`](../data/gpio.hjson#masked_out_upper) and
+[`MASKED_OUT_LOWER`](../data/gpio.hjson#masked_out_lower) register write, while the data to write is
+provided in the lower 16 bits of the register write.  The hardware updates
+`DATA_OUT` with the mask so that the modification is done without software
+requiring a Read-Modify-Write.
+
+Reads of masked registers return the lower/upper 16 bits of the `DATA_OUT`
+contents. Zeros are returned in the upper 16 bits (mask field). To read
+what is on the pins, software should read the [`DATA_IN`](../data/gpio.hjson#data_in) register.
+(See [GPIO Input](#gpio-input) section below).
+
+The same concept is duplicated for the output enable register `DATA_OE`.
+Direct access uses [`DIRECT_OE`](../data/gpio.hjson#direct_oe), and masked access is available
+using [`MASKED_OE_UPPER`](../data/gpio.hjson#masked_oe_upper) and [`MASKED_OE_LOWER`](../data/gpio.hjson#masked_oe_lower).
+
+The output enable is sent to the pad control block to determine if the
+pad should drive the `DATA_OUT` value to the associated pin or not.
+
+A typical use pattern is for initialization and suspend/resume code to
+use the full access registers to set the output enables and current output
+values, then switch to masked access for both `DATA_OUT` and `DATA_OE`.
+
+For GPIO outputs that are not used (either not wired to a pin output or
+not selected for pin multiplexing), the output values are disconnected
+and have no effect on the GPIO input, regardless of output enable values.
+
+### GPIO Input
+
+The [`DATA_IN`](../data/gpio.hjson#data_in) register returns the contents as seen on the
+peripheral input, typically from the pads connected to those inputs.  In the
+presence of a pin-multiplexing unit, GPIO peripheral inputs that are
+not connected to a chip input will be tied to a constant zero input.
+
+The GPIO module provides optional independent noise filter control for
+each of the 32 input signals. Each input can be independently enabled with
+the [`CTRL_EN_INPUT_FILTER`](../data/gpio.hjson#ctrl_en_input_filter) (one bit per input).  This 16-cycle
+filter is applied to both the [`DATA_IN`](../data/gpio.hjson#data_in) register and
+the interrupt detection logic. The timing for [`DATA_IN`](../data/gpio.hjson#data_in) is still
+not instantaneous if [`CTRL_EN_INPUT_FILTER`](../data/gpio.hjson#ctrl_en_input_filter) is false as there is
+top-level routing involved, but no flops are between the chip input and the
+[`DATA_IN`](../data/gpio.hjson#data_in) register.
+
+The contents of [`DATA_IN`](../data/gpio.hjson#data_in) are always readable and reflect the
+value seen at the chip input pad regardless of the output enable setting from
+DATA_OE. If the output enable is true (and the GPIO is connected to a
+chip-level pad), the value read from [`DATA_IN`](../data/gpio.hjson#data_in) includes the
+effect of the peripheral's driven output (so will only differ from DATA_OUT if
+the output driver is unable to switch the pin or during the delay imposed
+if the noise filter is enabled).
+
+### Interrupts
+
+The GPIO module provides 32 interrupt signals to the main processor.
+Each interrupt can be independently enabled, tested, and configured.
+Following the standard interrupt guidelines in the [Comportability
+Specification](../../../../doc/contributing/hw/comportability/README.md),
+the 32 bits of the [`INTR_ENABLE`](../data/gpio.hjson#intr_enable) register determines whether the
+associated inputs are configured to detect interrupt events. If enabled
+via the various `INTR_CTRL_EN` registers, their current state can be
+read in the [`INTR_STATE`](../data/gpio.hjson#intr_state) register. Clearing is done by writing a
+`1` into the associated [`INTR_STATE`](../data/gpio.hjson#intr_state) bit field.
+
+For configuration, there are 4 types of interrupts available per bit,
+controlled with four control registers. [`INTR_CTRL_EN_RISING`](../data/gpio.hjson#intr_ctrl_en_rising)
+configures the associated input for rising-edge detection.
+Similarly, [`INTR_CTRL_EN_FALLING`](../data/gpio.hjson#intr_ctrl_en_falling) detects falling edge inputs.
+[`INTR_CTRL_EN_LVLHIGH`](../data/gpio.hjson#intr_ctrl_en_lvlhigh) and [`INTR_CTRL_EN_LVLLOW`](../data/gpio.hjson#intr_ctrl_en_lvllow)
+allow the input to be level sensitive interrupts. In theory an input can be
+configured to detect both a rising and falling edge, but there is no hardware
+assistance to indicate which edge caused the output interrupt.
+
+**Note #1:** all inputs are sent through optional noise filtering before
+being sent into interrupt detection. **Note #2:** all output interrupts to
+the processor are level interrupts as per the Comportability Specification
+guidelines. The GPIO module, if configured, converts an edge detection
+into a level interrupt to the processor core.
diff --git a/hw/ip/hmac/README.md b/hw/ip/hmac/README.md
index 10b807db69120..fb879966e2131 100644
--- a/hw/ip/hmac/README.md
+++ b/hw/ip/hmac/README.md
@@ -53,288 +53,3 @@ generator to derive the random number from the written seed number.
 A later update may provide an interface for external hardware IPs, such as a key
 manager, to update the secret key. It will also have
 the ability to send the digest directly to a shared internal bus.
-
-# Theory of Operations
-
-## Block Diagram
-
-![HMAC Block Diagram](./doc/hmac_block_diagram.svg)
-
-The HMAC block diagram above shows that the HMAC core converts the secret key
-registers into an inner padded key and an outer padded key which are fed to the
-hash engine when appropriate. The module also feeds the result of the first
-round message (which uses the inner padded key) from the SHA-256 hash engine
-into the 16x32b FIFO for the second round (which uses the outer padded key).
-The message length is automatically updated to reflect the size of the outer
-padded key and first round digest result for the second round. See [Design
-Details](#design-details) for more information.
-
-![SHA-256 Block Diagram](./doc/sha2_block_diagram.svg)
-
-The SHA-256 (SHA-2) block diagram shows the message FIFO inside SHA-256, hash
-registers, digest registers, and SHA-256 compression function. The message FIFO
-is not software accessible but is fed from the 16x32b FIFO seen in the HMAC
-block diagram via the HMAC core. The HMAC core can forward the message directly
-from the 16x32b FIFO if HMAC is not enabled. This message is padded with length
-appended to fit the 512-bit block size as described in the [SHA-256
-specification][sha256-spec].
-
-With the 512-bit block, the compress function runs 64 rounds to calculate the
-block hash, which is stored in the hash registers above. After 64 rounds are
-completed, the SHA-256 updates the digest registers with the addition of the
-hash result and the previous digest registers.
-
-## Hardware Interface
-
-* [Interface Tables](data/hmac.hjson#interfaces)
-
-## Design Details
-
-### SHA-256 message feed and pad
-
-A message is fed via a memory-mapped message FIFO. Any write access to the
-memory-mapped window [`MSG_FIFO`](data/hmac.hjson#msg_fifo) updates the message FIFO. If the FIFO is full,
-the HMAC block will block any writes leading to back-pressure on the
-interconnect (as opposed to dropping those writes or overwriting existing FIFO
-contents). It is recommended this back-pressure is avoided by not writing to the
-memory-mapped message FIFO when it is full. To avoid doing so, software can
-read the [`STATUS.fifo_full`](data/hmac.hjson#status) register.
-
-The logic assumes the input message is little-endian.
-It converts the byte order of the word right before writing to SHA2 storage as SHA2 treats the incoming message as big-endian.
-If SW wants to convert the message byte order, SW should set [`CFG.endian_swap`](data/hmac.hjson#cfg) to **1**.
-The byte order of the digest registers, from [`DIGEST_0`](data/hmac.hjson#digest_0) to [`DIGEST_7`](data/hmac.hjson#digest_7) can be configured with [`CFG.digest_swap`](data/hmac.hjson#cfg).
-
-See the table below:
-
-```
-Input Msg #0: 010203h
-Input Msg #1: 0405h
-```
-
-endian_swap     | 0         | 1
-----------------|-----------|-----------
-Push to SHA2 #0 | 03020105h | 01020304h
-Push to SHA2 #1 | 00000004h | 00000005h
-
-
-Small writes to [`MSG_FIFO`](data/hmac.hjson#msg_fifo) are coalesced with into 32-bit words by the [packer logic]({{< relref "hw/ip/prim/doc/prim_packer" >}}).
-These words are fed into the internal message FIFO.
-While passing writes to the packer logic, the block also counts the number of bytes that are being passed.
-This gives the received message length, which is used in HMAC and SHA-256 as part of the hash computation.
-
-The SHA-256 module computes an intermediate hash for every 512-bit block.
-The message must be padded to fill 512-bit blocks. This is done with an initial
-**1** bit after the message bits with a 64-bit message length at the end and
-enough **0** bits in the middle to result in a full block.The [SHA-256
-specification][sha256-spec] describes this in more detail. An example is shown
-below. The padding logic handles this so software only needs to write the actual
-message bits into the FIFO.
-
-![SHA-256 Message Padding](./doc/message_padding.svg)
-
-For instance, if the message is empty, the message length is 64-bit 0. In this
-case, the padding logic gives `0x80000000` into the SHA-256 module first. Then
-it sends (512 - 32 - 64)/32, 13 times of `0x00000000` for Padding `0x00`.
-Lastly, it returns the message length which is 64-bit `0x00000000_00000000`. If
-incomplete words are written, the packet logic appends `0x80` in the proper byte
-location.  Such as `0xXX800000` for the message length % 4B == 1 case.
-
-### SHA-256 computation
-
-The SHA-256 engine receives 16 32-bit words from the message FIFO or the HMAC
-core then begins 64 rounds of the hash computation which is also called
-*compression*. In each round, the compression function fetches 32 bits from the
-buffer and computes the internal variables. The first 16 rounds are fed by the
-words from the message FIFO or the HMAC core. Input for later rounds comes from
-shuffling the given 512-bit block. Details are well described in
-[Wikipedia][sha2-wikipedia] and the [SHA-256 specification][sha256-spec].
-
-[sha2-wikipedia]: https://en.wikipedia.org/wiki/SHA-2
-
-With the given hash values, 4 byte message, and round constants, the compression
-function computes the next round hash values. The 64 32-bit round constants
-are hard-wired in the design. After the compression at the last round is
-finished, the resulting hash values are added into the digest. The digest, again,
-is used as initial hash values for the next 512-bit block compression. During
-the compression rounds, it doesn't fetch data from the message FIFO. The
-software can push up to 16 entries to the FIFO for the next hash computation.
-
-### HMAC computation
-
-![Two steps of HMAC](./doc/hmac_dataflow.svg)
-
-HMAC can be used with any hash algorithm but this version of HMAC IP only uses
-SHA-256. The first phase of HMAC calculates the SHA-256 hash of the inner
-secret key concatenated with the actual message to be authenticated. This inner
-secret key is created with a 256-bit (hashed) secret key and `0x36` pad.
-
-```verilog
-    inner_pad_key = {key[255:0], 256'h0} ^ {64{8'h36}} // big-endian
-```
-
-The message length used in the SHA-256 module is calculated by the HMAC core by
-adding 512 to the original message length (to account for the length of
-`inner_pad_key`, which has been prepended to the message).
-
-The first round digest is fed into the second round in HMAC. The second round
-computes the hash of the outer secret key concatenated with the first round
-digest. As the result of SHA-256 is 256-bits, it must be padded to fit into
-512-bit block size.
-
-```verilog
-    outer_pad_key = {key[255:0], 256'h0} ^ {64{8'h5c}} // big-endian
-```
-
-In the second round, the message length is a fixed 768 bits.
-
-HMAC assumes the secret key is 256-bit. The onus is on software to shrink the
-key to 256-bit using a hash function when setting up the HMAC. For example,
-common key sizes may be 2048-bit or 4096-bit. Software must hash these and
-write the hashed results to the HMAC.
-
-### Performance in SHA-256 mode and HMAC mode
-
-The SHA-256 hash algorithm computes 512 bits of data at a time. The first 16
-rounds need the actual 16 x 32-bit message and the following 48 rounds need
-some value derived from the message.
-
-In these 48 rounds, the software can feed the next 16 x 32-bit message block.
-But, once the FIFO is full, the software cannot push more data until the
-current block is processed. This version of the IP fetches the next 16 x 32-bit
-message after completing the current block. As such, it takes 80 cycles to
-complete a block. The effective throughput considering this is `64 byte / 80
-clk` or `16 clk / 80 clk`, 20% of the maximum throughput. For instance, if the
-clock frequency is 100MHz, the SHA-256 can hash out 80MB/s at most.
-
-This throughput could be enhanced in a future version by feeding the message
-into the internal buffer when the round hits 48, eliminating the extra 16
-cycles to feed the message after completing a block.
-
-If HMAC mode is turned on, it introduces extra latency due to the second round
-of computing the final hash of the outer key and the result of the first round
-using the inner key. This adds an extra 240 cycles (80 for the inner key, 80
-for the outer key, and 80 for the result of the first round) to complete a
-message. For instance, if an empty message is given then it takes 360 cycles
-(80 for msg itself and 240 for the extra) to get the HMAC authentication token.
-
-### MSG_FIFO
-
-The MSG_FIFO in the HMAC IP has a wide address range not just one 4 byte address.
-Any writes to the address range go into the single entry point of the `prim_packer`.
-Then `prim_packer` compacts the data into the word-size if not a word-write then writes to the MSG_FIFO.
-This is different from a conventional memory-mapped FIFO.
-
-By having wide address range pointing to a single entry point, the FIFO can free software from the fixed address restriction.
-For instance, the core can use "store multiple" commands to feed the message fifo efficiently.
-Also, a DMA engine which might not have the ability to be configured to the fixed write and incremental read may benefit from this behavior.
-
-# Programmer's Guide
-
-This chapter shows how to use the HMAC-SHA256 IP by showing some snippets such
-as initialization, initiating SHA-256 or HMAC process and processing the
-interrupts. This code is not compilable but serves to demonstrate the IO
-required.
-More detailed and complete code can be found in the software under `sw/`, [ROM code](https://github.com/lowRISC/opentitan/blob/master/sw/device/silicon_creator/lib/drivers/hmac.c) and [HMAC DIF](https://github.com/lowRISC/opentitan/blob/master/sw/device/lib/dif/dif_hmac.c).
-
-## Initialization
-
-This section of the code describes initializing the HMAC-SHA256, setting up the
-interrupts, endianness, and HMAC, SHA-256 mode. [`CFG.endian_swap`](data/hmac.hjson#cfg) reverses
-the byte-order of input words when software writes into the message FIFO.
-[`CFG.digest_swap`](data/hmac.hjson#cfg) reverses the byte-order in the final HMAC or SHA hash.
-
-```c
-void hmac_init(unsigned int endianess, unsigned int digest_endian) {
-  HMAC_CFG(0) = HMAC_CFG_SHA_EN
-              | HMAC_CFG_HMAC_EN
-              | (endianess << HMAC_CFG_ENDIAN_SWAP_LSB)
-              | (digest_endian << HMAC_CFG_DIGEST_SWAP_LSB);
-
-  // Enable interrupts if needed.
-
-  // If secret key is static, you can put the key here
-  HMAC_KEY_0 = SECRET_KEY_0;
-  HMAC_KEY_1 = SECRET_KEY_1;
-  HMAC_KEY_2 = SECRET_KEY_2;
-  HMAC_KEY_3 = SECRET_KEY_3;
-  HMAC_KEY_4 = SECRET_KEY_4;
-  HMAC_KEY_5 = SECRET_KEY_5;
-  HMAC_KEY_6 = SECRET_KEY_6;
-  HMAC_KEY_7 = SECRET_KEY_7;
-}
-```
-
-## Triggering HMAC/SHA-256 engine
-
-The following code shows how to send a message to the HMAC, the procedure is
-the same whether a full HMAC or just a SHA-256 calculation is required (choose
-between them using [`CFG.hmac_en`](data/hmac.hjson#cfg)). In both cases the SHA-256 engine must be
-enabled using [`CFG.sha_en`](data/hmac.hjson#cfg) (once all other configuration has been properly set).
-If the message is bigger than 512-bit, the software must wait until the FIFO
-isn't full before writing further bits.
-
-```c
-void run_hmac(uint32_t *msg, uint32_t msg_len, uint32_t *hash) {
-  // Initiate hash: hash_start
-  REG32(HMAC_CMD(0)) = (1 << HMAC_CMD_HASH_START);
-
-  // write the message: below example assumes word-aligned access
-  for (uint32_t written = 0 ; written < (msg_len >> 3) ; written += 4) {
-    while((REG32(HMAC_STATUS(0)) >> HMAC_STATUS_FIFO_FULL) & 0x1) ;
-    // Any write data from HMAC_MSG_FIFO_OFFSET to HMAC_MSG_FIFO_SIZE
-    // is written to the message FIFO
-    REG32(HMAC_MSG_FIFO(0)) = *(msg+(written/4));
-  }
-
-  // Completes hash: hash_process
-  REG32(HMAC_CMD(0)) = (1 << HMAC_CMD_HASH_PROCESS);
-
-  while(0 == (REG32(HMAC_INTR_STATE(0)) >> HMAC_INTR_STATE_HMAC_DONE) & 0x1);
-
-  REG32(HMAC_INTR_STATE(0)) = 1 << HMAC_INTR_STATE_HMAC_DONE;
-
-  // Read the digest
-  for (int i = 0 ; i < 8 ; i++) {
-    *(hash + i) = REG32(HMAC_DIGEST_0(0) + (i << 2));
-  }
-}
-```
-
-## Updating the configurations
-
-The HMAC IP prevents [`CFG`](data/hmac.hjson#cfg) and [`KEY`](data/hmac.hjson#key) registers from updating while the engine is processing messages.
-Such attempts are discarded.
-The [`KEY`](data/hmac.hjson#key) register ignores any attempt to access the secret key in the middle of the process.
-If the software tries to update the KEY, the IP reports an error through the Error FIFO. The error code is `SwUpdateSecretKeyInProcess`, `0x0003`.
-
-## Errors
-
-When HMAC sees errors, the IP reports the error via [`INTR_STATUS.hmac_err`](data/hmac.hjson#intr_status).
-The details of the error type is stored in [`ERR_CODE`](data/hmac.hjson#err_code).
-
-Error                        | Value | Description
------------------------------|-------|---------------
-`SwPushMsgWhenShaDisabled`   | `0x1` | The error is reported when SW writes data into MSG_FIFO when SHA is disabled. It may be due to SW routine error, or FI attacks.
-`SwHashStartWhenShaDisabled` | `0x2` | When HMAC detects the CMD.start when SHA is disabled, it reports this error code.
-`SwUpdateSecretKeyInProcess` | `0x3` | Secret Key CSRs should not be modified during the hashing. This error is reported when those CSRs are revised in active.
-`SwHashStartWhenActive`      | `0x4` | The error is reported when CMD.start is received while HMAC is running.
-`SwPushMsgWhenDisallowed`    | `0x5` | After CMD.process is received, the MSG_FIFO should not by updated by SW. This error is reported in that case.
-
-
-
-### FIFO_EMPTY
-
-If the FIFO_FULL interrupt occurs, it is recommended the software does not write
-more data into [`MSG_FIFO`](data/hmac.hjson#msg_fifo) until the interrupt is cleared and the status
-[`STATUS.fifo_full`](data/hmac.hjson#status) is lowered. Whilst the FIFO is full the HMAC will block
-writes until the FIFO has space which will cause back-pressure on the
-interconnect.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_hmac.h)
-
-## Register Table
-
-* [Register Table](data/hmac.hjson#registers)
diff --git a/hw/ip/hmac/doc/programmers_guide.md b/hw/ip/hmac/doc/programmers_guide.md
new file mode 100644
index 0000000000000..d0b3dfea81bb6
--- /dev/null
+++ b/hw/ip/hmac/doc/programmers_guide.md
@@ -0,0 +1,109 @@
+# Programmer's Guide
+
+This chapter shows how to use the HMAC-SHA256 IP by showing some snippets such
+as initialization, initiating SHA-256 or HMAC process and processing the
+interrupts. This code is not compilable but serves to demonstrate the IO
+required.
+More detailed and complete code can be found in the software under `sw/`, [ROM code](https://github.com/lowRISC/opentitan/blob/master/sw/device/silicon_creator/lib/drivers/hmac.c) and [HMAC DIF](https://github.com/lowRISC/opentitan/blob/master/sw/device/lib/dif/dif_hmac.c).
+
+## Initialization
+
+This section of the code describes initializing the HMAC-SHA256, setting up the
+interrupts, endianness, and HMAC, SHA-256 mode. [`CFG.endian_swap`](../data/hmac.hjson#cfg) reverses
+the byte-order of input words when software writes into the message FIFO.
+[`CFG.digest_swap`](../data/hmac.hjson#cfg) reverses the byte-order in the final HMAC or SHA hash.
+
+```c
+void hmac_init(unsigned int endianess, unsigned int digest_endian) {
+  HMAC_CFG(0) = HMAC_CFG_SHA_EN
+              | HMAC_CFG_HMAC_EN
+              | (endianess << HMAC_CFG_ENDIAN_SWAP_LSB)
+              | (digest_endian << HMAC_CFG_DIGEST_SWAP_LSB);
+
+  // Enable interrupts if needed.
+
+  // If secret key is static, you can put the key here
+  HMAC_KEY_0 = SECRET_KEY_0;
+  HMAC_KEY_1 = SECRET_KEY_1;
+  HMAC_KEY_2 = SECRET_KEY_2;
+  HMAC_KEY_3 = SECRET_KEY_3;
+  HMAC_KEY_4 = SECRET_KEY_4;
+  HMAC_KEY_5 = SECRET_KEY_5;
+  HMAC_KEY_6 = SECRET_KEY_6;
+  HMAC_KEY_7 = SECRET_KEY_7;
+}
+```
+
+## Triggering HMAC/SHA-256 engine
+
+The following code shows how to send a message to the HMAC, the procedure is
+the same whether a full HMAC or just a SHA-256 calculation is required (choose
+between them using [`CFG.hmac_en`](../data/hmac.hjson#cfg)). In both cases the SHA-256 engine must be
+enabled using [`CFG.sha_en`](../data/hmac.hjson#cfg) (once all other configuration has been properly set).
+If the message is bigger than 512-bit, the software must wait until the FIFO
+isn't full before writing further bits.
+
+```c
+void run_hmac(uint32_t *msg, uint32_t msg_len, uint32_t *hash) {
+  // Initiate hash: hash_start
+  REG32(HMAC_CMD(0)) = (1 << HMAC_CMD_HASH_START);
+
+  // write the message: below example assumes word-aligned access
+  for (uint32_t written = 0 ; written < (msg_len >> 3) ; written += 4) {
+    while((REG32(HMAC_STATUS(0)) >> HMAC_STATUS_FIFO_FULL) & 0x1) ;
+    // Any write data from HMAC_MSG_FIFO_OFFSET to HMAC_MSG_FIFO_SIZE
+    // is written to the message FIFO
+    REG32(HMAC_MSG_FIFO(0)) = *(msg+(written/4));
+  }
+
+  // Completes hash: hash_process
+  REG32(HMAC_CMD(0)) = (1 << HMAC_CMD_HASH_PROCESS);
+
+  while(0 == (REG32(HMAC_INTR_STATE(0)) >> HMAC_INTR_STATE_HMAC_DONE) & 0x1);
+
+  REG32(HMAC_INTR_STATE(0)) = 1 << HMAC_INTR_STATE_HMAC_DONE;
+
+  // Read the digest
+  for (int i = 0 ; i < 8 ; i++) {
+    *(hash + i) = REG32(HMAC_DIGEST_0(0) + (i << 2));
+  }
+}
+```
+
+## Updating the configurations
+
+The HMAC IP prevents [`CFG`](../data/hmac.hjson#cfg) and [`KEY`](../data/hmac.hjson#key) registers from updating while the engine is processing messages.
+Such attempts are discarded.
+The [`KEY`](../data/hmac.hjson#key) register ignores any attempt to access the secret key in the middle of the process.
+If the software tries to update the KEY, the IP reports an error through the Error FIFO. The error code is `SwUpdateSecretKeyInProcess`, `0x0003`.
+
+## Errors
+
+When HMAC sees errors, the IP reports the error via [`INTR_STATUS.hmac_err`](../data/hmac.hjson#intr_status).
+The details of the error type is stored in [`ERR_CODE`](../data/hmac.hjson#err_code).
+
+Error                        | Value | Description
+-----------------------------|-------|---------------
+`SwPushMsgWhenShaDisabled`   | `0x1` | The error is reported when SW writes data into MSG_FIFO when SHA is disabled. It may be due to SW routine error, or FI attacks.
+`SwHashStartWhenShaDisabled` | `0x2` | When HMAC detects the CMD.start when SHA is disabled, it reports this error code.
+`SwUpdateSecretKeyInProcess` | `0x3` | Secret Key CSRs should not be modified during the hashing. This error is reported when those CSRs are revised in active.
+`SwHashStartWhenActive`      | `0x4` | The error is reported when CMD.start is received while HMAC is running.
+`SwPushMsgWhenDisallowed`    | `0x5` | After CMD.process is received, the MSG_FIFO should not by updated by SW. This error is reported in that case.
+
+
+
+### FIFO_EMPTY
+
+If the FIFO_FULL interrupt occurs, it is recommended the software does not write
+more data into [`MSG_FIFO`](../data/hmac.hjson#msg_fifo) until the interrupt is cleared and the status
+[`STATUS.fifo_full`](../data/hmac.hjson#status) is lowered. Whilst the FIFO is full the HMAC will block
+writes until the FIFO has space which will cause back-pressure on the
+interconnect.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_hmac.h)
+
+## Register Table
+
+* [Register Table](../data/hmac.hjson#registers)
diff --git a/hw/ip/hmac/doc/theory_of_operation.md b/hw/ip/hmac/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..a828134cb4962
--- /dev/null
+++ b/hw/ip/hmac/doc/theory_of_operation.md
@@ -0,0 +1,174 @@
+# Theory of Operation
+
+## Block Diagram
+
+![HMAC Block Diagram](../doc/hmac_block_diagram.svg)
+
+The HMAC block diagram above shows that the HMAC core converts the secret key
+registers into an inner padded key and an outer padded key which are fed to the
+hash engine when appropriate. The module also feeds the result of the first
+round message (which uses the inner padded key) from the SHA-256 hash engine
+into the 16x32b FIFO for the second round (which uses the outer padded key).
+The message length is automatically updated to reflect the size of the outer
+padded key and first round digest result for the second round. See [Design
+Details](#design-details) for more information.
+
+![SHA-256 Block Diagram](../doc/sha2_block_diagram.svg)
+
+The SHA-256 (SHA-2) block diagram shows the message FIFO inside SHA-256, hash
+registers, digest registers, and SHA-256 compression function. The message FIFO
+is not software accessible but is fed from the 16x32b FIFO seen in the HMAC
+block diagram via the HMAC core. The HMAC core can forward the message directly
+from the 16x32b FIFO if HMAC is not enabled. This message is padded with length
+appended to fit the 512-bit block size as described in the [SHA-256
+specification][sha256-spec].
+
+With the 512-bit block, the compress function runs 64 rounds to calculate the
+block hash, which is stored in the hash registers above. After 64 rounds are
+completed, the SHA-256 updates the digest registers with the addition of the
+hash result and the previous digest registers.
+
+## Hardware Interface
+
+* [Interface Tables](../data/hmac.hjson#interfaces)
+
+## Design Details
+
+### SHA-256 message feed and pad
+
+A message is fed via a memory-mapped message FIFO. Any write access to the
+memory-mapped window [`MSG_FIFO`](../data/hmac.hjson#msg_fifo) updates the message FIFO. If the FIFO is full,
+the HMAC block will block any writes leading to back-pressure on the
+interconnect (as opposed to dropping those writes or overwriting existing FIFO
+contents). It is recommended this back-pressure is avoided by not writing to the
+memory-mapped message FIFO when it is full. To avoid doing so, software can
+read the [`STATUS.fifo_full`](../data/hmac.hjson#status) register.
+
+The logic assumes the input message is little-endian.
+It converts the byte order of the word right before writing to SHA2 storage as SHA2 treats the incoming message as big-endian.
+If SW wants to convert the message byte order, SW should set [`CFG.endian_swap`](../data/hmac.hjson#cfg) to **1**.
+The byte order of the digest registers, from [`DIGEST_0`](../data/hmac.hjson#digest_0) to [`DIGEST_7`](../data/hmac.hjson#digest_7) can be configured with [`CFG.digest_swap`](../data/hmac.hjson#cfg).
+
+See the table below:
+
+```
+Input Msg #0: 010203h
+Input Msg #1: 0405h
+```
+
+endian_swap     | 0         | 1
+----------------|-----------|-----------
+Push to SHA2 #0 | 03020105h | 01020304h
+Push to SHA2 #1 | 00000004h | 00000005h
+
+
+Small writes to [`MSG_FIFO`](../data/hmac.hjson#msg_fifo) are coalesced with into 32-bit words by the [packer logic]({{< relref "hw/ip/prim/doc/prim_packer" >}}).
+These words are fed into the internal message FIFO.
+While passing writes to the packer logic, the block also counts the number of bytes that are being passed.
+This gives the received message length, which is used in HMAC and SHA-256 as part of the hash computation.
+
+The SHA-256 module computes an intermediate hash for every 512-bit block.
+The message must be padded to fill 512-bit blocks. This is done with an initial
+**1** bit after the message bits with a 64-bit message length at the end and
+enough **0** bits in the middle to result in a full block.The [SHA-256
+specification][sha256-spec] describes this in more detail. An example is shown
+below. The padding logic handles this so software only needs to write the actual
+message bits into the FIFO.
+
+![SHA-256 Message Padding](../doc/message_padding.svg)
+
+For instance, if the message is empty, the message length is 64-bit 0. In this
+case, the padding logic gives `0x80000000` into the SHA-256 module first. Then
+it sends (512 - 32 - 64)/32, 13 times of `0x00000000` for Padding `0x00`.
+Lastly, it returns the message length which is 64-bit `0x00000000_00000000`. If
+incomplete words are written, the packet logic appends `0x80` in the proper byte
+location.  Such as `0xXX800000` for the message length % 4B == 1 case.
+
+### SHA-256 computation
+
+The SHA-256 engine receives 16 32-bit words from the message FIFO or the HMAC
+core then begins 64 rounds of the hash computation which is also called
+*compression*. In each round, the compression function fetches 32 bits from the
+buffer and computes the internal variables. The first 16 rounds are fed by the
+words from the message FIFO or the HMAC core. Input for later rounds comes from
+shuffling the given 512-bit block. Details are well described in
+[Wikipedia][sha2-wikipedia] and the [SHA-256 specification][sha256-spec].
+
+[sha2-wikipedia]: https://en.wikipedia.org/wiki/SHA-2
+
+With the given hash values, 4 byte message, and round constants, the compression
+function computes the next round hash values. The 64 32-bit round constants
+are hard-wired in the design. After the compression at the last round is
+finished, the resulting hash values are added into the digest. The digest, again,
+is used as initial hash values for the next 512-bit block compression. During
+the compression rounds, it doesn't fetch data from the message FIFO. The
+software can push up to 16 entries to the FIFO for the next hash computation.
+
+### HMAC computation
+
+![Two steps of HMAC](../doc/hmac_dataflow.svg)
+
+HMAC can be used with any hash algorithm but this version of HMAC IP only uses
+SHA-256. The first phase of HMAC calculates the SHA-256 hash of the inner
+secret key concatenated with the actual message to be authenticated. This inner
+secret key is created with a 256-bit (hashed) secret key and `0x36` pad.
+
+```verilog
+    inner_pad_key = {key[255:0], 256'h0} ^ {64{8'h36}} // big-endian
+```
+
+The message length used in the SHA-256 module is calculated by the HMAC core by
+adding 512 to the original message length (to account for the length of
+`inner_pad_key`, which has been prepended to the message).
+
+The first round digest is fed into the second round in HMAC. The second round
+computes the hash of the outer secret key concatenated with the first round
+digest. As the result of SHA-256 is 256-bits, it must be padded to fit into
+512-bit block size.
+
+```verilog
+    outer_pad_key = {key[255:0], 256'h0} ^ {64{8'h5c}} // big-endian
+```
+
+In the second round, the message length is a fixed 768 bits.
+
+HMAC assumes the secret key is 256-bit. The onus is on software to shrink the
+key to 256-bit using a hash function when setting up the HMAC. For example,
+common key sizes may be 2048-bit or 4096-bit. Software must hash these and
+write the hashed results to the HMAC.
+
+### Performance in SHA-256 mode and HMAC mode
+
+The SHA-256 hash algorithm computes 512 bits of data at a time. The first 16
+rounds need the actual 16 x 32-bit message and the following 48 rounds need
+some value derived from the message.
+
+In these 48 rounds, the software can feed the next 16 x 32-bit message block.
+But, once the FIFO is full, the software cannot push more data until the
+current block is processed. This version of the IP fetches the next 16 x 32-bit
+message after completing the current block. As such, it takes 80 cycles to
+complete a block. The effective throughput considering this is `64 byte / 80
+clk` or `16 clk / 80 clk`, 20% of the maximum throughput. For instance, if the
+clock frequency is 100MHz, the SHA-256 can hash out 80MB/s at most.
+
+This throughput could be enhanced in a future version by feeding the message
+into the internal buffer when the round hits 48, eliminating the extra 16
+cycles to feed the message after completing a block.
+
+If HMAC mode is turned on, it introduces extra latency due to the second round
+of computing the final hash of the outer key and the result of the first round
+using the inner key. This adds an extra 240 cycles (80 for the inner key, 80
+for the outer key, and 80 for the result of the first round) to complete a
+message. For instance, if an empty message is given then it takes 360 cycles
+(80 for msg itself and 240 for the extra) to get the HMAC authentication token.
+
+### MSG_FIFO
+
+The MSG_FIFO in the HMAC IP has a wide address range not just one 4 byte address.
+Any writes to the address range go into the single entry point of the `prim_packer`.
+Then `prim_packer` compacts the data into the word-size if not a word-write then writes to the MSG_FIFO.
+This is different from a conventional memory-mapped FIFO.
+
+By having wide address range pointing to a single entry point, the FIFO can free software from the fixed address restriction.
+For instance, the core can use "store multiple" commands to feed the message fifo efficiently.
+Also, a DMA engine which might not have the ability to be configured to the fixed write and incremental read may benefit from this behavior.
diff --git a/hw/ip/i2c/README.md b/hw/ip/i2c/README.md
index c84dd0c763154..cd635c5e6e4bd 100644
--- a/hw/ip/i2c/README.md
+++ b/hw/ip/i2c/README.md
@@ -74,421 +74,3 @@ This IP block should be compatible with any target device covered by the [I2C sp
 This IP in the host mode issues addresses in 7-bit encoding, and in the target mode, receives addresses in 7-bit encoding.
 (It remains the obligation of system designers to ensure that devices remain in a 7-bit address space.)
 This IP also supports clock-stretching, should that be required by target devices.
-
-# Theory of Operations
-
-## Block Diagram
-
-![](./doc/I2C_block_diagram.svg)
-
-## Hardware Interfaces
-
-* [Interface Tables](data/i2c.hjson#interfaces)
-
-## Design Details
-
-### Functional Modes
-
-I2C IP is a host-target combo that can function as either an I2C host or an I2C target.
-Although it is conceivable that an I2C combo can optionally function as both a host and a target at the same time, we do not support this feature at this time.
-These functional modes are enabled at runtime by setting the register fields [`CTRL.ENABLEHOST`](data/i2c.hjson#ctrl) and [`CTRL.ENABLETARGET`](data/i2c.hjson#ctrl).
-
-### Virtual Open Drain
-
-In devices which lack a true open drain buffer functionality, this IP implements a "virtual Open Drain" functionality.
-The SDA and SCL outputs are assumed to be connected to a tri-state buffer, with independent enable outputs for both signals.
-
-Rather than toggling the buffer inputs, the buffer inputs are *continuously asserted low*, and instead the buffer *enable* signals are toggled.
-The SDA or SCL buffers are enabled for a logical "Low" output on the respective signal, and are disabled for logical "High" outputs.
-This arrangement allows the the output pins to float high if there is no conflict from external devices, or to be pulled low if there is a conflict (as is required for clock-stretching or--in future revisions-- multi-host functionality).
-
-This arrangement is necessary for FPGA builds.
-
-
-### Override Mode for Direct Pin Access
-
-The I2C hardware interface consists of two external pins, SCL and SDA, whose behavior is described in the [I2C specification](https://www.nxp.com/docs/en/user-guide/UM10204.pdf).
-These pins are typically controlled by an internal state machine.
-However, there is a simpler "override" mode, by which these pins can be directly manipulated by software.
-This override mode is useful for troubleshooting or error-recovery.
-
-To enter override mode, the register field [`OVRD.TXOVRDEN`](data/i2c.hjson#ovrd) is asserted by software.
-In this state the output drivers `scl_tx_o` and `sda_tx_o` are controlled directly by the register fields [`OVRD.SCLVAL`](data/i2c.hjson#ovrd) and [`OVRD.SDAVAL`](data/i2c.hjson#ovrd).
-When [`OVRD.SCLVAL`](data/i2c.hjson#ovrd) and [`OVRD.SDAVAL`](data/i2c.hjson#ovrd) are set high, the virtual open drain configuration will leave the output resistively pulled high, and controllable by remote targets.
-In this state, with SCL or SDA asserted high, the register fields [`VAL.SCL_RX`](data/i2c.hjson#val) and [`VAL.SDA_RX`](data/i2c.hjson#val) can be used to receive inputs (including remote acknowledgments) from target devices.
-
-#### FSM control of SCL and SDA
-
-While in host mode, SCL and SDA are generated through the internal state machine.
-Since SCL is directly decoded from the states, it can have short glitches during transition which the external target may be sensitive to if it is not using an over-sampling scheme.
-To counter this, the SCL and SDA outputs from the internal state machine are flopped before they are emitted.
-
-This adds a one cycle module clock delay to both signals.
-If the module clock is sufficiently faster than I2C line speeds (for example 20MHz), this is not an issue.
-However if the line speeds and the module clock speeds become very close (2x), the 1 cycle delay may have an impact, as the internal state machine may mistakenly think it has sampled an SDA that has not yet been updated.
-
-It it thus recommended to run clock ratios such that the internal module clock is at least 5x-10x the line speeds.
-
-### Byte-Formatted Programming Mode
-
-This section applies to I2C in the host mode.
-The state machine-controlled mode allows for higher-speed operation with less frequent software interaction.
-In this mode, the I2C pins are controlled by the I2C state machine, which in turn is controlled by a sequence of formatting indicators.
-The formatting indicators indicate:
-- The sequence of bytes which should be transmitted on the SDA and SCL pins.
-- The periods between transmitted bytes when the state-machine should stop transmission and instead read back a fixed number of bytes.
-- Which bytes should be preceded by a START symbol.
-- Which bytes should be followed by a STOP symbol
-The format indicator consists of 13-bits.
-That is of one single Format Byte (entered into the format FIFO through [`FDATA.FBYTE`](data/i2c.hjson#fdata)), and five (5) 1-bit flags (entered into the format FIFO through registers [`FDATA.READ`](data/i2c.hjson#fdata), [`FDATA.RCONT`](data/i2c.hjson#fdata), [`FDATA.START`](data/i2c.hjson#fdata), [`FDATA.STOP`](data/i2c.hjson#fdata) and [`FDATA.NAKOK`](data/i2c.hjson#fdata))
-
-The I2C reads each format indicator from the head of FMT_FIFO, and processes them in turn.
-If none of the flags are set for the format indicator, the I2C FSM simply transmits the Format Byte onto the SCL and SDA pins according to the specification, waits for acknowledgement, and then proceeds to the next format indicator.
-The format flags modulate the behavior as follows.
-- READ (corresponds to [`FDATA.READ`](data/i2c.hjson#fdata)):
-Signifies the Format Byte ([`FDATA.FBYTE`](data/i2c.hjson#fdata)) should be treated as an unsigned number, R, and prompts the state machine to read R bytes from the target device.
-Bytes read from the bus, are inserted into the RX FIFO where they can be accessed by software.
-A value of 0 is treated as a read of 256B.
-To read a larger byte stream, multiple 256B reads can be chained together using the RCONT flag.
-- RCONT (corresponds to FIFO inputs [`FDATA.RCONT`](data/i2c.hjson#fdata), only used with READ):
-    - If RCONT is set, the Format Byte represents part of a longer sequence of reads, allowing for reads to be chained indefinitely.
-    - The RCONT flag indicates the the final byte returned with the current read should be responded to with an ACK, allowing the target to continue sending data.
-(Note that the first R-1 bytes read will still be acknowledged regardless of whether RCONT is asserted or not.)
-- START (corresponds to [`FDATA.START`](data/i2c.hjson#fdata), Ignored when used with READ):
-Issue a START condition before transmitting the Format Byte on the bus.
-    - This flag may also be used to issue a repeated start condition.
-- STOP (corresponds to [`FDATA.STOP`](data/i2c.hjson#fdata)):
-Issue a STOP signal after processing this current entry in the FMT FIFO.
-    - Note that this flag is not compatible with (READ & RCONT), and will cause bus conflicts.
-- NAKOK (corresponds to [`FDATA.NAKOK`](data/i2c.hjson#fdata), Not compatible with READ):
-Typically every byte transmitted must also receive an ACK signal, and the IP will raise an exception if no ACK is received.
-However, there are some I2C commands which do not require an ACK.
-In those cases this flag should be asserted with FBYTE indicating no ACK is expected and no interrupt should be raised if the ACK is not received.
-
-### Target Address Registers
-
-I2C target device is assigned two 7-bit address and 7-bit mask pairs.
-The target device accepts a transaction if the result of the bitwise AND operation performed on the transaction address sent by the host and a mask matches the assigned address corresponding to the mask.
-In other words, address matching is performed only for bits where the mask is "1".
-Thus, with the masks set to all ones (0x7F), the target device will respond to either of the two assigned unique addresses and no other.
-If the mask and the assigned address both have zeros in a particular bit position, that bit will be a match regardless of the value of that bit received from the host.
-Note that if, in any bit position, the mask has zero and the assigned address has one, no transaction can match and such mask/address pair is effectively disabled.
-The assigned address and mask pairs are set in registers [`TARGET_ID.ADDRESS0`](data/i2c.hjson#target_id), [`TARGET_ID.MASK0`](data/i2c.hjson#target_id), [`TARGET_ID.ADDRESS1`](data/i2c.hjson#target_id), and [`TARGET_ID.MASK1`](data/i2c.hjson#target_id).
-
-### Acquired Formatted Data
-
-This section applies to I2C in the target mode.
-When the target accepts a transaction, it inserts the transaction address, read/write bit, and START signal sent by the host into ACQ FIFO where they can be accessed by software.
-ACQ FIFO output corresponds to [`ACQDATA`](data/i2c.hjson#acqdata).
-If the transaction is a write operation (R/W bit = 0), the target proceeds to read bytes from the bus and insert them into ACQ FIFO until the host terminates the transaction by sending a STOP or a repeated START signal.
-A STOP or repeated START indicator is inserted into ACQ FIFO as the next entry following the last byte received, in which case other bits may be junk.
-The following diagram shows consecutive entries inserted into ACQ FIFO during a write operation:
-
-![](./doc/I2C_acq_fifo_write.svg)
-
-If the transaction is a read operation (R/W bit = 1), the target pulls bytes out of TX FIFO and transmits them to the bus until the host signals the end of the transfer by sending a NACK signal.
-If TX FIFO holds no data, or if the ACQ FIFO contains more than 1 entry, the target will hold SCL low to stretch the clock and give software time to write data bytes into TX FIFO or handle the available command.
-See (#stretching-during-read) for more details.
-TX FIFO input corresponds to [`TXDATA`](data/i2c.hjson#txdata).
-Typically, a NACK signal is followed by a STOP or repeated START signal and the IP will raise an exception if the host sends a STOP signal after an ACK.
-An ACK/NACK signal is inserted into the ACQ FIFO as the first bit (bit 0), in the same entry with a STOP or repeated START signal.
-For ACK and NACK signals, the value of the first bit is 0 and 1, respectively.
-The following diagram shows consecutive entries inserted into ACQ FIFO during a read operation:
-
-![](./doc/I2C_acq_fifo_read.svg)
-
-The ACQ FIFO entry consists of 10 bits:
-- Address (bits 7:1) and R/W bit (bit 0) or data byte
-- Format flags (bits 9:8)
-The format flags indicate the following signals received from the host:
-- START: 01
-- STOP: 10
-- repeated START: 11
-- No START, or STOP: 00
-
-### Timing Control Registers
-
-For standard mode, fast-mode and fast-mode plus, the timing requirements for each transaction are detailed in Table 10 of the [I2C specification](https://www.nxp.com/docs/en/user-guide/UM10204.pdf).
-In order to claim complete compatibility at each mode, the state machine timings need to be adapted to whether there are Standard-mode, Fast-mode and Fast-mode Plus targets on the bus.
-Furthermore, depending on the actual capacitance of the bus, even a bus with all Fast-mode Plus capable targets may have to operate at slower speeds than 1Mbaud.
-For example, the host may need to run at lower frequencies, as discussed in Section 5.2 of the specification, but the computation of the nominal frequency will depend on timing specifications in Table 10, in this case particularly, the limits on t<sub>LOW</sub>, t<sub>HIGH</sub>, t<sub>r</sub>, and t<sub>f</sub>.
-Assuming no clock stretching, for a given set of these four parameters the baud rate is then given to be:
-$$ 1/f\_{SCL}=t\_{LOW}+t\_{HIGH}+t\_{r}+t\_{f}. $$
-
-Thus in order to ensure compliance with the spec in any particular configuration, software will program the I2C host IP with explicit values for each of the following timing parameters, as defined in Figure 38 of the specification.
-- t<sub>LOW</sub>: set in register [`TIMING0.TLOW`](data/i2c.hjson#timing0).
-- t<sub>HIGH</sub>: set in register [`TIMING0.THIGH`](data/i2c.hjson#timing0).
-- t<sub>r</sub>: set in register [`TIMING1.T_R`](data/i2c.hjson#timing1).
-(Note: The rise time cannot be explicitly controlled by internal hardware, and will be a function of the capacitance of the bus.
-Thus this parameter is largely budgetary, meaning that it tells the state machine how much time to wait for an RC rise.)
-- t<sub>f</sub>: set in register [`TIMING1.T_F`](data/i2c.hjson#timing1).
-(Note: The fall time cannot be explicitly controlled by internal hardware, and is a function of the pin driver.
-Thus this parameter is also budgetary.
-Given that the actual fall time cannot be controlled to stay above the minimum values set in Table 10 of the specification, and so this in this regard this module currently is not strictly compliant to the I2C spec.)
-- t<sub>SU,STA</sub>: set in register [`TIMING2.TSU_STA`](data/i2c.hjson#timing2)
-- t<sub>HD,STA</sub>: set in register [`TIMING2.THD_STA`](data/i2c.hjson#timing2)
-- t<sub>SU,DAT</sub>: set in register [`TIMING3.TSU_DAT`](data/i2c.hjson#timing3).
-Taken to be synonymous with T<sub>SU,ACK</sub>
-- t<sub>HD,DAT</sub>: set in register [`TIMING3.THD_DAT`](data/i2c.hjson#timing3).
-Taken to be synonymous with T<sub>HD,ACK</sub>.
-Moreover, since the pin driver fall time is likely to be less then one clock cycle, this parameter is also taken to be synonymous with the parameters T<sub>VD,DAT</sub> and T<sub>VD,ACK</sub>
-- t<sub>SU,STO</sub>: set in register [`TIMING4.TSU_STO`](data/i2c.hjson#timing4).
-- t<sub>BUF</sub>: set in register [`TIMING4.T_BUF`](data/i2c.hjson#timing4)
-
-The values programmed into the registers [`TIMING0`](data/i2c.hjson#timing0) through [`TIMING4`](data/i2c.hjson#timing4) are to be expressed in units of the bus clock period.
-Note in order to ensure compliance with the I2C spec, firmware must program these registers with values within the ranges laid out in Table 10 of the specification.
-These values can be directly computed using DIFs given the desired speed standard, the desired operating frequency, and the actual line capacitance.
-These timing parameters are then fed directly to the I2C state machine to control the bus timing.
-
-A detailed description of the algorithm for determining these parameters--as well as a couple of concrete examples--are given in the [Programmers Guide section of this document.](#timing-parameter-tuning-algorithm)
-
-### Timeout Control
-A malfunctioning (or otherwise very slow) target device can hold SCL low indefinitely, stalling the bus.
-For this reason [`TIMEOUT_CTRL`](data/i2c.hjson#timeout_ctrl) provides a clock-stretching timeout mechanism to notify firmware of this sort of condition.
-If [`TIMEOUT_CTRL.EN`](data/i2c.hjson#timeout_ctrl) is asserted, an interrupt will be asserted when the IP detects that another device (a target or, in possible future revisions, an alternate host) has been holding SCL low for more than [`TIMEOUT_CTRL.VAL`](data/i2c.hjson#timeout_ctrl) clock ticks.
-
-
-This feature is added as a utility, though it is not required by the I2C specification.
-However, in some applications it could be used in protocols which build upon I2C.
-For instance, SMBus applications using this IP could in principle use this to support SMBus timeouts.
-(Note: This is just an example application of this feature.
-Other features may also be required for complete SMBus functionality.)
-
-### Clock Stretching
-As described in the I2C specification, a target device can pause a transaction by holding SCL low.
-There are 3 cases in which this design stretches the clock.
-In all cases described below, a target begins to stretch the clock after the ACK bit.
-In the first two scenarios, it is after the ACK bit sent by the target, in the last scenario, it is after the host's ACK bit.
-
-#### Stretching after address read
-    - When a target device receives a start, the address and R/W bit are written into the ACQ FIFO.
-    - If there is no space in the ACQ FIFO to receive such a write, the target stretches the clock after the ACK bit and waits for software to make space.
-    - The `acq_full` interrupt is generated to alert software to such a situation.
-
-#### Stretching during write
-    - Similar to the scenario above, if the host tries to write a data byte into the ACQ FIFO when there is no available space, the clock is also stretched after the ACK bit.
-    - The `acq_full` interrupt is generated to alert software to such a situation.
-
-#### Stretching during read
-    - When a target device receives a start and read command, it may stretch the clock for either of the following two reasons.
-      - If there is no data available to be sent back (TX FIFO empty case), the target stretches the clock until data is made available by software.
-      - If there is more than 1 entry in the ACQ FIFO.
-        - Having more than 1 entry in the ACQ FIFO suggests there is potentially an unhandled condition (STOP / RESTART) or an unhandled command (START) that requires software intervention before the read can proceed.
-    - The `tx_stretch` interrupt is generated to alert software to such a situation.
-
-
-### Interrupts
-The I2C module has a few interrupts including general data flow interrupts and unexpected event interrupts.
-
-#### Host Mode
-If the RX FIFO exceeds the designated depth of entries, the interrupt `rx_threshold` is raised to inform firmware.
-Firmware can configure the threshold value via the register [`FIFO_CTRL.RXILVL`](data/i2c.hjson#fifo_ctrl).
-
-Meanwhile it the FMT FIFO level falls below a designated depth of entries the `fmt_threshold` interrupt is raised.
-(Note that this behavior differs from similar interrupts in other modules, such as the UART IP module.)
-Firmware can configure the threshold value via the register [`FIFO_CTRL.FMTILVL`](data/i2c.hjson#fifo_ctrl).
-
-If either FIFO receives an additional write request when its FIFO is full, the interrupt `fmt_overflow` or `rx_overflow` is asserted and the format indicator or character is dropped.
-
-If the module transmits a byte, but receives no ACK signal, the `nak` interrupt is usually asserted.
-In cases where a byte is transmitted and no ACK is expected or required, that byte should be submitted with NAKOK flag also asserted.
-
-When the I2C module is in transmit mode, the `scl_interference` or `sda_interference` interrupts will be asserted if the IP identifies that some other device (host or target) on the bus is forcing either signal low and interfering with the transmission.
-If should be noted that the `scl_interference` interrupt is not raised in the case when the target device is stretching the clock.
-(However, it may be raised if the target allows SCL to go high and then pulls SCL down before the end of the current clock cycle.)
-
-A target device should never assert 0 on the SDA lines, and in the absence of multi-host support, the `sda_interference` interrupt is raised whenever the host IP detects that another device is pulling SDA low.
-
-On the other hand, it is legal for the a target device to assert SCL low for clock stretching purposes.
-With clock stretching, the target can delay the start of the following SCL pulse by holding SCL low between clock pulses.
-However the target device must assert SCL low before the start of the SCL pulse.
-If SCL is pulled low during an SCL pulse which has already started, this interruption of the SCL pulse will be registered as an exception by the I2C core, which will then assert the `scl_interference` interrupt.
-
-```wavejson
-{signal: [
-  {name: 'Clock', wave: 'p.....|.......|......'},
-  {name: 'SCL Host Driver', wave: '0.z..0|.z....0|..z.x.'},
-  {name: 'SCL Target Driver', wave: 'z.....|0..z...|...0..'},
-  {name: 'SCL bus', wave: '0.u..0|...u..0|..u0..'},
-  {name: 'scl_interference', wave: '0.....|.......|....1.'},
-],
-  head: {text: 'SCL pulses: Normal SCL pulse (Cycle 3),  SCL pulse with clock stretching (cycle 11), and SCL interference (interrupted SCL pulse)',tick:1}}
-```
-
-
-Though normal clock stretching does not count as SCL interference, if the module detects that a target device has held SCL low and stretched the any given SCL cycle for more than [`TIMEOUT_CTRL.VAL`](data/i2c.hjson#timeout_ctrl) clock ticks this will cause the stretch timeout interrupt to be asserted.
-This interrupt is suppressed, however, if [`TIMEOUT_CTRL.EN`](data/i2c.hjson#timeout_ctrl) is deasserted low.
-
-```wavejson
-{signal: [
-  {name: 'Clock', wave: 'p............'},
-  {name: 'SCL Host Driver', wave: '0..z.......x.'},
-  {name: 'SCL Target Driver', wave: 'z0...........'},
-  {name: 'SCL bus', wave: '0............'},
-  {name: 'TIMEOUT_CNTRL.VAL', wave: '2............', data: "8"},
-  {name: 'SCL timeout counter', wave: '2...22222222x', data: '0 1 2 3 4 5 6 7 8'},
-  {name: 'TIMEOUT_CNTRL.EN', wave: '1............'},
-  {name: 'scl_timeout', wave: '0..........1.'},
-],
-  head: {text: 'SCL Timeout Example',tick:-3}}
-```
-
-Except for START and STOP symbols, the I2C specification requires that the SDA signal remains constant whenever SCL is high.
-The `sda_unstable` interrupt is asserted if, when receiving data or acknowledgement pulse, the value of the SDA signal does not remain constant over the duration of the SCL pulse.
-
-Transactions are terminated by a STOP signal.
-The host may send a repeated START signal instead of a STOP, which also terminates the preceding transaction.
-In both cases, the `cmd_complete` interrupt is asserted, in the beginning of a repeated START or at the end of a STOP.
-
-
-#### Target Mode
-
-The interrupt `cmd_complete` is asserted whenever a RESTART or a STOP bit is observed by the target.
-
-The interrupt `tx_stretch` is asserted whenever target intends to transmit data but cannot.
-See
-
-When a host receives enough data from a target, it usually signals the end of the transaction by sending a NACK followed by a STOP or a repeated START.
-In a case when a target receives a STOP without the prerequisite NACK, the interrupt `unexp_stop` is asserted.
-This interrupt just means that a STOP was unexpectedly observed during a host read.
-It is not necessarily harmful, but software can be made aware just in case.
-
-If ACQ FIFO becomes full, the interrupt `acq_full` is asserted.
-
-If a host ceases to send SCL pulses at any point during an ongoing transaction, the target waits for a specified time period and then asserts the interrupt `host_timeout`.
-Host sending an address and R/W bit to all target devices, writing to the selected target, or reading from the target are examples of ongoing transactions.
-The time period is counted from the last low-to-high SCL transition.
-Firmware can configure the timeout value via the register [`HOST_TIMEOUT_CTRL`](data/i2c.hjson#host_timeout_ctrl).
-
-### Implementation Details: Format Flag Parsing
-
-To illustrate the behavior induced by various flags added to the formatting queue, the following figure shows a simplified version of the I2C_Host state machine.
-In this simplified view, many sequential states have been collapsed into four sub-sequences of states (shown in square brackets) or have their names abbreviated:
-- Issue start
-- Issue stop
-- Transmit Byte
-- Read Bytes
-
-Within each of these sub-sequences, state transitions depend only on the SDA/SCL inputs or internal timers.
-Each sub-sequence has a terminal event--generically labeled "[completed]" which prompts the transition to another sequence or state.
-
-However, all transitions which are dependent on formatting flags are shown explicitly in this figure.
-
-![](./doc/I2C_state_diagram.svg)
-
-Similarly, the figure below shows a simplified version of the I2C_Target state machine.
-
-![](./doc/I2C_state_diagram_target.svg)
-
-In this diagram, "R/W" stands for a R/W bit value. The host is reading when R/W bit is "1" and writing when R/W bit is "0".
-
-# Programmers guide
-
-## Initialization
-
-After reset, the initialization of the I2C HWIP primarily consists of four steps:
-1. Timing parameter initialization
-1. FIFO reset and configuration
-1. Interrupt configuration
-1. Enable I2C Host or Target functionality
-
-### Timing Parameter Tuning Algorithm
-
-Of the four initialization steps, the timing parameter initialization is the most involved.  With so many timing parameters, it is essential to have dedicated device interface functions (DIFs) to determine appropriate values for the 10 timing parameters.
-
-The values of these parameters will depend primarily on three bus details:
-- The speed mode of the slowest device on the bus: standard mode (100 kbaud), fast mode (400 kbaud) or fast-mode plus (1 Mbaud).
-- The input clock period, t<sub>clk</sub> in ns.
-- The expected signal rise time, t<sub>r</sub>, in ns.
-   - This is not a firmware-controlled parameter.
-Rather, it is a function of the capacitance and physical design of the bus.
-The specification provides detailed guidelines on how to manage capacitance in an I2C system:
-   - Section 5.2 of the I2C specification indicates that Fast-mode plus devices may operate at reduced clock speeds if the bus capacitance drives signal rise times (t<sub>r</sub>) outside the nominal 120ns limit.
-Excess capacitance can also be compensated for by reducing the size of the bus pullup resistor, so long as the total open-drain current does not exceed 20mA for fast-mode plus devices (as described in section 7.1 of the I2C specification).
-However the specification places a hard limit on rise times capping them at 1000ns.
-    - If there are standard- or fast-mode target devices on the bus, the specified open-drain current limit is reduced to 3mA (section 7.1), thus further restricting the minimum value of the pull-up resistor.
-    - In fast-mode bus designs, where the total line capacitance exceeds 200pF, the specification recommends replacing the pull-up resistor with an active current source, supplying 3mA or less (section 5.1).
-Regardless of the physical construction of the bus, the rise time (t<sub>r</sub>) is a system dependent, parameter that needs to be made known to firmware for I2C initialization.
-- The expected fall time, t<sub>f</sub>, in ns.
-   - Like t<sub>r</sub>, this parameter is not firmware controlled rather it is a function of the SCL driver, which in a strictly compliant device is expected to manage the slew-rate for the falling edge of the SDA and SCL signals, through proper design of the SCL output buffer.
-   - See table 10 of the I2C specification for more details.
-- (optional) The desired SCL cycle period, t<sub>SCL,user</sub> in ns.
-   - By default the device should operate at the maximum frequency for that mode.
-However, If the system developer wishes to operate at slower than the mode-specific maximum, a larger than minimum period  could be allowed as an additional functional parameter when calculating the timing parameters.
-
-Based on the inputs, the timing parameters may be chosen using the following algorithm:
-1. The physical timing parameters t<sub>HD,STA</sub>, t<sub>SU,STA</sub>, t<sub>HD.DAT</sub>, t<sub>SU,DAT</sub>, t<sub>BUF</sub>, and t<sub>STO</sub>, t<sub>HIGH</sub>, and t<sub>LOW</sub> all have minimum allowed values which depend on the choice of speed mode (standard-mode, fast-mode or fast-mode plus).
-Using the speed mode input, look up the appropriate minimum value (in ns) for each parameter (i.e. t<sub>HD,STA,min</sub>, t<sub>SU,STA,min</sub>, etc)
-1. For each of these eight parameters, obtain an integer minimum by dividing the physical minimum parameter by the clock frequency and rounding up to the next highest integer:
-$$ \textrm{THIGH_MIN}=\lceil{t\_{HIGH,min}/t\_{clk}}\rceil $$
-$$ \textrm{TLOW_MIN}=\lceil{t\_{LOW,min}/t\_{clk}}\rceil $$
-$$ \textrm{THD_STA_MIN}= \lceil{t\_{HD,STA,min}/t\_{clk}}\rceil $$
-$$ \textrm{TSU_STA_MIN}= \lceil{t\_{SU,STA,min}/t\_{clk}}\rceil $$
-$$ \textrm{THD_DAT_MIN}= \lceil{t\_{HD,DAT,min}/t\_{clk}}\rceil $$
-$$ \textrm{TSU_DAT_MIN}= \lceil{t\_{HD,DAT,min}/t\_{clk}}\rceil $$
-$$ \textrm{T_BUF_MIN}= \lceil{t\_{BUF,min}/t\_{clk}}\rceil $$
-$$ \textrm{T_STO_MIN}= \lceil{t\_{STO,min}/t\_{clk}}\rceil $$
-
-1. Input the integer timing parameters, THD_STA_MIN, TSU_STA_MIN, THD_DAT_MIN, TSU_DAT_MIN, T_BUF_MIN and T_STO_MIN into their corresponding registers (`TIMING2.THD_STA`, `TIMING2.TSU_STA`, `TIMING3.THD_DAT`, `TIMING3.TSU_DAT`, `TIMING4.T_BUF`, `TIMING4.T_STO`)
-    - This step allows the firmware to manage SDA signal delays to ensure that the SDA outputs are compliant with the specification.
-    - The registers `TIMING0.THIGH` and `TIMING0.TLOW` will be taken care of in a later step.
-1. Take the given values for for t<sub>f</sub> and t<sub>r</sub> and convert them to integer counts as well:
-$$ \textrm{T_R}= \lceil{t\_{r}/t\_{clk}}\rceil $$
-$$ \textrm{T_F}= \lceil{t\_{f}/t\_{clk}}\rceil $$
-1. Store T_R and T_F in their corresponding registers: `TIMING1.T_R` and `TIMING1.T_F`.
-1. Based on the input speed mode, look up the maximum permissible SCL frequency (f<sub>SCL,max</sub>)and calculate the minimum permissible SCL period:
-$$ t\_{SCL,min}= 1/f\_{SCL,max} $$
-1. As with each of the other physical parameters convert t<sub>SCL,min</sub> and, if provided, the t<sub>SCL,user</sub> to integers, MINPERIOD and USERPERIOD..
-$$ MINPERIOD = \lceil{t\_{SCL,min}/t\_{clk}}\rceil $$
-$$ USERPERIOD = \lceil{t\_{SCL,user}/t\_{clk}}\rceil $$
-1. Let PERIOD=max(MINPERIOD, USERPERIOD).
-1. Each SCL cycle will now be at least PERIOD clock cycles in duration, divided between four segments: T_R, THIGH, T_F, and TLOW.
-    - In other words: PERIOD=T_R+THIGH+T_F+TLOW.
-    - With T_R and T_F already established, the remaining integer parameters THIGH and TLOW are to be divided among the remaining clock cycles in PERIOD:
-$$ \textrm{THIGH}+\textrm{TLOW} \ge\textrm{PERIOD}-\textrm{T_F}-\textrm{T_R} $$
-    - Since t<sub>HIGH</sub> and t<sub>LOW</sub> both have minimum allowable values, which depends on the mode, high values of t<sub>r</sub> or t<sub>f</sub> may force an increase in the total SCL period, slowing down the data transit rate.
-    - The balance between t<sub>HIGH</sub> and t<sub>LOW</sub> can be manipulated in a variety of different ways (depending on the desired SCL duty cycle).
-    - It is, for instance, perfectly acceptable to simply set TLOW to the minimum possible value:
-$$ \textrm{TIMING0.TLOW}=\textrm{TLOW_MIN} $$
-1. THIGH is then set to satisfy both constraints in the desired SCL period and in the minimum permissible values for t<sub>HIGH</sub>:
-$$ \textrm{TIMING0.THIGH}=\max(\textrm{PERIOD}-\textrm{T_R} - \textrm{TIMING0.TLOW} -\textrm{T_F}, \textrm{THIGH_MIN}) $$
-
-
-#### Timing parameter examples
-
-The following tables show a couple of examples for calculating timing register parameters for Fast-mode Plus devices.
-Both examples assume a desired datarate of 1 Mbaud (the bus maximum) for an SCL period of 1us, and an internal device clock period of 3ns.
-
-| Parameter       | Spec. Min. (ns)  | Reg. Val.  | Phys. Val (ns) | Comment                                         |
-|-----------------|------------------|------------|----------------|-----------------------------------------------|
-| TIMING0.THIGH   | 260              | 120        | 360            | Chosen to satisfy SCL Period Minimum          |
-| TIMING0.TLOW    | 500              | 167        | 501            | Spec. t<sub>LOW</sub> Minimum                 |
-| TIMING1.T_F     | 20ns * (VDD/5.5V)| 7          | 21             | Signal slew-rate should be controlled         |
-| TIMING1.T_R     | 0                | 40         | 120            | Based on pull-up resistance, line capacitance |
-| SCL Period      | 1000             | N/A        | 1002           | Constraint on THIGH+TLOW+T_R+T_F              |
-| TIMING2.THD_STA | 260              | 87         | 261            | Spec. Minimum                                 |
-| TIMING2.TSU_STA | 260              | 87         | 261            | Spec. Minimum                                 |
-| TIMING3.THD_DAT | 0                | 0          | 0              | Spec. Minimum                                 |
-| TIMING3.TSU_DAT | 260              | 87         | 261            | Spec. Minimum                                 |
-| TIMING4.T_BUF   | 500              | 167        | 501            | Spec. Minimum                                 |
-| TIMING4.T_STO   | 260              | 87         | 161            | Spec. Minimum                                 |
-
-This next example shows how the first SCL timing registers: `TIMING0` and `TIMING1` are altered in a high-capacitance Fast-mode Plus bus, where the physical value of t<sub>r</sub> driven to an atypical value of 400ns.
-As in the previous example the integer register values are determined based on a system clock period, t<sub>clk</sub>, of 3ns.
-All other parameters in registers `TIMING2`, `TIMING3`, `TIMING4` are unchanged from the previous example.
-
-| Parameter       | Spec. Min. (ns)  | Reg. Val.  | Phys. Val (ns) | Comment                                       |
-|-----------------|------------------|------------|----------------|-----------------------------------------------|
-| TIMING0.THIGH   | 260              | 87         | 261            | Spec. t<sub>HIGH</sub> Minimum                |
-| TIMING0.TLOW    | 500              | 167        | 501            | Spec. t<sub>LOW</sub> Minimum                 |
-| TIMING1.T_F     | 20ns * (VDD/5.5V)| 7          | 21             | Signal slew-rate should be controlled         |
-| TIMING1.T_R     | 0                | 134        | 402            | Atypically high line capacitance             |
-| SCL Period      | 1000             | N/A        | 395            | Forced longer than minimum by long T_R        |
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_i2c.h)
-
-## Register Table
-
-* [Register Table](data/i2c.hjson#registers)
diff --git a/hw/ip/i2c/doc/programmers_guide.md b/hw/ip/i2c/doc/programmers_guide.md
new file mode 100644
index 0000000000000..63872808b4e92
--- /dev/null
+++ b/hw/ip/i2c/doc/programmers_guide.md
@@ -0,0 +1,110 @@
+# Programmer's Guide
+
+## Initialization
+
+After reset, the initialization of the I2C HWIP primarily consists of four steps:
+1. Timing parameter initialization
+1. FIFO reset and configuration
+1. Interrupt configuration
+1. Enable I2C Host or Target functionality
+
+### Timing Parameter Tuning Algorithm
+
+Of the four initialization steps, the timing parameter initialization is the most involved.  With so many timing parameters, it is essential to have dedicated device interface functions (DIFs) to determine appropriate values for the 10 timing parameters.
+
+The values of these parameters will depend primarily on three bus details:
+- The speed mode of the slowest device on the bus: standard mode (100 kbaud), fast mode (400 kbaud) or fast-mode plus (1 Mbaud).
+- The input clock period, t<sub>clk</sub> in ns.
+- The expected signal rise time, t<sub>r</sub>, in ns.
+   - This is not a firmware-controlled parameter.
+Rather, it is a function of the capacitance and physical design of the bus.
+The specification provides detailed guidelines on how to manage capacitance in an I2C system:
+   - Section 5.2 of the I2C specification indicates that Fast-mode plus devices may operate at reduced clock speeds if the bus capacitance drives signal rise times (t<sub>r</sub>) outside the nominal 120ns limit.
+Excess capacitance can also be compensated for by reducing the size of the bus pullup resistor, so long as the total open-drain current does not exceed 20mA for fast-mode plus devices (as described in section 7.1 of the I2C specification).
+However the specification places a hard limit on rise times capping them at 1000ns.
+    - If there are standard- or fast-mode target devices on the bus, the specified open-drain current limit is reduced to 3mA (section 7.1), thus further restricting the minimum value of the pull-up resistor.
+    - In fast-mode bus designs, where the total line capacitance exceeds 200pF, the specification recommends replacing the pull-up resistor with an active current source, supplying 3mA or less (section 5.1).
+Regardless of the physical construction of the bus, the rise time (t<sub>r</sub>) is a system dependent, parameter that needs to be made known to firmware for I2C initialization.
+- The expected fall time, t<sub>f</sub>, in ns.
+   - Like t<sub>r</sub>, this parameter is not firmware controlled rather it is a function of the SCL driver, which in a strictly compliant device is expected to manage the slew-rate for the falling edge of the SDA and SCL signals, through proper design of the SCL output buffer.
+   - See table 10 of the I2C specification for more details.
+- (optional) The desired SCL cycle period, t<sub>SCL,user</sub> in ns.
+   - By default the device should operate at the maximum frequency for that mode.
+However, If the system developer wishes to operate at slower than the mode-specific maximum, a larger than minimum period  could be allowed as an additional functional parameter when calculating the timing parameters.
+
+Based on the inputs, the timing parameters may be chosen using the following algorithm:
+1. The physical timing parameters t<sub>HD,STA</sub>, t<sub>SU,STA</sub>, t<sub>HD.DAT</sub>, t<sub>SU,DAT</sub>, t<sub>BUF</sub>, and t<sub>STO</sub>, t<sub>HIGH</sub>, and t<sub>LOW</sub> all have minimum allowed values which depend on the choice of speed mode (standard-mode, fast-mode or fast-mode plus).
+Using the speed mode input, look up the appropriate minimum value (in ns) for each parameter (i.e. t<sub>HD,STA,min</sub>, t<sub>SU,STA,min</sub>, etc)
+1. For each of these eight parameters, obtain an integer minimum by dividing the physical minimum parameter by the clock frequency and rounding up to the next highest integer:
+$$ \textrm{THIGH_MIN}=\lceil{t\_{HIGH,min}/t\_{clk}}\rceil $$
+$$ \textrm{TLOW_MIN}=\lceil{t\_{LOW,min}/t\_{clk}}\rceil $$
+$$ \textrm{THD_STA_MIN}= \lceil{t\_{HD,STA,min}/t\_{clk}}\rceil $$
+$$ \textrm{TSU_STA_MIN}= \lceil{t\_{SU,STA,min}/t\_{clk}}\rceil $$
+$$ \textrm{THD_DAT_MIN}= \lceil{t\_{HD,DAT,min}/t\_{clk}}\rceil $$
+$$ \textrm{TSU_DAT_MIN}= \lceil{t\_{HD,DAT,min}/t\_{clk}}\rceil $$
+$$ \textrm{T_BUF_MIN}= \lceil{t\_{BUF,min}/t\_{clk}}\rceil $$
+$$ \textrm{T_STO_MIN}= \lceil{t\_{STO,min}/t\_{clk}}\rceil $$
+
+1. Input the integer timing parameters, THD_STA_MIN, TSU_STA_MIN, THD_DAT_MIN, TSU_DAT_MIN, T_BUF_MIN and T_STO_MIN into their corresponding registers (`TIMING2.THD_STA`, `TIMING2.TSU_STA`, `TIMING3.THD_DAT`, `TIMING3.TSU_DAT`, `TIMING4.T_BUF`, `TIMING4.T_STO`)
+    - This step allows the firmware to manage SDA signal delays to ensure that the SDA outputs are compliant with the specification.
+    - The registers `TIMING0.THIGH` and `TIMING0.TLOW` will be taken care of in a later step.
+1. Take the given values for for t<sub>f</sub> and t<sub>r</sub> and convert them to integer counts as well:
+$$ \textrm{T_R}= \lceil{t\_{r}/t\_{clk}}\rceil $$
+$$ \textrm{T_F}= \lceil{t\_{f}/t\_{clk}}\rceil $$
+1. Store T_R and T_F in their corresponding registers: `TIMING1.T_R` and `TIMING1.T_F`.
+1. Based on the input speed mode, look up the maximum permissible SCL frequency (f<sub>SCL,max</sub>)and calculate the minimum permissible SCL period:
+$$ t\_{SCL,min}= 1/f\_{SCL,max} $$
+1. As with each of the other physical parameters convert t<sub>SCL,min</sub> and, if provided, the t<sub>SCL,user</sub> to integers, MINPERIOD and USERPERIOD..
+$$ MINPERIOD = \lceil{t\_{SCL,min}/t\_{clk}}\rceil $$
+$$ USERPERIOD = \lceil{t\_{SCL,user}/t\_{clk}}\rceil $$
+1. Let PERIOD=max(MINPERIOD, USERPERIOD).
+1. Each SCL cycle will now be at least PERIOD clock cycles in duration, divided between four segments: T_R, THIGH, T_F, and TLOW.
+    - In other words: PERIOD=T_R+THIGH+T_F+TLOW.
+    - With T_R and T_F already established, the remaining integer parameters THIGH and TLOW are to be divided among the remaining clock cycles in PERIOD:
+$$ \textrm{THIGH}+\textrm{TLOW} \ge\textrm{PERIOD}-\textrm{T_F}-\textrm{T_R} $$
+    - Since t<sub>HIGH</sub> and t<sub>LOW</sub> both have minimum allowable values, which depends on the mode, high values of t<sub>r</sub> or t<sub>f</sub> may force an increase in the total SCL period, slowing down the data transit rate.
+    - The balance between t<sub>HIGH</sub> and t<sub>LOW</sub> can be manipulated in a variety of different ways (depending on the desired SCL duty cycle).
+    - It is, for instance, perfectly acceptable to simply set TLOW to the minimum possible value:
+$$ \textrm{TIMING0.TLOW}=\textrm{TLOW_MIN} $$
+1. THIGH is then set to satisfy both constraints in the desired SCL period and in the minimum permissible values for t<sub>HIGH</sub>:
+$$ \textrm{TIMING0.THIGH}=\max(\textrm{PERIOD}-\textrm{T_R} - \textrm{TIMING0.TLOW} -\textrm{T_F}, \textrm{THIGH_MIN}) $$
+
+
+#### Timing parameter examples
+
+The following tables show a couple of examples for calculating timing register parameters for Fast-mode Plus devices.
+Both examples assume a desired datarate of 1 Mbaud (the bus maximum) for an SCL period of 1us, and an internal device clock period of 3ns.
+
+| Parameter       | Spec. Min. (ns)  | Reg. Val.  | Phys. Val (ns) | Comment                                         |
+|-----------------|------------------|------------|----------------|-----------------------------------------------|
+| TIMING0.THIGH   | 260              | 120        | 360            | Chosen to satisfy SCL Period Minimum          |
+| TIMING0.TLOW    | 500              | 167        | 501            | Spec. t<sub>LOW</sub> Minimum                 |
+| TIMING1.T_F     | 20ns * (VDD/5.5V)| 7          | 21             | Signal slew-rate should be controlled         |
+| TIMING1.T_R     | 0                | 40         | 120            | Based on pull-up resistance, line capacitance |
+| SCL Period      | 1000             | N/A        | 1002           | Constraint on THIGH+TLOW+T_R+T_F              |
+| TIMING2.THD_STA | 260              | 87         | 261            | Spec. Minimum                                 |
+| TIMING2.TSU_STA | 260              | 87         | 261            | Spec. Minimum                                 |
+| TIMING3.THD_DAT | 0                | 0          | 0              | Spec. Minimum                                 |
+| TIMING3.TSU_DAT | 260              | 87         | 261            | Spec. Minimum                                 |
+| TIMING4.T_BUF   | 500              | 167        | 501            | Spec. Minimum                                 |
+| TIMING4.T_STO   | 260              | 87         | 161            | Spec. Minimum                                 |
+
+This next example shows how the first SCL timing registers: `TIMING0` and `TIMING1` are altered in a high-capacitance Fast-mode Plus bus, where the physical value of t<sub>r</sub> driven to an atypical value of 400ns.
+As in the previous example the integer register values are determined based on a system clock period, t<sub>clk</sub>, of 3ns.
+All other parameters in registers `TIMING2`, `TIMING3`, `TIMING4` are unchanged from the previous example.
+
+| Parameter       | Spec. Min. (ns)  | Reg. Val.  | Phys. Val (ns) | Comment                                       |
+|-----------------|------------------|------------|----------------|-----------------------------------------------|
+| TIMING0.THIGH   | 260              | 87         | 261            | Spec. t<sub>HIGH</sub> Minimum                |
+| TIMING0.TLOW    | 500              | 167        | 501            | Spec. t<sub>LOW</sub> Minimum                 |
+| TIMING1.T_F     | 20ns * (VDD/5.5V)| 7          | 21             | Signal slew-rate should be controlled         |
+| TIMING1.T_R     | 0                | 134        | 402            | Atypically high line capacitance             |
+| SCL Period      | 1000             | N/A        | 395            | Forced longer than minimum by long T_R        |
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_i2c.h)
+
+## Register Table
+
+* [Register Table](../data/i2c.hjson#registers)
diff --git a/hw/ip/i2c/doc/theory_of_operation.md b/hw/ip/i2c/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..35feeccd5ebc3
--- /dev/null
+++ b/hw/ip/i2c/doc/theory_of_operation.md
@@ -0,0 +1,306 @@
+# Theory of Operation
+
+## Block Diagram
+
+![](../doc/I2C_block_diagram.svg)
+
+## Hardware Interfaces
+
+* [Interface Tables](../data/i2c.hjson#interfaces)
+
+## Design Details
+
+### Functional Modes
+
+I2C IP is a host-target combo that can function as either an I2C host or an I2C target.
+Although it is conceivable that an I2C combo can optionally function as both a host and a target at the same time, we do not support this feature at this time.
+These functional modes are enabled at runtime by setting the register fields [`CTRL.ENABLEHOST`](../data/i2c.hjson#ctrl) and [`CTRL.ENABLETARGET`](../data/i2c.hjson#ctrl).
+
+### Virtual Open Drain
+
+In devices which lack a true open drain buffer functionality, this IP implements a "virtual Open Drain" functionality.
+The SDA and SCL outputs are assumed to be connected to a tri-state buffer, with independent enable outputs for both signals.
+
+Rather than toggling the buffer inputs, the buffer inputs are *continuously asserted low*, and instead the buffer *enable* signals are toggled.
+The SDA or SCL buffers are enabled for a logical "Low" output on the respective signal, and are disabled for logical "High" outputs.
+This arrangement allows the the output pins to float high if there is no conflict from external devices, or to be pulled low if there is a conflict (as is required for clock-stretching or--in future revisions-- multi-host functionality).
+
+This arrangement is necessary for FPGA builds.
+
+
+### Override Mode for Direct Pin Access
+
+The I2C hardware interface consists of two external pins, SCL and SDA, whose behavior is described in the [I2C specification](https://www.nxp.com/docs/en/user-guide/UM10204.pdf).
+These pins are typically controlled by an internal state machine.
+However, there is a simpler "override" mode, by which these pins can be directly manipulated by software.
+This override mode is useful for troubleshooting or error-recovery.
+
+To enter override mode, the register field [`OVRD.TXOVRDEN`](../data/i2c.hjson#ovrd) is asserted by software.
+In this state the output drivers `scl_tx_o` and `sda_tx_o` are controlled directly by the register fields [`OVRD.SCLVAL`](../data/i2c.hjson#ovrd) and [`OVRD.SDAVAL`](../data/i2c.hjson#ovrd).
+When [`OVRD.SCLVAL`](../data/i2c.hjson#ovrd) and [`OVRD.SDAVAL`](../data/i2c.hjson#ovrd) are set high, the virtual open drain configuration will leave the output resistively pulled high, and controllable by remote targets.
+In this state, with SCL or SDA asserted high, the register fields [`VAL.SCL_RX`](../data/i2c.hjson#val) and [`VAL.SDA_RX`](../data/i2c.hjson#val) can be used to receive inputs (including remote acknowledgments) from target devices.
+
+#### FSM control of SCL and SDA
+
+While in host mode, SCL and SDA are generated through the internal state machine.
+Since SCL is directly decoded from the states, it can have short glitches during transition which the external target may be sensitive to if it is not using an over-sampling scheme.
+To counter this, the SCL and SDA outputs from the internal state machine are flopped before they are emitted.
+
+This adds a one cycle module clock delay to both signals.
+If the module clock is sufficiently faster than I2C line speeds (for example 20MHz), this is not an issue.
+However if the line speeds and the module clock speeds become very close (2x), the 1 cycle delay may have an impact, as the internal state machine may mistakenly think it has sampled an SDA that has not yet been updated.
+
+It it thus recommended to run clock ratios such that the internal module clock is at least 5x-10x the line speeds.
+
+### Byte-Formatted Programming Mode
+
+This section applies to I2C in the host mode.
+The state machine-controlled mode allows for higher-speed operation with less frequent software interaction.
+In this mode, the I2C pins are controlled by the I2C state machine, which in turn is controlled by a sequence of formatting indicators.
+The formatting indicators indicate:
+- The sequence of bytes which should be transmitted on the SDA and SCL pins.
+- The periods between transmitted bytes when the state-machine should stop transmission and instead read back a fixed number of bytes.
+- Which bytes should be preceded by a START symbol.
+- Which bytes should be followed by a STOP symbol
+The format indicator consists of 13-bits.
+That is of one single Format Byte (entered into the format FIFO through [`FDATA.FBYTE`](../data/i2c.hjson#fdata)), and five (5) 1-bit flags (entered into the format FIFO through registers [`FDATA.READ`](../data/i2c.hjson#fdata), [`FDATA.RCONT`](../data/i2c.hjson#fdata), [`FDATA.START`](../data/i2c.hjson#fdata), [`FDATA.STOP`](../data/i2c.hjson#fdata) and [`FDATA.NAKOK`](../data/i2c.hjson#fdata))
+
+The I2C reads each format indicator from the head of FMT_FIFO, and processes them in turn.
+If none of the flags are set for the format indicator, the I2C FSM simply transmits the Format Byte onto the SCL and SDA pins according to the specification, waits for acknowledgement, and then proceeds to the next format indicator.
+The format flags modulate the behavior as follows.
+- READ (corresponds to [`FDATA.READ`](../data/i2c.hjson#fdata)):
+Signifies the Format Byte ([`FDATA.FBYTE`](../data/i2c.hjson#fdata)) should be treated as an unsigned number, R, and prompts the state machine to read R bytes from the target device.
+Bytes read from the bus, are inserted into the RX FIFO where they can be accessed by software.
+A value of 0 is treated as a read of 256B.
+To read a larger byte stream, multiple 256B reads can be chained together using the RCONT flag.
+- RCONT (corresponds to FIFO inputs [`FDATA.RCONT`](../data/i2c.hjson#fdata), only used with READ):
+    - If RCONT is set, the Format Byte represents part of a longer sequence of reads, allowing for reads to be chained indefinitely.
+    - The RCONT flag indicates the the final byte returned with the current read should be responded to with an ACK, allowing the target to continue sending data.
+(Note that the first R-1 bytes read will still be acknowledged regardless of whether RCONT is asserted or not.)
+- START (corresponds to [`FDATA.START`](../data/i2c.hjson#fdata), Ignored when used with READ):
+Issue a START condition before transmitting the Format Byte on the bus.
+    - This flag may also be used to issue a repeated start condition.
+- STOP (corresponds to [`FDATA.STOP`](../data/i2c.hjson#fdata)):
+Issue a STOP signal after processing this current entry in the FMT FIFO.
+    - Note that this flag is not compatible with (READ & RCONT), and will cause bus conflicts.
+- NAKOK (corresponds to [`FDATA.NAKOK`](../data/i2c.hjson#fdata), Not compatible with READ):
+Typically every byte transmitted must also receive an ACK signal, and the IP will raise an exception if no ACK is received.
+However, there are some I2C commands which do not require an ACK.
+In those cases this flag should be asserted with FBYTE indicating no ACK is expected and no interrupt should be raised if the ACK is not received.
+
+### Target Address Registers
+
+I2C target device is assigned two 7-bit address and 7-bit mask pairs.
+The target device accepts a transaction if the result of the bitwise AND operation performed on the transaction address sent by the host and a mask matches the assigned address corresponding to the mask.
+In other words, address matching is performed only for bits where the mask is "1".
+Thus, with the masks set to all ones (0x7F), the target device will respond to either of the two assigned unique addresses and no other.
+If the mask and the assigned address both have zeros in a particular bit position, that bit will be a match regardless of the value of that bit received from the host.
+Note that if, in any bit position, the mask has zero and the assigned address has one, no transaction can match and such mask/address pair is effectively disabled.
+The assigned address and mask pairs are set in registers [`TARGET_ID.ADDRESS0`](../data/i2c.hjson#target_id), [`TARGET_ID.MASK0`](../data/i2c.hjson#target_id), [`TARGET_ID.ADDRESS1`](../data/i2c.hjson#target_id), and [`TARGET_ID.MASK1`](../data/i2c.hjson#target_id).
+
+### Acquired Formatted Data
+
+This section applies to I2C in the target mode.
+When the target accepts a transaction, it inserts the transaction address, read/write bit, and START signal sent by the host into ACQ FIFO where they can be accessed by software.
+ACQ FIFO output corresponds to [`ACQDATA`](../data/i2c.hjson#acqdata).
+If the transaction is a write operation (R/W bit = 0), the target proceeds to read bytes from the bus and insert them into ACQ FIFO until the host terminates the transaction by sending a STOP or a repeated START signal.
+A STOP or repeated START indicator is inserted into ACQ FIFO as the next entry following the last byte received, in which case other bits may be junk.
+The following diagram shows consecutive entries inserted into ACQ FIFO during a write operation:
+
+![](../doc/I2C_acq_fifo_write.svg)
+
+If the transaction is a read operation (R/W bit = 1), the target pulls bytes out of TX FIFO and transmits them to the bus until the host signals the end of the transfer by sending a NACK signal.
+If TX FIFO holds no data, or if the ACQ FIFO contains more than 1 entry, the target will hold SCL low to stretch the clock and give software time to write data bytes into TX FIFO or handle the available command.
+See (#stretching-during-read) for more details.
+TX FIFO input corresponds to [`TXDATA`](../data/i2c.hjson#txdata).
+Typically, a NACK signal is followed by a STOP or repeated START signal and the IP will raise an exception if the host sends a STOP signal after an ACK.
+An ACK/NACK signal is inserted into the ACQ FIFO as the first bit (bit 0), in the same entry with a STOP or repeated START signal.
+For ACK and NACK signals, the value of the first bit is 0 and 1, respectively.
+The following diagram shows consecutive entries inserted into ACQ FIFO during a read operation:
+
+![](../doc/I2C_acq_fifo_read.svg)
+
+The ACQ FIFO entry consists of 10 bits:
+- Address (bits 7:1) and R/W bit (bit 0) or data byte
+- Format flags (bits 9:8)
+The format flags indicate the following signals received from the host:
+- START: 01
+- STOP: 10
+- repeated START: 11
+- No START, or STOP: 00
+
+### Timing Control Registers
+
+For standard mode, fast-mode and fast-mode plus, the timing requirements for each transaction are detailed in Table 10 of the [I2C specification](https://www.nxp.com/docs/en/user-guide/UM10204.pdf).
+In order to claim complete compatibility at each mode, the state machine timings need to be adapted to whether there are Standard-mode, Fast-mode and Fast-mode Plus targets on the bus.
+Furthermore, depending on the actual capacitance of the bus, even a bus with all Fast-mode Plus capable targets may have to operate at slower speeds than 1Mbaud.
+For example, the host may need to run at lower frequencies, as discussed in Section 5.2 of the specification, but the computation of the nominal frequency will depend on timing specifications in Table 10, in this case particularly, the limits on t<sub>LOW</sub>, t<sub>HIGH</sub>, t<sub>r</sub>, and t<sub>f</sub>.
+Assuming no clock stretching, for a given set of these four parameters the baud rate is then given to be:
+$$ 1/f\_{SCL}=t\_{LOW}+t\_{HIGH}+t\_{r}+t\_{f}. $$
+
+Thus in order to ensure compliance with the spec in any particular configuration, software will program the I2C host IP with explicit values for each of the following timing parameters, as defined in Figure 38 of the specification.
+- t<sub>LOW</sub>: set in register [`TIMING0.TLOW`](../data/i2c.hjson#timing0).
+- t<sub>HIGH</sub>: set in register [`TIMING0.THIGH`](../data/i2c.hjson#timing0).
+- t<sub>r</sub>: set in register [`TIMING1.T_R`](../data/i2c.hjson#timing1).
+(Note: The rise time cannot be explicitly controlled by internal hardware, and will be a function of the capacitance of the bus.
+Thus this parameter is largely budgetary, meaning that it tells the state machine how much time to wait for an RC rise.)
+- t<sub>f</sub>: set in register [`TIMING1.T_F`](../data/i2c.hjson#timing1).
+(Note: The fall time cannot be explicitly controlled by internal hardware, and is a function of the pin driver.
+Thus this parameter is also budgetary.
+Given that the actual fall time cannot be controlled to stay above the minimum values set in Table 10 of the specification, and so this in this regard this module currently is not strictly compliant to the I2C spec.)
+- t<sub>SU,STA</sub>: set in register [`TIMING2.TSU_STA`](../data/i2c.hjson#timing2)
+- t<sub>HD,STA</sub>: set in register [`TIMING2.THD_STA`](../data/i2c.hjson#timing2)
+- t<sub>SU,DAT</sub>: set in register [`TIMING3.TSU_DAT`](../data/i2c.hjson#timing3).
+Taken to be synonymous with T<sub>SU,ACK</sub>
+- t<sub>HD,DAT</sub>: set in register [`TIMING3.THD_DAT`](../data/i2c.hjson#timing3).
+Taken to be synonymous with T<sub>HD,ACK</sub>.
+Moreover, since the pin driver fall time is likely to be less then one clock cycle, this parameter is also taken to be synonymous with the parameters T<sub>VD,DAT</sub> and T<sub>VD,ACK</sub>
+- t<sub>SU,STO</sub>: set in register [`TIMING4.TSU_STO`](../data/i2c.hjson#timing4).
+- t<sub>BUF</sub>: set in register [`TIMING4.T_BUF`](../data/i2c.hjson#timing4)
+
+The values programmed into the registers [`TIMING0`](../data/i2c.hjson#timing0) through [`TIMING4`](../data/i2c.hjson#timing4) are to be expressed in units of the bus clock period.
+Note in order to ensure compliance with the I2C spec, firmware must program these registers with values within the ranges laid out in Table 10 of the specification.
+These values can be directly computed using DIFs given the desired speed standard, the desired operating frequency, and the actual line capacitance.
+These timing parameters are then fed directly to the I2C state machine to control the bus timing.
+
+A detailed description of the algorithm for determining these parameters--as well as a couple of concrete examples--are given in the [Programmers Guide section of this document.](#timing-parameter-tuning-algorithm)
+
+### Timeout Control
+A malfunctioning (or otherwise very slow) target device can hold SCL low indefinitely, stalling the bus.
+For this reason [`TIMEOUT_CTRL`](../data/i2c.hjson#timeout_ctrl) provides a clock-stretching timeout mechanism to notify firmware of this sort of condition.
+If [`TIMEOUT_CTRL.EN`](../data/i2c.hjson#timeout_ctrl) is asserted, an interrupt will be asserted when the IP detects that another device (a target or, in possible future revisions, an alternate host) has been holding SCL low for more than [`TIMEOUT_CTRL.VAL`](../data/i2c.hjson#timeout_ctrl) clock ticks.
+
+
+This feature is added as a utility, though it is not required by the I2C specification.
+However, in some applications it could be used in protocols which build upon I2C.
+For instance, SMBus applications using this IP could in principle use this to support SMBus timeouts.
+(Note: This is just an example application of this feature.
+Other features may also be required for complete SMBus functionality.)
+
+### Clock Stretching
+As described in the I2C specification, a target device can pause a transaction by holding SCL low.
+There are 3 cases in which this design stretches the clock.
+In all cases described below, a target begins to stretch the clock after the ACK bit.
+In the first two scenarios, it is after the ACK bit sent by the target, in the last scenario, it is after the host's ACK bit.
+
+#### Stretching after address read
+    - When a target device receives a start, the address and R/W bit are written into the ACQ FIFO.
+    - If there is no space in the ACQ FIFO to receive such a write, the target stretches the clock after the ACK bit and waits for software to make space.
+    - The `acq_full` interrupt is generated to alert software to such a situation.
+
+#### Stretching during write
+    - Similar to the scenario above, if the host tries to write a data byte into the ACQ FIFO when there is no available space, the clock is also stretched after the ACK bit.
+    - The `acq_full` interrupt is generated to alert software to such a situation.
+
+#### Stretching during read
+    - When a target device receives a start and read command, it may stretch the clock for either of the following two reasons.
+      - If there is no data available to be sent back (TX FIFO empty case), the target stretches the clock until data is made available by software.
+      - If there is more than 1 entry in the ACQ FIFO.
+        - Having more than 1 entry in the ACQ FIFO suggests there is potentially an unhandled condition (STOP / RESTART) or an unhandled command (START) that requires software intervention before the read can proceed.
+    - The `tx_stretch` interrupt is generated to alert software to such a situation.
+
+
+### Interrupts
+The I2C module has a few interrupts including general data flow interrupts and unexpected event interrupts.
+
+#### Host Mode
+If the RX FIFO exceeds the designated depth of entries, the interrupt `rx_threshold` is raised to inform firmware.
+Firmware can configure the threshold value via the register [`FIFO_CTRL.RXILVL`](../data/i2c.hjson#fifo_ctrl).
+
+Meanwhile it the FMT FIFO level falls below a designated depth of entries the `fmt_threshold` interrupt is raised.
+(Note that this behavior differs from similar interrupts in other modules, such as the UART IP module.)
+Firmware can configure the threshold value via the register [`FIFO_CTRL.FMTILVL`](../data/i2c.hjson#fifo_ctrl).
+
+If either FIFO receives an additional write request when its FIFO is full, the interrupt `fmt_overflow` or `rx_overflow` is asserted and the format indicator or character is dropped.
+
+If the module transmits a byte, but receives no ACK signal, the `nak` interrupt is usually asserted.
+In cases where a byte is transmitted and no ACK is expected or required, that byte should be submitted with NAKOK flag also asserted.
+
+When the I2C module is in transmit mode, the `scl_interference` or `sda_interference` interrupts will be asserted if the IP identifies that some other device (host or target) on the bus is forcing either signal low and interfering with the transmission.
+If should be noted that the `scl_interference` interrupt is not raised in the case when the target device is stretching the clock.
+(However, it may be raised if the target allows SCL to go high and then pulls SCL down before the end of the current clock cycle.)
+
+A target device should never assert 0 on the SDA lines, and in the absence of multi-host support, the `sda_interference` interrupt is raised whenever the host IP detects that another device is pulling SDA low.
+
+On the other hand, it is legal for the a target device to assert SCL low for clock stretching purposes.
+With clock stretching, the target can delay the start of the following SCL pulse by holding SCL low between clock pulses.
+However the target device must assert SCL low before the start of the SCL pulse.
+If SCL is pulled low during an SCL pulse which has already started, this interruption of the SCL pulse will be registered as an exception by the I2C core, which will then assert the `scl_interference` interrupt.
+
+```wavejson
+{signal: [
+  {name: 'Clock', wave: 'p.....|.......|......'},
+  {name: 'SCL Host Driver', wave: '0.z..0|.z....0|..z.x.'},
+  {name: 'SCL Target Driver', wave: 'z.....|0..z...|...0..'},
+  {name: 'SCL bus', wave: '0.u..0|...u..0|..u0..'},
+  {name: 'scl_interference', wave: '0.....|.......|....1.'},
+],
+  head: {text: 'SCL pulses: Normal SCL pulse (Cycle 3),  SCL pulse with clock stretching (cycle 11), and SCL interference (interrupted SCL pulse)',tick:1}}
+```
+
+
+Though normal clock stretching does not count as SCL interference, if the module detects that a target device has held SCL low and stretched the any given SCL cycle for more than [`TIMEOUT_CTRL.VAL`](../data/i2c.hjson#timeout_ctrl) clock ticks this will cause the stretch timeout interrupt to be asserted.
+This interrupt is suppressed, however, if [`TIMEOUT_CTRL.EN`](../data/i2c.hjson#timeout_ctrl) is deasserted low.
+
+```wavejson
+{signal: [
+  {name: 'Clock', wave: 'p............'},
+  {name: 'SCL Host Driver', wave: '0..z.......x.'},
+  {name: 'SCL Target Driver', wave: 'z0...........'},
+  {name: 'SCL bus', wave: '0............'},
+  {name: 'TIMEOUT_CNTRL.VAL', wave: '2............', data: "8"},
+  {name: 'SCL timeout counter', wave: '2...22222222x', data: '0 1 2 3 4 5 6 7 8'},
+  {name: 'TIMEOUT_CNTRL.EN', wave: '1............'},
+  {name: 'scl_timeout', wave: '0..........1.'},
+],
+  head: {text: 'SCL Timeout Example',tick:-3}}
+```
+
+Except for START and STOP symbols, the I2C specification requires that the SDA signal remains constant whenever SCL is high.
+The `sda_unstable` interrupt is asserted if, when receiving data or acknowledgement pulse, the value of the SDA signal does not remain constant over the duration of the SCL pulse.
+
+Transactions are terminated by a STOP signal.
+The host may send a repeated START signal instead of a STOP, which also terminates the preceding transaction.
+In both cases, the `cmd_complete` interrupt is asserted, in the beginning of a repeated START or at the end of a STOP.
+
+
+#### Target Mode
+
+The interrupt `cmd_complete` is asserted whenever a RESTART or a STOP bit is observed by the target.
+
+The interrupt `tx_stretch` is asserted whenever target intends to transmit data but cannot.
+See
+
+When a host receives enough data from a target, it usually signals the end of the transaction by sending a NACK followed by a STOP or a repeated START.
+In a case when a target receives a STOP without the prerequisite NACK, the interrupt `unexp_stop` is asserted.
+This interrupt just means that a STOP was unexpectedly observed during a host read.
+It is not necessarily harmful, but software can be made aware just in case.
+
+If ACQ FIFO becomes full, the interrupt `acq_full` is asserted.
+
+If a host ceases to send SCL pulses at any point during an ongoing transaction, the target waits for a specified time period and then asserts the interrupt `host_timeout`.
+Host sending an address and R/W bit to all target devices, writing to the selected target, or reading from the target are examples of ongoing transactions.
+The time period is counted from the last low-to-high SCL transition.
+Firmware can configure the timeout value via the register [`HOST_TIMEOUT_CTRL`](../data/i2c.hjson#host_timeout_ctrl).
+
+### Implementation Details: Format Flag Parsing
+
+To illustrate the behavior induced by various flags added to the formatting queue, the following figure shows a simplified version of the I2C_Host state machine.
+In this simplified view, many sequential states have been collapsed into four sub-sequences of states (shown in square brackets) or have their names abbreviated:
+- Issue start
+- Issue stop
+- Transmit Byte
+- Read Bytes
+
+Within each of these sub-sequences, state transitions depend only on the SDA/SCL inputs or internal timers.
+Each sub-sequence has a terminal event--generically labeled "[completed]" which prompts the transition to another sequence or state.
+
+However, all transitions which are dependent on formatting flags are shown explicitly in this figure.
+
+![](../doc/I2C_state_diagram.svg)
+
+Similarly, the figure below shows a simplified version of the I2C_Target state machine.
+
+![](../doc/I2C_state_diagram_target.svg)
+
+In this diagram, "R/W" stands for a R/W bit value. The host is reading when R/W bit is "1" and writing when R/W bit is "0".
diff --git a/hw/ip/keymgr/README.md b/hw/ip/keymgr/README.md
index 5ec7f083bcf3e..acc563d282e1e 100644
--- a/hw/ip/keymgr/README.md
+++ b/hw/ip/keymgr/README.md
@@ -19,530 +19,3 @@ This document specifies the functionality of the OpenTitan key manager.
 The key manager implements the hardware component of the [identities and root keys](https://docs.opentitan.org/doc/security/specs/identities_and_root_keys/) strategy of OpenTitan.
 
 It enables the system to shield critical assets from software directly and provides a simple model for software to use derived key and identity outputs.
-
-# Theory of Operation
-
-Key manager behavior can be summarized by the functional model below.
-
-![Key Manager Functional Model](./doc/keymgr_functional_model.svg)
-
-In the diagram, the red boxes represent the working state and the associated internal key, the black ovals represent derivation functions, the green squares represent software inputs, and the remaining green / purple shapes represent outputs to both software and hardware.
-
-In OpenTitan, the derivation method selected is [KMAC](../kmac/README.md).
-Each valid operation involves a KMAC invocation using the key manager internal key and other HW / SW supplied inputs as data.
-While KMAC can generate outputs of arbitrary length, this design fixes the size to 256b.
-
-Effectively, the key manager behavior is divided into 3 classes of functions
-*  Key manager state advancement
-   *  The results are never visible to software and not directly usable by any software controlled hardware
-
-*  Output key generation
-   *  Results can be visible to software or consumed by hardware (sideload)
-
-*  Identity / seed generation
-   *  Results are always visible to software and used for asymmetric cryptography
-
-In general, the key generation and seed generation functions are identical.
-They differ only in how software chooses to deploy the outputs.
-
-For clarity, all commands issued to the key manager by software are referred to as operations.
-Transactions refer to the interaction between key manager and KMAC if a valid operation is issued.
-
-## Key Manager State
-
-The key manager working state (red boxes in the functional model) represents both the current state of the key manager as well as its related internal key.
-Each valid state (`Initialized` / `CreatorRootKey` / `OwnerIntermediateKey` / `OwnerRootKey`), supplies its secret material as the "key" input to a KMAC operation.
-Invalid states, such as `Reset / Disabled` on the other hand, either do not honor operation requests, or supplies random data when invoked.
-
-The data input is dependent on each state, see below.
-
-### Reset
-To begin operation, the state must first transition to Initialize.
-The advancement from `Reset` to `Initialized` is irreversible during the current power cycle.
-Until the initialize command is invoked, the key manager rejects all other software commands.
-
-### Initialized
-
-When transitioning from `Reset` to `Initialized`, random values obtained from the entropy source are used to populate the internal key first.
-Then the root key stored in OTP, if valid, is loaded into the internal key.
-This ensures that the hamming delta from the previous value to the next value is non-deterministic.
-The advancement from `Initialized` to `CreatorRootKey` is irreversible during the current power cycle.
-
-### CreatorRootKey
-
-`CreatorRootKey` is the first operational state of the key manager.
-When transitioning from `Initialized` to this state, a KMAC operation is invoked using the `RootKey` as the key (from OTP), and the remaining inputs as data.
-The output of the KMAC operation replaces the previous value of the internal key, and the new value becomes the `CreatorRootKey`.
-
-Inputs to the derivation function are:
-*  `DiversificationKey`: Secret seed from flash
-*  `HealthMeasurement`: Current life cycle state
-   *  To avoid a state value corresponding to each life cycle state, the raw life cycle value is not used.
-   *  Instead, certain life cycle states diversify the same way.
-   *  Please see the life cycle controller for more details.
-*  `DeviceIdentifier`: Unique device identification.
-*  `HardwareRevisionSecret`: A global design time constant.
-
-Other than the `DiversificationKey` and `HardwareRevisionSecret`, none of the values above are considered secret.
-
-Once the `CreatorRootKey` is reached, software can request key manager to advance state, generate output key or generate output identity.
-The key used for all 3 functions is the `CreatorRootKey`.
-
-The advancement from `CreatorRootKey` to the `OwnerIntermediateKey` is irreversible during the current power cycle.
-
-Keymgr reads the root key from OTP in a single clock cycle. It assumes that when keymgr's internal FSM reaches to this clock cycle, OTP root key is already available (`valid` is set to 1). Otherwise, keymgr skips loading the root key.
-
-### OwnerIntermediateKey
-
-This is the second operational state of the key manager.
-This state is reached through another invocation of the KMAC operation using the previous internal key, and other inputs as data.
-The output of the KMAC operation replaces the previous value of the internal key, and the new value becomes the `OwnerIntermediateKey`.
-
-The relevant data inputs are:
-*  `OwnerRootSecret`: Secret seed from flash.
-*  `SoftwareBinding`: A software programmed value representing the first owner code to be run.
-
-Once the `OwnerIntermediateKey` is created, software can request key manager to advance state, generate output key or generate output identity.
-The key used for all 3 functions is the `OwnerIntermediateKey`.
-
-The advancement from `OwnerIntermediateKey` to the `OwnerRootKey` is irreversible during the current power cycle.
-
-### OwnerRootKey
-
-This is the last operational state of the key manager.
-This state is reached through another invocation of the KMAC operation using the previous internal key, and other inputs as data.
-The output of the KMAC operation replaces the previous value of the internal key, and the new value becomes the `OwnerRootKey`.
-
-The relevant inputs are:
-*   `SoftwareBinding` - A software programmed value representing the owner kernel code.
-
-Once the `OwnerRootKey` is created, software can request key manager to advance state, generate output key or generate output identity.
-An advance command invoked from `OwnerRootKey` state simply moves the state to `Disabled`.
-
-The generate output and generate identity functions use `OwnerRootKey` as the KMAC key.
-The advancement from `OwnerRootKey` to the `Disabled` is irreversible during the current power cycle.
-
-### Disabled
-`Disabled` is a state where the key manager is no longer operational.
-Upon `Disabled` entry, the internal key is updated with KMAC computed random values; however, previously generated sideload key slots and software key slots are preserved.
-This allows the software to keep the last valid keys while preventing the system from further advancing the valid key.
-
-When advance and generate calls are invoked from this state, the outputs and keys are indiscriminately updated with randomly computed values.
-Key manager enters disabled state based on direct invocation by software:
-* Advance from `OwnerRootKey`
-* Disable operation
-
-### Invalid
-`Invalid` state is entered whenever key manager is deactivated through the [life cycle connection](#life-cycle-connection) or when an operation encounters a [fault](#faults-and-operational-faults) .
-Upon `Invalid` entry, the internal key, the sideload key slots and the software keys are all wiped with entropy directly.
-
-#### Invalid Entry Wiping
-Since the life cycle controller can deactivate the key manager at any time, the key manager attempts to gracefully handle the wiping process.
-When deactivated, the key manager immediately begins wiping all keys (internal key, hardware sideload key, software key) with entropy.
-However, if an operation was already ongoing, the key manager waits for the operation to complete gracefully before transitioning to invalid state.
-
-While waiting for the operation to complete, the key manager continuously wipes all keys with entropy.
-
-### Invalid and Disabled State
-
-`Invalid` and `Disabled` states are functionally very similar.
-The main difference between the two is "how" the states were reached and the entry behavior.
-
-`Disabled` state is reached through intentional software commands where the sideload key slots and software key are not wiped, while `Invalid` state is reached through life cycle deactivation or operational faults where the internal key, sideload key slots and software key are wiped.
-
-This also means that only `Invalid` is a terminal state.
-If after entering `Disabled` life cycle is deactivated or a fault is encountered, the same [invalid entry procedure](#Invalid) is followed to bring the system to a terminal `Invalid` state.
-
-If ever multiple conditions collide (a fault is detected at the same time software issues disable command), the `Invalid` entry path always takes precedence.
-
-## Life Cycle Connection
-The function of the key manager is directly managed by the [life cycle controller](../lc_ctrl/README.md#key-manager-en).
-
-Until the life cycle controller activates the key manager, the key manager does not accept any software commands.
-Once the key manager is activated by the life cycle controller, it is then allowed to transition to the various states previously [described](#key-manager-states).
-
-When the life cycle controller deactivates the key manager, the key manager transitions to the `Invalid` state.
-
-## Commands in Each State
-During each state, there are 3 valid commands software can issue:
-*  Advance state
-*  Output generation
-*  Identity generation
-
-The software is able to select a command and trigger the key manager FSM to process one of the commands.
-If a command is valid during the current working state, it is processed and acknowledged when complete.
-
-If a command is invalid, the behavior depends on the current state.
-If the current state is `Reset`, the invalid command is immediately rejected as the key manager FSM has not yet been initialized.
-If the current state is any other state, the key manager sequences random, dummy data to the KMAC module, but does not update internal key, sideload key slots or software keys.
-For each valid command, a set of inputs are selected and sequenced to the KMAC module.
-
-During `Disable` and `Invalid` states, the internal key, sideload key slots and software key are updated based on the input commands as with normal states.
-There are however a few differences:
--  The updates are made regardless of any error status to ensure their values are further scrambled.
--  Instead of normal input data, random data is selected for KMAC processing.
--  All operations return an invalid operations error, in addition to any other error that might naturally occur.
-
-## Generating Output Key
-The generate output command is composed of 2 options
-*  Generate output key for software, referred to as `generate-output-sw`
-*  Generate output key for hardware, referred to as `generate-output-hw`
-
-The hardware option is meant specifically for symmetric sideload use cases.
-When this option is issued, the output of the KMAC invocation is not stored in software visible registers, but instead in hardware registers that directly output to symmetric primitives such as AES, KMAC and OTBN.
-
-## KMAC Operations
-All invoked KMAC operations expect the key in two shares.
-This means the internal key, even though functionally 256b, is maintained as 512b.
-The KMAC processed outputs are also in 2-shares.
-For `generate-output-sw` commands, software is responsible for determining whether the key manager output should be preserved in shares or combined.
-
-## Errors, Faults and Alerts
-
-The key manager has two overall categories of errors:
-* Recoverable errors
-* Fatal errors
-
-Recoverable errors are those likely to have been introduced by software and not fatal to the key manager or the system.
-Fatal errors are logically impossible errors that have a high likelihood of being a fault and thus fatal.
-
-Each category of error can be further divided into two:
-* Synchronous errors
-* Asynchronous errors
-
-Synchronous errors happen only during a key manager operation.
-Asynchronous errors can happen at any time.
-
-Given the above, we have 4 total categories of errors:
-* Synchronous recoverable errors
-* Asynchronous recoverable errors
-* Synchronous fatal errors
-* Asynchronous fatal errors
-
-All recoverable errors (synchronous and asynchronous) are captured in [`ERR_CODE`](data/keymgr.hjson#err_code).
-All fatal errors (synchronous and asynchronous) are captured in [`FAULT_STATUS`](data/keymgr.hjson#fault_status).
-
-Recoverable errors cause a recoverable alert to be sent from the key manager.
-Fatal errors cause a fatal alert to be sent from the key manager.
-
-Below, the behavior of each category and its constituent errors are described in detail.
-
-### Synchronous Recoverable Errors
-
-These errors can only happen when a key manager operation is invoked and are typically associated with incorrect software programming.
-At the end of the operation, key manager reports whether there was an error in [`ERR_CODE`](data/keymgr.hjson#err_code) and sends a recoverable alert.
-
-* [`ERR_CODE.INVALID_OP`](data/keymgr.hjson#err_code) Software issued an invalid operation given the current key manager state.
-* [`ERR_CODE.INVALID_KMAC_INPUT`](data/keymgr.hjson#err_code) Software supplied invalid input (for example a key greater than the max version) for a key manager operation.
-
-### Asynchronous Recoverable Errors
-
-These errors can happen at any time regardless of whether there is a key manager operation.
-The error is reported in [`ERR_CODE`](data/keymgr.hjson#err_code) and the key manager sends a recoverable alert.
-
-* [`ERR_CODE.INVALID_SHADOW_UPDATE`](data/keymgr.hjson#err_code) Software performed an invalid sequence while trying to update a key manager shadow register.
-
-### Synchronous Fatal Errors
-
-These errors can only happen when a key manager operation is invoked and receives malformed operation results that are not logically possible.
-At the end of the operation, key manager reports whether there was an error in [`FAULT_STATUS`](data/keymgr.hjson#fault_status) and continuously sends fatal alerts .
-
-Note, these errors are synchronous from the perspective of the key manager, but they may be asynchronous from the perspective of another module.
-
-### Asynchronous Fatal Errors
-
-These errors can happen at any time regardless of whether there is a key manager operation.
-The error is reported in [`FAULT_STATUS`](data/keymgr.hjson#fault_status) and the key manager continuously sends fatal alerts.
-
-
-### Faults and Operational Faults
-
-When a fatal error is encountered, the key manager transitions to the `Invalid` [state](#invalid-entry-wiping).
-The following are a few examples of when the error occurs and how the key manager behaves.
-
-#### Example 1: Fault During Operation
-The key manager is running a generate operation and a non-onehot command was observed by the KMAC interface.
-Since the non-onehot condition is a fault, it is reflected in [`FAULT_STATUS`](data/keymgr.hjson#fault_status) and a fatal alert is generated.
-The key manager transitions to `Invalid` state, wipes internal storage and reports an invalid operation in [`ERR_CODE.INVALID_OP`](data/keymgr.hjson#err_code).
-
-#### Example 2: Fault During Idle
-The key manager is NOT running an operation and is idle.
-During this time, a fault is observed on the regfile (shadow storage error) and FSM (control FSM integrity error).
-The faults are reflected in [`FAULT_STATUS`](data/keymgr.hjson#fault_status).
-The key manager transitions to `Invalid` state, wipes internal storage but does not report an invalid operation.
-
-#### Example 3: Operation after Fault Detection
-Continuing from the example above, the key manager now begins an operation.
-Since the key manager is already in `Invalid` state, it does not wipe internal storage and reports an invalid operation in [`ERR_CODE.INVALID_OP`](data/keymgr.hjson#err_code).
-
-#### Additional Details on Invalid Input
-
-What is considered invalid input changes based on current state and operation.
-
-When an advance operation is invoked:
-- The internal key is checked for all 0's and all 1's.
-- During `Initialized` state, creator seed, device ID and health state data is checked for all 0's and all 1's.
-- During `CreatorRootKey` state, the owner seed is checked for all 0's and all 1's.
-- During all other states, nothing is explicitly checked.
-
-When a generate output key operation is invoked:
-- The internal key is checked for all 0's and all 1's.
-- The key version is less than or equal to the max key version.
-
-When a generate output identity is invoked:
-- The internal key is checked for all 0's and all 1's.
-
-#### Invalid Operation
-
-The table below enumerates the legal operations in a given state.
-When an illegal operation is supplied, the error code is updated and the operation is flagged as `done with error`.
-
-| Current State    | Legal Operations               |
-| -------------    | ------------------------------ |
-| Reset            | Advance                        |
-| Initialized      | Disable / Advance              |
-| CreatorRootKey   | Disable / Advance / Generate   |
-| OwnerIntKey      | Disable / Advance / Generate   |
-| OwnerRootKey     | Disable / Advance / Generate   |
-| Invalid/Disabled | None                           |
-
-*  All operations invoked during `Invalid` and `Disabled` states lead to invalid operation error.
-
-### Error Response
-In addition to alerts and interrupts, key manager may also update the internal key and relevant outputs based on current state.
-See the tables below for an enumeration.
-
-| Current State    | Invalid States  | Invalid Output | Invalid Input | Invalid Operation   |
-| -------------    | ----------------| ---------------|---------------|---------------------|
-| Reset            | Not Possible    | Not Possible   | Not possible  | Not updated         |
-| Initialized      | Updated         | Updated        | Not updated   | Not updated         |
-| CreatorRootKey   | Updated         | Updated        | Not updated   | Not possible        |
-| OwnerIntKey      | Updated         | Updated        | Not updated   | Not possible        |
-| OwnerRootKey     | Updated         | Updated        | Not updated   | Not possible        |
-| Invalid/Disabled | Updated         | Updated        | Updated       | Updated             |
-
-*  During `Reset` state, the KMAC module is never invoked, thus certain errors are not possible.
-*  During `Initialized`, `CreatorRootKey`, `OwnerIntermediateKey` and `OwnerRootKey` states, a fault error causes the relevant key / outputs to be updated; however an operational error does not.
-*  During `Invalid` and `Disabled` states, the relevant key / outputs are updated regardless of the error.
-*  Only the relevant collateral is updated -> ie, advance / disable command leads to working key update, and generate command leads to software or sideload key update.
-*  During `Disabled` state, if life cycle deactivation or an operational fault is encountered, the key manager transitions to `Invalid` state, see [here](#invalid-and-disabled-state)
-
-## DICE Support
-
-The key manager supports [DICE open profile](https://pigweed.googlesource.com/open-dice/+/HEAD/docs/specification.md#Open-Profile-for-DICE).
-Specifically, the open profile has two compound device identifiers.
-* Attestation CDI
-* Sealing CDI
-
-The attestation CDI is used to attest hardware and software configuration and is thus expected to change between updates.
-The sealing CDI on the other hand, is used to attest the authority of the hardware and software configuration.
-The sealing version is thus expected to remain stable across software updates.
-
-To support these features, the key manager maintains two versions of the working state and associated internal key.
-There is one version for attestation and one version for sealing.
-
-The main difference between the two CDIs is the different usage of `SW_BINDING`.
-For the Sealing CDI, the [`"SEALING_SW_BINDING"`](data/keymgr.hjson#sealing_sw_binding) is used, all other inputs are the same.
-For the Attestation CDI, the [`"ATTEST_SW_BINDING"`](data/keymgr.hjson#attest_sw_binding) is used, all other inputs are the same.
-
-When invoking an advance operation, both versions are advanced, one after the other.
-There are thus two KMAC transactions.
-The first transaction uses the Sealing CDI internal key, [`"SEALING_SW_BINDING"`](data/keymgr.hjson#sealing_sw_binding) and other common inputs.
-The second transaction uses the Attestation CDI internal key, [`"ATTEST_SW_BINDING"`](data/keymgr.hjson#attest_sw_binding) and other common inputs.
-
-When invoking a generate operation, the software must specify which CDI to use as the source key.
-This is done through [`"CONTROL.CDI_SEL"`](data/keymgr.hjson#control).
-Unlike the advance operation, there is only 1 KMAC transaction since we pick a specific CDI to operate.
-
-When disabling, both versions are disabled together.
-
-
-## Block Diagram
-The following is a high level block diagram of the key manager.
-
-![Key Manager Block Diagram](./doc/keymgr_block_diagram.svg)
-
-## Design Details
-
-Key manager is primarily composed of two components:
-*  keymgr_ctrl
-*  keymgr_kmac_if
-
-### Key Manager Control
-
-The key manager control block manages the working state, sideload key updates, as well as what commands are valid in each state.
-It also handles the life cycle `keymgr_en` input, which deactivates the entire key manager function in the event of an escalation.
-
-![Key Manager Control Block Diagram](./doc/keymgr_control_diagram.svg)
-
-
-### KMAC Interface Control
-
-The KMAC interface control represents the bulk of key manager logic.
-Based on input from key manager control, this module selects the inputs for each given command and sequences the data to KMAC.
-
-![Key Manager KMAC Interface Block Diagram](./doc/keymgr_kmac_if_diagram.svg)
-
-The KMAC interface works on a simple `valid / ready` protocol.
-When there is data to send, the KMAC interface sends out a `valid` and keeps it active.
-When the destination accepts the transaction, the `ready` is asserted.
-Note just like with any bus interface, the `ready` may already be asserted when `valid` asserts, or it may assert some time later, there are no restrictions.
-Since the data to be sent is always pre-buffered in key manager, the valid, once asserted, does not de-assert until the entire transaction is complete.
-
-The data interface itself is 64b wide.
-However, there may not always be 64b multiple aligned data to be sent.
-In these situations, the last transfer beat sent to KMAC has a byte mask / strobe attached.
-The byte mask indicates on the last beat which bytes are actually valid, and which are not.
-Not beats prior to the last always have fully asserted byte masks.
-
-Once KMAC receives all the required data and the last indication, it begins processing the data into a digest.
-This process may take an arbitrary number of cycles.
-When this process is complete, a `done` indication pulse is sent back with the digest.
-Note, the acceptance of `done` has no back-pressure and `keymgr` must accept it within one cycle.
-
-See diagram below for an example transfer:
-
-```wavejson
-{signal: [
-  {name: 'kmac_data_o.valid',     wave: '01...........|....0..'},
-  {name: 'kmac_data_i.ready',     wave: '1...0..101...|.......'},
-  {name: 'kmac_data_o.data',      wave: 'x2222...2.222|2222x..'},
-  {name: 'kmac_data_o.last',      wave: '0................10..'},
-  {name: 'kmac_data_o.strb',      wave: 'x2...............2x..'},
-  {name: 'kmac_data_i.done',      wave: '0..................10'},
-  {name: 'kmac_data_i.digest*',   wave: 'x..................3x'},
-  ],
-}
-```
-
-### Sideload Keys
-
-There are three sideload keys.
-One for AES, one for KMAC and one for OTBN.
-When a sideload key is generated successfully through the `generate-output-hw` command, the derived data is loaded into key storage registers.
-There is a set of storage registers for each destination.
-
-The KMAC key however is further overloaded as it is the main derivation mechanism for key manager internal stage.
-The KMAC key thus has two possible outputs, one is the sideload key, and the other is internal state key.
-
-When a valid operation is called, the internal state key is sent over the KMAC key.
-During all other times, the sideloaded value is presented.
-Note, there may not be a valid key in the sideload register if it has been cleared or never generated.
-The sideload key can be overwritten with another generate command, or cleared with entropy through [`SIDELOAD_CLEAR`](data/keymgr.hjson#sideload_clear).
-
-The clearing can be done one slot at a time, or all at once.
-Once a clearing bit is enabled for a particular key slot, its value is continuously re-randomized every clock cycle.
-Therefore, SW is responsible for toggling this bit back to disabled state, which makes the last random value remain stable on the sideload slot.
-Otherwise, the sideload key slot is continuously randomized which prevents sideloading an actual key to the target HWIP.
-
-The following diagram illustrates an example when there is no valid key in the KMAC sideload registers and an operation is called.
-During the duration of the operation, the key is valid and shows the internal key state.
-Once the operation is complete, it falls back to the sideload key state, which is invalid in this case.
-
-```wavejson
-{signal: [
-  {name: 'u_sideload_ctrl.u_kmac_key.key_o.valid',     wave: '0................'},
-  {name: 'u_sideload_ctrl.u_kmac_key.key_o.key_share', wave: 'x................'},
-  {name: 'u_ctrl.key_o.valid',                         wave: '0................'},
-  {name: 'u_ctrl.key_o.key_share',                     wave: 'x................'},
-  {name: 'u_ctrl.op_start_i',                          wave: '0....1.....0.....'},
-  {name: 'kmac_key_o.valid',                           wave: '0....1.....0.....'},
-  {name: 'kmac_key_o.key_share*',                      wave: 'x....3.....x.....'},
-  ],
-}
-```
-
-The following diagram illustrates an example when there is a valid key in the KMAC sideload registers and an operation is called.
-During the duration of the operation, the key is valid and shows the internal key state.
-Once the operation is complete, it falls back to the sideload key state, which is valid and contains a different value.
-
-```wavejson
-{signal: [
-  {name: 'u_sideload_ctrl.u_kmac_key.key_o.valid',     wave: '01...............'},
-  {name: 'u_sideload_ctrl.u_kmac_key.key_o.key_share', wave: 'x4...............'},
-  {name: 'u_ctrl.key_o.valid',                         wave: '0....1.....0.....'},
-  {name: 'u_ctrl.key_o.key_share',                     wave: 'x................'},
-  {name: 'u_ctrl.op_start_i',                          wave: '0....1.....0.....'},
-  {name: 'kmac_key_o.valid',                           wave: '01...............'},
-  {name: 'kmac_key_o.key_share*',                      wave: 'x4...3.....4.....'},
-  ],
-}
-```
-
-
-### Software Binding
-
-The identities flow employs an idea called [software binding](https://docs.opentitan.org/doc/security/specs/identities_and_root_keys/#software-binding) to ensure that a particular key derivation scheme is only reproducible for a given software configuration.
-The binding is created through the secure boot flow, where each stage sets the binding used for the next verified stage before advancing to it.
-The software binding is used during the following state transitions only:
--  `Initialized` to `CreatorRootKey`
--  `CreatorRootKey` to `OwnerIntermedaiteKey`
--  `OwnerIntermediateKey` to `OwnerRootKey`
-
-In order to save on storage and not have a duplicate copy per stage, the software binding registers [`SOFTWARE_BINDING`](data/keymgr.hjson#software_binding) are shared between key manager stages.
-
-Software sets the appropriate values and locks it by clearing [`SOFT_BINDING_EN`](data/keymgr.hjson#soft_binding_en).
-When later a successful `advance` call is made, the key manager then unlocks by setting [`SOFT_BINDING_EN`](data/keymgr.hjson#soft_binding_en) to 1.
-An unsuccessful advance call (errors) does not unlock the binding.
-This allows the next stage of software to re-use the binding registers.
-
-### Custom Security Checks
-
-The keymgr has several custom security checks.
-
-#### One-Hot Command Check
-The command received by the KMAC interface must always be in one-hot form and unchanging during the life time of a KMAC transaction.
-If this check fails, an error is reflected in [`FAULT_STATUS.CMD`](data/keymgr.hjson#fault_status).
-
-#### Unexpected KMAC Done
-The `kmac_done` signal can only happen during the expected transaction window.
-If this check fails, an error is reflected in [`FAULT_STATUS.KMAC_DONE`](data/keymgr.hjson#fault_status).
-
-#### Control State Machine Check
-This error checks for two things:
--  The key manager can advance to one of the key states (e.g. RootKey, OwnerIntermediateKey) only when there is a legal advanced operation.
--  The key manager can issue an advance or generate operation to the KMAC interface only if the original software request is an advanced or generate command.
-
-If these checks fail, an error is reflected in [`FAULT_STATUS.CTRL_FSM_CHK`](data/keymgr.hjson#fault_status).
-
-#### Sideload Select Check
-A sideload key slot is selected for update only if the original software request targeted that key slot.
-
-If this check fails, an error is reflected in [`FAULT_STATUS.SIDE_CTRL_SEL`](data/keymgr.hjson#fault_status).
-
-####
-
-####
-
-## Hardware Interfaces
-* [Interface Tables](data/keymgr.hjson#interfaces)
-
-# Programmers Guide
-
-## Initialize
-
-## Advance or Generate
-Software selects a command and triggers a "start".
-If the command is valid and successful, key manager indicates done and no errors.
-If the command is invalid or unsuccessful, key manager indicates done with error.
-Regardless of the validity of the command, the hardware sequences are triggered to avoid leaking timing information.
-
-The software is able to read the current state of key manager, however it never has access to the associated internal key.
-
-When issuing the `generate-output-hw` command, software must select a destination primitive (AES, KMAC or OTBN).
-At the conclusion of the command, key and valid signals are forwarded by the key manager to the selected destination primitive.
-The key and valid signals remain asserted to the selected destination until software explicitly disables the output via another command, or issues another `generate-output-hw` command with a different destination primitive.
-
-## Caveats
-The keymgr [`WORKING_STATE`](data/keymgr.hjson#working_state) register allows software to discover the current state of `keymgr`.
-However, since these values are not hardened, they can be attacked.
-As such, software should be careful to not make critical system decisions based on these registers.
-They are meant generally for informational or debug purposes.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_keymgr.h)
-
-## Register Table
-
-* [Register Table](data/keymgr.hjson#registers)
diff --git a/hw/ip/keymgr/doc/programmers_guide.md b/hw/ip/keymgr/doc/programmers_guide.md
new file mode 100644
index 0000000000000..502baa401d35b
--- /dev/null
+++ b/hw/ip/keymgr/doc/programmers_guide.md
@@ -0,0 +1,29 @@
+# Programmer's Guide
+
+## Initialize
+
+## Advance or Generate
+Software selects a command and triggers a "start".
+If the command is valid and successful, key manager indicates done and no errors.
+If the command is invalid or unsuccessful, key manager indicates done with error.
+Regardless of the validity of the command, the hardware sequences are triggered to avoid leaking timing information.
+
+The software is able to read the current state of key manager, however it never has access to the associated internal key.
+
+When issuing the `generate-output-hw` command, software must select a destination primitive (AES, KMAC or OTBN).
+At the conclusion of the command, key and valid signals are forwarded by the key manager to the selected destination primitive.
+The key and valid signals remain asserted to the selected destination until software explicitly disables the output via another command, or issues another `generate-output-hw` command with a different destination primitive.
+
+## Caveats
+The keymgr [`WORKING_STATE`](../data/keymgr.hjson#working_state) register allows software to discover the current state of `keymgr`.
+However, since these values are not hardened, they can be attacked.
+As such, software should be careful to not make critical system decisions based on these registers.
+They are meant generally for informational or debug purposes.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_keymgr.h)
+
+## Register Table
+
+* [Register Table](../data/keymgr.hjson#registers)
diff --git a/hw/ip/keymgr/doc/theory_of_operation.md b/hw/ip/keymgr/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..46d3d80a1e2b8
--- /dev/null
+++ b/hw/ip/keymgr/doc/theory_of_operation.md
@@ -0,0 +1,496 @@
+# Theory of Operation
+
+Key manager behavior can be summarized by the functional model below.
+
+![Key Manager Functional Model](../doc/keymgr_functional_model.svg)
+
+In the diagram, the red boxes represent the working state and the associated internal key, the black ovals represent derivation functions, the green squares represent software inputs, and the remaining green / purple shapes represent outputs to both software and hardware.
+
+In OpenTitan, the derivation method selected is [KMAC](../../kmac/README.md).
+Each valid operation involves a KMAC invocation using the key manager internal key and other HW / SW supplied inputs as data.
+While KMAC can generate outputs of arbitrary length, this design fixes the size to 256b.
+
+Effectively, the key manager behavior is divided into 3 classes of functions
+*  Key manager state advancement
+   *  The results are never visible to software and not directly usable by any software controlled hardware
+
+*  Output key generation
+   *  Results can be visible to software or consumed by hardware (sideload)
+
+*  Identity / seed generation
+   *  Results are always visible to software and used for asymmetric cryptography
+
+In general, the key generation and seed generation functions are identical.
+They differ only in how software chooses to deploy the outputs.
+
+For clarity, all commands issued to the key manager by software are referred to as operations.
+Transactions refer to the interaction between key manager and KMAC if a valid operation is issued.
+
+## Key Manager State
+
+The key manager working state (red boxes in the functional model) represents both the current state of the key manager as well as its related internal key.
+Each valid state (`Initialized` / `CreatorRootKey` / `OwnerIntermediateKey` / `OwnerRootKey`), supplies its secret material as the "key" input to a KMAC operation.
+Invalid states, such as `Reset / Disabled` on the other hand, either do not honor operation requests, or supplies random data when invoked.
+
+The data input is dependent on each state, see below.
+
+### Reset
+To begin operation, the state must first transition to Initialize.
+The advancement from `Reset` to `Initialized` is irreversible during the current power cycle.
+Until the initialize command is invoked, the key manager rejects all other software commands.
+
+### Initialized
+
+When transitioning from `Reset` to `Initialized`, random values obtained from the entropy source are used to populate the internal key first.
+Then the root key stored in OTP, if valid, is loaded into the internal key.
+This ensures that the hamming delta from the previous value to the next value is non-deterministic.
+The advancement from `Initialized` to `CreatorRootKey` is irreversible during the current power cycle.
+
+### CreatorRootKey
+
+`CreatorRootKey` is the first operational state of the key manager.
+When transitioning from `Initialized` to this state, a KMAC operation is invoked using the `RootKey` as the key (from OTP), and the remaining inputs as data.
+The output of the KMAC operation replaces the previous value of the internal key, and the new value becomes the `CreatorRootKey`.
+
+Inputs to the derivation function are:
+*  `DiversificationKey`: Secret seed from flash
+*  `HealthMeasurement`: Current life cycle state
+   *  To avoid a state value corresponding to each life cycle state, the raw life cycle value is not used.
+   *  Instead, certain life cycle states diversify the same way.
+   *  Please see the life cycle controller for more details.
+*  `DeviceIdentifier`: Unique device identification.
+*  `HardwareRevisionSecret`: A global design time constant.
+
+Other than the `DiversificationKey` and `HardwareRevisionSecret`, none of the values above are considered secret.
+
+Once the `CreatorRootKey` is reached, software can request key manager to advance state, generate output key or generate output identity.
+The key used for all 3 functions is the `CreatorRootKey`.
+
+The advancement from `CreatorRootKey` to the `OwnerIntermediateKey` is irreversible during the current power cycle.
+
+Keymgr reads the root key from OTP in a single clock cycle. It assumes that when keymgr's internal FSM reaches to this clock cycle, OTP root key is already available (`valid` is set to 1). Otherwise, keymgr skips loading the root key.
+
+### OwnerIntermediateKey
+
+This is the second operational state of the key manager.
+This state is reached through another invocation of the KMAC operation using the previous internal key, and other inputs as data.
+The output of the KMAC operation replaces the previous value of the internal key, and the new value becomes the `OwnerIntermediateKey`.
+
+The relevant data inputs are:
+*  `OwnerRootSecret`: Secret seed from flash.
+*  `SoftwareBinding`: A software programmed value representing the first owner code to be run.
+
+Once the `OwnerIntermediateKey` is created, software can request key manager to advance state, generate output key or generate output identity.
+The key used for all 3 functions is the `OwnerIntermediateKey`.
+
+The advancement from `OwnerIntermediateKey` to the `OwnerRootKey` is irreversible during the current power cycle.
+
+### OwnerRootKey
+
+This is the last operational state of the key manager.
+This state is reached through another invocation of the KMAC operation using the previous internal key, and other inputs as data.
+The output of the KMAC operation replaces the previous value of the internal key, and the new value becomes the `OwnerRootKey`.
+
+The relevant inputs are:
+*   `SoftwareBinding` - A software programmed value representing the owner kernel code.
+
+Once the `OwnerRootKey` is created, software can request key manager to advance state, generate output key or generate output identity.
+An advance command invoked from `OwnerRootKey` state simply moves the state to `Disabled`.
+
+The generate output and generate identity functions use `OwnerRootKey` as the KMAC key.
+The advancement from `OwnerRootKey` to the `Disabled` is irreversible during the current power cycle.
+
+### Disabled
+`Disabled` is a state where the key manager is no longer operational.
+Upon `Disabled` entry, the internal key is updated with KMAC computed random values; however, previously generated sideload key slots and software key slots are preserved.
+This allows the software to keep the last valid keys while preventing the system from further advancing the valid key.
+
+When advance and generate calls are invoked from this state, the outputs and keys are indiscriminately updated with randomly computed values.
+Key manager enters disabled state based on direct invocation by software:
+* Advance from `OwnerRootKey`
+* Disable operation
+
+### Invalid
+`Invalid` state is entered whenever key manager is deactivated through the [life cycle connection](#life-cycle-connection) or when an operation encounters a [fault](#faults-and-operational-faults) .
+Upon `Invalid` entry, the internal key, the sideload key slots and the software keys are all wiped with entropy directly.
+
+#### Invalid Entry Wiping
+Since the life cycle controller can deactivate the key manager at any time, the key manager attempts to gracefully handle the wiping process.
+When deactivated, the key manager immediately begins wiping all keys (internal key, hardware sideload key, software key) with entropy.
+However, if an operation was already ongoing, the key manager waits for the operation to complete gracefully before transitioning to invalid state.
+
+While waiting for the operation to complete, the key manager continuously wipes all keys with entropy.
+
+### Invalid and Disabled State
+
+`Invalid` and `Disabled` states are functionally very similar.
+The main difference between the two is "how" the states were reached and the entry behavior.
+
+`Disabled` state is reached through intentional software commands where the sideload key slots and software key are not wiped, while `Invalid` state is reached through life cycle deactivation or operational faults where the internal key, sideload key slots and software key are wiped.
+
+This also means that only `Invalid` is a terminal state.
+If after entering `Disabled` life cycle is deactivated or a fault is encountered, the same [invalid entry procedure](#Invalid) is followed to bring the system to a terminal `Invalid` state.
+
+If ever multiple conditions collide (a fault is detected at the same time software issues disable command), the `Invalid` entry path always takes precedence.
+
+## Life Cycle Connection
+The function of the key manager is directly managed by the [life cycle controller](../../lc_ctrl/README.md#key-manager-en).
+
+Until the life cycle controller activates the key manager, the key manager does not accept any software commands.
+Once the key manager is activated by the life cycle controller, it is then allowed to transition to the various states previously [described](#key-manager-states).
+
+When the life cycle controller deactivates the key manager, the key manager transitions to the `Invalid` state.
+
+## Commands in Each State
+During each state, there are 3 valid commands software can issue:
+*  Advance state
+*  Output generation
+*  Identity generation
+
+The software is able to select a command and trigger the key manager FSM to process one of the commands.
+If a command is valid during the current working state, it is processed and acknowledged when complete.
+
+If a command is invalid, the behavior depends on the current state.
+If the current state is `Reset`, the invalid command is immediately rejected as the key manager FSM has not yet been initialized.
+If the current state is any other state, the key manager sequences random, dummy data to the KMAC module, but does not update internal key, sideload key slots or software keys.
+For each valid command, a set of inputs are selected and sequenced to the KMAC module.
+
+During `Disable` and `Invalid` states, the internal key, sideload key slots and software key are updated based on the input commands as with normal states.
+There are however a few differences:
+-  The updates are made regardless of any error status to ensure their values are further scrambled.
+-  Instead of normal input data, random data is selected for KMAC processing.
+-  All operations return an invalid operations error, in addition to any other error that might naturally occur.
+
+## Generating Output Key
+The generate output command is composed of 2 options
+*  Generate output key for software, referred to as `generate-output-sw`
+*  Generate output key for hardware, referred to as `generate-output-hw`
+
+The hardware option is meant specifically for symmetric sideload use cases.
+When this option is issued, the output of the KMAC invocation is not stored in software visible registers, but instead in hardware registers that directly output to symmetric primitives such as AES, KMAC and OTBN.
+
+## KMAC Operations
+All invoked KMAC operations expect the key in two shares.
+This means the internal key, even though functionally 256b, is maintained as 512b.
+The KMAC processed outputs are also in 2-shares.
+For `generate-output-sw` commands, software is responsible for determining whether the key manager output should be preserved in shares or combined.
+
+## Errors, Faults and Alerts
+
+The key manager has two overall categories of errors:
+* Recoverable errors
+* Fatal errors
+
+Recoverable errors are those likely to have been introduced by software and not fatal to the key manager or the system.
+Fatal errors are logically impossible errors that have a high likelihood of being a fault and thus fatal.
+
+Each category of error can be further divided into two:
+* Synchronous errors
+* Asynchronous errors
+
+Synchronous errors happen only during a key manager operation.
+Asynchronous errors can happen at any time.
+
+Given the above, we have 4 total categories of errors:
+* Synchronous recoverable errors
+* Asynchronous recoverable errors
+* Synchronous fatal errors
+* Asynchronous fatal errors
+
+All recoverable errors (synchronous and asynchronous) are captured in [`ERR_CODE`](../data/keymgr.hjson#err_code).
+All fatal errors (synchronous and asynchronous) are captured in [`FAULT_STATUS`](../data/keymgr.hjson#fault_status).
+
+Recoverable errors cause a recoverable alert to be sent from the key manager.
+Fatal errors cause a fatal alert to be sent from the key manager.
+
+Below, the behavior of each category and its constituent errors are described in detail.
+
+### Synchronous Recoverable Errors
+
+These errors can only happen when a key manager operation is invoked and are typically associated with incorrect software programming.
+At the end of the operation, key manager reports whether there was an error in [`ERR_CODE`](../data/keymgr.hjson#err_code) and sends a recoverable alert.
+
+* [`ERR_CODE.INVALID_OP`](../data/keymgr.hjson#err_code) Software issued an invalid operation given the current key manager state.
+* [`ERR_CODE.INVALID_KMAC_INPUT`](../data/keymgr.hjson#err_code) Software supplied invalid input (for example a key greater than the max version) for a key manager operation.
+
+### Asynchronous Recoverable Errors
+
+These errors can happen at any time regardless of whether there is a key manager operation.
+The error is reported in [`ERR_CODE`](../data/keymgr.hjson#err_code) and the key manager sends a recoverable alert.
+
+* [`ERR_CODE.INVALID_SHADOW_UPDATE`](../data/keymgr.hjson#err_code) Software performed an invalid sequence while trying to update a key manager shadow register.
+
+### Synchronous Fatal Errors
+
+These errors can only happen when a key manager operation is invoked and receives malformed operation results that are not logically possible.
+At the end of the operation, key manager reports whether there was an error in [`FAULT_STATUS`](../data/keymgr.hjson#fault_status) and continuously sends fatal alerts .
+
+Note, these errors are synchronous from the perspective of the key manager, but they may be asynchronous from the perspective of another module.
+
+### Asynchronous Fatal Errors
+
+These errors can happen at any time regardless of whether there is a key manager operation.
+The error is reported in [`FAULT_STATUS`](../data/keymgr.hjson#fault_status) and the key manager continuously sends fatal alerts.
+
+
+### Faults and Operational Faults
+
+When a fatal error is encountered, the key manager transitions to the `Invalid` [state](#invalid-entry-wiping).
+The following are a few examples of when the error occurs and how the key manager behaves.
+
+#### Example 1: Fault During Operation
+The key manager is running a generate operation and a non-onehot command was observed by the KMAC interface.
+Since the non-onehot condition is a fault, it is reflected in [`FAULT_STATUS`](../data/keymgr.hjson#fault_status) and a fatal alert is generated.
+The key manager transitions to `Invalid` state, wipes internal storage and reports an invalid operation in [`ERR_CODE.INVALID_OP`](../data/keymgr.hjson#err_code).
+
+#### Example 2: Fault During Idle
+The key manager is NOT running an operation and is idle.
+During this time, a fault is observed on the regfile (shadow storage error) and FSM (control FSM integrity error).
+The faults are reflected in [`FAULT_STATUS`](../data/keymgr.hjson#fault_status).
+The key manager transitions to `Invalid` state, wipes internal storage but does not report an invalid operation.
+
+#### Example 3: Operation after Fault Detection
+Continuing from the example above, the key manager now begins an operation.
+Since the key manager is already in `Invalid` state, it does not wipe internal storage and reports an invalid operation in [`ERR_CODE.INVALID_OP`](../data/keymgr.hjson#err_code).
+
+#### Additional Details on Invalid Input
+
+What is considered invalid input changes based on current state and operation.
+
+When an advance operation is invoked:
+- The internal key is checked for all 0's and all 1's.
+- During `Initialized` state, creator seed, device ID and health state data is checked for all 0's and all 1's.
+- During `CreatorRootKey` state, the owner seed is checked for all 0's and all 1's.
+- During all other states, nothing is explicitly checked.
+
+When a generate output key operation is invoked:
+- The internal key is checked for all 0's and all 1's.
+- The key version is less than or equal to the max key version.
+
+When a generate output identity is invoked:
+- The internal key is checked for all 0's and all 1's.
+
+#### Invalid Operation
+
+The table below enumerates the legal operations in a given state.
+When an illegal operation is supplied, the error code is updated and the operation is flagged as `done with error`.
+
+| Current State    | Legal Operations               |
+| -------------    | ------------------------------ |
+| Reset            | Advance                        |
+| Initialized      | Disable / Advance              |
+| CreatorRootKey   | Disable / Advance / Generate   |
+| OwnerIntKey      | Disable / Advance / Generate   |
+| OwnerRootKey     | Disable / Advance / Generate   |
+| Invalid/Disabled | None                           |
+
+*  All operations invoked during `Invalid` and `Disabled` states lead to invalid operation error.
+
+### Error Response
+In addition to alerts and interrupts, key manager may also update the internal key and relevant outputs based on current state.
+See the tables below for an enumeration.
+
+| Current State    | Invalid States  | Invalid Output | Invalid Input | Invalid Operation   |
+| -------------    | ----------------| ---------------|---------------|---------------------|
+| Reset            | Not Possible    | Not Possible   | Not possible  | Not updated         |
+| Initialized      | Updated         | Updated        | Not updated   | Not updated         |
+| CreatorRootKey   | Updated         | Updated        | Not updated   | Not possible        |
+| OwnerIntKey      | Updated         | Updated        | Not updated   | Not possible        |
+| OwnerRootKey     | Updated         | Updated        | Not updated   | Not possible        |
+| Invalid/Disabled | Updated         | Updated        | Updated       | Updated             |
+
+*  During `Reset` state, the KMAC module is never invoked, thus certain errors are not possible.
+*  During `Initialized`, `CreatorRootKey`, `OwnerIntermediateKey` and `OwnerRootKey` states, a fault error causes the relevant key / outputs to be updated; however an operational error does not.
+*  During `Invalid` and `Disabled` states, the relevant key / outputs are updated regardless of the error.
+*  Only the relevant collateral is updated -> ie, advance / disable command leads to working key update, and generate command leads to software or sideload key update.
+*  During `Disabled` state, if life cycle deactivation or an operational fault is encountered, the key manager transitions to `Invalid` state, see [here](#invalid-and-disabled-state)
+
+## DICE Support
+
+The key manager supports [DICE open profile](https://pigweed.googlesource.com/open-dice/+/HEAD/docs/specification.md#Open-Profile-for-DICE).
+Specifically, the open profile has two compound device identifiers.
+* Attestation CDI
+* Sealing CDI
+
+The attestation CDI is used to attest hardware and software configuration and is thus expected to change between updates.
+The sealing CDI on the other hand, is used to attest the authority of the hardware and software configuration.
+The sealing version is thus expected to remain stable across software updates.
+
+To support these features, the key manager maintains two versions of the working state and associated internal key.
+There is one version for attestation and one version for sealing.
+
+The main difference between the two CDIs is the different usage of `SW_BINDING`.
+For the Sealing CDI, the [`"SEALING_SW_BINDING"`](../data/keymgr.hjson#sealing_sw_binding) is used, all other inputs are the same.
+For the Attestation CDI, the [`"ATTEST_SW_BINDING"`](../data/keymgr.hjson#attest_sw_binding) is used, all other inputs are the same.
+
+When invoking an advance operation, both versions are advanced, one after the other.
+There are thus two KMAC transactions.
+The first transaction uses the Sealing CDI internal key, [`"SEALING_SW_BINDING"`](../data/keymgr.hjson#sealing_sw_binding) and other common inputs.
+The second transaction uses the Attestation CDI internal key, [`"ATTEST_SW_BINDING"`](../data/keymgr.hjson#attest_sw_binding) and other common inputs.
+
+When invoking a generate operation, the software must specify which CDI to use as the source key.
+This is done through [`"CONTROL.CDI_SEL"`](../data/keymgr.hjson#control).
+Unlike the advance operation, there is only 1 KMAC transaction since we pick a specific CDI to operate.
+
+When disabling, both versions are disabled together.
+
+
+## Block Diagram
+The following is a high level block diagram of the key manager.
+
+![Key Manager Block Diagram](../doc/keymgr_block_diagram.svg)
+
+## Design Details
+
+Key manager is primarily composed of two components:
+*  keymgr_ctrl
+*  keymgr_kmac_if
+
+### Key Manager Control
+
+The key manager control block manages the working state, sideload key updates, as well as what commands are valid in each state.
+It also handles the life cycle `keymgr_en` input, which deactivates the entire key manager function in the event of an escalation.
+
+![Key Manager Control Block Diagram](../doc/keymgr_control_diagram.svg)
+
+
+### KMAC Interface Control
+
+The KMAC interface control represents the bulk of key manager logic.
+Based on input from key manager control, this module selects the inputs for each given command and sequences the data to KMAC.
+
+![Key Manager KMAC Interface Block Diagram](../doc/keymgr_kmac_if_diagram.svg)
+
+The KMAC interface works on a simple `valid / ready` protocol.
+When there is data to send, the KMAC interface sends out a `valid` and keeps it active.
+When the destination accepts the transaction, the `ready` is asserted.
+Note just like with any bus interface, the `ready` may already be asserted when `valid` asserts, or it may assert some time later, there are no restrictions.
+Since the data to be sent is always pre-buffered in key manager, the valid, once asserted, does not de-assert until the entire transaction is complete.
+
+The data interface itself is 64b wide.
+However, there may not always be 64b multiple aligned data to be sent.
+In these situations, the last transfer beat sent to KMAC has a byte mask / strobe attached.
+The byte mask indicates on the last beat which bytes are actually valid, and which are not.
+Not beats prior to the last always have fully asserted byte masks.
+
+Once KMAC receives all the required data and the last indication, it begins processing the data into a digest.
+This process may take an arbitrary number of cycles.
+When this process is complete, a `done` indication pulse is sent back with the digest.
+Note, the acceptance of `done` has no back-pressure and `keymgr` must accept it within one cycle.
+
+See diagram below for an example transfer:
+
+```wavejson
+{signal: [
+  {name: 'kmac_data_o.valid',     wave: '01...........|....0..'},
+  {name: 'kmac_data_i.ready',     wave: '1...0..101...|.......'},
+  {name: 'kmac_data_o.data',      wave: 'x2222...2.222|2222x..'},
+  {name: 'kmac_data_o.last',      wave: '0................10..'},
+  {name: 'kmac_data_o.strb',      wave: 'x2...............2x..'},
+  {name: 'kmac_data_i.done',      wave: '0..................10'},
+  {name: 'kmac_data_i.digest*',   wave: 'x..................3x'},
+  ],
+}
+```
+
+### Sideload Keys
+
+There are three sideload keys.
+One for AES, one for KMAC and one for OTBN.
+When a sideload key is generated successfully through the `generate-output-hw` command, the derived data is loaded into key storage registers.
+There is a set of storage registers for each destination.
+
+The KMAC key however is further overloaded as it is the main derivation mechanism for key manager internal stage.
+The KMAC key thus has two possible outputs, one is the sideload key, and the other is internal state key.
+
+When a valid operation is called, the internal state key is sent over the KMAC key.
+During all other times, the sideloaded value is presented.
+Note, there may not be a valid key in the sideload register if it has been cleared or never generated.
+The sideload key can be overwritten with another generate command, or cleared with entropy through [`SIDELOAD_CLEAR`](../data/keymgr.hjson#sideload_clear).
+
+The clearing can be done one slot at a time, or all at once.
+Once a clearing bit is enabled for a particular key slot, its value is continuously re-randomized every clock cycle.
+Therefore, SW is responsible for toggling this bit back to disabled state, which makes the last random value remain stable on the sideload slot.
+Otherwise, the sideload key slot is continuously randomized which prevents sideloading an actual key to the target HWIP.
+
+The following diagram illustrates an example when there is no valid key in the KMAC sideload registers and an operation is called.
+During the duration of the operation, the key is valid and shows the internal key state.
+Once the operation is complete, it falls back to the sideload key state, which is invalid in this case.
+
+```wavejson
+{signal: [
+  {name: 'u_sideload_ctrl.u_kmac_key.key_o.valid',     wave: '0................'},
+  {name: 'u_sideload_ctrl.u_kmac_key.key_o.key_share', wave: 'x................'},
+  {name: 'u_ctrl.key_o.valid',                         wave: '0................'},
+  {name: 'u_ctrl.key_o.key_share',                     wave: 'x................'},
+  {name: 'u_ctrl.op_start_i',                          wave: '0....1.....0.....'},
+  {name: 'kmac_key_o.valid',                           wave: '0....1.....0.....'},
+  {name: 'kmac_key_o.key_share*',                      wave: 'x....3.....x.....'},
+  ],
+}
+```
+
+The following diagram illustrates an example when there is a valid key in the KMAC sideload registers and an operation is called.
+During the duration of the operation, the key is valid and shows the internal key state.
+Once the operation is complete, it falls back to the sideload key state, which is valid and contains a different value.
+
+```wavejson
+{signal: [
+  {name: 'u_sideload_ctrl.u_kmac_key.key_o.valid',     wave: '01...............'},
+  {name: 'u_sideload_ctrl.u_kmac_key.key_o.key_share', wave: 'x4...............'},
+  {name: 'u_ctrl.key_o.valid',                         wave: '0....1.....0.....'},
+  {name: 'u_ctrl.key_o.key_share',                     wave: 'x................'},
+  {name: 'u_ctrl.op_start_i',                          wave: '0....1.....0.....'},
+  {name: 'kmac_key_o.valid',                           wave: '01...............'},
+  {name: 'kmac_key_o.key_share*',                      wave: 'x4...3.....4.....'},
+  ],
+}
+```
+
+
+### Software Binding
+
+The identities flow employs an idea called [software binding](https://docs.opentitan.org/doc/security/specs/identities_and_root_keys/#software-binding) to ensure that a particular key derivation scheme is only reproducible for a given software configuration.
+The binding is created through the secure boot flow, where each stage sets the binding used for the next verified stage before advancing to it.
+The software binding is used during the following state transitions only:
+-  `Initialized` to `CreatorRootKey`
+-  `CreatorRootKey` to `OwnerIntermedaiteKey`
+-  `OwnerIntermediateKey` to `OwnerRootKey`
+
+In order to save on storage and not have a duplicate copy per stage, the software binding registers [`SOFTWARE_BINDING`](../data/keymgr.hjson#software_binding) are shared between key manager stages.
+
+Software sets the appropriate values and locks it by clearing [`SOFT_BINDING_EN`](../data/keymgr.hjson#soft_binding_en).
+When later a successful `advance` call is made, the key manager then unlocks by setting [`SOFT_BINDING_EN`](../data/keymgr.hjson#soft_binding_en) to 1.
+An unsuccessful advance call (errors) does not unlock the binding.
+This allows the next stage of software to re-use the binding registers.
+
+### Custom Security Checks
+
+The keymgr has several custom security checks.
+
+#### One-Hot Command Check
+The command received by the KMAC interface must always be in one-hot form and unchanging during the life time of a KMAC transaction.
+If this check fails, an error is reflected in [`FAULT_STATUS.CMD`](../data/keymgr.hjson#fault_status).
+
+#### Unexpected KMAC Done
+The `kmac_done` signal can only happen during the expected transaction window.
+If this check fails, an error is reflected in [`FAULT_STATUS.KMAC_DONE`](../data/keymgr.hjson#fault_status).
+
+#### Control State Machine Check
+This error checks for two things:
+-  The key manager can advance to one of the key states (e.g. RootKey, OwnerIntermediateKey) only when there is a legal advanced operation.
+-  The key manager can issue an advance or generate operation to the KMAC interface only if the original software request is an advanced or generate command.
+
+If these checks fail, an error is reflected in [`FAULT_STATUS.CTRL_FSM_CHK`](../data/keymgr.hjson#fault_status).
+
+#### Sideload Select Check
+A sideload key slot is selected for update only if the original software request targeted that key slot.
+
+If this check fails, an error is reflected in [`FAULT_STATUS.SIDE_CTRL_SEL`](../data/keymgr.hjson#fault_status).
+
+####
+
+####
+
+## Hardware Interfaces
+* [Interface Tables](../data/keymgr.hjson#interfaces)
diff --git a/hw/ip/kmac/README.md b/hw/ip/kmac/README.md
index d6ab97e7e5cae..274d1e8c4cf7f 100644
--- a/hw/ip/kmac/README.md
+++ b/hw/ip/kmac/README.md
@@ -36,489 +36,3 @@ The KMAC IP supports various SHA3 hashing functions including SHA3 Extended Outp
 
 The KMAC HWIP implements a defense mechanism to deter SCA attacks.
 It is expected to protect against 1st-order SCA attacks by implementing masked storage and [Domain-Oriented Masking (DOM)][] inside the Keccak function.
-
-# Theory of Operation
-
-## Block Diagram
-
-![](./doc/kmac-block-diagram.svg)
-
-The above figure shows the KMAC/SHA3 HWIP block diagram.
-The KMAC has register interfaces for SW to configure the module, initiate the hashing process, and acquire the result digest from the STATE memory region.
-It also has an interface to the KeyMgr to get the secret key (masked).
-The IP has N x [application interfaces](#application-interface), which allows other HWIPs to request any pre-defined hashing operations.
-
-As similar with HMAC, KMAC HWIP also has a message FIFO (MSG_FIFO) whose depth was determined based on a few criteria such as the register interface width, and its latency, the latency of hashing algorithm (Keccak).
-Based on the given criteria, the MSG_FIFO depth was determined to store the incoming message while the SHA3 core is in computation.
-
-The MSG_FIFO has a packer in front.
-It packs any partial writes into the size of internal datapath (64bit) and stores in MSG_FIFO.
-It frees the software from having to align the messages.
-It also doesn't need the message length information.
-
-The fed messages go into the KMAC core regardless of KMAC enabled or not.
-The KMAC core forwards the messages to SHA3 core in case KMAC hash functionality is disabled.
-KMAC core prepends the encoded secret key as described in the SHA3 Derived Functions specification.
-It is expected that the software writes the encoded output length at the end of the message.
-For hashing operations triggered by an IP through the application interface, the encoded output length is appended inside the AppIntf module in the KMAC HWIP.
-
-The SHA3 core is the main Keccak processing module.
-It supports SHA3 hashing functions, SHAKE128, SHAKE256 extended output functions, and also cSHAKE128, cSHAKE256 functions in order to support KMAC operation.
-To support multiple hashing functions, it has the padding logic inside.
-The padding logic mainly pads the predefined bits at the end of the message and also performs `pad10*1()` function.
-If cSHAKE mode is set, the padding logic also prepends the encoded function name `N` and the customization string `S` prior to the incoming messages according to the spec requirements.
-
-Both the internal state width and the masking of the Keccak core are configurable via compile-time Verilog parameters.
-By default, 1600 bits of internal state are used and stored in two shares (1st order masking).
-The masked Keccak core takes 4 clock cycles per round if sufficient entropy is available.
-If desired, the masking can be disabled and the internal state width can be reduced to 25, 50, or 100 bits at compile time.
-
-## Hardware Interface
-
-* [Interface Tables](data/kmac.hjson#interfaces)
-
-## Design Details
-
-### Keccak Round
-
-A Keccak round implements the Keccak_f function described in the SHA3 specification.
-Keccak round logic in KMAC/SHA3 HWIP not only supports 1600 bit internal states but also all possible values {50, 100, 200, 400, 800, 1600} based on a parameter `Width`.
-If masking is disabled via compile-time Verilog parameter `EnMasking`, also 25 can be selected as state width.
-Keccak permutations in the specification allow arbitrary number of rounds.
-This module, however, supports Keccak_f which always runs `12 + 2*L` rounds, where $$ L = log_2 {( {Width \over 25} )} $$ .
-For instance, 200 bits of internal state run 18 rounds.
-KMAC/SHA3 instantiates the Keccak round module with 1600 bit.
-
-![](./doc/keccak-round.svg)
-
-Keccak round logic has two phases inside.
-Theta, Rho, Pi functions are executed at the 1st phase.
-Chi and Iota functions run at the 2nd phase.
-If the compile-time Verilog parameter `EnMasking` is not set, i.e., if masking is not enabled, the first phase and the second phase run at the same cycle.
-
-If masking is enabled, the Keccak round logic stores the intermediate state after processing the 1st phase.
-The stored values are then fed into the 2nd phase computing the Chi and Iota functions.
-The Chi function leverages first-order [Domain-Oriented Masking (DOM)][] to aggravate SCA attacks.
-
-To balance circuit area and SCA hardening, the Chi function uses 800 instead 1600 DOM multipliers but the multipliers are fully pipelined.
-The Chi and Iota functions are thus separately applied to the two halves of the state and the 2nd phase takes in total three clock cycles to complete.
-In the first clock cycle of the 2nd phase, the first stage of Chi is computed for the first lane halves of the state.
-In the second clock cycle, the new first lane halves are output and written to state register.
-At the same time, the first stage of Chi is computed for the second lane halves.
-In the third clock cycle, the new second lane halves are output and written to the state register.
-
-The 800 DOM multipliers need 800 bits of fresh entropy for remasking.
-If fresh entropy is not available, the DOM multipliers do not move forward and the 2nd phase will take more than three clock cycles.
-Processing a Keccak_f (1600 bit state) takes a total of 96 cycles (24 rounds X 4 cycles/round) including the 1st and 2nd phases.
-
-If the masking compile time option is enabled, Keccak round logic requires an additional 3200 flip flops to store the intermediate half state inside the 800 DOM multipliers.
-In addition to that Keccak round logic needs two sets of the same Theta, Rho, and Pi functions.
-As a result, the masked Keccak round logic takes more than twice as much as area than the unmasked version of it.
-
-### Padding for Keccak
-
-Padding logic supports SHA3/SHAKE/cSHAKE algorithms.
-cSHAKE needs the extra inputs for the Function-name `N` and the Customization string `S`.
-Other than that, SHA3, SHAKE, and cSHAKE share similar datapath inside the padding module except the last part added next to the end of the message.
-SHA3 adds `2'b 10`, SHAKE adds `4'b 1111`, cSHAKE adds `2'b00` then `pad10*1()` follows.
-All are little-endian values.
-
-Interface between this padding logic and the MSG_FIFO follows the conventional FIFO interface.
-So `prim_fifo_*` can talk to the padding logic directly.
-This module talks to Keccak round logic with a more memory-like interface.
-The interface has an additional address signal on top of the valid, ready, and data signals.
-
-![](./doc/sha3-padding.svg)
-
-The hashing process begins when the software issues the start command to [`CMD`](data/kmac.hjson#cmd) .
-If cSHAKE is enabled, the padding logic expands the prefix value (`N || S` above) into a block size.
-The block size is determined by the [`CFG.kstrength`](data/kmac.hjson#cfg) .
-If the value is 128, the block size will be 168 bytes.
-If it is 256, the block size will be 136 bytes.
-The expanded prefix value is transmitted to the Keccak round logic.
-After sending the block size, the padding logic triggers the Keccak round logic to run a full 24 rounds.
-
-If the mode is not cSHAKE, or cSHAKE mode and the prefix block has been processed, the padding logic accepts the incoming message bitstream and forward the data to the Keccak round logic in a block granularity.
-The padding logic controls the data flow and makes the Keccak logic to run after sending a block size.
-
-After the software writes the message bitstream, it should issue the Process command into [`CMD`](data/kmac.hjson#cmd) register.
-The padding logic, after receiving the Process command, appends proper ending bits with respect to the [`CFG.mode`](data/kmac.hjson#cfg) value.
-The logic writes 0 up to the block size to the Keccak round logic then ends with 1 at the end of the block.
-
-![](./doc/sha3-padding-fsm.svg)
-
-After the Keccak round completes the last block, the padding logic asserts an `absorbed` signal to notify the software.
-The signal generates the `kmac_done` interrupt.
-At this point, the software is able to read the digest in [`STATE`](data/kmac.hjson#state) memory region.
-If the output length is greater than the Keccak block rate in SHAKE and cSHAKE mode, the software may run the Keccak round manually by issuing Run command to [`CMD`](data/kmac.hjson#cmd) register.
-
-The software completes the operation by issuing Done command after reading the digest.
-The padding logic clears internal variables and goes back to Idle state.
-
-### Padding for KMAC
-
-![](./doc/kmac-padding.svg)
-
-KMAC core prepends and appends additional bitstream on top of Keccak padding logic in SHA3 core.
-The [NIST SP 800-185][] defines `KMAC[128,256](K, X, L, S)` as a cSHAKE function.
-See the section 4.3 in NIST SP 800-185 for details.
-If KMAC is enabled, the software should configure [`CMD.mode`](data/kmac.hjson#cmd) to cSHAKE and the first six bytes of [`PREFIX`](data/kmac.hjson#prefix) to `0x01204B4D4143` (bigendian).
-The first six bytes of [`PREFIX`](data/kmac.hjson#prefix) represents the value of `encode_string("KMAC")`.
-
-The KMAC padding logic prepends a block containing the encoded secret key to the output message.
-The KMAC first sends the block of secret key then accepts the incoming message bitstream.
-At the end of the message, the software writes `right_encode(output_length)` to MSG_FIFO prior to issue Process command.
-
-### Message FIFO
-
-The KMAC HWIP has a compile-time configurable depth message FIFO inside.
-The message FIFO receives incoming message bitstream regardless of its byte position in a word.
-Then it packs the partial message bytes into the internal 64 bit data width.
-After packing the data, the logic stores the data into the FIFO until the internal KMAC/SHA3 engine consumes the data.
-
-#### FIFO Depth calculation
-
-The depth of the message FIFO is chosen to cover the throughput of the software or other producers such as DMA engine.
-The size of the message FIFO is enough to hold the incoming data while the SHA3 engine is processing the previous block.
-Details are in `kmac_pkg::MsgFifoDepth` parameter.
-Default design parameters assume the system characteristics as below:
-
-- `kmac_pkg::RegLatency`: The register write takes 5 cycles.
-- `kmac_pkg::Sha3Latency`: Keccak round latency takes 96 cycles, which is the masked version of the Keccak round.
-
-#### FIFO Depth and Empty status
-
-If the SW is slow and the SHA3 engine pops the data fast enough, the Message FIFO's depth may remain **0**.
-The Message FIFO's `fifo_empty` signal, however, is lowered for a cycle.
-This enables the HW to fire the interrupt even the FIFO remains empty.
-
-However, the recommended approach to write messages is:
-
-1. Check the FIFO depth [`STATUS.fifo_depth`](data/kmac.hjson#status). This represents the number of entry slots currently occupied in the FIFO.
-2. Calculate the remaining size as `<max number of fifo entries> - <STATUS.fifo_depth>) * <entry size>`.
-3. Write data to fill the remaining size.
-4. Repeat until all data is written.
-
-In code, this looks something like:
-```c
-/**
- * Absorb input data into the Keccak computation.
- *
- * Assumes that the KMAC block is in the "absorb" state; it is the caller's
- * responsibility to check before calling.
- *
- * @param in Input buffer.
- * @param in_len Length of input buffer (bytes).
- * @return Number of bytes written.
- */
-size_t kmac_absorb(const uint8_t *in, size_t in_len) {
-    // Read FIFO depth from the status register.
-    uint32_t status = abs_mmio_read32(kBase + KMAC_STATUS_REG_OFFSET);
-    uint32_t fifo_depth =
-        bitfield_field32_read(status, KMAC_STATUS_FIFO_DEPTH_FIELD);
-
-    // Calculate the remaining space in the FIFO using auto-generated KMAC
-    // parameters and take the minimum of that space and the input length.
-    size_t free_entries = (KMAC_PARAM_NUM_ENTRIES_MSG_FIFO - fifo_depth);
-    size_t max_len = free_entries * KMAC_PARAM_NUM_BYTES_MSG_FIFO_ENTRY;
-    size_t write_len = (in_len < max_len) ? in_len : max_len;
-
-    // Note: this example uses byte-writes for simplicity, but in practice it
-    // would be more efficient to use word-writes for aligned full words and
-    // byte-writes only as needed at the beginning and end of the input.
-    for (size_t i = 0; i < write_len; i++) {
-      abs_mmio_write8(kBase + KMAC_MSG_FIFO_REG_OFFSET, in[i]);
-    }
-
-    return write_len;
-}
-```
-
-The method recommended above is always safe.
-However, in specific contexts, it may be okay to skip polling `STATUS.fifo_depth`.
-Normally, KMAC will process data faster than software can write it, and back pressure on the FIFO interface, should ensure that writes from software will simply block until KMAC can process messages.
-The only reason for polling, then, is to prevent a specific deadlock scenario:
-1. Software has configured KMAC to wait forever for entropy.
-2. There is a problem with the EDN, so entropy is never coming.
-3. The FIFO is full and KMAC is waiting for entropy to process it.
-
-If either the entropy wait timer is nonzero or `kmac_en` is false (so KMAC will not be refreshing entropy), it is safe to write to the FIFO without polling `STATUS.fifo_depth`.
-However, this should be done carefully, and tests should always cover the scenario in which EDN is locked up.
-
-#### Masking
-
-The message FIFO does not generate the masked message data.
-Incoming message bitstream is not sensitive to the leakage.
-If the `EnMasking` parameter is set and [`CFG_SHADOWED.msg_mask`](data/kmac.hjson#cfg_shadowed) is enabled, the message is masked upon loading into the Keccak core using the internal entropy generator.
-The secret key, however, is stored as masked form always.
-
-If the `EnMasking` parameter is not set, the masking is disabled.
-Then, the software has to provide the key in unmasked form by default.
-Any write operations to [`KEY_SHARE1_0`](data/kmac.hjson#key_share1_0) - [`KEY_SHARE1_15`](data/kmac.hjson#key_share1_5) are ignored.
-
-If the `EnMasking` parameter is not set and the `SwKeyMasked` parameter is set, software has to provide the key in masked form.
-Internally, the design then unmasks the key by XORing the two key shares together when loading the key into the engine.
-This is useful when software interface compatibility between the masked and unmasked configuration is desirable.
-
-If the `EnMasking` parameter is set, the `SwKeyMasked` parameter has no effect: Software always provides the key in two shares.
-
-### Keccak State Access
-
-After the Keccak round completes the KMAC/SHA3 operation, the contents of the Keccak state contain the digest value.
-The software can access the 1600 bit of the Keccak state directly through the window of the KMAC/SHA3 register.
-
-If the compile-time parameter masking feature is enabled, the upper 256B of the window is the second share of the Keccak state.
-If not, the upper address space is zero value.
-The software reads both of the Keccak state shares and XORed in the software to get the unmasked digest value if masking feature is set.
-
-The Keccak state is valid after the sponge absorbing process is completed.
-While in an idle state or in the sponge absorbing stage, the value is zero.
-This ensures that the logic does not expose the secret key XORed with the keccak_f results of the prefix to the software.
-In addition to that, the KMAC/SHA3 blocks the software access to the Keccak state when it processes the request from KeyMgr for Key Derivation Function (KDF).
-
-### Application Interface
-
-![](./doc/application-interface.svg)
-
-KMAC/SHA3 HWIP has an option to receive the secret key from the KeyMgr via sideload key interface.
-The software should set [`CFG.sideload`](data/kmac.hjson#cfg) to use the KeyMgr sideloaded key for the SW-initiated KMAC operation.
-`keymgr_pkg::hw_key_t` defines the structure of the sideloaded key.
-KeyMgr provides the sideloaded key in two-share masked form regardless of the compile-time parameter `EnMasking`.
-If `EnMasking` is not defined, the KMAC merges the shared key to the unmasked form before uses the key.
-
-The IP has N number of the application interface. The apps connected to the KMAC IP may initiate the SHA3/cSHAKE/KMAC hashing operation via the application interface `kmac_pkg::app_{req|rsp}_t`.
-The type of the hashing operation is determined in the compile-time parameter `kmac_pkg::AppCfg`.
-
-| Index | App      | Algorithm | Prefix
-|:-----:|:--------:|:---------:|------------
-| 0     | KeyMgr   | KMAC      | CSR prefix
-| 1     | LC_CTRL  | cSHAKE128 | "LC_CTRL"
-| 2     | ROM_CTRL | cSHAKE256 | "ROM_CTRL"
-
-In the current version of IP, the IP has three application interfaces, which are KeyMgr, LC_CTRL, and ROM_CTRL.
-KeyMgr uses the KMAC operation with CSR prefix value.
-LC_CTRL and ROM_CTRL use the cSHAKE operation with the compile-time parameter prefixes.
-
-The app sends 64-bit data (`MsgWidth`) in a beat with the message strobe signal.
-The state machine inside the AppIntf logic starts when it receives the first valid data from any of the AppIntf.
-The AppIntf module chooses the winner based on the fixed priority.
-Then it forwards the selected App to the next stage.
-Because this logic sees the first valid data as an initiator, the Apps cannot run the hashing operation with an empty message.
-After the logic switches to accept the message bitstream from the selected App, if the hashing operation is KMAC, the logic forces the sideloaded key to be used as a secret.
-Also it ignores the command issued from the software.
-Instead it generates the commands and sends them to the KMAC core.
-
-The last beat of the App data moves the state machine to append the encoded output length if the hashing operation is KMAC.
-The output length is the digest width, which is 256 bit always.
-It means that the logic appends `0x020100` (little-endian) to the end of the message.
-The output data from this logic goes to MSG_FIFO.
-Because the MSG_FIFO handles un-aligned data inside, KeyMgr interface logic sends the encoded output length value in a separate beat.
-
-After the encoded output length is pushed to the KMAC core, the interface logic issues a Process command to run the hashing logic.
-
-After hashing operation is completed, KMAC does not raise a `kmac_done` interrupt; rather it triggers the `done` status in the App response channel.
-The result digest always comes in two shares.
-If the `EnMasking` parameter is not set, the second share is always zero.
-
-### Entropy Generator
-
-This section explains the entropy generator inside the KMAC HWIP.
-
-KMAC has an entropy generator to provide the design with pseudo-random numbers while processing the secret key block.
-The entropy is used for both remasking the DOM multipliers inside the Chi function of the Keccak core as well as for masking the message if [`CFG_SHADOWED.msg_mask`](data/kmac.hjson#cfg_shadowed) is enabled.
-
-![Entropy block](./doc/kmac-entropy.svg)
-
-The entropy generator is made up of 25 32-bit linear feedback shift registers (LFSRs).
-This allows the module to generate 800 bits of fresh, pseudo-random numbers required by the 800 DOM multipliers for remasking in every clock cycle.
-To break linear shift patterns, each LFSR features a non-linear layer.
-In addition an 800-bit wide permutation spanning across all LFSRs is used.
-
-Depending on [`CFG_SHADOWED.entropy_mode`](data/kmac.hjson#cfg_shadowed), the entropy generator fetches initial entropy from the [Entropy Distribution Network (EDN)][edn] module or software has to provide a seed by writing the [`ENTROPY_SEED_0`](data/kmac.hjson#entropy_seed_0) - [`ENTROPY_SEED_4`](data/kmac.hjson#entropy_seed_4) registers in ascending order.
-The module periodically refreshes the LFSR seeds with the new entropy from EDN.
-
-To limit the entropy consumption for reseeding, a cascaded reseeding mechanism is used.
-Per reseeding operation, the entropy generator consumes five times 32 bits of entropy from [EDN][edn], one 32-bit word at a time.
-These five 32-bit words are directly fed into LFSRs 0/5/10/15/20 for reseeding.
-At the same time, the previous states of LFSRs 0/5/10/15/20 from before the reseeding operation are permuted and then forwarded to reseed LFSRs 1/6/11/16/21.
-Similarly, the previous states of LFSRs 1/6/11/16/21 from before the reseeding operation are permuted and then forwarded to reseed LFSRs 2/7/12/17/22.
-Software can still request a complete reseed of all 25 LFSRs from EDN by subsequently triggering five reseeding operations through [`CMD.entropy_req`](data/kmac.hjson#cmd).
-
-[edn]: ../edn/README.md
-
-### Error Report
-
-This section explains the errors KMAC HWIP raises during the hashing operations, their meanings, and the error handling process.
-
-KMAC HWIP has the error checkers in its internal datapath.
-If the checkers detect errors, whether they are triggered by the SW mis-configure, or HW malfunctions, they report the error to [`ERR_CODE`](data/kmac.hjson#err_code) and raise an `kmac_error` interrupt.
-Each error code gives debugging information at the lower 24 bits of [`ERR_CODE`](data/kmac.hjson#err_code).
-
-Value | Error Code | Description
-------|------------|-------------
-0x01  | KeyNotValid | In KMAC mode with the sideloaded key, the IP raises an error if the sideloaded secret key is not ready.
-0x02  | SwPushedMsgFifo | MsgFifo is updated while not being in the Message Feed state.
-0x03  | SwIssuedCmdInAppActive | SW issued a command while the application interface is being used
-0x04  | WaitTimerExpired | EDN has not responded within the wait timer limit.
-0x05  | IncorrectEntropyMode | When SW sets `entropy_ready`, the `entropy_mode` is neither SW nor EDN.
-0x06  | UnexpectedModeStrength | SHA3 mode and Keccak Strength combination is not expected.
-0x07  | IncorrectFunctionName | In KMAC mode, the PREFIX has the value other than `encoded_string("KMAC")`
-0x08  | SwCmdSequence | SW does not follow the guided sequence, `start` -> `process` -> {`run` ->} `done`
-0x09  | SwHashingWithoutEntropyReady | SW requests KMAC op without proper config of Entropy in KMAC. This error occurs if KMAC IP masking feature is enabled.
-0x80  | Sha3Control | SW may receive Sha3Control error along with `SwCmdSequence` error. Can be ignored.
-
-#### KeyNotValid (0x01)
-
-The `KeyNotValid` error is raised in the application interface module.
-When a KMAC application requests a hashing operation, the module checks if the sideloaded key is ready.
-If the key is not ready, the module reports `KeyNotValid` error and moves to dead-end state and waits the IP reset.
-
-This error does not provide any additional information.
-
-#### SwPushedMsgFifo (0x02)
-
-The `SwPushedMsgFifo` error happens when the Message FIFO receives TL-UL transactions while the application interface is busy.
-The Message FIFO drops the request.
-
-The IP reports the error with an info field.
-
-Bits    | Name        | Description
---------|-------------|-------------
-[23:16] | reserved    | all zero
-[15:8]  | kmac_app_st | KMAC_APP FSM state.
-[7:0]   | mux_sel     | Current APP Mux selection. 0: None, 1: SW, 2: App
-
-#### SwIssuedCmdInAppActive (0x03)
-
-If the SW issues any commands while the application interface is being used, the module reports `SwIssuedCmdInAppActive` error.
-The received command does not affect the Application process.
-The request is dropped by the KMAC_APP module.
-
-The lower 3 bits of [`ERR_CODE`](data/kmac.hjson#err_code) contains the received command from the SW.
-#### WaitTimerExpired (0x04)
-
-The SW may set the EDN wait timer to exit from EDN request state if the response from EDN takes long.
-If the timer expires, the module cancels the transaction and report the `WaitTimerExpired` error.
-
-When this error happens, the state machine in KMAC_ENTROPY module moves to Wait state.
-In that state, it keeps using the pre-generated entropy and asserting the entropy valid signal.
-It asserts the entropy valid signal to complete the current hashing operation.
-If the module does not complete, or flush the pending operation, it creates the back pressure to the message FIFO.
-Then, the SW may not be able to access the KMAC IP at all, as the crossbar is stuck.
-
-The SW may move the state machine to the reset state by issuing [`CFG.err_processed`](data/kmac.hjson#cfg).
-
-#### IncorrectEntropyMode (0x05)
-
-If SW misconfigures the entropy mode and let the entropy module prepare the random data, the module reports `IncorrectEntropyMode` error.
-The state machine moves to Wait state after reporting the error.
-
-The SW may move the state machine to the reset state by issuing [`CFG.err_processed`](data/kmac.hjson#cfg).
-
-#### UnexpectedModeStrength (0x06)
-
-When the SW issues `Start` command, the KMAC_ERRCHK module checks the [`CFG.mode`](data/kmac.hjson#cfg) and [`CFG.kstrength`](data/kmac.hjson#cfg).
-The KMAC HWIP assumes the combinations of two to be **SHA3-224**, **SHA3-256**, **SHA3-384**, **SHA3-512**, **SHAKE-128**, **SHAKE-256**, **cSHAKE-128**, and **cSHAKE-256**.
-If the combination of the `mode` and `kstrength` does not fall into above, the module reports the `UnexpectedModeStrength` error.
-
-However, the KMAC HWIP proceeds the hashing operation as other combinations does not cause any malfunctions inside the IP.
-The SW may get the incorrect digest value.
-
-#### IncorrectFunctionName (0x07)
-
-If [`CFG.kmac_en`](data/kmac.hjson#cfg) is set and the SW issues the `Start` command, the KMAC_ERRCHK checks if the [`PREFIX`](data/kmac.hjson#prefix) has correct function name, `encode_string("KMAC")`.
-If the value does not match to the byte form of `encode_string("KMAC")` (`0x4341_4D4B_2001`), it reports the `IncorrectFunctionName` error.
-
-As same as `UnexpectedModeStrength` error, this error does not block the hashing operation.
-The SW may get the incorrect signature value.
-
-#### SwCmdSequence (0x08)
-
-The KMAC_ERRCHK module checks the SW issued commands if it follows the guideline.
-If the SW issues the command that is not relevant to the current context, the module reports the `SwCmdSequence` error.
-The lower 3bits of the [`ERR_CODE`](data/kmac.hjson#err_code) contains the received command.
-
-This error, however, does not stop the KMAC HWIP.
-The incorrect command is dropped at the following datapath, SHA3 core.
-
-# Programmers Guide
-
-## Initialization
-
-The software can update the KMAC/SHA3 configurations only when the IP is in the idle state.
-The software should check [`STATUS.sha3_idle`](data/kmac.hjson#status) before updating the configurations.
-The software must first program [`CFG.msg_endianness`](data/kmac.hjson#cfg) and [`CFG.state_endianness`](data/kmac.hjson#cfg) at the initialization stage.
-These determine the byte order of incoming messages (msg_endianness) and the Keccak state output (state_endianness).
-
-## Software Initiated KMAC/SHA3 process
-
-This section describes the expected software process to run the KMAC/SHA3 HWIP.
-At first, the software configures [`CFG.kmac_en`](data/kmac.hjson#cfg) for KMAC operation.
-If KMAC is enabled, the software should configure [`CFG.mode`](data/kmac.hjson#cfg) to cSHAKE and [`CFG.kstrength`](data/kmac.hjson#cfg) to 128 or 256 bit security strength.
-The software also updates [`PREFIX`](data/kmac.hjson#prefix) registers if cSHAKE mode is used.
-Current design does not convert cSHAKE mode to SHAKE even if [`PREFIX`](data/kmac.hjson#prefix) is empty string.
-It is the software's responsibility to change the [`CFG.mode`](data/kmac.hjson#cfg) to SHAKE in case of empty [`PREFIX`](data/kmac.hjson#prefix).
-The KMAC/SHA3 HWIP uses [`PREFIX`](data/kmac.hjson#prefix) registers as it is.
-It means that the software should update [`PREFIX`](data/kmac.hjson#prefix) with encoded values.
-
-If [`CFG.kmac_en`](data/kmac.hjson#cfg) is set, the software should update the secret key.
-The software prepares two shares of the secret key and selects its length in [`KEY_LEN`](data/kmac.hjson#key_len) then writes the shares of the secret key to [`KEY_SHARE0`](data/kmac.hjson#key_share0) and [`KEY_SHARE1`](data/kmac.hjson#key_share1) .
-The two shares of the secret key are the values that represent the secret key value when they are XORed together.
-The software can XOR the unmasked secret key with entropy.
-The XORed value is a share and the entropy used is the other share.
-
-After configuring, the software notifies the KMAC/SHA3 engine to accept incoming messages by issuing Start command into [`CMD`](data/kmac.hjson#cmd) .
-If Start command is not issued, the incoming message is discarded.
-If KMAC is enabled, the software pushes the `right_encode(output_length)` value at the end of the message.
-For example, if the desired output length is 256 bit, the software writes `0x00020100` to MSG_FIFO.
-
-After the software pushes all messages, it issues Process command to [`CMD`](data/kmac.hjson#cmd) for SHA3 engine to complete the sponge absorbing process.
-SHA3 hashing engine pads the incoming message as defined in the SHA3 specification.
-
-After the SHA3 engine completes the sponge absorbing step, it generates `kmac_done` interrupt.
-Or the software can poll the [`STATUS.squeeze`](data/kmac.hjson#status) bit until it becomes 1.
-In this stage, the software may run the Keccak round manually.
-
-If the desired digest length is greater than the Keccak rate, the software issues Run command for the Keccak round logic to run one full round after the software reads the current available Keccak state.
-At this stage, KMAC/SHA3 does not raise an interrupt when the Keccak round completes the software initiated manual run.
-The software should check [`STATUS.squeeze`](data/kmac.hjson#status) register field for the readiness of [`STATE`](data/kmac.hjson#state) value.
-
-After the software reads all the digest values, it issues Done command to [`CMD`](data/kmac.hjson#cmd) register to clear the internal states.
-Done command clears the Keccak state, FSM in SHA3 and KMAC, and a few internal variables.
-Secret key and other software programmed values won't be reset.
-
-
-## Endianness
-
-This KMAC HWIP operates in little-endian.
-Internal SHA3 hashing engine receives in 64-bit granularity.
-The data written to SHA3 is assumed to be little endian.
-
-The software may write/read the data in big-endian order if [`CFG.msg_endianness`](data/kmac.hjson#cfg) or [`CFG.state_endianness`](data/kmac.hjson#cfg) is set.
-If the endianness bit is 1, the data is assumed to be big-endian.
-So, the internal logic byte-swap the data.
-For example, when the software writes `0xDEADBEEF` with endianness as 1, the logic converts it to `0xEFBEADDE` then writes into MSG_FIFO.
-
-The software managed secret key, and the prefix are always little-endian values.
-For example, if the software configures the function name `N` in KMAC operation, it writes `encode_string("KMAC")`.
-The `encode_string("KMAC")` represents `0x01 0x20 0x4b 0x4d 0x41 0x43` in byte order.
-The software writes `0x4d4b2001` into [`PREFIX0`](data/kmac.hjson#prefix0) and `0x????4341` into [`PREFIX1`](data/kmac.hjson#prefix1) .
-Upper 2 bytes can vary depending on the customization input string `S`.
-
-## KMAC/SHA3 context switching
-
-This version of KMAC/SHA3 HWIP _does not_ support the software context switching.
-A context switching scheme would allow software to save the current hashing engine state and initiate a new high priority hashing operation.
-It could restore the previous hashing state later and continue the operation.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_kmac.h)
-
-## Registers
-
-* [Register Table](data/kmac.hjson#registers)
-
-[SHA3 specification, FIPS 202]: https://csrc.nist.gov/publications/detail/fips/202/final
-[NIST SP 800-185]: https://csrc.nist.gov/publications/detail/sp/800-185/final
-[Domain-Oriented Masking (DOM)]: https://eprint.iacr.org/2017/395.pdf
diff --git a/hw/ip/kmac/doc/programmers_guide.md b/hw/ip/kmac/doc/programmers_guide.md
new file mode 100644
index 0000000000000..2f7c6c091bd53
--- /dev/null
+++ b/hw/ip/kmac/doc/programmers_guide.md
@@ -0,0 +1,602 @@
+# Programmer's Guide
+
+## Initialization
+
+The software can update the KMAC/SHA3 configurations only when the IP is in the idle state.
+The software should check [`STATUS.sha3_idle`](data/kmac.hjson#status) before updating the configurations.
+The software must first program [`CFG.msg_endianness`](data/kmac.hjson#cfg) and [`CFG.state_endianness`](data/kmac.hjson#cfg) at the initialization stage.
+These determine the byte order of incoming messages (msg_endianness) and the Keccak state output (state_endianness).
+
+## Software Initiated KMAC/SHA3 process
+
+This section describes the expected software process to run the KMAC/SHA3 HWIP.
+At first, the software configures [`CFG.kmac_en`](data/kmac.hjson#cfg) for KMAC operation.
+If KMAC is enabled, the software should configure [`CFG.mode`](data/kmac.hjson#cfg) to cSHAKE and [`CFG.kstrength`](data/kmac.hjson#cfg) to 128 or 256 bit security strength.
+The software also updates [`PREFIX`](data/kmac.hjson#prefix) registers if cSHAKE mode is used.
+Current design does not convert cSHAKE mode to SHAKE even if [`PREFIX`](data/kmac.hjson#prefix) is empty string.
+It is the software's responsibility to change the [`CFG.mode`](data/kmac.hjson#cfg) to SHAKE in case of empty [`PREFIX`](data/kmac.hjson#prefix).
+The KMAC/SHA3 HWIP uses [`PREFIX`](data/kmac.hjson#prefix) registers as it is.
+It means that the software should update [`PREFIX`](data/kmac.hjson#prefix) with encoded values.
+
+If [`CFG.kmac_en`](data/kmac.hjson#cfg) is set, the software should update the secret key.
+The software prepares two shares of the secret key and selects its length in [`KEY_LEN`](data/kmac.hjson#key_len) then writes the shares of the secret key to [`KEY_SHARE0`](data/kmac.hjson#key_share0) and [`KEY_SHARE1`](data/kmac.hjson#key_share1) .
+The two shares of the secret key are the values that represent the secret key value when they are XORed together.
+The software can XOR the unmasked secret key with entropy.
+The XORed value is a share and the entropy used is the other share.
+
+After configuring, the software notifies the KMAC/SHA3 engine to accept incoming messages by issuing Start command into [`CMD`](../data/kmac.hjson#cmd) .
+If Start command is not issued, the incoming message is discarded.
+If KMAC is enabled, the software pushes the `right_encode(output_length)` value at the end of the message.
+For example, if the desired output length is 256 bit, the software writes `0x00020100` to MSG_FIFO.
+
+After the software pushes all messages, it issues Process command to [`CMD`](data/kmac.hjson#cmd) for SHA3 engine to complete the sponge absorbing process.
+SHA3 hashing engine pads the incoming message as defined in the SHA3 specification.
+
+After the SHA3 engine completes the sponge absorbing step, it generates `kmac_done` interrupt.
+Or the software can poll the [`STATUS.squeeze`](data/kmac.hjson#status) bit until it becomes 1.
+In this stage, the software may run the Keccak round manually.
+
+If the desired digest length is greater than the Keccak rate, the software issues Run command for the Keccak round logic to run one full round after the software reads the current available Keccak state.
+At this stage, KMAC/SHA3 does not raise an interrupt when the Keccak round completes the software initiated manual run.
+The software should check [`STATUS.squeeze`](data/kmac.hjson#status) register field for the readiness of [`STATE`](data/kmac.hjson#state) value.
+
+After the software reads all the digest values, it issues Done command to [`CMD`](data/kmac.hjson#cmd) register to clear the internal states.
+Done command clears the Keccak state, FSM in SHA3 and KMAC, and a few internal variables.
+Secret key and other software programmed values won't be reset.
+
+
+## Endianness
+
+This KMAC HWIP operates in little-endian.
+Internal SHA3 hashing engine receives in 64-bit granularity.
+The data written to SHA3 is assumed to be little endian.
+
+The software may write/read the data in big-endian order if [`CFG.msg_endianness`](data/kmac.hjson#cfg) or [`CFG.state_endianness`](data/kmac.hjson#cfg) is set.
+If the endianness bit is 1, the data is assumed to be big-endian.
+So, the internal logic byte-swap the data.
+For example, when the software writes `0xDEADBEEF` with endianness as 1, the logic converts it to `0xEFBEADDE` then writes into MSG_FIFO.
+
+The software managed secret key, and the prefix are always little-endian values.
+For example, if the software configures the function name `N` in KMAC operation, it writes `encode_string("KMAC")`.
+The `encode_string("KMAC")` represents `0x01 0x20 0x4b 0x4d 0x41 0x43` in byte order.
+The software writes `0x4d4b2001` into [`PREFIX0`](data/kmac.hjson#prefix0) and `0x????4341` into [`PREFIX1`](data/kmac.hjson#prefix1) .
+Upper 2 bytes can vary depending on the customization input string `S`.
+
+## KMAC/SHA3 context switching
+
+This version of KMAC/SHA3 HWIP _does not_ support the software context switching.
+A context switching scheme would allow software to save the current hashing engine state and initiate a new high priority hashing operation.
+It could restore the previous hashing state later and continue the operation.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../sw/device/lib/dif/dif_kmac.h)
+
+## Registers
+
+* [Register Table](data/kmac.hjson#registers)
+
+[SHA3 specification, FIPS 202]: https://csrc.nist.gov/publications/detail/fips/202/final
+[NIST SP 800-185]: https://csrc.nist.gov/publications/detail/sp/800-185/final
+[Domain-Oriented Masking (DOM)]: https://eprint.iacr.org/2017/395.pdf../data/kmac.hjson#entropy_seed_0cation
+
+# Overview
+
+This document specifies the Keccak Message Authentication Code (KMAC) and Secure Hashing Algorithm 3 (SHA3) hardware IP functionality.
+This module conforms to the OpenTitan guideline for peripheral device functionality.
+See that document for integration overview within the broader OpenTitan top level system.
+
+## Features
+
+- Support for SHA3-224, 256, 384, 512, SHAKE[128, 256] and cSHAKE[128, 256]
+- Support byte-granularity on input message
+- Support 128b, 192b, 256b, 384b, 512b of the secret key length in KMAC mode
+- Support arbitrary output length for SHAKE, cSHAKE, KMAC
+- Support customization input string S, and function-name N up to 36 bytes total
+- 64b x 10 depth Message FIFO
+- 1600b of internal state (internally represented in two shares for 1st-order masking)
+- Performance goals of >= 72 Mb/s @ 100MHz (when entropy is available always)
+    - SHA3-512: roughly 66 MB/s at most
+    - SHA3-224: 120 MB/s at most
+- Implement 1st-order masked Keccak permutations for Side-Channel Analysis (SCA) protection
+
+## Description
+
+The KMAC module is a Keccak based message authentication code generator to check the integrity of an incoming message and a signature signed with the same secret key.
+The secret key length can vary up to 512 bits.
+
+The KMAC generates at most 1600 bits of the digest value at a time which can be read from the STATE memory region.
+There's a way for the software to read more digest values by manually running the Keccak rounds.
+The details of the operation are described in the [SHA3 specification, FIPS 202]() known as _sponge construction_.
+
+The KMAC HWIP also performs the SHA3 hash functions without the authentication, whose purpose is to check the correctness of the received message.
+The KMAC IP supports various SHA3 hashing functions including SHA3 Extended Output Function (XOF) known as SHAKE functions.
+
+The KMAC HWIP implements a defense mechanism to deter SCA attacks.
+It is expected to protect against 1st-order SCA attacks by implementing masked storage and [Domain-Oriented Masking (DOM)][] inside the Keccak function.
+
+# Theory of Operation
+
+## Block Diagram
+
+![](../doc/kmac-block-diagram.svg)
+
+The above figure shows the KMAC/SHA3 HWIP block diagram.
+The KMAC has register interfaces for SW to configure the module, initiate the hashing process, and acquire the result digest from the STATE memory region.
+It also has an interface to the KeyMgr to get the secret key (masked).
+The IP has N x [application interfaces](#application-interface), which allows other HWIPs to request any pre-defined hashing operations.
+
+As similar with HMAC, KMAC HWIP also has a message FIFO (MSG_FIFO) whose depth was determined based on a few criteria such as the register interface width, and its latency, the latency of hashing algorithm (Keccak).
+Based on the given criteria, the MSG_FIFO depth was determined to store the incoming message while the SHA3 core is in computation.
+
+The MSG_FIFO has a packer in front.
+It packs any partial writes into the size of internal datapath (64bit) and stores in MSG_FIFO.
+It frees the software from having to align the messages.
+It also doesn't need the message length information.
+
+The fed messages go into the KMAC core regardless of KMAC enabled or not.
+The KMAC core forwards the messages to SHA3 core in case KMAC hash functionality is disabled.
+KMAC core prepends the encoded secret key as described in the SHA3 Derived Functions specification.
+It is expected that the software writes the encoded output length at the end of the message.
+For hashing operations triggered by an IP through the application interface, the encoded output length is appended inside the AppIntf module in the KMAC HWIP.
+
+The SHA3 core is the main Keccak processing module.
+It supports SHA3 hashing functions, SHAKE128, SHAKE256 extended output functions, and also cSHAKE128, cSHAKE256 functions in order to support KMAC operation.
+To support multiple hashing functions, it has the padding logic inside.
+The padding logic mainly pads the predefined bits at the end of the message and also performs `pad10*1()` function.
+If cSHAKE mode is set, the padding logic also prepends the encoded function name `N` and the customization string `S` prior to the incoming messages according to the spec requirements.
+
+Both the internal state width and the masking of the Keccak core are configurable via compile-time Verilog parameters.
+By default, 1600 bits of internal state are used and stored in two shares (1st order masking).
+The masked Keccak core takes 4 clock cycles per round if sufficient entropy is available.
+If desired, the masking can be disabled and the internal state width can be reduced to 25, 50, or 100 bits at compile time.
+
+## Hardware Interface
+
+* [Interface Tables](../data/kmac.hjson#interfaces)
+
+## Design Details
+
+### Keccak Round
+
+A Keccak round implements the Keccak_f function described in the SHA3 specification.
+Keccak round logic in KMAC/SHA3 HWIP not only supports 1600 bit internal states but also all possible values {50, 100, 200, 400, 800, 1600} based on a parameter `Width`.
+If masking is disabled via compile-time Verilog parameter `EnMasking`, also 25 can be selected as state width.
+Keccak permutations in the specification allow arbitrary number of rounds.
+This module, however, supports Keccak_f which always runs `12 + 2*L` rounds, where $$ L = log_2 {( {Width \over 25} )} $$ .
+For instance, 200 bits of internal state run 18 rounds.
+KMAC/SHA3 instantiates the Keccak round module with 1600 bit.
+
+![](../doc/keccak-round.svg)
+
+Keccak round logic has two phases inside.
+Theta, Rho, Pi functions are executed at the 1st phase.
+Chi and Iota functions run at the 2nd phase.
+If the compile-time Verilog parameter `EnMasking` is not set, i.e., if masking is not enabled, the first phase and the second phase run at the same cycle.
+
+If masking is enabled, the Keccak round logic stores the intermediate state after processing the 1st phase.
+The stored values are then fed into the 2nd phase computing the Chi and Iota functions.
+The Chi function leverages first-order [Domain-Oriented Masking (DOM)][] to aggravate SCA attacks.
+
+To balance circuit area and SCA hardening, the Chi function uses 800 instead 1600 DOM multipliers but the multipliers are fully pipelined.
+The Chi and Iota functions are thus separately applied to the two halves of the state and the 2nd phase takes in total three clock cycles to complete.
+In the first clock cycle of the 2nd phase, the first stage of Chi is computed for the first lane halves of the state.
+In the second clock cycle, the new first lane halves are output and written to state register.
+At the same time, the first stage of Chi is computed for the second lane halves.
+In the third clock cycle, the new second lane halves are output and written to the state register.
+
+The 800 DOM multipliers need 800 bits of fresh entropy for remasking.
+If fresh entropy is not available, the DOM multipliers do not move forward and the 2nd phase will take more than three clock cycles.
+Processing a Keccak_f (1600 bit state) takes a total of 96 cycles (24 rounds X 4 cycles/round) including the 1st and 2nd phases.
+
+If the masking compile time option is enabled, Keccak round logic requires an additional 3200 flip flops to store the intermediate half state inside the 800 DOM multipliers.
+In addition to that Keccak round logic needs two sets of the same Theta, Rho, and Pi functions.
+As a result, the masked Keccak round logic takes more than twice as much as area than the unmasked version of it.
+
+### Padding for Keccak
+
+Padding logic supports SHA3/SHAKE/cSHAKE algorithms.
+cSHAKE needs the extra inputs for the Function-name `N` and the Customization string `S`.
+Other than that, SHA3, SHAKE, and cSHAKE share similar datapath inside the padding module except the last part added next to the end of the message.
+SHA3 adds `2'b 10`, SHAKE adds `4'b 1111`, cSHAKE adds `2'b00` then `pad10*1()` follows.
+All are little-endian values.
+
+Interface between this padding logic and the MSG_FIFO follows the conventional FIFO interface.
+So `prim_fifo_*` can talk to the padding logic directly.
+This module talks to Keccak round logic with a more memory-like interface.
+The interface has an additional address signal on top of the valid, ready, and data signals.
+
+![](../doc/sha3-padding.svg)
+
+The hashing process begins when the software issues the start command to [`CMD`](../data/kmac.hjson#cmd) .
+If cSHAKE is enabled, the padding logic expands the prefix value (`N || S` above) into a block size.
+The block size is determined by the [`CFG.kstrength`](../data/kmac.hjson#cfg) .
+If the value is 128, the block size will be 168 bytes.
+If it is 256, the block size will be 136 bytes.
+The expanded prefix value is transmitted to the Keccak round logic.
+After sending the block size, the padding logic triggers the Keccak round logic to run a full 24 rounds.
+
+If the mode is not cSHAKE, or cSHAKE mode and the prefix block has been processed, the padding logic accepts the incoming message bitstream and forward the data to the Keccak round logic in a block granularity.
+The padding logic controls the data flow and makes the Keccak logic to run after sending a block size.
+
+After the software writes the message bitstream, it should issue the Process command into [`CMD`](../data/kmac.hjson#cmd) register.
+The padding logic, after receiving the Process command, appends proper ending bits with respect to the [`CFG.mode`](../data/kmac.hjson#cfg) value.
+The logic writes 0 up to the block size to the Keccak round logic then ends with 1 at the end of the block.
+
+![](../doc/sha3-padding-fsm.svg)
+
+After the Keccak round completes the last block, the padding logic asserts an `absorbed` signal to notify the software.
+The signal generates the `kmac_done` interrupt.
+At this point, the software is able to read the digest in [`STATE`](../data/kmac.hjson#state) memory region.
+If the output length is greater than the Keccak block rate in SHAKE and cSHAKE mode, the software may run the Keccak round manually by issuing Run command to [`CMD`](../data/kmac.hjson#cmd) register.
+
+The software completes the operation by issuing Done command after reading the digest.
+The padding logic clears internal variables and goes back to Idle state.
+
+### Padding for KMAC
+
+![](../doc/kmac-padding.svg)
+
+KMAC core prepends and appends additional bitstream on top of Keccak padding logic in SHA3 core.
+The [NIST SP 800-185][] defines `KMAC[128,256](K, X, L, S)` as a cSHAKE function.
+See the section 4.3 in NIST SP 800-185 for details.
+If KMAC is enabled, the software should configure [`CMD.mode`](../data/kmac.hjson#cmd) to cSHAKE and the first six bytes of [`PREFIX`](../data/kmac.hjson#prefix) to `0x01204B4D4143` (bigendian).
+The first six bytes of [`PREFIX`](../data/kmac.hjson#prefix) represents the value of `encode_string("KMAC")`.
+
+The KMAC padding logic prepends a block containing the encoded secret key to the output message.
+The KMAC first sends the block of secret key then accepts the incoming message bitstream.
+At the end of the message, the software writes `right_encode(output_length)` to MSG_FIFO prior to issue Process command.
+
+### Message FIFO
+
+The KMAC HWIP has a compile-time configurable depth message FIFO inside.
+The message FIFO receives incoming message bitstream regardless of its byte position in a word.
+Then it packs the partial message bytes into the internal 64 bit data width.
+After packing the data, the logic stores the data into the FIFO until the internal KMAC/SHA3 engine consumes the data.
+
+#### FIFO Depth calculation
+
+The depth of the message FIFO is chosen to cover the throughput of the software or other producers such as DMA engine.
+The size of the message FIFO is enough to hold the incoming data while the SHA3 engine is processing the previous block.
+Details are in `kmac_pkg::MsgFifoDepth` parameter.
+Default design parameters assume the system characteristics as below:
+
+- `kmac_pkg::RegLatency`: The register write takes 5 cycles.
+- `kmac_pkg::Sha3Latency`: Keccak round latency takes 96 cycles, which is the masked version of the Keccak round.
+
+#### FIFO Depth and Empty status
+
+If the SW is slow and the SHA3 engine pops the data fast enough, the Message FIFO's depth may remain **0**.
+The Message FIFO's `fifo_empty` signal, however, is lowered for a cycle.
+This enables the HW to fire the interrupt even the FIFO remains empty.
+
+However, the recommended approach to write messages is:
+
+1. Check the FIFO depth [`STATUS.fifo_depth`](../data/kmac.hjson#status). This represents the number of entry slots currently occupied in the FIFO.
+2. Calculate the remaining size as `<max number of fifo entries> - <STATUS.fifo_depth>) * <entry size>`.
+3. Write data to fill the remaining size.
+4. Repeat until all data is written.
+
+In code, this looks something like:
+```c
+/**
+ * Absorb input data into the Keccak computation.
+ *
+ * Assumes that the KMAC block is in the "absorb" state; it is the caller's
+ * responsibility to check before calling.
+ *
+ * @param in Input buffer.
+ * @param in_len Length of input buffer (bytes).
+ * @return Number of bytes written.
+ */
+size_t kmac_absorb(const uint8_t *in, size_t in_len) {
+    // Read FIFO depth from the status register.
+    uint32_t status = abs_mmio_read32(kBase + KMAC_STATUS_REG_OFFSET);
+    uint32_t fifo_depth =
+        bitfield_field32_read(status, KMAC_STATUS_FIFO_DEPTH_FIELD);
+
+    // Calculate the remaining space in the FIFO using auto-generated KMAC
+    // parameters and take the minimum of that space and the input length.
+    size_t free_entries = (KMAC_PARAM_NUM_ENTRIES_MSG_FIFO - fifo_depth);
+    size_t max_len = free_entries * KMAC_PARAM_NUM_BYTES_MSG_FIFO_ENTRY;
+    size_t write_len = (in_len < max_len) ? in_len : max_len;
+
+    // Note: this example uses byte-writes for simplicity, but in practice it
+    // would be more efficient to use word-writes for aligned full words and
+    // byte-writes only as needed at the beginning and end of the input.
+    for (size_t i = 0; i < write_len; i++) {
+      abs_mmio_write8(kBase + KMAC_MSG_FIFO_REG_OFFSET, in[i]);
+    }
+
+    return write_len;
+}
+```
+
+The method recommended above is always safe.
+However, in specific contexts, it may be okay to skip polling `STATUS.fifo_depth`.
+Normally, KMAC will process data faster than software can write it, and back pressure on the FIFO interface, should ensure that writes from software will simply block until KMAC can process messages.
+The only reason for polling, then, is to prevent a specific deadlock scenario:
+1. Software has configured KMAC to wait forever for entropy.
+2. There is a problem with the EDN, so entropy is never coming.
+3. The FIFO is full and KMAC is waiting for entropy to process it.
+
+If either the entropy wait timer is nonzero or `kmac_en` is false (so KMAC will not be refreshing entropy), it is safe to write to the FIFO without polling `STATUS.fifo_depth`.
+However, this should be done carefully, and tests should always cover the scenario in which EDN is locked up.
+
+#### Masking
+
+The message FIFO does not generate the masked message data.
+Incoming message bitstream is not sensitive to the leakage.
+If the `EnMasking` parameter is set and [`CFG_SHADOWED.msg_mask`](../data/kmac.hjson#cfg_shadowed) is enabled, the message is masked upon loading into the Keccak core using the internal entropy generator.
+The secret key, however, is stored as masked form always.
+
+If the `EnMasking` parameter is not set, the masking is disabled.
+Then, the software has to provide the key in unmasked form by default.
+Any write operations to [`KEY_SHARE1_0`](../data/kmac.hjson#key_share1_0) - [`KEY_SHARE1_15`](../data/kmac.hjson#key_share1_5) are ignored.
+
+If the `EnMasking` parameter is not set and the `SwKeyMasked` parameter is set, software has to provide the key in masked form.
+Internally, the design then unmasks the key by XORing the two key shares together when loading the key into the engine.
+This is useful when software interface compatibility between the masked and unmasked configuration is desirable.
+
+If the `EnMasking` parameter is set, the `SwKeyMasked` parameter has no effect: Software always provides the key in two shares.
+
+### Keccak State Access
+
+After the Keccak round completes the KMAC/SHA3 operation, the contents of the Keccak state contain the digest value.
+The software can access the 1600 bit of the Keccak state directly through the window of the KMAC/SHA3 register.
+
+If the compile-time parameter masking feature is enabled, the upper 256B of the window is the second share of the Keccak state.
+If not, the upper address space is zero value.
+The software reads both of the Keccak state shares and XORed in the software to get the unmasked digest value if masking feature is set.
+
+The Keccak state is valid after the sponge absorbing process is completed.
+While in an idle state or in the sponge absorbing stage, the value is zero.
+This ensures that the logic does not expose the secret key XORed with the keccak_f results of the prefix to the software.
+In addition to that, the KMAC/SHA3 blocks the software access to the Keccak state when it processes the request from KeyMgr for Key Derivation Function (KDF).
+
+### Application Interface
+
+![](../doc/application-interface.svg)
+
+KMAC/SHA3 HWIP has an option to receive the secret key from the KeyMgr via sideload key interface.
+The software should set [`CFG.sideload`](../data/kmac.hjson#cfg) to use the KeyMgr sideloaded key for the SW-initiated KMAC operation.
+`keymgr_pkg::hw_key_t` defines the structure of the sideloaded key.
+KeyMgr provides the sideloaded key in two-share masked form regardless of the compile-time parameter `EnMasking`.
+If `EnMasking` is not defined, the KMAC merges the shared key to the unmasked form before uses the key.
+
+The IP has N number of the application interface. The apps connected to the KMAC IP may initiate the SHA3/cSHAKE/KMAC hashing operation via the application interface `kmac_pkg::app_{req|rsp}_t`.
+The type of the hashing operation is determined in the compile-time parameter `kmac_pkg::AppCfg`.
+
+| Index | App      | Algorithm | Prefix
+|:-----:|:--------:|:---------:|------------
+| 0     | KeyMgr   | KMAC      | CSR prefix
+| 1     | LC_CTRL  | cSHAKE128 | "LC_CTRL"
+| 2     | ROM_CTRL | cSHAKE256 | "ROM_CTRL"
+
+In the current version of IP, the IP has three application interfaces, which are KeyMgr, LC_CTRL, and ROM_CTRL.
+KeyMgr uses the KMAC operation with CSR prefix value.
+LC_CTRL and ROM_CTRL use the cSHAKE operation with the compile-time parameter prefixes.
+
+The app sends 64-bit data (`MsgWidth`) in a beat with the message strobe signal.
+The state machine inside the AppIntf logic starts when it receives the first valid data from any of the AppIntf.
+The AppIntf module chooses the winner based on the fixed priority.
+Then it forwards the selected App to the next stage.
+Because this logic sees the first valid data as an initiator, the Apps cannot run the hashing operation with an empty message.
+After the logic switches to accept the message bitstream from the selected App, if the hashing operation is KMAC, the logic forces the sideloaded key to be used as a secret.
+Also it ignores the command issued from the software.
+Instead it generates the commands and sends them to the KMAC core.
+
+The last beat of the App data moves the state machine to append the encoded output length if the hashing operation is KMAC.
+The output length is the digest width, which is 256 bit always.
+It means that the logic appends `0x020100` (little-endian) to the end of the message.
+The output data from this logic goes to MSG_FIFO.
+Because the MSG_FIFO handles un-aligned data inside, KeyMgr interface logic sends the encoded output length value in a separate beat.
+
+After the encoded output length is pushed to the KMAC core, the interface logic issues a Process command to run the hashing logic.
+
+After hashing operation is completed, KMAC does not raise a `kmac_done` interrupt; rather it triggers the `done` status in the App response channel.
+The result digest always comes in two shares.
+If the `EnMasking` parameter is not set, the second share is always zero.
+
+### Entropy Generator
+
+This section explains the entropy generator inside the KMAC HWIP.
+
+KMAC has an entropy generator to provide the design with pseudo-random numbers while processing the secret key block.
+The entropy is used for both remasking the DOM multipliers inside the Chi function of the Keccak core as well as for masking the message if [`CFG_SHADOWED.msg_mask`](../data/kmac.hjson#cfg_shadowed) is enabled.
+
+![Entropy block](../doc/kmac-entropy.svg)
+
+The entropy generator is made up of 25 32-bit linear feedback shift registers (LFSRs).
+This allows the module to generate 800 bits of fresh, pseudo-random numbers required by the 800 DOM multipliers for remasking in every clock cycle.
+To break linear shift patterns, each LFSR features a non-linear layer.
+In addition an 800-bit wide permutation spanning across all LFSRs is used.
+
+Depending on [`CFG_SHADOWED.entropy_mode`](../data/kmac.hjson#cfg_shadowed), the entropy generator fetches initial entropy from the [Entropy Distribution Network (EDN)][edn] module or software has to provide a seed by writing the [`ENTROPY_SEED_0`](data/kmac.hjson#entropy_seed_0) - [`ENTROPY_SEED_4`](data/kmac.hjson#entropy_seed_4) registers in ascending order.
+The module periodically refreshes the LFSR seeds with the new entropy from EDN.
+
+To limit the entropy consumption for reseeding, a cascaded reseeding mechanism is used.
+Per reseeding operation, the entropy generator consumes five times 32 bits of entropy from [EDN][edn], one 32-bit word at a time.
+These five 32-bit words are directly fed into LFSRs 0/5/10/15/20 for reseeding.
+At the same time, the previous states of LFSRs 0/5/10/15/20 from before the reseeding operation are permuted and then forwarded to reseed LFSRs 1/6/11/16/21.
+Similarly, the previous states of LFSRs 1/6/11/16/21 from before the reseeding operation are permuted and then forwarded to reseed LFSRs 2/7/12/17/22.
+Software can still request a complete reseed of all 25 LFSRs from EDN by subsequently triggering five reseeding operations through [`CMD.entropy_req`](data/kmac.hjson#cmd).
+
+[edn]: ../../edn/README.md
+
+### Error Report
+
+This section explains the errors KMAC HWIP raises during the hashing operations, their meanings, and the error handling process.
+
+KMAC HWIP has the error checkers in its internal datapath.
+If the checkers detect errors, whether they are triggered by the SW mis-configure, or HW malfunctions, they report the error to [`ERR_CODE`](../data/kmac.hjson#err_code) and raise an `kmac_error` interrupt.
+Each error code gives debugging information at the lower 24 bits of [`ERR_CODE`](../data/kmac.hjson#err_code).
+
+Value | Error Code | Description
+------|------------|-------------
+0x01  | KeyNotValid | In KMAC mode with the sideloaded key, the IP raises an error if the sideloaded secret key is not ready.
+0x02  | SwPushedMsgFifo | MsgFifo is updated while not being in the Message Feed state.
+0x03  | SwIssuedCmdInAppActive | SW issued a command while the application interface is being used
+0x04  | WaitTimerExpired | EDN has not responded within the wait timer limit.
+0x05  | IncorrectEntropyMode | When SW sets `entropy_ready`, the `entropy_mode` is neither SW nor EDN.
+0x06  | UnexpectedModeStrength | SHA3 mode and Keccak Strength combination is not expected.
+0x07  | IncorrectFunctionName | In KMAC mode, the PREFIX has the value other than `encoded_string("KMAC")`
+0x08  | SwCmdSequence | SW does not follow the guided sequence, `start` -> `process` -> {`run` ->} `done`
+0x09  | SwHashingWithoutEntropyReady | SW requests KMAC op without proper config of Entropy in KMAC. This error occurs if KMAC IP masking feature is enabled.
+0x80  | Sha3Control | SW may receive Sha3Control error along with `SwCmdSequence` error. Can be ignored.
+
+#### KeyNotValid (0x01)
+
+The `KeyNotValid` error is raised in the application interface module.
+When a KMAC application requests a hashing operation, the module checks if the sideloaded key is ready.
+If the key is not ready, the module reports `KeyNotValid` error and moves to dead-end state and waits the IP reset.
+
+This error does not provide any additional information.
+
+#### SwPushedMsgFifo (0x02)
+
+The `SwPushedMsgFifo` error happens when the Message FIFO receives TL-UL transactions while the application interface is busy.
+The Message FIFO drops the request.
+
+The IP reports the error with an info field.
+
+Bits    | Name        | Description
+--------|-------------|-------------
+[23:16] | reserved    | all zero
+[15:8]  | kmac_app_st | KMAC_APP FSM state.
+[7:0]   | mux_sel     | Current APP Mux selection. 0: None, 1: SW, 2: App
+
+#### SwIssuedCmdInAppActive (0x03)
+
+If the SW issues any commands while the application interface is being used, the module reports `SwIssuedCmdInAppActive` error.
+The received command does not affect the Application process.
+The request is dropped by the KMAC_APP module.
+
+The lower 3 bits of [`ERR_CODE`](../data/kmac.hjson#err_code) contains the received command from the SW.
+#### WaitTimerExpired (0x04)
+
+The SW may set the EDN wait timer to exit from EDN request state if the response from EDN takes long.
+If the timer expires, the module cancels the transaction and report the `WaitTimerExpired` error.
+
+When this error happens, the state machine in KMAC_ENTROPY module moves to Wait state.
+In that state, it keeps using the pre-generated entropy and asserting the entropy valid signal.
+It asserts the entropy valid signal to complete the current hashing operation.
+If the module does not complete, or flush the pending operation, it creates the back pressure to the message FIFO.
+Then, the SW may not be able to access the KMAC IP at all, as the crossbar is stuck.
+
+The SW may move the state machine to the reset state by issuing [`CFG.err_processed`](../data/kmac.hjson#cfg).
+
+#### IncorrectEntropyMode (0x05)
+
+If SW misconfigures the entropy mode and let the entropy module prepare the random data, the module reports `IncorrectEntropyMode` error.
+The state machine moves to Wait state after reporting the error.
+
+The SW may move the state machine to the reset state by issuing [`CFG.err_processed`](../data/kmac.hjson#cfg).
+
+#### UnexpectedModeStrength (0x06)
+
+When the SW issues `Start` command, the KMAC_ERRCHK module checks the [`CFG.mode`](../data/kmac.hjson#cfg) and [`CFG.kstrength`](../data/kmac.hjson#cfg).
+The KMAC HWIP assumes the combinations of two to be **SHA3-224**, **SHA3-256**, **SHA3-384**, **SHA3-512**, **SHAKE-128**, **SHAKE-256**, **cSHAKE-128**, and **cSHAKE-256**.
+If the combination of the `mode` and `kstrength` does not fall into above, the module reports the `UnexpectedModeStrength` error.
+
+However, the KMAC HWIP proceeds the hashing operation as other combinations does not cause any malfunctions inside the IP.
+The SW may get the incorrect digest value.
+
+#### IncorrectFunctionName (0x07)
+
+If [`CFG.kmac_en`](../data/kmac.hjson#cfg) is set and the SW issues the `Start` command, the KMAC_ERRCHK checks if the [`PREFIX`](../data/kmac.hjson#prefix) has correct function name, `encode_string("KMAC")`.
+If the value does not match to the byte form of `encode_string("KMAC")` (`0x4341_4D4B_2001`), it reports the `IncorrectFunctionName` error.
+
+As same as `UnexpectedModeStrength` error, this error does not block the hashing operation.
+The SW may get the incorrect signature value.
+
+#### SwCmdSequence (0x08)
+
+The KMAC_ERRCHK module checks the SW issued commands if it follows the guideline.
+If the SW issues the command that is not relevant to the current context, the module reports the `SwCmdSequence` error.
+The lower 3bits of the [`ERR_CODE`](../data/kmac.hjson#err_code) contains the received command.
+
+This error, however, does not stop the KMAC HWIP.
+The incorrect command is dropped at the following datapath, SHA3 core.
+
+# Programmers Guide
+
+## Initialization
+
+The software can update the KMAC/SHA3 configurations only when the IP is in the idle state.
+The software should check [`STATUS.sha3_idle`](../data/kmac.hjson#status) before updating the configurations.
+The software must first program [`CFG.msg_endianness`](../data/kmac.hjson#cfg) and [`CFG.state_endianness`](../data/kmac.hjson#cfg) at the initialization stage.
+These determine the byte order of incoming messages (msg_endianness) and the Keccak state output (state_endianness).
+
+## Software Initiated KMAC/SHA3 process
+
+This section describes the expected software process to run the KMAC/SHA3 HWIP.
+At first, the software configures [`CFG.kmac_en`](../data/kmac.hjson#cfg) for KMAC operation.
+If KMAC is enabled, the software should configure [`CFG.mode`](../data/kmac.hjson#cfg) to cSHAKE and [`CFG.kstrength`](../data/kmac.hjson#cfg) to 128 or 256 bit security strength.
+The software also updates [`PREFIX`](../data/kmac.hjson#prefix) registers if cSHAKE mode is used.
+Current design does not convert cSHAKE mode to SHAKE even if [`PREFIX`](../data/kmac.hjson#prefix) is empty string.
+It is the software's responsibility to change the [`CFG.mode`](../data/kmac.hjson#cfg) to SHAKE in case of empty [`PREFIX`](../data/kmac.hjson#prefix).
+The KMAC/SHA3 HWIP uses [`PREFIX`](../data/kmac.hjson#prefix) registers as it is.
+It means that the software should update [`PREFIX`](../data/kmac.hjson#prefix) with encoded values.
+
+If [`CFG.kmac_en`](../data/kmac.hjson#cfg) is set, the software should update the secret key.
+The software prepares two shares of the secret key and selects its length in [`KEY_LEN`](../data/kmac.hjson#key_len) then writes the shares of the secret key to [`KEY_SHARE0`](../data/kmac.hjson#key_share0) and [`KEY_SHARE1`](../data/kmac.hjson#key_share1) .
+The two shares of the secret key are the values that represent the secret key value when they are XORed together.
+The software can XOR the unmasked secret key with entropy.
+The XORed value is a share and the entropy used is the other share.
+
+After configuring, the software notifies the KMAC/SHA3 engine to accept incoming messages by issuing Start command into [`CMD`](../data/kmac.hjson#cmd) .
+If Start command is not issued, the incoming message is discarded.
+If KMAC is enabled, the software pushes the `right_encode(output_length)` value at the end of the message.
+For example, if the desired output length is 256 bit, the software writes `0x00020100` to MSG_FIFO.
+
+After the software pushes all messages, it issues Process command to [`CMD`](../data/kmac.hjson#cmd) for SHA3 engine to complete the sponge absorbing process.
+SHA3 hashing engine pads the incoming message as defined in the SHA3 specification.
+
+After the SHA3 engine completes the sponge absorbing step, it generates `kmac_done` interrupt.
+Or the software can poll the [`STATUS.squeeze`](../data/kmac.hjson#status) bit until it becomes 1.
+In this stage, the software may run the Keccak round manually.
+
+If the desired digest length is greater than the Keccak rate, the software issues Run command for the Keccak round logic to run one full round after the software reads the current available Keccak state.
+At this stage, KMAC/SHA3 does not raise an interrupt when the Keccak round completes the software initiated manual run.
+The software should check [`STATUS.squeeze`](../data/kmac.hjson#status) register field for the readiness of [`STATE`](../data/kmac.hjson#state) value.
+
+After the software reads all the digest values, it issues Done command to [`CMD`](../data/kmac.hjson#cmd) register to clear the internal states.
+Done command clears the Keccak state, FSM in SHA3 and KMAC, and a few internal variables.
+Secret key and other software programmed values won't be reset.
+
+
+## Endianness
+
+This KMAC HWIP operates in little-endian.
+Internal SHA3 hashing engine receives in 64-bit granularity.
+The data written to SHA3 is assumed to be little endian.
+
+The software may write/read the data in big-endian order if [`CFG.msg_endianness`](../data/kmac.hjson#cfg) or [`CFG.state_endianness`](../data/kmac.hjson#cfg) is set.
+If the endianness bit is 1, the data is assumed to be big-endian.
+So, the internal logic byte-swap the data.
+For example, when the software writes `0xDEADBEEF` with endianness as 1, the logic converts it to `0xEFBEADDE` then writes into MSG_FIFO.
+
+The software managed secret key, and the prefix are always little-endian values.
+For example, if the software configures the function name `N` in KMAC operation, it writes `encode_string("KMAC")`.
+The `encode_string("KMAC")` represents `0x01 0x20 0x4b 0x4d 0x41 0x43` in byte order.
+The software writes `0x4d4b2001` into [`PREFIX0`](../data/kmac.hjson#prefix0) and `0x????4341` into [`PREFIX1`](../data/kmac.hjson#prefix1) .
+Upper 2 bytes can vary depending on the customization input string `S`.
+
+## KMAC/SHA3 context switching
+
+This version of KMAC/SHA3 HWIP _does not_ support the software context switching.
+A context switching scheme would allow software to save the current hashing engine state and initiate a new high priority hashing operation.
+It could restore the previous hashing state later and continue the operation.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_kmac.h)
+
+## Registers
+
+* [Register Table](../data/kmac.hjson#registers)
+
+[SHA3 specification, FIPS 202]: https://csrc.nist.gov/publications/detail/fips/202/final
+[NIST SP 800-185]: https://csrc.nist.gov/publications/detail/sp/800-185/final
+[Domain-Oriented Masking (DOM)]: https://eprint.iacr.org/2017/395.pdf
diff --git a/hw/ip/kmac/doc/theory_of_operation.md b/hw/ip/kmac/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..8f418334bd3d7
--- /dev/null
+++ b/hw/ip/kmac/doc/theory_of_operation.md
@@ -0,0 +1,403 @@
+# Theory of Operation
+
+## Block Diagram
+
+![](../doc/kmac-block-diagram.svg)
+
+The above figure shows the KMAC/SHA3 HWIP block diagram.
+The KMAC has register interfaces for SW to configure the module, initiate the hashing process, and acquire the result digest from the STATE memory region.
+It also has an interface to the KeyMgr to get the secret key (masked).
+The IP has N x [application interfaces](#application-interface), which allows other HWIPs to request any pre-defined hashing operations.
+
+As similar with HMAC, KMAC HWIP also has a message FIFO (MSG_FIFO) whose depth was determined based on a few criteria such as the register interface width, and its latency, the latency of hashing algorithm (Keccak).
+Based on the given criteria, the MSG_FIFO depth was determined to store the incoming message while the SHA3 core is in computation.
+
+The MSG_FIFO has a packer in front.
+It packs any partial writes into the size of internal datapath (64bit) and stores in MSG_FIFO.
+It frees the software from having to align the messages.
+It also doesn't need the message length information.
+
+The fed messages go into the KMAC core regardless of KMAC enabled or not.
+The KMAC core forwards the messages to SHA3 core in case KMAC hash functionality is disabled.
+KMAC core prepends the encoded secret key as described in the SHA3 Derived Functions specification.
+It is expected that the software writes the encoded output length at the end of the message.
+For hashing operations triggered by an IP through the application interface, the encoded output length is appended inside the AppIntf module in the KMAC HWIP.
+
+The SHA3 core is the main Keccak processing module.
+It supports SHA3 hashing functions, SHAKE128, SHAKE256 extended output functions, and also cSHAKE128, cSHAKE256 functions in order to support KMAC operation.
+To support multiple hashing functions, it has the padding logic inside.
+The padding logic mainly pads the predefined bits at the end of the message and also performs `pad10*1()` function.
+If cSHAKE mode is set, the padding logic also prepends the encoded function name `N` and the customization string `S` prior to the incoming messages according to the spec requirements.
+
+Both the internal state width and the masking of the Keccak core are configurable via compile-time Verilog parameters.
+By default, 1600 bits of internal state are used and stored in two shares (1st order masking).
+The masked Keccak core takes 4 clock cycles per round if sufficient entropy is available.
+If desired, the masking can be disabled and the internal state width can be reduced to 25, 50, or 100 bits at compile time.
+
+## Hardware Interface
+
+* [Interface Tables](../data/kmac.hjson#interfaces)
+
+## Design Details
+
+### Keccak Round
+
+A Keccak round implements the Keccak_f function described in the SHA3 specification.
+Keccak round logic in KMAC/SHA3 HWIP not only supports 1600 bit internal states but also all possible values {50, 100, 200, 400, 800, 1600} based on a parameter `Width`.
+If masking is disabled via compile-time Verilog parameter `EnMasking`, also 25 can be selected as state width.
+Keccak permutations in the specification allow arbitrary number of rounds.
+This module, however, supports Keccak_f which always runs `12 + 2*L` rounds, where $$ L = log_2 {( {Width \over 25} )} $$ .
+For instance, 200 bits of internal state run 18 rounds.
+KMAC/SHA3 instantiates the Keccak round module with 1600 bit.
+
+![](../doc/keccak-round.svg)
+
+Keccak round logic has two phases inside.
+Theta, Rho, Pi functions are executed at the 1st phase.
+Chi and Iota functions run at the 2nd phase.
+If the compile-time Verilog parameter `EnMasking` is not set, i.e., if masking is not enabled, the first phase and the second phase run at the same cycle.
+
+If masking is enabled, the Keccak round logic stores the intermediate state after processing the 1st phase.
+The stored values are then fed into the 2nd phase computing the Chi and Iota functions.
+The Chi function leverages first-order [Domain-Oriented Masking (DOM)][] to aggravate SCA attacks.
+
+To balance circuit area and SCA hardening, the Chi function uses 800 instead 1600 DOM multipliers but the multipliers are fully pipelined.
+The Chi and Iota functions are thus separately applied to the two halves of the state and the 2nd phase takes in total three clock cycles to complete.
+In the first clock cycle of the 2nd phase, the first stage of Chi is computed for the first lane halves of the state.
+In the second clock cycle, the new first lane halves are output and written to state register.
+At the same time, the first stage of Chi is computed for the second lane halves.
+In the third clock cycle, the new second lane halves are output and written to the state register.
+
+The 800 DOM multipliers need 800 bits of fresh entropy for remasking.
+If fresh entropy is not available, the DOM multipliers do not move forward and the 2nd phase will take more than three clock cycles.
+Processing a Keccak_f (1600 bit state) takes a total of 96 cycles (24 rounds X 4 cycles/round) including the 1st and 2nd phases.
+
+If the masking compile time option is enabled, Keccak round logic requires an additional 3200 flip flops to store the intermediate half state inside the 800 DOM multipliers.
+In addition to that Keccak round logic needs two sets of the same Theta, Rho, and Pi functions.
+As a result, the masked Keccak round logic takes more than twice as much as area than the unmasked version of it.
+
+### Padding for Keccak
+
+Padding logic supports SHA3/SHAKE/cSHAKE algorithms.
+cSHAKE needs the extra inputs for the Function-name `N` and the Customization string `S`.
+Other than that, SHA3, SHAKE, and cSHAKE share similar datapath inside the padding module except the last part added next to the end of the message.
+SHA3 adds `2'b 10`, SHAKE adds `4'b 1111`, cSHAKE adds `2'b00` then `pad10*1()` follows.
+All are little-endian values.
+
+Interface between this padding logic and the MSG_FIFO follows the conventional FIFO interface.
+So `prim_fifo_*` can talk to the padding logic directly.
+This module talks to Keccak round logic with a more memory-like interface.
+The interface has an additional address signal on top of the valid, ready, and data signals.
+
+![](../doc/sha3-padding.svg)
+
+The hashing process begins when the software issues the start command to [`CMD`](../data/kmac.hjson#cmd) .
+If cSHAKE is enabled, the padding logic expands the prefix value (`N || S` above) into a block size.
+The block size is determined by the [`CFG.kstrength`](../data/kmac.hjson#cfg) .
+If the value is 128, the block size will be 168 bytes.
+If it is 256, the block size will be 136 bytes.
+The expanded prefix value is transmitted to the Keccak round logic.
+After sending the block size, the padding logic triggers the Keccak round logic to run a full 24 rounds.
+
+If the mode is not cSHAKE, or cSHAKE mode and the prefix block has been processed, the padding logic accepts the incoming message bitstream and forward the data to the Keccak round logic in a block granularity.
+The padding logic controls the data flow and makes the Keccak logic to run after sending a block size.
+
+After the software writes the message bitstream, it should issue the Process command into [`CMD`](../data/kmac.hjson#cmd) register.
+The padding logic, after receiving the Process command, appends proper ending bits with respect to the [`CFG.mode`](../data/kmac.hjson#cfg) value.
+The logic writes 0 up to the block size to the Keccak round logic then ends with 1 at the end of the block.
+
+![](../doc/sha3-padding-fsm.svg)
+
+After the Keccak round completes the last block, the padding logic asserts an `absorbed` signal to notify the software.
+The signal generates the `kmac_done` interrupt.
+At this point, the software is able to read the digest in [`STATE`](../data/kmac.hjson#state) memory region.
+If the output length is greater than the Keccak block rate in SHAKE and cSHAKE mode, the software may run the Keccak round manually by issuing Run command to [`CMD`](../data/kmac.hjson#cmd) register.
+
+The software completes the operation by issuing Done command after reading the digest.
+The padding logic clears internal variables and goes back to Idle state.
+
+### Padding for KMAC
+
+![](../doc/kmac-padding.svg)
+
+KMAC core prepends and appends additional bitstream on top of Keccak padding logic in SHA3 core.
+The [NIST SP 800-185][] defines `KMAC[128,256](K, X, L, S)` as a cSHAKE function.
+See the section 4.3 in NIST SP 800-185 for details.
+If KMAC is enabled, the software should configure [`CMD.mode`](../data/kmac.hjson#cmd) to cSHAKE and the first six bytes of [`PREFIX`](../data/kmac.hjson#prefix) to `0x01204B4D4143` (bigendian).
+The first six bytes of [`PREFIX`](../data/kmac.hjson#prefix) represents the value of `encode_string("KMAC")`.
+
+The KMAC padding logic prepends a block containing the encoded secret key to the output message.
+The KMAC first sends the block of secret key then accepts the incoming message bitstream.
+At the end of the message, the software writes `right_encode(output_length)` to MSG_FIFO prior to issue Process command.
+
+### Message FIFO
+
+The KMAC HWIP has a compile-time configurable depth message FIFO inside.
+The message FIFO receives incoming message bitstream regardless of its byte position in a word.
+Then it packs the partial message bytes into the internal 64 bit data width.
+After packing the data, the logic stores the data into the FIFO until the internal KMAC/SHA3 engine consumes the data.
+
+#### FIFO Depth calculation
+
+The depth of the message FIFO is chosen to cover the throughput of the software or other producers such as DMA engine.
+The size of the message FIFO is enough to hold the incoming data while the SHA3 engine is processing the previous block.
+Details are in `kmac_pkg::MsgFifoDepth` parameter.
+Default design parameters assume the system characteristics as below:
+
+- `kmac_pkg::RegLatency`: The register write takes 5 cycles.
+- `kmac_pkg::Sha3Latency`: Keccak round latency takes 96 cycles, which is the masked version of the Keccak round.
+
+#### FIFO Depth and Empty status
+
+If the SW is slow and the SHA3 engine pops the data fast enough, the Message FIFO's depth may remain **0**.
+The Message FIFO's `fifo_empty` signal, however, is lowered for a cycle.
+This enables the HW to fire the interrupt even the FIFO remains empty.
+
+However, the recommended approach to write messages is:
+
+1. Check the FIFO depth [`STATUS.fifo_depth`](../data/kmac.hjson#status). This represents the number of entry slots currently occupied in the FIFO.
+2. Calculate the remaining size as `<max number of fifo entries> - <STATUS.fifo_depth>) * <entry size>`.
+3. Write data to fill the remaining size.
+4. Repeat until all data is written.
+
+In code, this looks something like:
+```c
+/**
+ * Absorb input data into the Keccak computation.
+ *
+ * Assumes that the KMAC block is in the "absorb" state; it is the caller's
+ * responsibility to check before calling.
+ *
+ * @param in Input buffer.
+ * @param in_len Length of input buffer (bytes).
+ * @return Number of bytes written.
+ */
+size_t kmac_absorb(const uint8_t *in, size_t in_len) {
+    // Read FIFO depth from the status register.
+    uint32_t status = abs_mmio_read32(kBase + KMAC_STATUS_REG_OFFSET);
+    uint32_t fifo_depth =
+        bitfield_field32_read(status, KMAC_STATUS_FIFO_DEPTH_FIELD);
+
+    // Calculate the remaining space in the FIFO using auto-generated KMAC
+    // parameters and take the minimum of that space and the input length.
+    size_t free_entries = (KMAC_PARAM_NUM_ENTRIES_MSG_FIFO - fifo_depth);
+    size_t max_len = free_entries * KMAC_PARAM_NUM_BYTES_MSG_FIFO_ENTRY;
+    size_t write_len = (in_len < max_len) ? in_len : max_len;
+
+    // Note: this example uses byte-writes for simplicity, but in practice it
+    // would be more efficient to use word-writes for aligned full words and
+    // byte-writes only as needed at the beginning and end of the input.
+    for (size_t i = 0; i < write_len; i++) {
+      abs_mmio_write8(kBase + KMAC_MSG_FIFO_REG_OFFSET, in[i]);
+    }
+
+    return write_len;
+}
+```
+
+The method recommended above is always safe.
+However, in specific contexts, it may be okay to skip polling `STATUS.fifo_depth`.
+Normally, KMAC will process data faster than software can write it, and back pressure on the FIFO interface, should ensure that writes from software will simply block until KMAC can process messages.
+The only reason for polling, then, is to prevent a specific deadlock scenario:
+1. Software has configured KMAC to wait forever for entropy.
+2. There is a problem with the EDN, so entropy is never coming.
+3. The FIFO is full and KMAC is waiting for entropy to process it.
+
+If either the entropy wait timer is nonzero or `kmac_en` is false (so KMAC will not be refreshing entropy), it is safe to write to the FIFO without polling `STATUS.fifo_depth`.
+However, this should be done carefully, and tests should always cover the scenario in which EDN is locked up.
+
+#### Masking
+
+The message FIFO does not generate the masked message data.
+Incoming message bitstream is not sensitive to the leakage.
+If the `EnMasking` parameter is set and [`CFG_SHADOWED.msg_mask`](../data/kmac.hjson#cfg_shadowed) is enabled, the message is masked upon loading into the Keccak core using the internal entropy generator.
+The secret key, however, is stored as masked form always.
+
+If the `EnMasking` parameter is not set, the masking is disabled.
+Then, the software has to provide the key in unmasked form by default.
+Any write operations to [`KEY_SHARE1_0`](../data/kmac.hjson#key_share1_0) - [`KEY_SHARE1_15`](../data/kmac.hjson#key_share1_5) are ignored.
+
+If the `EnMasking` parameter is not set and the `SwKeyMasked` parameter is set, software has to provide the key in masked form.
+Internally, the design then unmasks the key by XORing the two key shares together when loading the key into the engine.
+This is useful when software interface compatibility between the masked and unmasked configuration is desirable.
+
+If the `EnMasking` parameter is set, the `SwKeyMasked` parameter has no effect: Software always provides the key in two shares.
+
+### Keccak State Access
+
+After the Keccak round completes the KMAC/SHA3 operation, the contents of the Keccak state contain the digest value.
+The software can access the 1600 bit of the Keccak state directly through the window of the KMAC/SHA3 register.
+
+If the compile-time parameter masking feature is enabled, the upper 256B of the window is the second share of the Keccak state.
+If not, the upper address space is zero value.
+The software reads both of the Keccak state shares and XORed in the software to get the unmasked digest value if masking feature is set.
+
+The Keccak state is valid after the sponge absorbing process is completed.
+While in an idle state or in the sponge absorbing stage, the value is zero.
+This ensures that the logic does not expose the secret key XORed with the keccak_f results of the prefix to the software.
+In addition to that, the KMAC/SHA3 blocks the software access to the Keccak state when it processes the request from KeyMgr for Key Derivation Function (KDF).
+
+### Application Interface
+
+![](../doc/application-interface.svg)
+
+KMAC/SHA3 HWIP has an option to receive the secret key from the KeyMgr via sideload key interface.
+The software should set [`CFG.sideload`](../data/kmac.hjson#cfg) to use the KeyMgr sideloaded key for the SW-initiated KMAC operation.
+`keymgr_pkg::hw_key_t` defines the structure of the sideloaded key.
+KeyMgr provides the sideloaded key in two-share masked form regardless of the compile-time parameter `EnMasking`.
+If `EnMasking` is not defined, the KMAC merges the shared key to the unmasked form before uses the key.
+
+The IP has N number of the application interface. The apps connected to the KMAC IP may initiate the SHA3/cSHAKE/KMAC hashing operation via the application interface `kmac_pkg::app_{req|rsp}_t`.
+The type of the hashing operation is determined in the compile-time parameter `kmac_pkg::AppCfg`.
+
+| Index | App      | Algorithm | Prefix
+|:-----:|:--------:|:---------:|------------
+| 0     | KeyMgr   | KMAC      | CSR prefix
+| 1     | LC_CTRL  | cSHAKE128 | "LC_CTRL"
+| 2     | ROM_CTRL | cSHAKE256 | "ROM_CTRL"
+
+In the current version of IP, the IP has three application interfaces, which are KeyMgr, LC_CTRL, and ROM_CTRL.
+KeyMgr uses the KMAC operation with CSR prefix value.
+LC_CTRL and ROM_CTRL use the cSHAKE operation with the compile-time parameter prefixes.
+
+The app sends 64-bit data (`MsgWidth`) in a beat with the message strobe signal.
+The state machine inside the AppIntf logic starts when it receives the first valid data from any of the AppIntf.
+The AppIntf module chooses the winner based on the fixed priority.
+Then it forwards the selected App to the next stage.
+Because this logic sees the first valid data as an initiator, the Apps cannot run the hashing operation with an empty message.
+After the logic switches to accept the message bitstream from the selected App, if the hashing operation is KMAC, the logic forces the sideloaded key to be used as a secret.
+Also it ignores the command issued from the software.
+Instead it generates the commands and sends them to the KMAC core.
+
+The last beat of the App data moves the state machine to append the encoded output length if the hashing operation is KMAC.
+The output length is the digest width, which is 256 bit always.
+It means that the logic appends `0x020100` (little-endian) to the end of the message.
+The output data from this logic goes to MSG_FIFO.
+Because the MSG_FIFO handles un-aligned data inside, KeyMgr interface logic sends the encoded output length value in a separate beat.
+
+After the encoded output length is pushed to the KMAC core, the interface logic issues a Process command to run the hashing logic.
+
+After hashing operation is completed, KMAC does not raise a `kmac_done` interrupt; rather it triggers the `done` status in the App response channel.
+The result digest always comes in two shares.
+If the `EnMasking` parameter is not set, the second share is always zero.
+
+### Entropy Generator
+
+This section explains the entropy generator inside the KMAC HWIP.
+
+KMAC has an entropy generator to provide the design with pseudo-random numbers while processing the secret key block.
+The entropy is used for both remasking the DOM multipliers inside the Chi function of the Keccak core as well as for masking the message if [`CFG_SHADOWED.msg_mask`](../data/kmac.hjson#cfg_shadowed) is enabled.
+
+![Entropy block](../doc/kmac-entropy.svg)
+
+The entropy generator is made up of 25 32-bit linear feedback shift registers (LFSRs).
+This allows the module to generate 800 bits of fresh, pseudo-random numbers required by the 800 DOM multipliers for remasking in every clock cycle.
+To break linear shift patterns, each LFSR features a non-linear layer.
+In addition an 800-bit wide permutation spanning across all LFSRs is used.
+
+Depending on [`CFG_SHADOWED.entropy_mode`](../data/kmac.hjson#cfg_shadowed), the entropy generator fetches initial entropy from the [Entropy Distribution Network (EDN)][edn] module or software has to provide a seed by writing the [`ENTROPY_SEED_0`](data/kmac.hjson#entropy_seed_0) - [`ENTROPY_SEED_4`](../data/kmac.hjson#entropy_seed_4) registers in ascending order.
+The module periodically refreshes the LFSR seeds with the new entropy from EDN.
+
+To limit the entropy consumption for reseeding, a cascaded reseeding mechanism is used.
+Per reseeding operation, the entropy generator consumes five times 32 bits of entropy from [EDN][edn], one 32-bit word at a time.
+These five 32-bit words are directly fed into LFSRs 0/5/10/15/20 for reseeding.
+At the same time, the previous states of LFSRs 0/5/10/15/20 from before the reseeding operation are permuted and then forwarded to reseed LFSRs 1/6/11/16/21.
+Similarly, the previous states of LFSRs 1/6/11/16/21 from before the reseeding operation are permuted and then forwarded to reseed LFSRs 2/7/12/17/22.
+Software can still request a complete reseed of all 25 LFSRs from EDN by subsequently triggering five reseeding operations through [`CMD.entropy_req`](data/kmac.hjson#cmd).
+
+[edn]: ../../../edn/README.md
+
+### Error Report
+
+This section explains the errors KMAC HWIP raises during the hashing operations, their meanings, and the error handling process.
+
+KMAC HWIP has the error checkers in its internal datapath.
+If the checkers detect errors, whether they are triggered by the SW mis-configure, or HW malfunctions, they report the error to [`ERR_CODE`](data/kmac.hjson#err_code) and raise an `kmac_error` interrupt.
+Each error code gives debugging information at the lower 24 bits of [`ERR_CODE`](data/kmac.hjson#err_code).
+
+Value | Error Code | Description
+------|------------|-------------
+0x01  | KeyNotValid | In KMAC mode with the sideloaded key, the IP raises an error if the sideloaded secret key is not ready.
+0x02  | SwPushedMsgFifo | MsgFifo is updated while not being in the Message Feed state.
+0x03  | SwIssuedCmdInAppActive | SW issued a command while the application interface is being used
+0x04  | WaitTimerExpired | EDN has not responded within the wait timer limit.
+0x05  | IncorrectEntropyMode | When SW sets `entropy_ready`, the `entropy_mode` is neither SW nor EDN.
+0x06  | UnexpectedModeStrength | SHA3 mode and Keccak Strength combination is not expected.
+0x07  | IncorrectFunctionName | In KMAC mode, the PREFIX has the value other than `encoded_string("KMAC")`
+0x08  | SwCmdSequence | SW does not follow the guided sequence, `start` -> `process` -> {`run` ->} `done`
+0x09  | SwHashingWithoutEntropyReady | SW requests KMAC op without proper config of Entropy in KMAC. This error occurs if KMAC IP masking feature is enabled.
+0x80  | Sha3Control | SW may receive Sha3Control error along with `SwCmdSequence` error. Can be ignored.
+
+#### KeyNotValid (0x01)
+
+The `KeyNotValid` error is raised in the application interface module.
+When a KMAC application requests a hashing operation, the module checks if the sideloaded key is ready.
+If the key is not ready, the module reports `KeyNotValid` error and moves to dead-end state and waits the IP reset.
+
+This error does not provide any additional information.
+
+#### SwPushedMsgFifo (0x02)
+
+The `SwPushedMsgFifo` error happens when the Message FIFO receives TL-UL transactions while the application interface is busy.
+The Message FIFO drops the request.
+
+The IP reports the error with an info field.
+
+Bits    | Name        | Description
+--------|-------------|-------------
+[23:16] | reserved    | all zero
+[15:8]  | kmac_app_st | KMAC_APP FSM state.
+[7:0]   | mux_sel     | Current APP Mux selection. 0: None, 1: SW, 2: App
+
+#### SwIssuedCmdInAppActive (0x03)
+
+If the SW issues any commands while the application interface is being used, the module reports `SwIssuedCmdInAppActive` error.
+The received command does not affect the Application process.
+The request is dropped by the KMAC_APP module.
+
+The lower 3 bits of [`ERR_CODE`](data/kmac.hjson#err_code) contains the received command from the SW.
+#### WaitTimerExpired (0x04)
+
+The SW may set the EDN wait timer to exit from EDN request state if the response from EDN takes long.
+If the timer expires, the module cancels the transaction and report the `WaitTimerExpired` error.
+
+When this error happens, the state machine in KMAC_ENTROPY module moves to Wait state.
+In that state, it keeps using the pre-generated entropy and asserting the entropy valid signal.
+It asserts the entropy valid signal to complete the current hashing operation.
+If the module does not complete, or flush the pending operation, it creates the back pressure to the message FIFO.
+Then, the SW may not be able to access the KMAC IP at all, as the crossbar is stuck.
+
+The SW may move the state machine to the reset state by issuing [`CFG.err_processed`](data/kmac.hjson#cfg).
+
+#### IncorrectEntropyMode (0x05)
+
+If SW misconfigures the entropy mode and let the entropy module prepare the random data, the module reports `IncorrectEntropyMode` error.
+The state machine moves to Wait state after reporting the error.
+
+The SW may move the state machine to the reset state by issuing [`CFG.err_processed`](data/kmac.hjson#cfg).
+
+#### UnexpectedModeStrength (0x06)
+
+When the SW issues `Start` command, the KMAC_ERRCHK module checks the [`CFG.mode`](data/kmac.hjson#cfg) and [`CFG.kstrength`](data/kmac.hjson#cfg).
+The KMAC HWIP assumes the combinations of two to be **SHA3-224**, **SHA3-256**, **SHA3-384**, **SHA3-512**, **SHAKE-128**, **SHAKE-256**, **cSHAKE-128**, and **cSHAKE-256**.
+If the combination of the `mode` and `kstrength` does not fall into above, the module reports the `UnexpectedModeStrength` error.
+
+However, the KMAC HWIP proceeds the hashing operation as other combinations does not cause any malfunctions inside the IP.
+The SW may get the incorrect digest value.
+
+#### IncorrectFunctionName (0x07)
+
+If [`CFG.kmac_en`](data/kmac.hjson#cfg) is set and the SW issues the `Start` command, the KMAC_ERRCHK checks if the [`PREFIX`](data/kmac.hjson#prefix) has correct function name, `encode_string("KMAC")`.
+If the value does not match to the byte form of `encode_string("KMAC")` (`0x4341_4D4B_2001`), it reports the `IncorrectFunctionName` error.
+
+As same as `UnexpectedModeStrength` error, this error does not block the hashing operation.
+The SW may get the incorrect signature value.
+
+#### SwCmdSequence (0x08)
+
+The KMAC_ERRCHK module checks the SW issued commands if it follows the guideline.
+If the SW issues the command that is not relevant to the current context, the module reports the `SwCmdSequence` error.
+The lower 3bits of the [`ERR_CODE`](data/kmac.hjson#err_code) contains the received command.
+
+This error, however, does not stop the KMAC HWIP.
+The incorrect command is dropped at the following datapath, SHA3 core.
diff --git a/hw/ip/lc_ctrl/README.md b/hw/ip/lc_ctrl/README.md
index 804538dcef954..53f94b3dac638 100644
--- a/hw/ip/lc_ctrl/README.md
+++ b/hw/ip/lc_ctrl/README.md
@@ -43,644 +43,3 @@ It is difficult to manage this kind of fast escalation in software in our workin
 It is more suitable in hardware.
 - Advancing life cycle state sometimes must be done in the absence of software for a variety of reasons; thus having a small piece of hardware that understands what to do is simpler than placing restrictions on the entire CPU / memory complex.
 - As can be seen from this document, the hardware additions are small and non-complicated.
-
-
-# Theory of Operations
-
-The following sections give an overview of the life cycle function.
-It begins with life cycle sensing at power up, progresses through how life cycle transitions are made, and then focuses specifically on how life cycle impacts various functionality of the design.
-
-## Power Up Sequence
-
-Upon power up, the life cycle controller will default to "RAW" state and wait for the OTP controller to initialize and sense the contents of the [life cycle partition](../otp_ctrl/README.md#logical-partitions).
-When the OTP is ready, the life cycle controller reads the contents of the life cycle partition, decodes the life cycle state and updates its internal state to match.
-This implies that unlike the life cycle definition diagram, there is a one-time "RAW to any state" logical transition that is implicit within the implementation.
-Note during OTP sensing, the life cycle controller does not perform any redundant checks upon the value it reads; instead that responsibility is allocated to the OTP controller.
-
-Once the state values are correctly sensed, the life cycle controller performs checks on state consistency and dependencies, and if correct, broadcasts both the raw state value as well as the decoded functional outputs to the rest of the device.
-
-Once the broadcast is complete and signals stable, modules held under reset by ["sys_rst_n"](../rstmgr/README.md) are then released from reset to begin mission mode operations (this includes the processor).
-Note this point is also when it is safe for DFT to commence operations, as DFT functions may be blocked until life cycle completes its broadcast.
-
-The following diagram illustrates this power-up sequence.
-Note the sequence is not designed into one specific module, but rather a result of coordination between the OTP controller, life cycle controller and the reset / power controllers.
-
-![LC Power Up Sequence](./doc/lc_ctrl_power_up.svg)
-
-## Normal Operation
-
-Once the life cycle system is powered up and stable, its outputs remain static unless specifically requested to change or affected by security escalation.
-The life cycle controller can accept [change requests](#life-cycle-requests) from software as well as external entities.
-
-### Unconditional Transitions
-
-For unconditional transitions, the life cycle controller advances the state by requesting an OTP update to the OTP controller.
-Once the programming is confirmed, the life cycle controller reports a success to the requesting agent and waits for the device to reboot.
-
-### Conditional Transitions
-
-For conditional transitions, such as those that require a token (RAW_UNLOCK, TEST_UNLOCK, TEST_EXIT, RMA_UNLOCK), the life cycle controller advances the state via OTP programming only after it is supplied with the valid token.
-[Some tokens](../../../doc/security/specs/device_life_cycle/README.md#manufacturing-states) are hardcoded design constants, while others are stored in OTP.
-Note that conditional transitions will only be allowed if the OTP partition holding the corresponding token has been provisioned and locked.
-
-Since unlock tokens are considered secret, they are not stored in their raw form.
-Instead, the tokens are wrapped and unwrapped based on a global constant using a [PRESENT-based scrambling mechanism](../otp_ctrl/README.md#secret-vs-non-secret-partitions).
-This ensures that a breach of fuse physical security does not automatically expose all the relevant information without also breaking the constant key.
-
-RAW_UNLOCK is not exposed in the open source design, rather it is something provisioned by the silicon creators prior to tapeout.
-It is the only token among those listed that is a global constant and stored in gates.
-
-All others CAN be device unique and are stored in OTP.
-
-### Transition Counter Limits
-
-For conditional transitions, there is a limit to how many times they can be attempted.
-This is to prevent an attacker from brute-forcing any specific token, as this also helps to reduce the overall required token size.
-
-For OpenTitan, the total amount of state transitions and transition attempts is limited to 24.
-Once this number is reached, the life cycle controller rejects further attempts, effectively locking the device into its current state.
-
-The token counters are maintained in the OTP.
-To ensure the security of token limits cannot be bypassed, each request for a conditional transition **FIRST** increments the token count, and **THEN** checks for the validity of the token.
-
-### Token Hashing Mechanism
-
-All 128bit lock and unlock tokens are passed through a cryptographic one way function in hardware before the life cycle controller compares them to the provisioned values in OTP or to the netlist constant in case of RAW_UNLOCK.
-
-This mechanism is used to guard against reverse engineering and brute-forcing attempts.
-An attacker able to extract the hashed token values from the scrambled OTP partitions or from the netlist would first have to find a hash collision in order to perform a life cycle transition, since the values supplied to the life cycle controller must be valid hash pre-images.
-
-The employed one way function is a 128bit cSHAKE hash with the function name "" and customization string "LC_CTRL", see also [kmac documentation](../kmac/README.md) and [`kmac_pkg.sv`](https://github.com/lowRISC/opentitan/blob/master/hw/ip/kmac/rtl/kmac_pkg.sv#L148-L155).
-
-### Post Transition Handling
-
-After a transition request, whether it was unconditional or conditional, the life cycle controller always disables all of its decoded outputs and puts the system in an inert state.
-The device is then expected to reboot before returning to a functional state.
-
-Note this happens for either successful or unsuccessful transitions.
-This general policy places a time-bound on how quickly life cycle states can change and also forces the device to behave more predictably.
-
-## Security Escalation
-
-The life cycle controller contains two escalation paths that are connected to escalation severities 1 and 2 of the alert handler.
-
-The two escalation paths are redundant, and both trigger the same mechanism.
-Upon assertion of any of the two escalation actions, the life cycle state is **TEMPORARILY** altered.
-I.e. when this escalation path is triggered, the life cycle state is transitioned into "ESCALATE", which behaves like a virtual "SCRAP" state (i.e. this state is not programmed into OTP).
-This causes [all decoded outputs](#life-cycle-decoded-outputs-and-controls) to be disabled until the next power cycle.
-In addition to that, the life cycle controller asserts the ESCALATE_EN life cycle signal which is distributed to all IPs in the design that expose an escalation action (like moving FSMs into terminal error states or clearing sensitive registers).
-
-Whether to escalate to the life cycle controller or not is a software decision, please see the alert handler for more details.
-
-## Life Cycle Decoded Outputs and Controls
-
-The core function of life cycle is how various functions of the design are modulated by what state the design is in.
-[This section](../../../doc/security/specs/device_life_cycle/README.md#manufacturing-states) in the life cycle architecture documentation summarizes the overall behavior.
-
-The signals have been split into two summary tables in the sections below.
-The first table contains all control signals that enable certain functionality in the system, whereas the second table contains all signals that change access to certain elements in the flash and OTP memories.
-
-All life cycle control signals are 4-bits, with only `4'b1010` as a valid enable value, and all others meaning "disable".
-A `"Y"` mark means the function is directly enabled by hardware during that
-state.
-A `"grey"` box means a particular function is not available during that
-state.
-The states in <span style="color:red">RED</span> are volatile, temporary states.
-They exist only after specific events, and are restored to normal once the device is power cycled.
-
-### Life Cycle Function Control Signals
-
-The individual signals summarized in the table below are described in the following subsections.
-
-{{#include doc/lc_ctrl_function_signals_table.md}}
-
-Signals marked with an asterisk (Y\*) are only asserted under certain conditions as explained in detail below.
-
-#### DFT_EN
-
-As its name implies, this signal enables DFT functions.
-This is accomplished primarily by providing functional isolation on the SOC inserted DFT TAP module and any other memory macros that are built natively with a DFT function (for example flash and OTP).
-
-The isolation ensures three things:
-- The TAP controller is unable to issue instructions that would put the design into scan mode.
-This ensures that secrets cannot be scanned out, and specific values cannot be scanned into the design to emulate a particular functional mode
-- The TAP controller is unable to issue any kind of self test that would disrupt and scramble live logic which could lead to unpredictable behavior
-- The TAP controller or test function is unable to alter the non-volatile contents of flash or OTP
-
-See [TAP isolation](#tap-and-isolation) for more implementation details.
-
-#### NVM_DEBUG_EN
-
-NVM modules like flash implement debug access that bypasses memory protection or lock-down.
-This feature may be there for a variety of reasons, but primarily it can be used to debug the normal behavior of the controller.
-
-This type of functionality, if it exists, must be disabled during specific life cycle states.
-Since these back-door functions may bypass memory protection, they could be used to read out provisioned secrets that are not meant to be visible to software or a debug host.
-
-Note that NVM_DEBUG_EN is disabled in the last test unlocked state (TEST_UNLOCKED7) such that the isolated flash partition can be be securely populated, without exposing its contents via the NVM backdoor interface.
-See also accessibility description of the [isolated flash partition](#iso_part_sw_rd_en-and-iso_part_sw_wr_en).
-
-#### HW_DEBUG_EN
-
-HW_DEBUG_EN refers to the general ungating of both invasive (JTAG control of the processor, bidirectional analog test points) and non-invasive debug (debug bus observation, and register access error returns).
-
-This signal thus needs to be routed to all security-aware and debug capable peripherals.
-This signal is used to determine whether OpenTitan peripheral register interfaces should [silently error](../../../util/reggen/README.md#error_responses" >}}).
-If HW_DEBUG_EN is set to ON, normal errors should be returned.
-If HW_DEBUG_EN is set to OFF, errors should return silently.
-
-Similar to DFT_EN, HW_DEBUG_EN is also used to isolate the processor TAP.
-When HW_DEBUG_EN is OFF, the TAP should not be able to perform its normal debug access, thus preventing an external entity from hijacking the processor.
-
-#### CPU_EN
-
-CPU_EN controls whether code execution is allowed.
-This is implemented as part of the processor's reset controls.
-In OpenTitan's [reset topology](../rstmgr/README.md), it is not possible to reset only the processor by itself, so this reset control extends to a large population of the OpenTitan peripherals.
-
-This ensures that during specific states (RAW, TEST_LOCKED, SCRAP, INVALID) it is not possible for the processor to execute code that breaks the device out of a non-functional state.
-
-In conjunction with DFT_EN / HW_DEBUG_EN, this acts as the final layer in life cycle defense in depth.
-
-#### KEY_MANAGER_EN {#key-manager-en}
-
-The KEY_MANAGER_EN signal allows the key manager to function normally.
-When this signal is logically disabled, any existing key manager collateral is uninstantiated and wiped; further instantiation and generation calls for the key manager are also made unavailable.
-
-The KEY_MANAGER_EN signal is active only during DEV / PROD / PROD_END / RMA.
-
-#### ESCALATE_EN
-
-The ESCALATE_EN signal is available in all life cycle states and is asserted if for any reason the alert subsystem decides to move the life cycle state into the ESCALATION state.
-This signal is also unconditionally asserted in all INVALID and SCRAP states (including virtual SCRAP states).
-
-#### CHECK_BYP_EN
-
-The CHECK_BYP_EN signal is used to disable the [background consistency checks](../otp_ctrl/README.md#partition-checks) of the life cycle OTP partition during life cycle transitions to prevent spurious consistency check failures (the OTP contents and the buffer registers can get out of sync during state transitions).
-The CHECK_BYP_EN signal is only asserted when a transition command is issued.
-
-#### CLK_BYP_REQ
-
-If the life cycle state is in RAW, TEST* or RMA, and if [`TRANSITION_CTRL.EXT_CLOCK_EN`](data/lc_ctrl.hjson#transition_ctrl) is set to one, the CLK_BYP_REQ signal is asserted in order to switch the main system clock to an external clock signal.
-This functionality is needed in certain life cycle states where the internal clock source may not be fully calibrated yet, since the OTP macro requires a stable clock frequency in order to reliably program the fuse array.
-Note that the [`TRANSITION_CTRL.EXT_CLOCK_EN`](data/lc_ctrl.hjson#transition_ctrl) register can only be set to one if the transition interface has been claimed via the [`CLAIM_TRANSITION_IF`](data/lc_ctrl.hjson#claim_transition_if) mutex.
-This function is not available in production life cycle states.
-
-For details on the clock switch, please see [clkmgr](../clkmgr/README.md#life-cycle-requested-external-clock).
-
-
-### Life Cycle Access Control Signals
-
-The individual signals summarized in the table below are described in the following subsections.
-
-{{#include doc/lc_ctrl_access_signals_table.md}}
-
-Signals marked with an asterisk (Y\*) are only asserted under certain conditions as explained in detail below.
-
-#### CREATOR_SEED_SW_RW_EN and OWNER_SEED_SW_RW_EN
-
-These signals control whether the non-volatile provisioning of life cycle related collateral can be accessed.
-The signals can only be active during DEV / PROD / PROD_END / RMA.
-During other states, it is not possible to either read or modify the collateral.
-This specifically limits the danger of rogue software images during any TEST_UNLOCKED state.
-However, as these signals only gate functional access and not DFT access, it is still possible for a malicious agent to bypass this protection by abusing scan shift/capture mechanics.
-
-While the OWNER_SEED_SW_RW_EN is statically enabled in the states shown above, the CREATOR_SEED_SW_RW_EN is only enabled if the device has not yet been personalized (i.e., the OTP partition holding the root key has not been locked down yet).
-
-For more a list of the collateral in Flash and OTP and an explanation of how that collateral is affected by these signals, see the [OTP collateral](#otp-collateral) and [flash collateral](#flash-collateral) sections.
-
-#### SEED_HW_RD_EN
-
-The SEED_HW_RD_EN signal controls whether the owner and creator root keys can be accessed by hardware.
-This signal is dependent on the personalization state of the device and will only be enabled if the device has been personalized (i.e., when the OTP partition holding the root key has been locked down).
-
-#### ISO_PART_SW_RD_EN and ISO_PART_SW_WR_EN
-
-These signals control whether the isolated flash partition holding additional manufacturing details can be accessed.
-The isolated partition is both read and writable during the PROD / PROD_END / RMA states.
-In all other states it is inaccessible, except during the TEST_UNLOCKED* states where the partition is write-only.
-This construction allows to write a value to that partition and keep it secret before advancing into any of the production states.
-
-
-## OTP Collateral
-
-The following is a list of all life cycle related collateral stored in OTP.
-Most collateral also contain associated metadata to indicate when the collateral is restricted from further software access, see [accessibility summary](#otp-accessibility-summary-and-impact-of-provision_en) for more details.
-Since not all collateral is consumed by the life cycle controller, the consuming agent is also shown.
-
-{{#include doc/lc_ctrl_otp_collateral.md}}
-
-The TOKENs and KEYS are considered secret data and are stored in [wrapped format](#conditional-transitions).
-Before use, the secrets are unwrapped.
-
-The SECRET0_DIGEST and SECRET2_DIGEST are the digest values computed over the secret partitions in OTP holding the tokens and root keys.
-As described in more detail in the [OTP controller specification](../otp_ctrl/README.md#direct-access-memory-map), these digests have a non-zero value once the partition has been provisioned and write/read access has been locked.
-
-### ID State of the Device
-
-If the SECRET2_DIGEST is zero, the device is considered to have "blank" ID state, in which case the CREATOR_ROOT_KEY_* (in OTP) and CREATOR_DIV_KEY (in FLASH) can be written by software.
-All consumers of these keys are supplied with an invalid value.
-
-If the SECRET2_DIGEST has a nonzero value, the device is considered "creator personalized", and the CREATOR_ROOT_KEY and CREATOR_DIV_KEY are no longer accessible to software.
-Actual values are supplied to the consumers.
-If SECRET2_DIGEST has a nonzero value, the CREATOR_SEED_SW_RW_EN signal will be disabled in PROD, PROD_END and DEV states.
-
-### Secret Collateral
-
-Among the OTP life cycle collateral, the following are considered secrets (note there may be other secrets unrelated to life cycle, please see [OTP controller specification](../otp_ctrl/README.md#partition-listing-and-description) for more details):
-
-- *_TOKEN
-- CREATOR_ROOT_KEY*
-
-Specifically this means after OTP sensing, the above entries are unwrapped to obtain the real value.
-Similarly, during programming, they are wrapped before beginning to be written to OTP.
-
-The function used for this wrapping is the lightweight PRESENT-cipher.
-The wrapping is a one time event during controlled manufacturing, and unwrapping also cannot be supplied with arbitrary ciphertexts.
-Thus the system cannot be abused to generate a large number of traces for informational leakage, and thus a fully hardened cipher (such as masked AES) is not required.
-
-Note also, a global key is used here because there is no other non-volatile location to store a secret key.
-If PUFs were available (either in memory form or fused form), it could become an appealing alternative to hold a device unique fuse key.
-
-See the [OTP controller](../otp_ctrl/README.md#secret-vs-non-secret-partitions) for more details.
-
-### OTP Accessibility Summary and Impact of Life Cycle Signals
-
-A subset of secret collateral is further access-controlled by the life cycle CREATOR_SEED_SW_RW_EN signal.
-These are
-
-- RMA_UNLOCK_TOKEN
-- CREATOR_ROOT_KEY
-
-The table below summarizes the software accessibility of all life cycle collateral.
-
-{{#include doc/lc_ctrl_otp_accessibility.md}}
-
-Note that CREATOR_SEED_SW_RW_EN is set to OFF if SECRET2_DIGEST has a nonzero value in PROD, PROD_END and DEV states.
-SEED_HW_RD_EN only becomes active if SECRET2_DIGEST has a nonzero value in DEV, PROD, PROD_END and RMA states.
-
-## Flash Collateral
-
-The flash contains both memory mapped and non-memory mapped partitions.
-As it pertains to life cycle, the flash contains two sets of important collateral.
-They are enumerated in the table below.
-Just as with OTP, the consumer and usage of each is also described.
-
-{{#include doc/lc_ctrl_flash_collateral.md}}
-
-Each collateral belongs to a separate flash partition, the table below enumerates the partition and whether the partition is memory mapped.
-
-{{#include doc/lc_ctrl_flash_partitions.md}}
-
-The general flash partition refers to any software managed storage in flash, and is not a specific carve out in the non-memory mapped area.
-
-### Flash Accessibility Summary and Impact of Life Cycle Signals
-
-The creator software is trusted to manage the owner partition (OWNER_DATA).
-As such, OWNER_DATA remains accessible during DEV / PROD / PROD_END / RMA states, irrespective of the device personalization state.
-It is expected that ROM_ext during secure boot programs the protection correctly such that downstream software has appropriate permissions.
-
-The CREATOR_DATA partitions however, are further qualified based on the personalization state of the device.
-Just as with OTP, the table below enumerates accessibility of flash collateral.
-
-{{#include doc/lc_ctrl_flash_accessibility.md}}
-
-Note that CREATOR_SEED_SW_RW_EN is set to OFF if SECRET2_DIGEST has a nonzero value in PROD, PROD_END and DEV states.
-SEED_HW_RD_EN only becomes active if SECRET2_DIGEST has a nonzero value in DEV, PROD, PROD_END and RMA states.
-OWNER_SEED_SW_RW_EN is always enabled during DEV, PROD, PROD_END and RMA states.
-
-See also [Device Life Cycle Architecture](../../../doc/security/specs/device_life_cycle/README.md) for more information on creator/owner isolation.
-
-
-## Hardware Interfaces
-
-### Parameters
-
-Note that parameters prefixed with `RndCnst` are random netlist constants that need to be regenerated via topgen before the tapeout (typically by the silicon creator).
-
-Parameter                      | Default (Max)         | Top Earlgrey   | Description
--------------------------------|-----------------------|----------------|---------------
-`AlertAsyncOn`                 | 2'b11                 | 2'b11          |
-`IdcodeValue`                  | `32'h00000001`        | `32'h00000001` | Idcode for the LC JTAG TAP.
-`RndCnstLcKeymgrDivInvalid`    | (see RTL)             | (see RTL)      | Life cycle state group diversification value for keymgr.
-`RndCnstLcKeymgrDivTestDevRma` | (see RTL)             | (see RTL)      | Life cycle state group diversification value for keymgr.
-`RndCnstLcKeymgrDivProduction` | (see RTL)             | (see RTL)      | Life cycle state group diversification value for keymgr.
-
-### Signals
-
-* [Interface Tables](data/lc_ctrl.hjson#interfaces)
-
-Signal                       | Direction        | Type                                     | Description
------------------------------|------------------|------------------------------------------|---------------
-`jtag_i`                     | `input`          | `jtag_pkg::jtag_req_t`                   | JTAG input signals for life cycle TAP.
-`jtag_o`                     | `output`         | `jtag_pkg::jtag_rsp_t`                   | JTAG output signals for life cycle TAP.
-`esc_scrap_state0_tx_i`      | `input`          | `prim_esc_pkg::esc_tx_t`                 | Escalation input from alert handler. Moves the life cycle state into an invalid state upon assertion.
-`esc_scrap_state0_rx_o`      | `output`         | `prim_esc_pkg::esc_rx_t`                 | Escalation feedback to alert handler
-`esc_scrap_state1_tx_i`      | `input`          | `prim_esc_pkg::esc_tx_t`                 | Escalation input from alert handler. Moves the life cycle state into an invalid state upon assertion.
-`esc_scrap_state1_rx_o`      | `output`         | `prim_esc_pkg::esc_rx_t`                 | Escalation feedback to alert handler
-`pwr_lc_i`                   | `input`          | `pwrmgr::pwr_lc_req_t`                   | Initialization request coming from power manager.
-`pwr_lc_o`                   | `output`         | `pwrmgr::pwr_lc_rsp_t`                   | Initialization response and programming idle state going to power manager.
-`lc_otp_program_o`           | `output`         | `otp_ctrl_pkg::lc_otp_program_req_t`     | Life cycle state transition request.
-`lc_otp_program_i`           | `input`          | `otp_ctrl_pkg::lc_otp_program_rsp_t`     | Life cycle state transition response.
-`kmac_data_o`                | `output`         | `kmac_pkg::app_req_t`                    | Life cycle RAW token hashing request.
-`kmac_data_i`                | `input`          | `kmac_pkg::app_rsp_t`                    | Life cycle RAW token hashing response.
-`otp_lc_data_i`              | `input`          | `otp_ctrl_pkg::otp_lc_data_t`            | Life cycle state output holding the current life cycle state, the value of the transition counter and the tokens needed for life cycle transitions.
-`lc_keymgr_div_o`            | `output`         | `lc_keymgr_div_t`                        | Life cycle state group diversification value.
-`lc_flash_rma_seed_o`        | `output`         | `lc_flash_rma_seed_t`                    | Seed for flash RMA.
-`otp_device_id_i`            | `input`          | `otp_device_id_t`                        | HW_CFG bits from OTP ([`DEVICE_ID_0`](data/lc_ctrl.hjson#device_id_0)).
-`otp_manuf_state_i`          | `input`          | `otp_manuf_state_t`                      | HW_CFG bits from OTP ([`MANUF_STATE_0`](data/lc_ctrl.hjson#manuf_state_0)).
-`lc_otp_vendor_test_o`       | `output`         | `otp_ctrl_pkg::lc_otp_vendor_test_req_t` | Vendor-specific test bits to OTP ([`OTP_VENDOR_TEST_CTRL`](data/lc_ctrl.hjson#otp_vendor_test_ctrl)).
-`lc_otp_vendor_test_i`       | `input`          | `otp_ctrl_pkg::lc_otp_vendor_test_rsp_t` | Vendor-specific test bits to OTP ([`OTP_VENDOR_TEST_STATUS`](data/lc_ctrl.hjson#otp_vendor_test_status)).
-`lc_dft_en_o`                | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-`lc_nvm_debug_en_o`          | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-`lc_hw_debug_en_o`           | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-`lc_cpu_en_o`                | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-`lc_creator_seed_sw_rw_en_o` | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-`lc_owner_seed_sw_rw_en_o`   | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-`lc_iso_part_sw_rd_en_o`     | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-`lc_iso_part_sw_wr_en_o`     | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-`lc_seed_hw_rd_en_o`         | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-`lc_keymgr_en_o`             | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-`lc_escalate_en_o`           | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-`lc_check_byp_en_o`          | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-`lc_clk_byp_req_o`           | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-`lc_clk_byp_ack_i`           | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-`lc_flash_rma_req_o`         | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-`lc_flash_rma_ack_i`         | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
-
-#### Power Manager Interface
-
-The power manager interface is comprised of three signals overall: an initialization request (`pwr_lc_i.lc_init`), an initialization done response (`pwr_lc_o.lc_done`) and an idle indicator (`pwr_lc_o.lc_idle`).
-
-The power manager asserts `pwr_lc_i.lc_init` in order to signal to the life cycle controller that it can start initialization, and the life cycle controller signals completion of the initialization sequence by asserting `pwr_lc_o.lc_done` (the signal will remain high until reset).
-
-The idle indication signal `pwr_lc_o.lc_idle` indicates that the life cycle controller is idle.
-If this bit is 0, the life cycle controller is either not initialized or in the middle of carrying out a life cycle state transition.
-The power manager uses that indication to determine whether a power down request needs to be aborted.
-
-Since the power manager may run in a different clock domain, the `pwr_lc_i.lc_init` signal is synchronized within the life cycle controller.
-The power manager is responsible for synchronizing the `pwr_lc_o.lc_done` and `pwr_lc_o.lc_idle` signals.
-
-See also [power manager documentation](../pwrmgr/README.md).
-
-#### OTP Interfaces
-
-All interfaces to and from OTP are explained in detail in the [OTP Specification Document](../otp_ctrl/README.md#life-cycle-interfaces).
-
-#### KMAC Interface
-
-The life cycle controller interfaces with KMAC through a [side load interface](../kmac/README.md#keymgr-interface) in the same way as the key manager.
-Since the KMAC and life cycle controller are in different clock domains, the KMAC interface signals are synchronized to the life cycle clock inside the life cycle controller.
-
-#### Control Signal Propagation
-
-For better security, all the [life cycle control signals](#life-cycle-decoded-outputs-and-controls) are broadcast in multi-bit form.
-The active ON state for every signal is broadcast as `4'b1010`, while the inactive OFF state is encoded as `4'b0101`.
-For all life cycle signals except the escalation signal ESCALATE_EN, all values different from ON must be interpreted as OFF in RTL.
-In case of ESCALATE_EN, all values different from OFF must be interpreted as ON in RTL.
-
-Since many signals cross clock boundaries, their synchronization needs to be taken into account.
-However, since the ON / OFF encoding above has been chosen such that **all bits toggle exactly once** for a transition from OFF to ON (and vice-versa), all that needs to be done is guard against metastability using a two-stage synchronizer, as illustrated below.
-
-![Multibit Sync](./doc/lc_ctrl_multibit_sync.svg)
-
-In other words, since each bit in the encoding flips exactly once upon an OFF -> ON or ON -> OFF transition, we can guarantee that there are no transient patterns toggling back and forth between enabling and disabling a function.
-Note that even though synchronization can be achieved with a simple two-stage synchronizer, designs **must** use the `prim_lc_sync` primitive.
-This primitive has additional LC-specific assertions and provides a parametric amount of separately buffered copies of the life cycle signal to prevent logic optimization by the synthesis tool (buffers have a 'size_only' constraint in synthesis).
-For all signals except ESCALATE_EN, it is recommended to structure the design such that at least two separately buffered copies of the life cycle signals have to be consumed in order to unlock a certain function.
-
-#### Key Manager Interface
-
-The `lc_keymgr_div_o` signal is a 128bit diversification constant that is output to the key manager once the life cycle controller has initialized, and is asserted at the same time as `lc_keymgr_en_o`.
-Depending on which group the life cycle state is in, this signal is assigned a different random netlist constant as defined in the table below.
-
-Life Cycle State Group     | Assigned Diversification Constant
----------------------------|----------------------------------
-TEST_UNLOCKED\*, DEV, RMA  | `LcKeymgrDivTestDevRma`
-PROD, PROD_END             | `LcKeymgrDivProduction`
-All Other States           | `LcKeymgrDivInvalid`
-
-Note that this signal is quasistatic.
-It is hence recommended to place a max-delay constraint on it and leverage the synchronized version of `lc_keymgr_en_o` to enable any downstream register in different clock domains than the life cycle controller.
-
-
-## Design Details
-
-
-### Block Diagram
-
-Conceptually speaking, the life cycle controller consists of a large  FSM that is further subdivided into logical modules for maintainability, as illustrated below. All blue blocks in the block diagram are purely combinational and do not contain any registers.
-
-![LC Controller Block Diagram](./doc/lc_ctrl_blockdiag.svg)
-
-The main FSM implements a linear state sequence that always moves in one direction for increased glitch resistance.
-I.e., it never returns to the initialization and broadcast states as described in the [life cycle state controller section](#main-fsm).
-
-The main FSM state is redundantly encoded, and augmented with the life cycle state.
-That augmented state vector is consumed by three combinational submodules:
-- `lc_ctrl_state_decode`: This submodule decodes the redundantly encoded life cycle state, checks that there are no encoding errors and enforces state dependencies as required by the definition. The decoded state is forwarded to the CSRs for SW consumption.
-- `lc_ctrl_transition`: This submodule checks whether the transition target state specified via the CSRs is valid, and computes the redundantly encoded state vector of the transition target state.
-- `lc_ctrl_signal_decode`: This submodule is an output function only and derives the life cycle control signals (colored in blue) from the augmented state vector.
-
-Note that the two additional life cycle control signals `lc_flash_rma_req_o` and `lc_clk_byp_req_o` are output by the main FSM, since they cannot be derived from the life cycle state alone and are reactive in nature in the sense that there is a corresponding acknowledgement signal.
-
-The life cycle controller contains a JTAG TAP that can be used to access the same CSR space that is accessible via TL-UL.
-In order to write to the CSRs, a [hardware mutex](#hardware-mutex) has to be claimed.
-
-The life cycle controller also contains two escalation receivers that are connected to escalation severity 1 and 2 of the alert handler module.
-The actions that are triggered by these escalation receivers are explained in the [escalation handling section](#escalation-handling) below.
-
-### System Integration and TAP Isolation
-
-The figure below provides more context about how the life cycle controller is integrated into the system, and how its control signals interact with various components.
-
-![LC Controller Block Diagram](./doc/lc_ctrl_system_view.svg)
-
-Although technically a life cycle feature, the sampling of the strap pins and JTAG / TAP isolation is performed in the pinmux after the life cycle controller has initialized.
-See [pinmux documentation](../pinmux/README.md#strap-sampling-and-tap-isolation) and the detailed selection listed in [Life Cycle Definition Table](../../../doc/security/specs/device_life_cycle/README.md#manufacturing-states).
-
-### Life Cycle Manufacturing State Encodings
-
-The encoding of the life-cycle state is used both for OTP storage and as part of the FSM state in the life cycle controller.
-In other words the state stored within OTP is not re-encoded before it is consumed as part of the life cycle controller FSM state.
-
-{{#include doc/lc_ctrl_encoding_table.md}}
-
-Any decoding that does not fall into the table above is considered **INVALID**.
-
-Each word in the table above maps to an ECC protected 16bit OTP word (i.e., 16bit + 6bit ECC).
-Further, each Ax/Bx word used in the LC state is a unique, random netlist constant generated by the silicon creator prior to tapeout based on a custom seed and the employed ECC polynomial.
-The values Bx are constructed such that {Bx,ECC(Bx)} can be incrementally written over {Ax,ECC(Ax)} without producing any ECC errors.
-
-The purpose of this encoding is to ensure the following
-
-- It is difficult to jump from PROD / PROD_END / SCRAP into DEV
-- It is difficult to jump from DEV / PROD / PROD_END / SCRAP into TEST*
-- It is difficult to jump from DEV / PROD / PROD_END / SCRAP into RMA
-
-Further, the encoding has been chosen to minimize the probability of successful glitch attacks attempting to alter the value of bits in the life cycle state.
-In particular, this encoding guards against attacks that manipulate the OTP to output all-zeros, or attacks that manipulate the OTP to read from other address locations within OTP to inject specific values.
-
-Note that the RAW state is guarded by the RAW_UNLOCK process, which involves supplying a 128bit UNLOCK_TOKEN and performing a full system reset in case the token was correct. Hence moving the state into RAW does not provide any advantage to an attacker.
-
-The encoded life cycle state is not readable by SW in any way through the OTP or life cycle interfaces.
-However a decoded version of the manufacturing life cycle is exposed in the [`LC_STATE`](data/lc_ctrl.hjson#lc_state) register.
-
-### Life Cycle Readout Consistency Checks in OTP
-
-In order to guard against glitch attacks during OTP sense and readout, the OTP controller makes sure to read out the life cycle partition before releasing the state to the life cycle controller.
-I.e., the OTP controller senses and buffers the life cycle in registers in a first readout pass.
-Then, as part of the [consistency check mechanism](../otp_ctrl/README.md#storage-consistency), the OTP controller performs a second and third readout pass to verify whether the buffered life cycle state indeed corresponds to the values stored in OTP.
-The second readout pass uses a linearly increasing address sequence, whereas the third readout pass uses a linearly decreasing address sequence (i.e., reads in reverse order).
-
-### Transition Counter Encoding
-
-The life cycle transition counter has 24 strokes where each stroke maps to one 16bit OTP word.
-The strokes are similarly encoded as the life cycle state in the sense that upon the first transition attempt, all words are initialized with unique Cx values that can later be overwritten with unique Dx values without producing an ECC error.
-
-{{#include doc/lc_ctrl_counter_table.md}}
-
-Upon each life cycle transition attempt, the life cycle controller **FIRST** increments the transition counter before initiating any token hashing and comparison operations.
-
-A decoded version of this counter is exposed in the [`LC_TRANSITION_CNT`](data/lc_ctrl.hjson#lc_transition_cnt) register.
-
-### Life Cycle State Controller
-
-The life cycle state controller is the main entity that handles life cycle requests, escalation events and transactions with the OTP and flash controllers.
-The state diagram for the controller FSM is shown below.
-
-![LC Controller FSM](./doc/lc_ctrl_fsm.svg)
-
-Once the FSM has initialized upon request from the power manager, it moves into `IdleSt`, which is the state where all life cycle control signals are broadcast.
-The life cycle controller stays in `IdleSt` unless a life cycle state request is initiated via the CSRs.
-
-In that case, the life cycle controller first increments the redundantly encoded life cycle transition counter in `CntIncrSt` and `CntProgSt` in order to fend against brute force attacks.
-Then, the transition is checked for validity in `TransCheckSt` and the token hashing operation is initiated in `TokenHashSt`.
-A first token comparison is performed when the hashed token returns in `TokenHashSt`, followed by two more comparisons in `TokenCheck0St` and `TokenCheck1St`.
-The difference among these three comparisons is that the first comparison is done using the hashed token input directly, whereas the second and the third comparison use a registered version of the hashed token.
-If all token checks are successful, the next life cycle state vector is computed and programmed in `TransProgSt`.
-
-Note that an initiated life cycle transition request always ends in `PostTransSt`, no matter whether the transition is successful or not.
-
-#### Escalation Handling
-
-The life cycle controller contains two escalation channels that are connected to the alert handler.
-
-When the first channel `esc_wipe_secrets` is asserted, the life cycle controller permanently asserts the `lc_escalate_en` life cycle signal.
-That signal is routed to various security modules in OpenTitan and triggers local wiping and invalidation features.
-Note that this first escalation action does not affect the life cycle state.
-
-When the second channel `esc_scrap_state` is asserted, the life cycle controller moves the life cycle state into `EscalateSt`, which behaves like a "virtual" SCRAP life cycle state.
-This transition is not permanent, and will clear upon the next power cycle.
-Note that any scrap state (virtual or encoded in the life cycle state vector) will also cause the `lc_escalate_en` life cycle signal to be asserted.
-
-#### FSM Glitch Countermeasures
-
-The FSM has been designed to have a linear control flow that always moves in the same direction, and that always ends in a terminal state after initiating a transition request in order to make glitch attacks harder.
-A sparse FSM state encoding is employed, where each state is encoded as a 16bit word with a minimum Hamming distance of 5 w.r.t. any other state.
-The FSM state and the life cycle state vector are concurrently monitored, and if an erroneous encoding is detected, the life cycle FSM is immediately moved into the terminal `InvalidSt`, and a `fatal_state_error` alert is asserted.
-
-#### Life Cycle Request Interface
-
-Life cycle requests are the explicit requests made to change life cycle states.
-The controller allows requests to come from either the TAP or the software interface.
-The interface is common between the two and is maintained as a CSR interface.
-To arbitrate between the two, a hardware mutex needs to be obtained before either side can proceed.
-The hardware mutex internally acts as a mux to block off the unselected path and all accesses to the request interface are blocked until it is claimed.
-If two requests arrive simultaneously, the TAP interface is given priority.
-
-The request interface consists of 7 registers:
-
-1. [`TRANSITION_CTRL`](data/lc_ctrl.hjson#transition_ctrl): Control register for the transition, can be used to switch to an external clock.
-2. [`TRANSITION_TARGET`](data/lc_ctrl.hjson#transition_target): Specifies the target state to which the agent wants to transition.
-3. [`TRANSITION_TOKEN_*`](data/lc_ctrl.hjson#transition_token_): Any necessary token for conditional transitions.
-4. [`TRANSITION_CMD`](data/lc_ctrl.hjson#transition_cmd): Start the life cycle transition.
-5. [`STATUS`](data/lc_ctrl.hjson#status): Indicates whether the requested transition succeeded.
-6. [`OTP_VENDOR_TEST_CTRL`](data/lc_ctrl.hjson#otp_vendor_test_ctrl): See [Macro-specific test control bits](#vendor-specific-test-control-register).
-7. [`OTP_VENDOR_TEST_STATUS`](data/lc_ctrl.hjson#otp_vendor_test_status): See [Macro-specific test control bits](#vendor-specific-test-control-register).
-
-If the transition fails, the cause will be reported in this register as well.
-
-See diagram below.
-
-![LC Request Interface](./doc/lc_ctrl_request_interface.svg)
-
-In order to claim the hardware mutex, the value kMuBi8True must be written to the claim register ([`CLAIM_TRANSITION_IF`](data/lc_ctrl.hjson#claim_transition_if)).
-If the register reads back as kMuBi8True, then the mutex is claimed, and the interface that won arbitration can continue operations.
-If the value is not read back, then the requesting interface should wait and try again later.
-Note that all transition registers (with the exception of the [`STATUS`](data/lc_ctrl.hjson#status) register) read back all-zero if the mutex is not claimed.
-
-When an agent is done with the mutex, it releases the mutex by explicitly writing a 0 to the claim register.
-This resets the mux to select no one and also holds the request interface in reset.
-
-#### Vendor-specific Test Control Register
-
-Certain OTP macros require special configuration bits to be set during the test phases.
-Likewise, it is necessary to expose macro-specific status bits during the test phases.
-To this end, the life cycle CSRs contain the [`OTP_VENDOR_TEST_CTRL`](data/lc_ctrl.hjson#otp_vendor_test_ctrl) and [`OTP_VENDOR_TEST_STATUS`](data/lc_ctrl.hjson#otp_vendor_test_status) registers, which are reserved for vendor-specific test control and status bits.
-These registers are only active during RAW, TEST_* and RMA life cycle states.
-In all other life cycle states, the status register reads back all-zero, and the control register value will be tied to 0 before forwarding it to the OTP macro.
-
-Similarly to the [Life Cycle Request Interface](#life-cycle-request-interface), the hardware mutex must be claimed in order to access both of these registers.
-Note that these registers read back all-zero if the mutex is not claimed.
-
-### TAP Construction and Isolation
-
-#### Life Cycle TAP Controller
-
-The life cycle TAP controller is functionally very similar to the [RISC-V debug module](https://github.com/lowRISC/opentitan/blob/master/hw/ip/rv_dm/rtl/rv_dm.sv) for the Ibex processor and reuses the same debug transport module (DTM) and the associated debug module interface (DMI).
-The DTM and DMI are specified as part of the [RISC-V external debug specification, v0.13](https://github.com/riscv/riscv-debug-spec/blob/release/riscv-debug-release.pdf) and essentially provide a simple mechanism to read and write to a register space.
-In the case of the life cycle TAP controller this register space is essentially the life cycle CSR space.
-Hence, the [register table](#register-table) is identical for both the SW view and the view through the DMI, with the only difference that the byte offsets have to be converted to word offsets for the DMI.
-
-The RISC-V external debug specification defines the two custom JTAG registers 0x10 (DTM control/status) and 0x11 (DMI).
-The former provides status info such as idle state, number of address bits and RISC-V specification version plus reset control.
-The latter exposes an address, data and operation field for accessing a CSR space.
-
-In order to interact with the LC controller through JTAG, the debugging agent should read out the `abits` field from 0x10 in order to determine the address width in the DMI, and verify that the `version` field is indeed set to 1 to confirm that the DTM implements v0.13 of the spec.
-Then, the debugger can issue a CSR read or write operation via the 0x11 register as explained in more detail in [the RISC-V external specification, Chapter 6.1.5](https://github.com/riscv/riscv-debug-spec/blob/release/riscv-debug-release.pdf).
-
-### TAP and Isolation
-
-As currently defined, the life cycle controller TAP is a separate entity from the main SOC DFT TAP and the processor TAP.
-This physical separation aids in logical isolation, as the SOC DFT tap can be disabled by DFT_EN, while the processor TAP can be disabled by DEBUG_EN.
-The TAP isolation and multiplexing is implemented in the pinmux IP as [described here](../pinmux/README.md#strap-sampling-and-tap-isolation).
-
-# Programmer's Guide
-
-The register layout and offsets shown in the [register table](data/lc_ctrl.hjson#registers) below are identical for both the CSR and JTAG TAP interfaces.
-Hence the following programming sequence applies to both SW running on the device and SW running on the test appliance that accesses life cycle through the TAP.
-
-1. In order to perform a life cycle transition, SW should first check whether the life cycle controller has successfully initialized and is ready to accept a transition command by making sure that the [`STATUS.READY`](data/lc_ctrl.hjson#status) bit is set to 1, and that all other status and error bits in [`STATUS`](data/lc_ctrl.hjson#status) are set to 0.
-
-2. Read the [`LC_STATE`](data/lc_ctrl.hjson#lc_state) and [`LC_TRANSITION_CNT`](data/lc_ctrl.hjson#lc_transition_cnt) registers to determine which life cycle state the device currently is in, and how many transition attempts are still available.
-
-3. Claim exclusive access to the transition interface by writing kMuBi8True to the [`CLAIM_TRANSITION_IF`](data/lc_ctrl.hjson#claim_transition_if) register, and reading it back. If the value read back equals to kMuBi8True, the hardware mutex has successfully been claimed and SW can proceed to step 4. If the value read back equals to 0, the mutex has already been claimed by the other interface (either CSR or TAP), and SW should try claiming the mutex again.
-Note that all transition interface registers are protected by the hardware-governed [`TRANSITION_REGWEN`](data/lc_ctrl.hjson#transition_regwen) register, which will only be set to 1 if the mutex has been claimed successfully.
-
-4. If required, enable the external clock and other vendor-specific OTP settings in the [`OTP_VENDOR_TEST_CTRL`](data/lc_ctrl.hjson#otp_vendor_test_ctrl) register.
-Note that these settings only take effect in RAW, TEST* and RMA life cycle states.
-They are ignored in the PROD* and DEV states.
-
-5. Write the desired target state to [`TRANSITION_TARGET`](data/lc_ctrl.hjson#transition_target). For conditional transitions, the corresponding token has to be written to [`TRANSITION_TOKEN_0`](data/lc_ctrl.hjson#transition_token_0). For all unconditional transitions, the token registers have to be set to zero.
-
-6. An optional, but recommended step is to read back and verify the values written in steps 4. and 5. before proceeding with step 7.
-
-7. Write 1 to the [`TRANSITION_CMD.START`](data/lc_ctrl.hjson#transition_cmd) register to initiate the life cycle transition.
-
-8. Poll the [`STATUS`](data/lc_ctrl.hjson#status) register and wait until either [`STATUS.TRANSITION_SUCCESSFUL`](data/lc_ctrl.hjson#status) or any of the error bits is asserted.
-The [`TRANSITION_REGWEN`](data/lc_ctrl.hjson#transition_regwen) register will be set to 0 while a transition is in progress in order to prevent any accidental modifications of the transition interface registers during this phase.
-
-Note that any life cycle state transition - no matter whether successful or not - increments the LC_TRANSITION_CNT and moves the life cycle state into the temporary POST_TRANSITION state.
-Hence, step 8. cannot be carried out in case device SW is used to implement the programming sequence above, since the processor is disabled in the POST_TRANSITION life cycle state.
-
-This behavior is however not of concern, since access to the transition interface via the CSRs is considered a convenience feature for bringup in the lab.
-It is expected that the JTAG TAP interface is used to access the life cycle transition interface in production settings.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_lc_ctrl.h)
-
-## Register Table
-
-* [Register Table](data/lc_ctrl.hjson#registers)
diff --git a/hw/ip/lc_ctrl/doc/programmers_guide.md b/hw/ip/lc_ctrl/doc/programmers_guide.md
new file mode 100644
index 0000000000000..80c1d37f1e43a
--- /dev/null
+++ b/hw/ip/lc_ctrl/doc/programmers_guide.md
@@ -0,0 +1,38 @@
+# Programmer's Guide
+
+The register layout and offsets shown in the [register table](../data/lc_ctrl.hjson#registers) below are identical for both the CSR and JTAG TAP interfaces.
+Hence the following programming sequence applies to both SW running on the device and SW running on the test appliance that accesses life cycle through the TAP.
+
+1. In order to perform a life cycle transition, SW should first check whether the life cycle controller has successfully initialized and is ready to accept a transition command by making sure that the [`STATUS.READY`](../data/lc_ctrl.hjson#status) bit is set to 1, and that all other status and error bits in [`STATUS`](../data/lc_ctrl.hjson#status) are set to 0.
+
+2. Read the [`LC_STATE`](../data/lc_ctrl.hjson#lc_state) and [`LC_TRANSITION_CNT`](../data/lc_ctrl.hjson#lc_transition_cnt) registers to determine which life cycle state the device currently is in, and how many transition attempts are still available.
+
+3. Claim exclusive access to the transition interface by writing kMuBi8True to the [`CLAIM_TRANSITION_IF`](../data/lc_ctrl.hjson#claim_transition_if) register, and reading it back. If the value read back equals to kMuBi8True, the hardware mutex has successfully been claimed and SW can proceed to step 4. If the value read back equals to 0, the mutex has already been claimed by the other interface (either CSR or TAP), and SW should try claiming the mutex again.
+Note that all transition interface registers are protected by the hardware-governed [`TRANSITION_REGWEN`](../data/lc_ctrl.hjson#transition_regwen) register, which will only be set to 1 if the mutex has been claimed successfully.
+
+4. If required, enable the external clock and other vendor-specific OTP settings in the [`OTP_VENDOR_TEST_CTRL`](../data/lc_ctrl.hjson#otp_vendor_test_ctrl) register.
+Note that these settings only take effect in RAW, TEST* and RMA life cycle states.
+They are ignored in the PROD* and DEV states.
+
+5. Write the desired target state to [`TRANSITION_TARGET`](../data/lc_ctrl.hjson#transition_target). For conditional transitions, the corresponding token has to be written to [`TRANSITION_TOKEN_0`](../data/lc_ctrl.hjson#transition_token_0). For all unconditional transitions, the token registers have to be set to zero.
+
+6. An optional, but recommended step is to read back and verify the values written in steps 4. and 5. before proceeding with step 7.
+
+7. Write 1 to the [`TRANSITION_CMD.START`](../data/lc_ctrl.hjson#transition_cmd) register to initiate the life cycle transition.
+
+8. Poll the [`STATUS`](../data/lc_ctrl.hjson#status) register and wait until either [`STATUS.TRANSITION_SUCCESSFUL`](../data/lc_ctrl.hjson#status) or any of the error bits is asserted.
+The [`TRANSITION_REGWEN`](../data/lc_ctrl.hjson#transition_regwen) register will be set to 0 while a transition is in progress in order to prevent any accidental modifications of the transition interface registers during this phase.
+
+Note that any life cycle state transition - no matter whether successful or not - increments the LC_TRANSITION_CNT and moves the life cycle state into the temporary POST_TRANSITION state.
+Hence, step 8. cannot be carried out in case device SW is used to implement the programming sequence above, since the processor is disabled in the POST_TRANSITION life cycle state.
+
+This behavior is however not of concern, since access to the transition interface via the CSRs is considered a convenience feature for bringup in the lab.
+It is expected that the JTAG TAP interface is used to access the life cycle transition interface in production settings.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_lc_ctrl.h)
+
+## Register Table
+
+* [Register Table](../data/lc_ctrl.hjson#registers)
diff --git a/hw/ip/lc_ctrl/doc/theory_of_operation.md b/hw/ip/lc_ctrl/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..6c8dbf40a5fe4
--- /dev/null
+++ b/hw/ip/lc_ctrl/doc/theory_of_operation.md
@@ -0,0 +1,600 @@
+# Theory of Operation
+
+The following sections give an overview of the life cycle function.
+It begins with life cycle sensing at power up, progresses through how life cycle transitions are made, and then focuses specifically on how life cycle impacts various functionality of the design.
+
+## Power Up Sequence
+
+Upon power up, the life cycle controller will default to "RAW" state and wait for the OTP controller to initialize and sense the contents of the [life cycle partition](../../otp_ctrl/README.md#logical-partitions).
+When the OTP is ready, the life cycle controller reads the contents of the life cycle partition, decodes the life cycle state and updates its internal state to match.
+This implies that unlike the life cycle definition diagram, there is a one-time "RAW to any state" logical transition that is implicit within the implementation.
+Note during OTP sensing, the life cycle controller does not perform any redundant checks upon the value it reads; instead that responsibility is allocated to the OTP controller.
+
+Once the state values are correctly sensed, the life cycle controller performs checks on state consistency and dependencies, and if correct, broadcasts both the raw state value as well as the decoded functional outputs to the rest of the device.
+
+Once the broadcast is complete and signals stable, modules held under reset by ["sys_rst_n"](../../rstmgr/README.md) are then released from reset to begin mission mode operations (this includes the processor).
+Note this point is also when it is safe for DFT to commence operations, as DFT functions may be blocked until life cycle completes its broadcast.
+
+The following diagram illustrates this power-up sequence.
+Note the sequence is not designed into one specific module, but rather a result of coordination between the OTP controller, life cycle controller and the reset / power controllers.
+
+![LC Power Up Sequence](../doc/lc_ctrl_power_up.svg)
+
+## Normal Operation
+
+Once the life cycle system is powered up and stable, its outputs remain static unless specifically requested to change or affected by security escalation.
+The life cycle controller can accept [change requests](#life-cycle-requests) from software as well as external entities.
+
+### Unconditional Transitions
+
+For unconditional transitions, the life cycle controller advances the state by requesting an OTP update to the OTP controller.
+Once the programming is confirmed, the life cycle controller reports a success to the requesting agent and waits for the device to reboot.
+
+### Conditional Transitions
+
+For conditional transitions, such as those that require a token (RAW_UNLOCK, TEST_UNLOCK, TEST_EXIT, RMA_UNLOCK), the life cycle controller advances the state via OTP programming only after it is supplied with the valid token.
+[Some tokens](../../../../doc/security/specs/device_life_cycle/README.md#manufacturing-states) are hardcoded design constants, while others are stored in OTP.
+Note that conditional transitions will only be allowed if the OTP partition holding the corresponding token has been provisioned and locked.
+
+Since unlock tokens are considered secret, they are not stored in their raw form.
+Instead, the tokens are wrapped and unwrapped based on a global constant using a [PRESENT-based scrambling mechanism](../../otp_ctrl/README.md#secret-vs-non-secret-partitions).
+This ensures that a breach of fuse physical security does not automatically expose all the relevant information without also breaking the constant key.
+
+RAW_UNLOCK is not exposed in the open source design, rather it is something provisioned by the silicon creators prior to tapeout.
+It is the only token among those listed that is a global constant and stored in gates.
+
+All others CAN be device unique and are stored in OTP.
+
+### Transition Counter Limits
+
+For conditional transitions, there is a limit to how many times they can be attempted.
+This is to prevent an attacker from brute-forcing any specific token, as this also helps to reduce the overall required token size.
+
+For OpenTitan, the total amount of state transitions and transition attempts is limited to 24.
+Once this number is reached, the life cycle controller rejects further attempts, effectively locking the device into its current state.
+
+The token counters are maintained in the OTP.
+To ensure the security of token limits cannot be bypassed, each request for a conditional transition **FIRST** increments the token count, and **THEN** checks for the validity of the token.
+
+### Token Hashing Mechanism
+
+All 128bit lock and unlock tokens are passed through a cryptographic one way function in hardware before the life cycle controller compares them to the provisioned values in OTP or to the netlist constant in case of RAW_UNLOCK.
+
+This mechanism is used to guard against reverse engineering and brute-forcing attempts.
+An attacker able to extract the hashed token values from the scrambled OTP partitions or from the netlist would first have to find a hash collision in order to perform a life cycle transition, since the values supplied to the life cycle controller must be valid hash pre-images.
+
+The employed one way function is a 128bit cSHAKE hash with the function name "" and customization string "LC_CTRL", see also [kmac documentation](../../kmac/README.md) and [`kmac_pkg.sv`](https://github.com/lowRISC/opentitan/blob/master/hw/ip/kmac/rtl/kmac_pkg.sv#L148-L155).
+
+### Post Transition Handling
+
+After a transition request, whether it was unconditional or conditional, the life cycle controller always disables all of its decoded outputs and puts the system in an inert state.
+The device is then expected to reboot before returning to a functional state.
+
+Note this happens for either successful or unsuccessful transitions.
+This general policy places a time-bound on how quickly life cycle states can change and also forces the device to behave more predictably.
+
+## Security Escalation
+
+The life cycle controller contains two escalation paths that are connected to escalation severities 1 and 2 of the alert handler.
+
+The two escalation paths are redundant, and both trigger the same mechanism.
+Upon assertion of any of the two escalation actions, the life cycle state is **TEMPORARILY** altered.
+I.e. when this escalation path is triggered, the life cycle state is transitioned into "ESCALATE", which behaves like a virtual "SCRAP" state (i.e. this state is not programmed into OTP).
+This causes [all decoded outputs](#life-cycle-decoded-outputs-and-controls) to be disabled until the next power cycle.
+In addition to that, the life cycle controller asserts the ESCALATE_EN life cycle signal which is distributed to all IPs in the design that expose an escalation action (like moving FSMs into terminal error states or clearing sensitive registers).
+
+Whether to escalate to the life cycle controller or not is a software decision, please see the alert handler for more details.
+
+## Life Cycle Decoded Outputs and Controls
+
+The core function of life cycle is how various functions of the design are modulated by what state the design is in.
+[This section](../../../../doc/security/specs/device_life_cycle/README.md#manufacturing-states) in the life cycle architecture documentation summarizes the overall behavior.
+
+The signals have been split into two summary tables in the sections below.
+The first table contains all control signals that enable certain functionality in the system, whereas the second table contains all signals that change access to certain elements in the flash and OTP memories.
+
+All life cycle control signals are 4-bits, with only `4'b1010` as a valid enable value, and all others meaning "disable".
+A `"Y"` mark means the function is directly enabled by hardware during that
+state.
+A `"grey"` box means a particular function is not available during that
+state.
+The states in <span style="color:red">RED</span> are volatile, temporary states.
+They exist only after specific events, and are restored to normal once the device is power cycled.
+
+### Life Cycle Function Control Signals
+
+The individual signals summarized in the table below are described in the following subsections.
+
+{{#include doc/lc_ctrl_function_signals_table.md}}
+
+Signals marked with an asterisk (Y\*) are only asserted under certain conditions as explained in detail below.
+
+#### DFT_EN
+
+As its name implies, this signal enables DFT functions.
+This is accomplished primarily by providing functional isolation on the SOC inserted DFT TAP module and any other memory macros that are built natively with a DFT function (for example flash and OTP).
+
+The isolation ensures three things:
+- The TAP controller is unable to issue instructions that would put the design into scan mode.
+This ensures that secrets cannot be scanned out, and specific values cannot be scanned into the design to emulate a particular functional mode
+- The TAP controller is unable to issue any kind of self test that would disrupt and scramble live logic which could lead to unpredictable behavior
+- The TAP controller or test function is unable to alter the non-volatile contents of flash or OTP
+
+See [TAP isolation](#tap-and-isolation) for more implementation details.
+
+#### NVM_DEBUG_EN
+
+NVM modules like flash implement debug access that bypasses memory protection or lock-down.
+This feature may be there for a variety of reasons, but primarily it can be used to debug the normal behavior of the controller.
+
+This type of functionality, if it exists, must be disabled during specific life cycle states.
+Since these back-door functions may bypass memory protection, they could be used to read out provisioned secrets that are not meant to be visible to software or a debug host.
+
+Note that NVM_DEBUG_EN is disabled in the last test unlocked state (TEST_UNLOCKED7) such that the isolated flash partition can be be securely populated, without exposing its contents via the NVM backdoor interface.
+See also accessibility description of the [isolated flash partition](#iso_part_sw_rd_en-and-iso_part_sw_wr_en).
+
+#### HW_DEBUG_EN
+
+HW_DEBUG_EN refers to the general ungating of both invasive (JTAG control of the processor, bidirectional analog test points) and non-invasive debug (debug bus observation, and register access error returns).
+
+This signal thus needs to be routed to all security-aware and debug capable peripherals.
+This signal is used to determine whether OpenTitan peripheral register interfaces should [silently error](../../../util/reggen/README.md#error_responses" >}}).
+If HW_DEBUG_EN is set to ON, normal errors should be returned.
+If HW_DEBUG_EN is set to OFF, errors should return silently.
+
+Similar to DFT_EN, HW_DEBUG_EN is also used to isolate the processor TAP.
+When HW_DEBUG_EN is OFF, the TAP should not be able to perform its normal debug access, thus preventing an external entity from hijacking the processor.
+
+#### CPU_EN
+
+CPU_EN controls whether code execution is allowed.
+This is implemented as part of the processor's reset controls.
+In OpenTitan's [reset topology](../../rstmgr/README.md), it is not possible to reset only the processor by itself, so this reset control extends to a large population of the OpenTitan peripherals.
+
+This ensures that during specific states (RAW, TEST_LOCKED, SCRAP, INVALID) it is not possible for the processor to execute code that breaks the device out of a non-functional state.
+
+In conjunction with DFT_EN / HW_DEBUG_EN, this acts as the final layer in life cycle defense in depth.
+
+#### KEY_MANAGER_EN {#key-manager-en}
+
+The KEY_MANAGER_EN signal allows the key manager to function normally.
+When this signal is logically disabled, any existing key manager collateral is uninstantiated and wiped; further instantiation and generation calls for the key manager are also made unavailable.
+
+The KEY_MANAGER_EN signal is active only during DEV / PROD / PROD_END / RMA.
+
+#### ESCALATE_EN
+
+The ESCALATE_EN signal is available in all life cycle states and is asserted if for any reason the alert subsystem decides to move the life cycle state into the ESCALATION state.
+This signal is also unconditionally asserted in all INVALID and SCRAP states (including virtual SCRAP states).
+
+#### CHECK_BYP_EN
+
+The CHECK_BYP_EN signal is used to disable the [background consistency checks](../../otp_ctrl/README.md#partition-checks) of the life cycle OTP partition during life cycle transitions to prevent spurious consistency check failures (the OTP contents and the buffer registers can get out of sync during state transitions).
+The CHECK_BYP_EN signal is only asserted when a transition command is issued.
+
+#### CLK_BYP_REQ
+
+If the life cycle state is in RAW, TEST* or RMA, and if [`TRANSITION_CTRL.EXT_CLOCK_EN`](../data/lc_ctrl.hjson#transition_ctrl) is set to one, the CLK_BYP_REQ signal is asserted in order to switch the main system clock to an external clock signal.
+This functionality is needed in certain life cycle states where the internal clock source may not be fully calibrated yet, since the OTP macro requires a stable clock frequency in order to reliably program the fuse array.
+Note that the [`TRANSITION_CTRL.EXT_CLOCK_EN`](../data/lc_ctrl.hjson#transition_ctrl) register can only be set to one if the transition interface has been claimed via the [`CLAIM_TRANSITION_IF`](../data/lc_ctrl.hjson#claim_transition_if) mutex.
+This function is not available in production life cycle states.
+
+For details on the clock switch, please see [clkmgr](../../clkmgr/README.md#life-cycle-requested-external-clock).
+
+
+### Life Cycle Access Control Signals
+
+The individual signals summarized in the table below are described in the following subsections.
+
+{{#include doc/lc_ctrl_access_signals_table.md}}
+
+Signals marked with an asterisk (Y\*) are only asserted under certain conditions as explained in detail below.
+
+#### CREATOR_SEED_SW_RW_EN and OWNER_SEED_SW_RW_EN
+
+These signals control whether the non-volatile provisioning of life cycle related collateral can be accessed.
+The signals can only be active during DEV / PROD / PROD_END / RMA.
+During other states, it is not possible to either read or modify the collateral.
+This specifically limits the danger of rogue software images during any TEST_UNLOCKED state.
+However, as these signals only gate functional access and not DFT access, it is still possible for a malicious agent to bypass this protection by abusing scan shift/capture mechanics.
+
+While the OWNER_SEED_SW_RW_EN is statically enabled in the states shown above, the CREATOR_SEED_SW_RW_EN is only enabled if the device has not yet been personalized (i.e., the OTP partition holding the root key has not been locked down yet).
+
+For more a list of the collateral in Flash and OTP and an explanation of how that collateral is affected by these signals, see the [OTP collateral](#otp-collateral) and [flash collateral](#flash-collateral) sections.
+
+#### SEED_HW_RD_EN
+
+The SEED_HW_RD_EN signal controls whether the owner and creator root keys can be accessed by hardware.
+This signal is dependent on the personalization state of the device and will only be enabled if the device has been personalized (i.e., when the OTP partition holding the root key has been locked down).
+
+#### ISO_PART_SW_RD_EN and ISO_PART_SW_WR_EN
+
+These signals control whether the isolated flash partition holding additional manufacturing details can be accessed.
+The isolated partition is both read and writable during the PROD / PROD_END / RMA states.
+In all other states it is inaccessible, except during the TEST_UNLOCKED* states where the partition is write-only.
+This construction allows to write a value to that partition and keep it secret before advancing into any of the production states.
+
+
+## OTP Collateral
+
+The following is a list of all life cycle related collateral stored in OTP.
+Most collateral also contain associated metadata to indicate when the collateral is restricted from further software access, see [accessibility summary](#otp-accessibility-summary-and-impact-of-provision_en) for more details.
+Since not all collateral is consumed by the life cycle controller, the consuming agent is also shown.
+
+{{#include doc/lc_ctrl_otp_collateral.md}}
+
+The TOKENs and KEYS are considered secret data and are stored in [wrapped format](#conditional-transitions).
+Before use, the secrets are unwrapped.
+
+The SECRET0_DIGEST and SECRET2_DIGEST are the digest values computed over the secret partitions in OTP holding the tokens and root keys.
+As described in more detail in the [OTP controller specification](../../otp_ctrl/README.md#direct-access-memory-map), these digests have a non-zero value once the partition has been provisioned and write/read access has been locked.
+
+### ID State of the Device
+
+If the SECRET2_DIGEST is zero, the device is considered to have "blank" ID state, in which case the CREATOR_ROOT_KEY_* (in OTP) and CREATOR_DIV_KEY (in FLASH) can be written by software.
+All consumers of these keys are supplied with an invalid value.
+
+If the SECRET2_DIGEST has a nonzero value, the device is considered "creator personalized", and the CREATOR_ROOT_KEY and CREATOR_DIV_KEY are no longer accessible to software.
+Actual values are supplied to the consumers.
+If SECRET2_DIGEST has a nonzero value, the CREATOR_SEED_SW_RW_EN signal will be disabled in PROD, PROD_END and DEV states.
+
+### Secret Collateral
+
+Among the OTP life cycle collateral, the following are considered secrets (note there may be other secrets unrelated to life cycle, please see [OTP controller specification](../../otp_ctrl/README.md#partition-listing-and-description) for more details):
+
+- *_TOKEN
+- CREATOR_ROOT_KEY*
+
+Specifically this means after OTP sensing, the above entries are unwrapped to obtain the real value.
+Similarly, during programming, they are wrapped before beginning to be written to OTP.
+
+The function used for this wrapping is the lightweight PRESENT-cipher.
+The wrapping is a one time event during controlled manufacturing, and unwrapping also cannot be supplied with arbitrary ciphertexts.
+Thus the system cannot be abused to generate a large number of traces for informational leakage, and thus a fully hardened cipher (such as masked AES) is not required.
+
+Note also, a global key is used here because there is no other non-volatile location to store a secret key.
+If PUFs were available (either in memory form or fused form), it could become an appealing alternative to hold a device unique fuse key.
+
+See the [OTP controller](../../otp_ctrl/README.md#secret-vs-non-secret-partitions) for more details.
+
+### OTP Accessibility Summary and Impact of Life Cycle Signals
+
+A subset of secret collateral is further access-controlled by the life cycle CREATOR_SEED_SW_RW_EN signal.
+These are
+
+- RMA_UNLOCK_TOKEN
+- CREATOR_ROOT_KEY
+
+The table below summarizes the software accessibility of all life cycle collateral.
+
+{{#include doc/lc_ctrl_otp_accessibility.md}}
+
+Note that CREATOR_SEED_SW_RW_EN is set to OFF if SECRET2_DIGEST has a nonzero value in PROD, PROD_END and DEV states.
+SEED_HW_RD_EN only becomes active if SECRET2_DIGEST has a nonzero value in DEV, PROD, PROD_END and RMA states.
+
+## Flash Collateral
+
+The flash contains both memory mapped and non-memory mapped partitions.
+As it pertains to life cycle, the flash contains two sets of important collateral.
+They are enumerated in the table below.
+Just as with OTP, the consumer and usage of each is also described.
+
+{{#include doc/lc_ctrl_flash_collateral.md}}
+
+Each collateral belongs to a separate flash partition, the table below enumerates the partition and whether the partition is memory mapped.
+
+{{#include doc/lc_ctrl_flash_partitions.md}}
+
+The general flash partition refers to any software managed storage in flash, and is not a specific carve out in the non-memory mapped area.
+
+### Flash Accessibility Summary and Impact of Life Cycle Signals
+
+The creator software is trusted to manage the owner partition (OWNER_DATA).
+As such, OWNER_DATA remains accessible during DEV / PROD / PROD_END / RMA states, irrespective of the device personalization state.
+It is expected that ROM_ext during secure boot programs the protection correctly such that downstream software has appropriate permissions.
+
+The CREATOR_DATA partitions however, are further qualified based on the personalization state of the device.
+Just as with OTP, the table below enumerates accessibility of flash collateral.
+
+{{#include doc/lc_ctrl_flash_accessibility.md}}
+
+Note that CREATOR_SEED_SW_RW_EN is set to OFF if SECRET2_DIGEST has a nonzero value in PROD, PROD_END and DEV states.
+SEED_HW_RD_EN only becomes active if SECRET2_DIGEST has a nonzero value in DEV, PROD, PROD_END and RMA states.
+OWNER_SEED_SW_RW_EN is always enabled during DEV, PROD, PROD_END and RMA states.
+
+See also [Device Life Cycle Architecture](../../../../doc/security/specs/device_life_cycle/README.md) for more information on creator/owner isolation.
+
+
+## Hardware Interfaces
+
+### Parameters
+
+Note that parameters prefixed with `RndCnst` are random netlist constants that need to be regenerated via topgen before the tapeout (typically by the silicon creator).
+
+Parameter                      | Default (Max)         | Top Earlgrey   | Description
+-------------------------------|-----------------------|----------------|---------------
+`AlertAsyncOn`                 | 2'b11                 | 2'b11          |
+`IdcodeValue`                  | `32'h00000001`        | `32'h00000001` | Idcode for the LC JTAG TAP.
+`RndCnstLcKeymgrDivInvalid`    | (see RTL)             | (see RTL)      | Life cycle state group diversification value for keymgr.
+`RndCnstLcKeymgrDivTestDevRma` | (see RTL)             | (see RTL)      | Life cycle state group diversification value for keymgr.
+`RndCnstLcKeymgrDivProduction` | (see RTL)             | (see RTL)      | Life cycle state group diversification value for keymgr.
+
+### Signals
+
+* [Interface Tables](../data/lc_ctrl.hjson#interfaces)
+
+Signal                       | Direction        | Type                                     | Description
+-----------------------------|------------------|------------------------------------------|---------------
+`jtag_i`                     | `input`          | `jtag_pkg::jtag_req_t`                   | JTAG input signals for life cycle TAP.
+`jtag_o`                     | `output`         | `jtag_pkg::jtag_rsp_t`                   | JTAG output signals for life cycle TAP.
+`esc_scrap_state0_tx_i`      | `input`          | `prim_esc_pkg::esc_tx_t`                 | Escalation input from alert handler. Moves the life cycle state into an invalid state upon assertion.
+`esc_scrap_state0_rx_o`      | `output`         | `prim_esc_pkg::esc_rx_t`                 | Escalation feedback to alert handler
+`esc_scrap_state1_tx_i`      | `input`          | `prim_esc_pkg::esc_tx_t`                 | Escalation input from alert handler. Moves the life cycle state into an invalid state upon assertion.
+`esc_scrap_state1_rx_o`      | `output`         | `prim_esc_pkg::esc_rx_t`                 | Escalation feedback to alert handler
+`pwr_lc_i`                   | `input`          | `pwrmgr::pwr_lc_req_t`                   | Initialization request coming from power manager.
+`pwr_lc_o`                   | `output`         | `pwrmgr::pwr_lc_rsp_t`                   | Initialization response and programming idle state going to power manager.
+`lc_otp_program_o`           | `output`         | `otp_ctrl_pkg::lc_otp_program_req_t`     | Life cycle state transition request.
+`lc_otp_program_i`           | `input`          | `otp_ctrl_pkg::lc_otp_program_rsp_t`     | Life cycle state transition response.
+`kmac_data_o`                | `output`         | `kmac_pkg::app_req_t`                    | Life cycle RAW token hashing request.
+`kmac_data_i`                | `input`          | `kmac_pkg::app_rsp_t`                    | Life cycle RAW token hashing response.
+`otp_lc_data_i`              | `input`          | `otp_ctrl_pkg::otp_lc_data_t`            | Life cycle state output holding the current life cycle state, the value of the transition counter and the tokens needed for life cycle transitions.
+`lc_keymgr_div_o`            | `output`         | `lc_keymgr_div_t`                        | Life cycle state group diversification value.
+`lc_flash_rma_seed_o`        | `output`         | `lc_flash_rma_seed_t`                    | Seed for flash RMA.
+`otp_device_id_i`            | `input`          | `otp_device_id_t`                        | HW_CFG bits from OTP ([`DEVICE_ID_0`](../data/lc_ctrl.hjson#device_id_0)).
+`otp_manuf_state_i`          | `input`          | `otp_manuf_state_t`                      | HW_CFG bits from OTP ([`MANUF_STATE_0`](../data/lc_ctrl.hjson#manuf_state_0)).
+`lc_otp_vendor_test_o`       | `output`         | `otp_ctrl_pkg::lc_otp_vendor_test_req_t` | Vendor-specific test bits to OTP ([`OTP_VENDOR_TEST_CTRL`](../data/lc_ctrl.hjson#otp_vendor_test_ctrl)).
+`lc_otp_vendor_test_i`       | `input`          | `otp_ctrl_pkg::lc_otp_vendor_test_rsp_t` | Vendor-specific test bits to OTP ([`OTP_VENDOR_TEST_STATUS`](../data/lc_ctrl.hjson#otp_vendor_test_status)).
+`lc_dft_en_o`                | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+`lc_nvm_debug_en_o`          | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+`lc_hw_debug_en_o`           | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+`lc_cpu_en_o`                | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+`lc_creator_seed_sw_rw_en_o` | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+`lc_owner_seed_sw_rw_en_o`   | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+`lc_iso_part_sw_rd_en_o`     | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+`lc_iso_part_sw_wr_en_o`     | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+`lc_seed_hw_rd_en_o`         | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+`lc_keymgr_en_o`             | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+`lc_escalate_en_o`           | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+`lc_check_byp_en_o`          | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+`lc_clk_byp_req_o`           | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+`lc_clk_byp_ack_i`           | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+`lc_flash_rma_req_o`         | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+`lc_flash_rma_ack_i`         | `output`         | `lc_tx_t`                                | [Multibit control signal](#life-cycle-decoded-outputs-and-controls).
+
+#### Power Manager Interface
+
+The power manager interface is comprised of three signals overall: an initialization request (`pwr_lc_i.lc_init`), an initialization done response (`pwr_lc_o.lc_done`) and an idle indicator (`pwr_lc_o.lc_idle`).
+
+The power manager asserts `pwr_lc_i.lc_init` in order to signal to the life cycle controller that it can start initialization, and the life cycle controller signals completion of the initialization sequence by asserting `pwr_lc_o.lc_done` (the signal will remain high until reset).
+
+The idle indication signal `pwr_lc_o.lc_idle` indicates that the life cycle controller is idle.
+If this bit is 0, the life cycle controller is either not initialized or in the middle of carrying out a life cycle state transition.
+The power manager uses that indication to determine whether a power down request needs to be aborted.
+
+Since the power manager may run in a different clock domain, the `pwr_lc_i.lc_init` signal is synchronized within the life cycle controller.
+The power manager is responsible for synchronizing the `pwr_lc_o.lc_done` and `pwr_lc_o.lc_idle` signals.
+
+See also [power manager documentation](../../pwrmgr/README.md).
+
+#### OTP Interfaces
+
+All interfaces to and from OTP are explained in detail in the [OTP Specification Document](../../otp_ctrl/README.md#life-cycle-interfaces).
+
+#### KMAC Interface
+
+The life cycle controller interfaces with KMAC through a [side load interface](../../kmac/README.md#keymgr-interface) in the same way as the key manager.
+Since the KMAC and life cycle controller are in different clock domains, the KMAC interface signals are synchronized to the life cycle clock inside the life cycle controller.
+
+#### Control Signal Propagation
+
+For better security, all the [life cycle control signals](#life-cycle-decoded-outputs-and-controls) are broadcast in multi-bit form.
+The active ON state for every signal is broadcast as `4'b1010`, while the inactive OFF state is encoded as `4'b0101`.
+For all life cycle signals except the escalation signal ESCALATE_EN, all values different from ON must be interpreted as OFF in RTL.
+In case of ESCALATE_EN, all values different from OFF must be interpreted as ON in RTL.
+
+Since many signals cross clock boundaries, their synchronization needs to be taken into account.
+However, since the ON / OFF encoding above has been chosen such that **all bits toggle exactly once** for a transition from OFF to ON (and vice-versa), all that needs to be done is guard against metastability using a two-stage synchronizer, as illustrated below.
+
+![Multibit Sync](../doc/lc_ctrl_multibit_sync.svg)
+
+In other words, since each bit in the encoding flips exactly once upon an OFF -> ON or ON -> OFF transition, we can guarantee that there are no transient patterns toggling back and forth between enabling and disabling a function.
+Note that even though synchronization can be achieved with a simple two-stage synchronizer, designs **must** use the `prim_lc_sync` primitive.
+This primitive has additional LC-specific assertions and provides a parametric amount of separately buffered copies of the life cycle signal to prevent logic optimization by the synthesis tool (buffers have a 'size_only' constraint in synthesis).
+For all signals except ESCALATE_EN, it is recommended to structure the design such that at least two separately buffered copies of the life cycle signals have to be consumed in order to unlock a certain function.
+
+#### Key Manager Interface
+
+The `lc_keymgr_div_o` signal is a 128bit diversification constant that is output to the key manager once the life cycle controller has initialized, and is asserted at the same time as `lc_keymgr_en_o`.
+Depending on which group the life cycle state is in, this signal is assigned a different random netlist constant as defined in the table below.
+
+Life Cycle State Group     | Assigned Diversification Constant
+---------------------------|----------------------------------
+TEST_UNLOCKED\*, DEV, RMA  | `LcKeymgrDivTestDevRma`
+PROD, PROD_END             | `LcKeymgrDivProduction`
+All Other States           | `LcKeymgrDivInvalid`
+
+Note that this signal is quasistatic.
+It is hence recommended to place a max-delay constraint on it and leverage the synchronized version of `lc_keymgr_en_o` to enable any downstream register in different clock domains than the life cycle controller.
+
+
+## Design Details
+
+
+### Block Diagram
+
+Conceptually speaking, the life cycle controller consists of a large  FSM that is further subdivided into logical modules for maintainability, as illustrated below. All blue blocks in the block diagram are purely combinational and do not contain any registers.
+
+![LC Controller Block Diagram](../doc/lc_ctrl_blockdiag.svg)
+
+The main FSM implements a linear state sequence that always moves in one direction for increased glitch resistance.
+I.e., it never returns to the initialization and broadcast states as described in the [life cycle state controller section](#main-fsm).
+
+The main FSM state is redundantly encoded, and augmented with the life cycle state.
+That augmented state vector is consumed by three combinational submodules:
+- `lc_ctrl_state_decode`: This submodule decodes the redundantly encoded life cycle state, checks that there are no encoding errors and enforces state dependencies as required by the definition. The decoded state is forwarded to the CSRs for SW consumption.
+- `lc_ctrl_transition`: This submodule checks whether the transition target state specified via the CSRs is valid, and computes the redundantly encoded state vector of the transition target state.
+- `lc_ctrl_signal_decode`: This submodule is an output function only and derives the life cycle control signals (colored in blue) from the augmented state vector.
+
+Note that the two additional life cycle control signals `lc_flash_rma_req_o` and `lc_clk_byp_req_o` are output by the main FSM, since they cannot be derived from the life cycle state alone and are reactive in nature in the sense that there is a corresponding acknowledgement signal.
+
+The life cycle controller contains a JTAG TAP that can be used to access the same CSR space that is accessible via TL-UL.
+In order to write to the CSRs, a [hardware mutex](#hardware-mutex) has to be claimed.
+
+The life cycle controller also contains two escalation receivers that are connected to escalation severity 1 and 2 of the alert handler module.
+The actions that are triggered by these escalation receivers are explained in the [escalation handling section](#escalation-handling) below.
+
+### System Integration and TAP Isolation
+
+The figure below provides more context about how the life cycle controller is integrated into the system, and how its control signals interact with various components.
+
+![LC Controller Block Diagram](../doc/lc_ctrl_system_view.svg)
+
+Although technically a life cycle feature, the sampling of the strap pins and JTAG / TAP isolation is performed in the pinmux after the life cycle controller has initialized.
+See [pinmux documentation](../../pinmux/README.md#strap-sampling-and-tap-isolation) and the detailed selection listed in [Life Cycle Definition Table](../../../../doc/security/specs/device_life_cycle/README.md#manufacturing-states).
+
+### Life Cycle Manufacturing State Encodings
+
+The encoding of the life-cycle state is used both for OTP storage and as part of the FSM state in the life cycle controller.
+In other words the state stored within OTP is not re-encoded before it is consumed as part of the life cycle controller FSM state.
+
+{{#include doc/lc_ctrl_encoding_table.md}}
+
+Any decoding that does not fall into the table above is considered **INVALID**.
+
+Each word in the table above maps to an ECC protected 16bit OTP word (i.e., 16bit + 6bit ECC).
+Further, each Ax/Bx word used in the LC state is a unique, random netlist constant generated by the silicon creator prior to tapeout based on a custom seed and the employed ECC polynomial.
+The values Bx are constructed such that {Bx,ECC(Bx)} can be incrementally written over {Ax,ECC(Ax)} without producing any ECC errors.
+
+The purpose of this encoding is to ensure the following
+
+- It is difficult to jump from PROD / PROD_END / SCRAP into DEV
+- It is difficult to jump from DEV / PROD / PROD_END / SCRAP into TEST*
+- It is difficult to jump from DEV / PROD / PROD_END / SCRAP into RMA
+
+Further, the encoding has been chosen to minimize the probability of successful glitch attacks attempting to alter the value of bits in the life cycle state.
+In particular, this encoding guards against attacks that manipulate the OTP to output all-zeros, or attacks that manipulate the OTP to read from other address locations within OTP to inject specific values.
+
+Note that the RAW state is guarded by the RAW_UNLOCK process, which involves supplying a 128bit UNLOCK_TOKEN and performing a full system reset in case the token was correct. Hence moving the state into RAW does not provide any advantage to an attacker.
+
+The encoded life cycle state is not readable by SW in any way through the OTP or life cycle interfaces.
+However a decoded version of the manufacturing life cycle is exposed in the [`LC_STATE`](../data/lc_ctrl.hjson#lc_state) register.
+
+### Life Cycle Readout Consistency Checks in OTP
+
+In order to guard against glitch attacks during OTP sense and readout, the OTP controller makes sure to read out the life cycle partition before releasing the state to the life cycle controller.
+I.e., the OTP controller senses and buffers the life cycle in registers in a first readout pass.
+Then, as part of the [consistency check mechanism](../../otp_ctrl/README.md#storage-consistency), the OTP controller performs a second and third readout pass to verify whether the buffered life cycle state indeed corresponds to the values stored in OTP.
+The second readout pass uses a linearly increasing address sequence, whereas the third readout pass uses a linearly decreasing address sequence (i.e., reads in reverse order).
+
+### Transition Counter Encoding
+
+The life cycle transition counter has 24 strokes where each stroke maps to one 16bit OTP word.
+The strokes are similarly encoded as the life cycle state in the sense that upon the first transition attempt, all words are initialized with unique Cx values that can later be overwritten with unique Dx values without producing an ECC error.
+
+{{#include doc/lc_ctrl_counter_table.md}}
+
+Upon each life cycle transition attempt, the life cycle controller **FIRST** increments the transition counter before initiating any token hashing and comparison operations.
+
+A decoded version of this counter is exposed in the [`LC_TRANSITION_CNT`](../data/lc_ctrl.hjson#lc_transition_cnt) register.
+
+### Life Cycle State Controller
+
+The life cycle state controller is the main entity that handles life cycle requests, escalation events and transactions with the OTP and flash controllers.
+The state diagram for the controller FSM is shown below.
+
+![LC Controller FSM](../doc/lc_ctrl_fsm.svg)
+
+Once the FSM has initialized upon request from the power manager, it moves into `IdleSt`, which is the state where all life cycle control signals are broadcast.
+The life cycle controller stays in `IdleSt` unless a life cycle state request is initiated via the CSRs.
+
+In that case, the life cycle controller first increments the redundantly encoded life cycle transition counter in `CntIncrSt` and `CntProgSt` in order to fend against brute force attacks.
+Then, the transition is checked for validity in `TransCheckSt` and the token hashing operation is initiated in `TokenHashSt`.
+A first token comparison is performed when the hashed token returns in `TokenHashSt`, followed by two more comparisons in `TokenCheck0St` and `TokenCheck1St`.
+The difference among these three comparisons is that the first comparison is done using the hashed token input directly, whereas the second and the third comparison use a registered version of the hashed token.
+If all token checks are successful, the next life cycle state vector is computed and programmed in `TransProgSt`.
+
+Note that an initiated life cycle transition request always ends in `PostTransSt`, no matter whether the transition is successful or not.
+
+#### Escalation Handling
+
+The life cycle controller contains two escalation channels that are connected to the alert handler.
+
+When the first channel `esc_wipe_secrets` is asserted, the life cycle controller permanently asserts the `lc_escalate_en` life cycle signal.
+That signal is routed to various security modules in OpenTitan and triggers local wiping and invalidation features.
+Note that this first escalation action does not affect the life cycle state.
+
+When the second channel `esc_scrap_state` is asserted, the life cycle controller moves the life cycle state into `EscalateSt`, which behaves like a "virtual" SCRAP life cycle state.
+This transition is not permanent, and will clear upon the next power cycle.
+Note that any scrap state (virtual or encoded in the life cycle state vector) will also cause the `lc_escalate_en` life cycle signal to be asserted.
+
+#### FSM Glitch Countermeasures
+
+The FSM has been designed to have a linear control flow that always moves in the same direction, and that always ends in a terminal state after initiating a transition request in order to make glitch attacks harder.
+A sparse FSM state encoding is employed, where each state is encoded as a 16bit word with a minimum Hamming distance of 5 w.r.t. any other state.
+The FSM state and the life cycle state vector are concurrently monitored, and if an erroneous encoding is detected, the life cycle FSM is immediately moved into the terminal `InvalidSt`, and a `fatal_state_error` alert is asserted.
+
+#### Life Cycle Request Interface
+
+Life cycle requests are the explicit requests made to change life cycle states.
+The controller allows requests to come from either the TAP or the software interface.
+The interface is common between the two and is maintained as a CSR interface.
+To arbitrate between the two, a hardware mutex needs to be obtained before either side can proceed.
+The hardware mutex internally acts as a mux to block off the unselected path and all accesses to the request interface are blocked until it is claimed.
+If two requests arrive simultaneously, the TAP interface is given priority.
+
+The request interface consists of 7 registers:
+
+1. [`TRANSITION_CTRL`](../data/lc_ctrl.hjson#transition_ctrl): Control register for the transition, can be used to switch to an external clock.
+2. [`TRANSITION_TARGET`](../data/lc_ctrl.hjson#transition_target): Specifies the target state to which the agent wants to transition.
+3. [`TRANSITION_TOKEN_*`](../data/lc_ctrl.hjson#transition_token_): Any necessary token for conditional transitions.
+4. [`TRANSITION_CMD`](../data/lc_ctrl.hjson#transition_cmd): Start the life cycle transition.
+5. [`STATUS`](../data/lc_ctrl.hjson#status): Indicates whether the requested transition succeeded.
+6. [`OTP_VENDOR_TEST_CTRL`](../data/lc_ctrl.hjson#otp_vendor_test_ctrl): See [Macro-specific test control bits](#vendor-specific-test-control-register).
+7. [`OTP_VENDOR_TEST_STATUS`](../data/lc_ctrl.hjson#otp_vendor_test_status): See [Macro-specific test control bits](#vendor-specific-test-control-register).
+
+If the transition fails, the cause will be reported in this register as well.
+
+See diagram below.
+
+![LC Request Interface](../doc/lc_ctrl_request_interface.svg)
+
+In order to claim the hardware mutex, the value kMuBi8True must be written to the claim register ([`CLAIM_TRANSITION_IF`](../data/lc_ctrl.hjson#claim_transition_if)).
+If the register reads back as kMuBi8True, then the mutex is claimed, and the interface that won arbitration can continue operations.
+If the value is not read back, then the requesting interface should wait and try again later.
+Note that all transition registers (with the exception of the [`STATUS`](../data/lc_ctrl.hjson#status) register) read back all-zero if the mutex is not claimed.
+
+When an agent is done with the mutex, it releases the mutex by explicitly writing a 0 to the claim register.
+This resets the mux to select no one and also holds the request interface in reset.
+
+#### Vendor-specific Test Control Register
+
+Certain OTP macros require special configuration bits to be set during the test phases.
+Likewise, it is necessary to expose macro-specific status bits during the test phases.
+To this end, the life cycle CSRs contain the [`OTP_VENDOR_TEST_CTRL`](../data/lc_ctrl.hjson#otp_vendor_test_ctrl) and [`OTP_VENDOR_TEST_STATUS`](../data/lc_ctrl.hjson#otp_vendor_test_status) registers, which are reserved for vendor-specific test control and status bits.
+These registers are only active during RAW, TEST_* and RMA life cycle states.
+In all other life cycle states, the status register reads back all-zero, and the control register value will be tied to 0 before forwarding it to the OTP macro.
+
+Similarly to the [Life Cycle Request Interface](#life-cycle-request-interface), the hardware mutex must be claimed in order to access both of these registers.
+Note that these registers read back all-zero if the mutex is not claimed.
+
+### TAP Construction and Isolation
+
+#### Life Cycle TAP Controller
+
+The life cycle TAP controller is functionally very similar to the [RISC-V debug module](https://github.com/lowRISC/opentitan/blob/master/hw/ip/rv_dm/rtl/rv_dm.sv) for the Ibex processor and reuses the same debug transport module (DTM) and the associated debug module interface (DMI).
+The DTM and DMI are specified as part of the [RISC-V external debug specification, v0.13](https://github.com/riscv/riscv-debug-spec/blob/release/riscv-debug-release.pdf) and essentially provide a simple mechanism to read and write to a register space.
+In the case of the life cycle TAP controller this register space is essentially the life cycle CSR space.
+Hence, the [register table](#register-table) is identical for both the SW view and the view through the DMI, with the only difference that the byte offsets have to be converted to word offsets for the DMI.
+
+The RISC-V external debug specification defines the two custom JTAG registers 0x10 (DTM control/status) and 0x11 (DMI).
+The former provides status info such as idle state, number of address bits and RISC-V specification version plus reset control.
+The latter exposes an address, data and operation field for accessing a CSR space.
+
+In order to interact with the LC controller through JTAG, the debugging agent should read out the `abits` field from 0x10 in order to determine the address width in the DMI, and verify that the `version` field is indeed set to 1 to confirm that the DTM implements v0.13 of the spec.
+Then, the debugger can issue a CSR read or write operation via the 0x11 register as explained in more detail in [the RISC-V external specification, Chapter 6.1.5](https://github.com/riscv/riscv-debug-spec/blob/release/riscv-debug-release.pdf).
+
+### TAP and Isolation
+
+As currently defined, the life cycle controller TAP is a separate entity from the main SOC DFT TAP and the processor TAP.
+This physical separation aids in logical isolation, as the SOC DFT tap can be disabled by DFT_EN, while the processor TAP can be disabled by DEBUG_EN.
+The TAP isolation and multiplexing is implemented in the pinmux IP as [described here](../../pinmux/README.md#strap-sampling-and-tap-isolation).
diff --git a/hw/ip/otp_ctrl/README.md b/hw/ip/otp_ctrl/README.md
index 4d83e66808c49..195ce18dd618a 100644
--- a/hw/ip/otp_ctrl/README.md
+++ b/hw/ip/otp_ctrl/README.md
@@ -57,969 +57,3 @@ It also implies that no matter how the OTP storage or word size may change under
 This standardized interface is defined further below, and the wrapper leverages the same [technology primitive mechanism](../prim/README.md) that is employed in other parts of OpenTitan in order to wrap and abstract technology-specific macros (such as memories and clocking cells) that are potentially closed-source.
 
 In order to enable simulation and FPGA emulation of the OTP controller even without access to the proprietary OTP IP, a generalized and synthesizable model of the OTP IP is provided in the form of a [generic technology primitive](https://github.com/lowRISC/opentitan/blob/master/hw/ip/prim_generic/rtl/prim_generic_otp.sv).
-
-
-# Theory of Operations
-
-Conceptually speaking, the OTP functionality is at a high level split into "front-end" and "back-end".
-The "front-end" contains the logical partitions that feed the hardware and software consumer interfaces of the system.
-The "back-end" represents the programming interface used by hardware and software components to stage the upcoming values.
-The diagram below illustrates this behavioral model.
-
-![OTP Controller Block Diagram](./doc/otp_ctrl_behavioral_model.svg)
-
-Note that the front-end contains both buffered and unbuffered partitions.
-Buffered partitions are sensed once per power cycle and their contents are stored in registers, whereas unbuffered partitions are read on-demand.
-The former are typically partitions that contain data like hardware configuration bits, key material and the life cycle state that need to be always available to the hardware, whereas the latter are large partitions that are accessed infrequently, such as the software configurations.
-Values that are programmed into a buffered partition via the programming interface (coupled with read verification) are merely "staged", and do not take effect until the next power cycle.
-
-The sections below describe the operation of various pieces of the OTP controller and how it supports the described functionality.
-
-## Logical Partitions
-
-The OTP is logically separated into partitions that represent different functions.
-This means the isolation is virtual and maintained by the OTP controller instead of the underlying OTP IP.
-
-Within each logical partition, there are specific enforceable properties
-
-- Confidentiality via secret partitions
-  - This controls whether a particular partition contains secret data.
-  - If secret, a partition is not readable by software once locked, and is scrambled in storage.
-- Read lockability
-  - This controls whether a particular partition disables software readability for later stage software.
-  - Some partitions can be locked statically (by computing and storing an associated digest in OTP), others can be read locked at runtime via CSRs.
-- Write lockability
-  - This controls whether a partition is locked and prevented from future updates.
-  - A locked partition is stored alongside a digest to be used later for integrity verification.
-- Integrity Verification
-  - Once a partition is write-locked by calculating and writing a non-zero [digest](#locking-a-partition) to it, it can undergo periodic verification (time-scale configurable by software).
-This verification takes two forms, partition integrity checks, and storage consistency checks.
-
-Since the OTP is memory-like in nature (it only outputs a certain number of bits per address location), some of the logical partitions are buffered in registers for instantaneous and parallel access by hardware.
-This is a critical point, since after power-up, these particular OTP contents are stored in flip flops and sourced to the system.
-I.e., buffered partitions are **NOT** directly sourced from the OTP macro itself.
-Thus the security of both volatile (OTP controller) and non-volatile (OTP IP) storage becomes important.
-
-### Partition Listing and Description
-
-The OTP controller for OpenTitan contains the seven logical partitions shown below.
-
-{{#include doc/otp_ctrl_partitions.md}}
-
-Generally speaking, the production life cycle of a device is split into 5 stages "Manufacturing" -> "Calibration and Testing" -> "Provisioning" -> "Mission" -> "RMA".
-OTP values are usually programmed during "Calibration and Testing", "Provisioning" and "RMA" stages, as explained below.
-A detailed listing of all the items and the corresponding memory map can be found in the [Programmer's Guide](#programmers-guide)) further below.
-
-### Calibration and Test
-
-During this stage, the device is tested for functionality and calibrated to ensure uniformity.
-The calibration can focus on a number of things, but usually is centered around adjusting clock, voltage and timing sources to remove process variation.
-These calibration values are programmed into the CREATOR_SW_CFG partition, as they are non-secret values meant to be read out by software and programmed into respective peripherals.
-
-Early on during this stage, the various tokens are also programmed into the secret partitions and harvested by the silicon creator.
-
-### Provisioning
-
-During this stage, the device is provisioned with the final firmware and a "unique" seed or identity.
-The secret partitions are populated with root secrets and keys that are critical to establishing the device identity.
-
-As part of injecting the final firmware, the stock-keeping-unit-specific hardware and software configurations are also programmed.
-
-### Life Cycle Partition
-
-The life cycle partition is active throughout all stages and hence it is the **ONLY** partition that cannot be locked.
-After the device finishes provisioning and goes into production, it must retain the ability to transition back to RMA in case of unexpected failures.
-
-In order to support this transition, the [life cycle state](../lc_ctrl/README.md) and counters must always be update-able.
-
-## Locking a Partition
-
-Write access to a partition can be permanently locked when software determines it will no longer make any updates to that partition.
-To lock, an integrity constant is calculated and programmed alongside the other data of that partition.
-The size of that integrity constant depends on the partition size granule, and is either 32bit or 64bit (see also [Direct Access Memory Map](#direct-access-memory-map)).
-
-Once the "integrity digest" is non-zero, no further updates are allowed.
-If the partition is secret, software is in addition no longer able to read its contents (see [Secret Partition description](#secret-vs-nonsecret-partitions)).
-
-Note however, in all partitions, the digest itself is **ALWAYS** readable.
-This gives software an opportunity to confirm that the locking operation has proceeded correctly, and if not, scrap the part immediately.
-
-Calculation of the integrity digest depends on whether the partition requires periodic background verification.
-
-### Vendor Test Partition
-
-The vendor test partition is intended to be used for OTP programming smoke checks during the manufacturing flow.
-The silicon creator may implement these checks inside the proprietary version of the `prim_otp` wrapper.
-This partition behaves like any other SW partition, with the exception that ECC uncorrectable errors will not lead to fatal errors / alerts as they do in all other partitions.
-This is due to the nature of the OTP programming smoke checks, which may leave certain OTP words in a state inconsistent with the ECC polynomial employed upon OTP readout.
-
-### Software Configuration Partitions
-
-The software configuration partitions are used as non-volatile storage for flags, configuration and calibration data.
-As such, the contents of this partition are usually consumed once as part of code execution, or moved to another storage compartment somewhere in the design.
-For example, the clock calibration values and the LDO calibration values are programmed to the analog sensor top (AST) at startup.
-
-As such, it is not necessary to check periodically at the OTP source.
-Instead, software can simply check as part of secure boot and take other measures when these values are programmed into peripherals.
-
-For this partition it is thus the responsibility of software to calculate the integrity digest and program it into the OTP.
-It is also reasonable to shadow (parts of) this partition in main memory, and there is not an immediate impact from OTP contents to hardware.
-
-### Hardware Configuration and Secret Partitions
-
-The hardware and secret partitions directly affect downstream hardware.
-The contents must go through periodic integrity checks and therefore the stored digest is calculated by hardware when software provides the intent to lock (as opposed to the software partitions where the digest has to be calculated by software).
-
-### Life Cycle Partition
-
-The life cycle partition cannot be locked and will therefore not contain a stored digest.
-Note however that only the life cycle controller has access to this partition, i.e., the Direct Access Interface (DAI) cannot read nor write from/to the life cycle partition.
-
-## Secret vs Non-Secret Partitions
-
-Non-secret OTP partitions hold data that can be public; or data that has no impact on security.
-For example, the current value of lock bits or clock calibration values.
-These values are stored in OTP as plaintext.
-
-Secret partitions contain data that are critical to security, for example FLASH scrambling keys, device root secret and unlock tokens.
-These values are stored scrambled in OTP, and are descrambled upon read.
-The currently employed cipher is PRESENT, as it lends itself well to iterative decomposition, and it is a proven lightweight block cipher (see also [PRESENT Scrambling Primitive](../prim/doc/prim_present.md).
-The usage of a block cipher however implies that the secret partitions can only be written in 64bit chunks.
-
-Further, the contents of a particular secret partition are not readable by software once locked (other than the digest which must be always readable); while non-secret partitions are always readable unless read accessibility is explicitly removed by software.
-
-Unfortunately, secret partitions must utilize a global netlist key for the scrambling operation, as there is no other non-volatile storage to store a unique key.
-
-
-## Partition Checks
-
-### Integrity
-
-Once the appropriate partitions have been locked, the hardware integrity checker employs two integrity checks to verify the content of the volatile buffer registers:
-
-1. All buffered partitions have additional ECC protection (8bit ECC for each 64bit block) that is concurrently monitored.
-2. The digest of the partition is recomputed at semi-random intervals and compared to the digest stored alongside the partition.
-
-The purpose of this check is NOT to check between the storage flops and the OTP, but whether the buffer register contents remain consistent with the calculated digest.
-This verification is primarily concerned with whether the storage flops have experienced fault attacks.
-This check applies to only the HW_CFG and SECRET* partitions.
-If a failure is encountered, the OTP controller will send out a `fatal_check_error` alert and reset all of its hardware outputs to their defaults.
-
-### Storage Consistency
-
-This verification ensures the value stored in the buffer registers remain consistent with those in the OTP.
-This process re-reads the OTP at semi-random intervals and confirms the value read is the same as the value stored.
-Note, given there are integrity checks in parallel, it is not necessary for some partitions to check ALL read contents for consistency.
-If there is an integrity digest, only the digest needs to be read; otherwise, all values must be read.
-
-
-This check applies to LIFE_CYCLE, HW_CFG and SECRET* partitions.
-If a failure is encountered, the OTP controller will send out a `fatal_check_error` alert and reset all of its hardware outputs to their defaults.
-
-Note that checks applied to life cycle could cause a failure if life cycle is updated, because life cycle is the only partition that may contain live updates.
-The controller hence detects this condition based on the `lc_check_byp_en_i` signal coming from the life cycle controller, and pauses background checks on this partition in order to prevent false positives.
-
-### Secret Partition Integrity Checks
-
-Since the secret partitions are stored scrambled, this also implies the integrity digest is calculated over the scrambled form.
-In order to balance the amount of buffer registers needed, only the decrypted form of the secret partitions is held in buffer registers.
-Hardware calculates the digest by re-scrambling the data before passing it through the digest.
-
-
-## Power-up and Sense
-
-The OTP controller partition storage must output a specified safe default (it is not always 0 like a blank OTP) upon reset release.
-This default output must remain until the OTP controller completes all checks.
-
-The OTP controller reads from the OTP IP.
-If the reads pass OTP IP internal checks (for example ECC or redundancy), the partition storage is updated; however the output is still held at the default state via an output mux.
-After all read is complete, the OTP controller performs integrity checks on the HW_CFG and SECRET* partitions.
-If a partition fails the integrity checks at this point it would signal an initialization error in the status CSR and abort further initialization.
-
-After all integrity checks are complete, the OTP controller releases the output gating and marks outputs as valid.
-However, any partition marked with "error" continues to hold its output in the default state.
-
-Once the above steps are complete, the partition storage in buffered registers is not updated again (except for updates to the life cycle partition through the life cycle interface).
-I.e., values programmed to OTP via the programming interface will not be visible in buffered registers until after the next power cycle.
-
-At this point, outputs of the partition storage are NOT expected to change unless a periodic check suddenly fails.
-When this failure occurs, all outputs are reverted to their default state, and an alert is immediately triggered to the alert handler.
-For timing purposes, OTP outputs can be treated as semi-static, as this error event should be rare and exceptional.
-
-
-## Partition Defaults
-
-Partition defaults are context specific.
-For example, a hardware configuration item that locks down specific access should default to "no access".
-This ensures that a glitch attack on the OTP cannot easily revert the design to an insecure state.
-
-This hence suggests that when an OTP is all 0's and all 1's, it should, whenever possible, reflect an invalid or inert state in the encoding space of the affected item.
-This also implies the reset state of consuming agents (for example key manager and life cycle), should default to invalid / inert state as well.
-
-
-## Program and Read Ports
-
-As shown previously, the OTP is split into a front and back end.
-The back-end interface is primarily used to update OTP contents, and read back for debug and verification purposes.
-Despite being a separate functional access port from the logical partitions, the program and read ports are subjected to the same access controls.
-
-When a partition is write-locked, programming accesses are disallowed.
-If the partition is secret, read accesses by the back-end interface are also disallowed (except for the digest which must always be readable).
-Software can also disable any read accesses to the software configuration partitions via CSR settings to prevent later stage software from reading any content.
-
-The exception to the above is the life cycle partition.
-The life cycle controller interface also acts as a "back-end" interface that always has programming access to ensure life cycle state can be advanced.
-
-Note, the program and read ports can conflict with ongoing background storage checks, and the OTP controller arbitrates between these two sides.
-An in-progress operation will always be completed.
-Afterwards, or when two requests arrive at the same time, the priority is life cycle > programming interface > on-demand read accesses via CSR windows > background checks.
-
-
-## Programming the OTP
-
-The OTP controller has two programming paths:
-
-1. a functional programming path through software (the program port),
-2. Life cycle programming path through hardware.
-
-The functional interface is used to update all partitions except for life cycle.
-As mentioned previously, any updates made during the current power cycle are **NOT** reflected in the buffered partitions until the next reboot.
-
-The life cycle interface is used to update the life cycle state and transition counter only.
-The commands are issued from the [life cycle controller](../lc_ctrl/README.md), and similarly, successful or failed indications are also sent back to the life cycle controller.
-Similar to the functional interface, the life cycle controller allows only one update per power cycle, and after a requested transition reverts to an inert state until reboot.
-
-For more details on how the software programs the OTP, please refer to the [Programmer's Guide](#programmers-guide)) further below.
-
-
-## Hardware Interfaces
-
-### Parameters
-
-The following table lists the instantiation parameters of OTP.
-Note that parameters prefixed with `RndCnst` are random netlist constants that need to be regenerated via topgen before the tapeout (typically by the silicon creator).
-
-Parameter                   | Default (Max) | Top Earlgrey | Description
-----------------------------|---------------|--------------|---------------
-`AlertAsyncOn`              | 2'b11         | 2'b11        |
-`RndCnstLfsrSeed`           | (see RTL)     | (see RTL)    | Seed to be used for the internal 40bit partition check timer LFSR. This needs to be replaced by the silicon creator before the tapeout.
-`RndCnstLfsrPerm`           | (see RTL)     | (see RTL)    | Permutation to be used for the internal 40bit partition check timer LFSR. This needs to be replaced by the silicon creator before the tapeout.
-`RndCnstKey`                | (see RTL)     | (see RTL)    | Random scrambling keys for secret partitions, to be used in the [scrambling datapath](#scrambling-datapath).
-`RndCnstDigestConst`        | (see RTL)     | (see RTL)    | Random digest finalization constants, to be used in the [scrambling datapath](#scrambling-datapath).
-`RndCnstDigestIV`           | (see RTL)     | (see RTL)    | Random digest initialization vectors, to be used in the [scrambling datapath](#scrambling-datapath).
-`RndCnstRawUnlockToken`     | (see RTL)     | (see RTL)    | Global RAW unlock token to be used for the first life cycle transition. See also [conditional life cycle transitions](../lc_ctrl/README.md#conditional-transitions).
-
-### Signals
-
-* [Interface Tables](data/otp_ctrl.hjson#interfaces)
-
-The OTP controller contains various interfaces that connect to other comportable IPs within OpenTitan, and these are briefly explained further below.
-
-#### EDN Interface
-
-The entropy request interface that talks to EDN in order to fetch fresh entropy for ephemeral SRAM scrambling key derivation and the LFSR counters for background checks.
-It is comprised of the `otp_edn_o` and `otp_edn_i` signals and follows a req / ack protocol.
-
-See also [EDN documentation](../edn/README.md).
-
-#### Power Manager Interface
-
-The power manager interface is comprised of three signals overall: an initialization request (`pwr_otp_i.otp_init`), an initialization done response (`pwr_otp_o.otp_done`) and an idle indicator (`pwr_otp_o.otp_idle`).
-
-The power manager asserts `pwr_otp_i.otp_init` in order to signal to the OTP controller that it can start initialization, and the OTP controller signals completion of the initialization sequence by asserting `pwr_otp_o.otp_done` (the signal will remain high until reset).
-
-The idle indication signal `pwr_otp_o.otp_idle` indicates whether there is an ongoing write operation in the Direct Access Interface (DAI) or Life Cycle Interface (LCI), and the power manager uses that indication to determine whether a power down request needs to be aborted.
-
-Since the power manager may run in a different clock domain, the `pwr_otp_i.otp_init` signal is synchronized within the OTP controller.
-The power manager is responsible for synchronizing the `pwr_otp_o.otp_done` and `pwr_otp_o.otp_idle` signals.
-
-See also [power manager documentation](../pwrmgr/README.md).
-
-#### Life Cycle Interfaces
-
-The interface to the life cycle controller can be split into three functional sub-interfaces (vendor test, state output, state transitions), and these are explained in more detail below.
-Note that the OTP and life cycle controllers are supposed to be in the same clock domain, hence no additional signal synchronization is required.
-See also [life cycle controller documentation](../lc_ctrl/README.md) for more details.
-
-##### Vendor Test Signals
-
-The `lc_otp_vendor_test_i` and `lc_otp_vendor_test_o` signals are connected to a 32bit control and a 32bit status register in the life cycle TAP, respectively, and are directly routed to the `prim_otp` wrapper.
-These control and status signals may be used by the silicon creator to exercise the OTP programming smoke checks on the VENDOR_TEST partition.
-The signals are gated with the life cycle state inside the life cycle controller such that they do not have any effect in production life cycle states.
-
-##### State, Counter and Token Output
-
-After initialization, the life cycle partition contents, as well as the tokens and personalization status is output to the life cycle controller via the `otp_lc_data_o` struct.
-The life cycle controller uses this information to determine the life cycle state, and steer the appropriate qualifier signals.
-Some of these qualifier signals (`lc_dft_en_i`, `lc_creator_seed_sw_rw_en_i`, `lc_seed_hw_rd_en_i` and `lc_escalate_en_i`) are fed back to the OTP controller in order to ungate testing logic to the OTP macro; enable SW write access to the `SECRET2` partition; enable hardware read access to the root key in the `SECRET2` partition; or to push the OTP controller into escalation state.
-
-A possible sequence for the signals described is illustrated below.
-```wavejson
-{signal: [
-  {name: 'clk_i',                           wave: 'p.................'},
-  {name: 'otp_lc_data_o.valid',             wave: '0.|...|.1.|...|...'},
-  {name: 'otp_lc_data_o.state',             wave: '03|...|...|...|...'},
-  {name: 'otp_lc_data_o.count',             wave: '03|...|...|...|...'},
-  {},
-  {name: 'otp_lc_data_o.test_unlock_token', wave: '0.|...|.3.|...|...'},
-  {name: 'otp_lc_data_o.test_exit_token',   wave: '0.|...|.3.|...|...'},
-  {name: 'otp_lc_data_o.test_tokens_valid', wave: '0.|...|.3.|...|...'},
-  {},
-  {name: 'otp_lc_data_o.rma_token',         wave: '0.|.3.|...|...|...'},
-  {name: 'otp_lc_data_o.rma_token_valid',   wave: '0.|.3.|...|...|...'},
-  {},
-  {name: 'otp_lc_data_o.secrets_valid',     wave: '0.|.3.|...|...|...'},
-  {},
-  {name: 'lc_creator_seed_sw_rw_en_i',      wave: '0.|...|...|.4.|...'},
-  {name: 'lc_seed_hw_rd_en_i',              wave: '0.|...|...|.4.|...'},
-  {name: 'lc_dft_en_i',                     wave: '0.|...|...|.4.|...'},
-  {},
-  {name: 'lc_escalate_en_i',                wave: '0.|...|...|...|.5.'},
-]}
-```
-
-Note that the `otp_lc_data_o.valid` signal is only asserted after the `LIFE_CYCLE`, `SECRET0` and `SECRET2` partitions have successfully initialized, since the life cycle collateral contains information from all three partitions.
-The `otp_lc_data_o.test_tokens_valid` and `otp_lc_data_o.rma_token_valid` signals are multibit valid signals indicating whether the corresponding tokens are valid.
-The ``otp_lc_data_o.secrets_valid`` signal is a multibit valid signal that is set to `lc_ctrl_pkg::On` iff the `SECRET2` partition containing the root keys has been locked with a digest.
-
-
-##### State Transitions
-
-In order to perform life cycle state transitions, the life cycle controller can present the new value of the life cycle state and counter via the programming interface as shown below:
-
-```wavejson
-{signal: [
-  {name: 'clk_i',                          wave: 'p.......'},
-  {name: 'lc_otp_program_i.req',           wave: '01.|..0.'},
-  {name: 'lc_otp_program_i.state',         wave: '03.|..0.'},
-  {name: 'lc_otp_program_i.count',         wave: '03.|..0.'},
-  {name: 'lc_otp_program_o.ack',           wave: '0..|.10.'},
-  {name: 'lc_otp_program_o.err',           wave: '0..|.40.'},
-]}
-```
-
-The request must remain asserted until the life cycle controller has responded.
-An error is fatal and indicates that the OTP programming operation has failed.
-
-Note that the new state must not clear any bits that have already been programmed to OTP - i.e., the new state must be incrementally programmable on top of the previous state.
-There are hence some implications on the life cycle encoding due to the ECC employed, see [life cycle state encoding](../lc_ctrl/README.md#life-cycle-manufacturing-state-encodings) for details.
-
-Note that the behavior of the `lc_otp_program_i.otp_test_ctrl` signal is vendor-specific, and hence the signal is set to `x` in the timing diagram above.
-The purpose of this signal is to control vendor-specific test mechanisms, and its value will only be forwarded to the OTP macro in RAW, TEST_* and RMA states.
-In all other life cycle states this signal will be clamped to zero.
-
-#### Interface to Key Manager
-
-The interface to the key manager is a simple struct that outputs the CREATOR_ROOT_KEY_SHARE0 and CREATOR_ROOT_KEY_SHARE1 keys via `otp_keymgr_key_o` if these secrets have been provisioned and locked (via CREATOR_KEY_LOCK).
-Otherwise, this signal is tied to a random netlist constant.
-
-Since the key manager may run in a different clock domain, key manager is responsible for synchronizing the `otp_keymgr_key_o` signals.
-
-#### Interface to Flash Scrambler
-
-The interface to the FLASH scrambling device is a simple req/ack interface that provides the flash controller with the two 128bit keys for data and address scrambling.
-
-The keys can be requested as illustrated below:
-
-```wavejson
-{signal: [
-  {name: 'clk_i',                      wave: 'p...........'},
-  {name: 'flash_otp_key_i.data_req',   wave: '01.|..0.|...'},
-  {name: 'flash_otp_key_i.addr_req',   wave: '01.|....|..0'},
-  {name: 'flash_otp_key_o.data_ack',   wave: '0..|.10.|...'},
-  {name: 'flash_otp_key_o.addr_ack',   wave: '0..|....|.10'},
-  {name: 'flash_otp_key_o.key',        wave: '0..|.30.|.40'},
-  {name: 'flash_otp_key_o.seed_valid', wave: '0..|.10.|.10'},
-]}
-```
-
-The keys are derived from the FLASH_DATA_KEY_SEED and FLASH_ADDR_KEY_SEED values stored in the `SECRET1` partition using the [scrambling primitive](#scrambling-datapath).
-If the key seeds have not yet been provisioned, the keys are derived from all-zero constants, and the `flash_otp_key_o.seed_valid` signal will be set to 0 in the response.
-
-Note that the req/ack protocol runs on the OTP clock.
-It is the task of the scrambling device to synchronize the handshake protocol by instantiating the `prim_sync_reqack.sv` primitive as shown below.
-
-![OTP Key Req Ack](./doc/otp_ctrl_key_req_ack.svg)
-
-Note that the key and nonce output signals on the OTP controller side are guaranteed to remain stable for at least 62 OTP clock cycles after the `ack` signal is pulsed high, because the derivation of a 64bit half-key takes at least two passes through the 31-cycle PRESENT primitive.
-Hence, if the scrambling device clock is faster or in the same order of magnitude as the OTP clock, the data can be directly sampled upon assertion of `src_ack_o`.
-If the scrambling device runs on a significantly slower clock than OTP, an additional register (as indicated with dashed grey lines in the figure) has to be added.
-
-#### Interfaces to SRAM and OTBN Scramblers
-
-The interfaces to the SRAM and OTBN scrambling devices follow a req / ack protocol, where the scrambling device first requests a new ephemeral key by asserting the request channel (`sram_otp_key_i[*]`, `otbn_otp_key_i`).
-The OTP controller then fetches entropy from EDN and derives an ephemeral key using the SRAM_DATA_KEY_SEED and the [PRESENT scrambling data path](#scrambling-datapath).
-Finally, the OTP controller returns a fresh ephemeral key via the response channels (`sram_otp_key_o[*]`, `otbn_otp_key_o`), which complete the req / ack handshake.
-The wave diagram below illustrates this process for the OTBN scrambling device.
-
-```wavejson
-{signal: [
-  {name: 'clk_i',                     wave: 'p.......'},
-  {name: 'otbn_otp_key_i.req',        wave: '01.|..0.'},
-  {name: 'otbn_otp_key_o.ack',        wave: '0..|.10.'},
-  {name: 'otbn_otp_key_o.nonce',      wave: '0..|.30.'},
-  {name: 'otbn_otp_key_o.key',        wave: '0..|.30.'},
-  {name: 'otbn_otp_key_o.seed_valid', wave: '0..|.10.'},
-]}
-```
-
-If the key seeds have not yet been provisioned, the keys are derived from all-zero constants, and the `*.seed_valid` signal will be set to 0 in the response.
-It should be noted that this mechanism requires the EDN and entropy distribution network to be operational, and a key derivation request will block if they are not.
-
-Note that the req/ack protocol runs on the OTP clock.
-It is the task of the scrambling device to perform the synchronization as described in the previous subsection on the [flash scrambler interface](#interface-to-flash-scrambler).
-
-#### Hardware Config Bits
-
-The bits of the HW_CFG partition are output via the `otp_hw_cfg_o` struct.
-IPs that consume collateral stored in this partition shall connect to this struct via the topgen feature, and break out the appropriate bits by either accessing the correct index or using the struct fields.
-These fields are autogenerated from the memory map items allocated to the HW_CFG partition, and the autogenerated struct type can be found in the `otp_ctrl_part_pkg.sv` package.
-Note that it is the task of the receiving IP to synchronize these bits accordingly to the local clock.
-For convenience, a valid bit is also available in that struct.
-The valid bit indicates that the HW_CFG partition has initialized.
-
-### Parameter and Memory Map Changes after D3/V3
-
-Note that all instantiation parameters can be changed without affecting D3/V3 status of the module.
-Similarly, it is permissible to change the contents (partition size, adding and removing items) of the `CREATOR_SW_CFG`, `OWNER_SW_CFG` and `HW_CFG` partitions without affecting D3 status.
-Note however that partition size changes may affect V3 coverage metrics, hence if the size any of the above three partitions is changed, V3 needs to be re-assessed.
-
-## Design Details
-
-### Block Diagram
-
-The following is a high-level block diagram that illustrates everything that has been discussed.
-
-![OTP Controller Block Diagram](./doc/otp_ctrl_blockdiag.svg)
-
-Each of the partitions P0-P7 has its [own controller FSM](#partition-implementations) that interacts with the OTP wrapper and the [scrambling datapath](#scrambling-datapath) to fulfill its tasks.
-The partitions expose the address ranges and access control information to the Direct Access Interface (DAI) in order to block accesses that go to locked address ranges.
-Further, the only two blocks that have (conditional) write access to the OTP are the DAI and the Life Cycle Interface (LCI) blocks.
-The partitions can only issue read transactions to the OTP macro.
-Note that the access ranges of the DAI and the LCI are mutually exclusive.
-I.e., the DAI cannot read from nor write to the life cycle partition.
-The LCI cannot read the OTP, but is allowed to write to the life cycle partition.
-
-The CSR node on the left side of this diagram connects to the DAI, the OTP partitions (P0-P7) and the OTP wrapper through a gated TL-UL interface.
-All connections from the partitions to the CSR node are read-only, and typically only carry a subset of the information available.
-E.g., the secret partitions only expose their digest value via the CSRs.
-
-The Key Derivation Interface (KDI) on the bottom right side interacts with the scrambling datapath, the EDN and the partition holding the scrambling root keys in order to derive static and ephemeral scrambling keys for FLASH and SRAM scrambling.
-
-The test access gate shown at the top of the block diagram is governed by the life cycle qualification signal `dft_en_i`, which is only enabled during the TEST_UNLOCKED* life cycle states.
-Otherwise, test access via this TL-UL window is locked down.
-
-In addition to the blocks mentioned so far, the OTP controller also contains an LFSR timer that creates pseudo-randomly distributed partition check requests, and provides pseudo random data at high bandwidth in the event of a secure erase request due to chip-wide alert escalation.
-For security reasons, the LFSR is periodically reseeded with entropy coming from EDN.
-
-### Data Allocation and Packing
-#### Software View
-
-The effective word width of an OTP IP typically depends on a couple of factors, including the redundancy scheme employed.
-For this the design at hand, it is assumed that this native OTP word-width is 16bit.
-For software convenience, however, these details are abstracted and the open-source OTP controller exposes the OTP storage as a linear address space of 32bit words, which is aligned with the machine word size of the Ibex processor.
-Since the OTP IP employs a redundancy mechanism similar to ECC, this implies however that write operations take place at a granularity of 32bit blocks for non-secret and 64bit blocks for secret partitions (due to the scrambling).
-Hence, software is responsible to appropriately pack and program items, since each 32bit location can only be programmed once.
-
-#### Life Cycle View
-
-Since the life cycle partition is the only partition that needs live updates in-field, proper care must be taken to properly encode data in this partition such that incremental updates are possible.
-The life cycle state is hence encoded such that incremental updates to the state are always carried out at the granularity of a 16bit word.
-Further, the life cycle transition counter is encoded such that each stroke consumes a full 16bit word for the same reason.
-
-See [life cycle controller documentation](../lc_ctrl/README.md) for more details on the life cycle encoding.
-
-### Partition Controllers
-
-In RTL, we distinguish between buffered and unbuffered partition modules.
-These are parameterized, such that we can assemble the array of OTP partitions with these two modules only.
-The corresponding controller FSMs are explained in more detail below.
-
-#### Unbuffered Partition
-
-![Unbuffered Partition FSM](./doc/otp_ctrl_unbuf_part_fsm.svg)
-
-As shown above, the unbuffered partition module has a relatively simple controller FSM that only reads out the digest value of the partition upon initialization, and then basically waits for TL-UL read transactions to its corresponding window in the CSR space.
-
-Write access through the DAI will be locked in case the digest is set to a non-zero value.
-Also, read access through the DAI and the CSR window can be locked at runtime via a CSR.
-Read transactions through the CSR window will error out if they are out of bounds, or if read access is locked.
-
-Note that unrecoverable [OTP errors](#generalized-open-source-interface), ECC failures in the digest register or external escalation via `lc_escalate_en` will move the partition controller into a terminal error state.
-
-#### Buffered Partition
-
-![Buffered Partition FSM](./doc/otp_ctrl_buf_part_fsm.svg)
-
-The controller FSM of the buffered partition module is more complex than the unbuffered counterpart, since it has to account for scrambling and digest calculation.
-
-Upon initialization, the controller reads out the whole partition and descrambles it on the fly if needed.
-
-Then, right after the initial readout, the partition controller jumps into the first integrity check, which behaves somewhat differently, depending on whether the partition is digest protected (or not) and/or scrambled (or not).
-If the partition is not digest protected, or if the digest has not yet been computed, the check completes right away, and the buffered values are released for hardware broadcast.
-Otherwise, the partition contents in the buffer registers are re-scrambled if needed, and a digest is computed on the fly.
-If the computed digest matches with the one that has been read out before, the buffered registers are released for hardware broadcast.
-Otherwise, the buffered values are gated to their default, and an alert is triggered through the error handling logic.
-
-After initialization, the integrity check (as described above) and the consistency check can be triggered by the LFSR timer mechanism on a periodic basis.
-
-The consistency check behaves differently, depending on whether the partition is digest protected or not.
-If it is, the consistency check will read out the digest stored in OTP and compare it with the value stored in the buffer register.
-Otherwise, if no digest is available, the controller will read out the whole partition from OTP, and compare it to the contents stored in the buffer registers.
-In case of a mismatch, the buffered values are gated to their default, and an alert is triggered through the error handling logic.
-
-Note that in case of unrecoverable OTP errors or ECC failures in the buffer registers, the partition controller FSM is moved into a terminal error state, which locks down all access through DAI and clamps the values that are broadcast in hardware to their defaults.
-
-External escalation via the `lc_escalate_en` signal will move the partition controller FSM into the terminal error state as well.
-See [life cycle controller documentation](../lc_ctrl/README.md) for more details.
-
-### Direct Access Interface Control
-
-![Direct Access Interface FSM](./doc/otp_ctrl_dai_fsm.svg)
-
-Upon reset release, the DAI controller first sends an initialization command to the OTP macro.
-Once the OTP macro becomes operational, an initialization request is sent to all partition controllers, which will read out and initialize the corresponding buffer registers.
-The DAI then becomes operational once all partitions have initialized, and supports read, write and digest calculation commands (see [here](#direct-access-interface) for more information about how to interact with the DAI through the CSRs).
-
-Read and write commands transfer either 32bit or 64bit of data from the OTP to the corresponding CSR and vice versa. The access size is determined automatically, depending on whether the partition is scrambled or not. Also, (de)scrambling is performed transparently, depending on whether the partition is scrambled or not.
-
-Digest calculation commands read out the complete contents of a particular partition, compute a digest and write that digest value to the predefined location at the end of the partition.
-
-Note that any unrecoverable OTP error will move the DAI into a terminal error state, where all access through the DAI will be locked.
-Also, the DAI consumes the read and write access information provided by the partition controller, and if a certain read or write access is not permitted, a recoverable error will be flagged in the status / error CSRs.
-
-### Life Cycle Interface Control
-
-![Life Cycle Interface FSM](./doc/otp_ctrl_lci_fsm.svg)
-
-Upon reset release the LCI FSM waits until the OTP controller has initialized and the LCI gets enabled.
-Once it is in the idle state, life cycle state updates can be initiated via the life cycle interface as [described here](#state-transitions).
-The LCI controller takes the life cycle state to be programmed and writes all 16bit words to OTP.
-In case of unrecoverable OTP errors, the FSM signals an error to the life cycle controller and moves into a terminal error state.
-
-### Key Derivation Interface
-
-![Key Derivation Interface FSM](./doc/otp_ctrl_kdi_fsm.svg)
-
-Upon reset release the KDI FSM waits until the OTP controller has initialized and the KDI gets enabled.
-Once it is in the idle state, key derivation can be requested via the [flash](#interface-to-flash-scrambler) and [sram](#interface-to-sram-and-otbn-scramblers) interfaces.
-Based on which interface makes the request, the KDI controller will evaluate a variant of the PRESENT digest mechanism as described in more detail below.
-
-### Scrambling Datapath
-
-![OTP Digest Mechanism](./doc/otp_ctrl_digest_mechanism.svg)
-
-The scrambling datapath is built around an iterative implementation of the [PRESENT lightweight cipher](../prim/doc/prim_present.md) that performs one round per cycle.
-The datapath contains some additional multiplexing circuitry to enable the DAI, KDI and partition controllers to evaluate different functions with the same datapath.
-The algorithmic steps of these functions are explained in more detail below.
-
-#### Scrambling
-
-As illustrated in subfigure a) in the diagram above, the standard 128bit-key PRESENT configuration with 31 rounds is used for scrambling operations.
-The key used for scrambling is a global netlist constant chosen by the silicon creator, and all secret partitions are encrypted using the their own distinct netlist constant.
-Note that the amount of data that is being scrambled is small (160byte = 20 x 64bit blocks) and the scrambled data remains constant.
-Hence, no additional masking or diversification scheme is applied since only a very limited amount of information can be gathered by observing the scrambling operation via side-channels.
-
-#### Digest Calculation
-
-The integrity digests used in the [partition checks](#partition-checks) are computed using a custom [Merkle-Damgard](https://en.wikipedia.org/wiki/Merkle%E2%80%93Damg%C3%A5rd_construction) scheme, where the employed one-way compression function F is constructed by using PRESENT in a [Davies-Meyer arrangement](https://en.wikipedia.org/wiki/One-way_compression_function#Davies%E2%80%93Meyer).
-This is illustrated in subfigure b).
-
-At the beginning of the digest calculation the 64bit state is initialized with an initialization vector (IV).
-Then, the data to be digested is split into 128bit chunks, each of which is used as a 128bit key input for updating the 64bit state with the compression function F.
-Chunks that are not aligned with 128bit are padded with zero, and the finalization operation consists of another 31-round encryption pass with a finalization constant.
-Note that both the IV as well as the finalization constant are global netlist constants chosen by the silicon creator.
-
-#### Scrambling Key Derivation
-
-The key derivation functions for ephemeral SRAM and static FLASH scrambling keys employ a similar construction as the digest calculation function.
-In particular, the keys are derived by repeatedly reducing a (partially random) block of data into a 64bit block, as illustrated in subfigures c) and d).
-
-For ephemeral SRAM scrambling keys, the data block is composed of the 128bit SRAM_DATA_KEY_SEED stored in OTP, as well as 128bit of fresh entropy fetched from the EDN.
-This process is repeated twice in order to produce a 128bit key.
-
-For static FLASH scrambling keys, the data block is composed of a 128bit part of either the FLASH_DATA_KEY_SEED or the FLASH_ADDR_KEY_SEED stored in OTP.
-These key seeds are 256bit in size, allowing to use a unique chunk of 128bit of key seed data to derive a 64bit halve of a particular scrambling key.
-
-Note that the IV and finalization constants are distinct for SRAM and FLASH data and FLASH address scrambling keys.
-These constants are chosen by the silicon creator prior to the tapeout.
-
-### Access Arbitration
-
-Access to the OTP wrapper and the scrambling datapath are both round-robin arbitrated, where the former arbitration occurs at cycle level (i.e., individual OTP memory accesses), and the latter occurs at the level of complete transactions (i.e., full digest or encryption).
-Arbitration at transaction level is implemented similarly to cycle-based arbitration, with the difference that the grant signals remain asserted until the requestor deasserts the request (thereby releasing the arbiter, which acts as a mutex in this case).
-This is behavior illustrated in the example below.
-
-```wavejson
-{signal: [
-  {name: 'clk_i',                  wave: 'p............'},
-  {name: 'part_scrmbl_mtx_req[0]', wave: '01....0.1....'},
-  {name: 'part_scrmbl_mtx_req[1]', wave: '0.1......0...'},
-  {name: 'part_scrmbl_mtx_req[2]', wave: '0.1........0.'},
-  {},
-  {name: 'part_scrmbl_mtx_gnt[0]', wave: '01....0....1.'},
-  {name: 'part_scrmbl_mtx_gnt[1]', wave: '0.....1..0...'},
-  {name: 'part_scrmbl_mtx_gnt[2]', wave: '0........1.0.'},
-]}
-```
-
-### Primitive Wrapper and FPGA Emulation
-
-![OTP Wrapper Block Diagram](./doc/otp_ctrl_prim_otp.svg)
-
-The OTP IP is wrapped up in a primitive wrapper that exposes a TL-UL interface for testing purposes, and a generalized open-source interface for functional operation (described below).
-Any OTP redundancy mechanism like per-word ECC is assumed to be handled inside the wrapper, which means that the word width exposed as part of the generalized interface is the effective word width.
-
-Note that the register space exposed via the TL-UL test interface, as well as DFT and power-related signals are dependent on the underlying proprietary OTP IP.
-They are therefore not further described in this document.
-
-#### Generalized Open-source Interface
-
-The generalized open-source interface uses a couple of parameters (defaults set for Earlgrey configuration).
-
-Parameter      | Default | Top Earlgrey  | Description
----------------|---------|---------------|---------------
-`Width`        | 16      | 16            | Native OTP word width.
-`Depth`        | 1024    | 1024          | Depth of OTP macro.
-`CmdWidth`     | 2       | 2             | Width of the OTP command.
-`ErrWidth`     | 3       | 3             | Width of error code output signal.
-`PwrSeqWidth`  | 2       | 2             | Width of power sequencing signals to/from AST.
-`SizeWidth`    | 2       | 2             | Width of the size field.
-`IfWidth`      | 2^`SizeWidth` * `Width` | 2^`SizeWidth` * `Width` | Data interface width.
-
-The generalized open-source interface is a simple command interface with a ready / valid handshake that makes it possible to introduce back pressure if the OTP macro is not able to accept a command due to an ongoing operation.
-
-In order to facilitate the scrambling and digest operations, the data width has been sized such that data blocks up to the PRESENT block size (64bit) can be transferred across the generalized interface. The actual size of a transfer is determined via the size_i field. Transfer sizes are specified in multiples of the native OTP block size, as listed below.
-
-Value of `size_i` | #Native OTP Words | Bit Slice
-------------------|-------------------|------------
-2'b00             |                 1 | `{word0} = data[15:0]`
-2'b01             |                 2 | `{word1, word0} = data[31:0]`
-2'b10             |                 3 | `{word2, word1, word0} = data[47:0]`
-2'b11             |                 4 | `{word3, word2, word1, word0} = data[63:0]`
-
-Responses are returned in-order via an unidirectional response interface (i.e., without back pressure capability).
-Downstream logic must be able to sink the response in any case.
-The response optionally carries read data, depending on whether the operation that took place was a read or not.
-Also, an error signal returns a non-zero error code in case an error occurred while carrying out the OTP command.
-
-The signals pertaining to the generalized open-source interface are listed below.
-
-Signal                  | Direction        | Type                        | Description
-------------------------|------------------|-----------------------------|---------------
-`fatal_alert_o`         | `output`         | `logic`                     | Fatal alert output from the primitive. This is connected to a separate alert channel in the instantiating IP. The instantiating IP latches the alert indication and continuously outputs alert events until reset.
-`recov_alert_o`         | `output`         | `logic`                     | Recoverable alert output from the primitive. This is connected to a separate alert channel in the instantiating IP. Should only be pulsed high for each alert occurrence. The instantiating IP then sends out a single alert event for each pulse.
-`ready_o`               | `output`         | `logic`                     | Ready signal for the command handshake.
-`valid_i`               | `input`          | `logic`                     | Valid signal for the command handshake.
-`size_i`                | `input`          | `logic [SizeWidth-1:0]`     | Number of native OTP words to transfer, minus one: `2'b00 = 1 native word` ... `2'b11 = 4 native words`.
-`cmd_i`                 | `input`          | `logic [CmdWidth-1:0]`      | OTP command: `2'b00 = read`, `2'b01 = write`, `2'b11 = initialize`
-`addr_i`                | `input`          | `logic [$clog2(Depth)-1:0]` | OTP word address.
-`wdata_i`               | `input`          | `logic [IfWidth-1:0]`       | Write data for write commands.
-`valid_o`               | `output`         | `logic`                     | Valid signal for command response.
-`rdata_o`               | `output`         | `logic [IfWidth-1:0]`       | Read data from read commands.
-`err_o`                 | `output`         | `logic [ErrWidth-1:0]`      | Error code.
-
-The `prim_otp` wrappers implements the `Macro*` error codes (0x0 - 0x4) defined in the [OTP error handling](#error-handling).
-
-The timing diagram below illustrates the timing of a command.
-Note that both read and write commands return a response, and each command is independent of the previously issued commands.
-The latency from accepting a command to returning a response depends on the underlying OTP IP and is typically larger than 10 cycles.
-The returned values depend on the command type and whether an error occurred or not.
-
-```wavejson
-{
-  signal: [
-    { name: 'clk_i',    wave: 'p.............' },
-    { name: 'ready_o',  wave: '0..10|.10.|...' , node: '...a...c'},
-    { name: 'valid_i',  wave: '01..0|1.0.|...' },
-    { name: 'size_i',   wave: '03..0|3.0.|...' },
-    { name: 'cmd_i',    wave: '04..0|4.0.|...' },
-    { name: 'wdata_i',  wave: '05..0|5.0.|...' },
-    { name: 'valid_o',  wave: '0....|..10|.10' , node: '........b...d'},
-    { name: 'rdata_o',  wave: '0....|..50|.50' },
-    { name: 'err_o',    wave: '0....|..40|.40' },
-  ],
-  edge: [
-   'a~>b',
-   'c~>d',
-  ],
-  head: {
-    text: 'Timing of an OTP command.',
-  },
-  foot: {
-    text: "Cmd's are accepted in cycles 3/7, and the corresponding responses return in cycles 8/12.",
-    tick: 0,
-  }
-}
-```
-
-Note that the open source OTP controller allows up to two outstanding OTP commands, meaning that it is permissible to acknowledge an incoming command and start working on it while the results of the last command are still in the process of being output (e.g., due to an output register stage).
-
-#### Generic Simulation and FPGA Emulation Model
-
-For open-source simulation and FPGA emulation, a synthesizable and generic OTP wrapper module is provided (`prim_generic_otp`).
-This is automatically selected in the OpenTitan build flow via the technology primitive mechanism if no proprietary OTP IP is available for a specific technology.
-The OTP storage in `prim_generic_otp` is emulated using a standard RAM primitive `prim_generic_ram_1p`.
-While this storage element is volatile, the primitive is constructed such that the contents are not wiped upon a system-wide reset.
-I.e., only a power-cycle wipes the RAM primitive, thereby enabling limited emulation of the OTP function and life cycle transitions also on an FPGA device.
-
-
-# Programmer's Guide
-
-During provisioning and manufacturing, SW interacts with the OTP controller mostly through the Direct Access Interface (DAI), which is described below.
-Afterwards during production, SW is expected to perform only read accesses via the exposed CSRs and CSR windows, since all write access to the partitions has been locked down.
-
-The following sections provide some general guidance, followed by an explanation of the DAI and a detailed OTP memory map.
-Typical programming sequences are explained at the end of the Programmer's guide.
-
-## General Guidance
-
-### Initialization
-
-The OTP controller initializes automatically upon power-up and is fully operational by the time the processor boots.
-The only initialization steps that SW should perform are:
-
-1. Check that the OTP controller has successfully initialized by reading [`STATUS`](data/otp_ctrl.hjson#status). I.e., make sure that none of the ERROR bits are set, and that the DAI is idle ([`STATUS.DAI_IDLE`](data/otp_ctrl.hjson#status)).
-2. Set up the periodic background checks:
-    - Choose whether to enable periodic [background checks](#partition-checks) by programming nonzero mask values to [`INTEGRITY_CHECK_PERIOD`](data/otp_ctrl.hjson#integrity_check_period) and [`CONSISTENCY_CHECK_PERIOD`](data/otp_ctrl.hjson#consistency_check_period).
-    - Choose whether such checks shall be subject to a timeout by programming a nonzero timeout cycle count to [`CHECK_TIMEOUT`](data/otp_ctrl.hjson#check_timeout).
-    - It is recommended to lock down the background check registers via [`CHECK_REGWEN`](data/otp_ctrl.hjson#check_regwen), once the background checks have been set up.
-
-If needed, one-off integrity and consistency checks can be triggered via [`CHECK_TRIGGER`](data/otp_ctrl.hjson#check_trigger).
-If this functionality is not needed, it is recommended to lock down the trigger register via [`CHECK_TRIGGER_REGWEN`](data/otp_ctrl.hjson#check_trigger_regwen).
-
-Later on during the boot process, SW may also choose to block read access to the CREATOR_SW_CFG or OWNER_SW_CFG partitions at runtime via [`CREATOR_SW_CFG_READ_LOCK`](data/otp_ctrl.hjson#creator_sw_cfg_read_lock) and [`OWNER_SW_CFG_READ_LOCK`](data/otp_ctrl.hjson#owner_sw_cfg_read_lock).
-
-
-### Reset Considerations
-
-It is important to note that values in OTP **can be corrupted** if a reset occurs during a programming operation.
-This should be of minor concern for SW, however, since all partitions except for the LIFE_CYCLE partition are being provisioned in secure and controlled environments, and not in the field.
-The LIFE_CYCLE partition is the only partition that is modified in the field - but that partition is entirely owned by the life cycle controller and not by SW.
-
-### Programming Already Programmed Regions
-
-OTP words cannot be programmed twice, and doing so may damage the memory array.
-Hence the OTP controller performs a blank check and returns an error if a write operation is issued to an already programmed location.
-
-### Potential Side-Effects on Flash via Life Cycle
-
-It should be noted that the locked status of the partition holding the creator root key (i.e., the value of the [`SECRET2_DIGEST_0`](data/otp_ctrl.hjson#secret2_digest_0)) determines the ID_STATUS of the device, which in turn determines SW accessibility of creator seed material in flash and OTP.
-That means that creator-seed-related collateral needs to be provisioned to Flash **before** the OTP digest lockdown mechanism is triggered, since otherwise accessibility to the corresponding flash region is lost.
-See the [life cycle controller documentation](../lc_ctrl/README.md#id-state-of-the-device) for more details.
-
-## Direct Access Interface
-
-OTP has to be programmed via the Direct Access Interface, which is comprised of the following CSRs:
-
-CSR Name                             | Description
--------------------------------------|------------------------------------
-[`DIRECT_ACCESS_WDATA_0`](data/otp_ctrl.hjson#direct_access_wdata_0) | Low 32bit word to be written.
-[`DIRECT_ACCESS_WDATA_1`](data/otp_ctrl.hjson#direct_access_wdata_1) | High 32bit word to be written.
-[`DIRECT_ACCESS_RDATA_0`](data/otp_ctrl.hjson#direct_access_rdata_0) | Low 32bit word that has been read.
-[`DIRECT_ACCESS_RDATA_1`](data/otp_ctrl.hjson#direct_access_rdata_1) | High 32bit word that has been read.
-[`DIRECT_ACCESS_ADDRESS`](data/otp_ctrl.hjson#direct_access_address) | byte address for the access.
-[`DIRECT_ACCESS_CMD`](data/otp_ctrl.hjson#direct_access_cmd)     | Command register to trigger a read or a write access.
-[`DIRECT_ACCESS_REGWEN`](data/otp_ctrl.hjson#direct_access_regwen)  | Write protection register for DAI.
-
-See further below for a detailed [Memory Map](#direct-access-memory-map) of the address space accessible via the DAI.
-
-### Readout Sequence
-
-A typical readout sequence looks as follows:
-
-1. Check whether the DAI is idle by reading the [`STATUS`](data/otp_ctrl.hjson#status) register.
-2. Write the byte address for the access to [`DIRECT_ACCESS_ADDRESS`](data/otp_ctrl.hjson#direct_access_address).
-Note that the address is aligned with the granule, meaning that either 2 or 3 LSBs of the address are ignored, depending on whether the access granule is 32 or 64bit.
-3. Trigger a read command by writing 0x1 to [`DIRECT_ACCESS_CMD`](data/otp_ctrl.hjson#direct_access_cmd).
-4. Poll the [`STATUS`](data/otp_ctrl.hjson#status) until the DAI state goes back to idle.
-Alternatively, the `otp_operation_done` interrupt can be enabled up to notify the processor once an access has completed.
-5. If the status register flags a DAI error, additional handling is required (see [Section on Error handling](#error-handling)).
-6. If the region accessed has a 32bit access granule, the 32bit chunk of read data can be read from [`DIRECT_ACCESS_RDATA_0`](data/otp_ctrl.hjson#direct_access_rdata_0).
-If the region accessed has a 64bit access granule, the 64bit chunk of read data can be read from the [`DIRECT_ACCESS_RDATA_0`](data/otp_ctrl.hjson#direct_access_rdata_0) and [`DIRECT_ACCESS_RDATA_1`](data/otp_ctrl.hjson#direct_access_rdata_1) registers.
-7. Go back to 1. and repeat until all data has been read.
-
-The hardware will set [`DIRECT_ACCESS_REGWEN`](data/otp_ctrl.hjson#direct_access_regwen) to 0x0 while an operation is pending in order to temporarily lock write access to the CSRs registers.
-
-### Programming Sequence
-
-A typical programming sequence looks as follows:
-
-1. Check whether the DAI is idle by reading the [`STATUS`](data/otp_ctrl.hjson#status) register.
-2. If the region to be accessed has a 32bit access granule, place a 32bit chunk of data into [`DIRECT_ACCESS_WDATA_0`](data/otp_ctrl.hjson#direct_access_wdata_0).
-If the region to be accessed has a 64bit access granule, both the [`DIRECT_ACCESS_WDATA_0`](data/otp_ctrl.hjson#direct_access_wdata_0) and [`DIRECT_ACCESS_WDATA_1`](data/otp_ctrl.hjson#direct_access_wdata_1) registers have to be used.
-3. Write the byte address for the access to [`DIRECT_ACCESS_ADDRESS`](data/otp_ctrl.hjson#direct_access_address).
-Note that the address is aligned with the granule, meaning that either 2 or 3 LSBs of the address are ignored, depending on whether the access granule is 32 or 64bit.
-4. Trigger a write command by writing 0x2 to [`DIRECT_ACCESS_CMD`](data/otp_ctrl.hjson#direct_access_cmd).
-5. Poll the [`STATUS`](data/otp_ctrl.hjson#status) until the DAI state goes back to idle.
-Alternatively, the `otp_operation_done` interrupt can be enabled up to notify the processor once an access has completed.
-6. If the status register flags a DAI error, additional handling is required (see [Section on Error handling](#error-handling)).
-7. Go back to 1. and repeat until all data has been written.
-
-The hardware will set [`DIRECT_ACCESS_REGWEN`](data/otp_ctrl.hjson#direct_access_regwen) to 0x0 while an operation is pending in order to temporarily lock write access to the CSRs registers.
-
-Note that SW is responsible for keeping track of already programmed OTP word locations during the provisioning phase.
-**It is imperative that SW does not write the same word location twice**, since this can lead to ECC inconsistencies, thereby potentially rendering the device useless.
-
-### Digest Calculation Sequence
-
-The hardware digest computation for the hardware and secret partitions can be triggered as follows:
-
-1. Check whether the DAI is idle by reading the [`STATUS`](data/otp_ctrl.hjson#status) register.
-3. Write the partition base address to [`DIRECT_ACCESS_ADDRESS`](data/otp_ctrl.hjson#direct_access_address).
-4. Trigger a digest calculation command by writing 0x4 to [`DIRECT_ACCESS_CMD`](data/otp_ctrl.hjson#direct_access_cmd).
-5. Poll the [`STATUS`](data/otp_ctrl.hjson#status) until the DAI state goes back to idle.
-Alternatively, the `otp_operation_done` interrupt can be enabled up to notify the processor once an access has completed.
-6. If the status register flags a DAI error, additional handling is required (see [Section on Error handling](#error-handling)).
-
-The hardware will set [`DIRECT_ACCESS_REGWEN`](data/otp_ctrl.hjson#direct_access_regwen) to 0x0 while an operation is pending in order to temporarily lock write access to the CSRs registers.
-
-It should also be noted that the effect of locking a partition via the digest only takes effect **after** the next system reset.
-To prevent integrity check failures SW must therefore ensure that no more programming operations are issued to the affected partition after initiating the digest calculation sequence.
-
-### Software Integrity Handling
-
-As opposed to buffered partitions, the digest and integrity handling of unbuffered partitions is entirely up to software.
-The only hardware-assisted feature in unbuffered partitions is the digest lock, which locks write access to an unbuffered partition once a nonzero value has been programmed to the 64bit digest location.
-
-In a similar vein, it should be noted that the system-wide bus-integrity metadata does not travel alongside the data end-to-end in the OTP controller (i.e., the  bus-integrity metadata bits are not stored into the OTP memory array).
-This means that data written to and read from the OTP macro is not protected by the bus integrity feature at all stages.
-In case of buffered partitions this does not pose a concern since data integrity in these partitions is checked via the hardware assisted digest mechanism.
-In case of unbuffered partitions however, the data integrity checking is entirely up to software.
-I.e., if data is read from an unbuffered partition (either through the DAI or CSR windows), software should perform an integrity check on that data.
-
-## Error Handling
-
-The agents that can access the OTP macro (DAI, LCI, buffered/unbuffered partitions) expose detailed error codes that can be used to root cause any failure.
-The error codes are defined in the table below, and the corresponding `otp_err_e` enum type can be found in the `otp_ctrl_pkg`.
-The table also lists which error codes are supported by which agent.
-
-Errors that are not "recoverable" are severe errors that move the corresponding partition or DAI/LCI FSM into a terminal error state, where no more commands can be accepted (a system reset is required to restore functionality in that case).
-Errors that are "recoverable" are less severe and do not cause the FSM to jump into a terminal error state.
-
-Note that error codes that originate in the physical OTP macro are prefixed with `Macro*`.
-
-Error Code | Enum Name              | Recoverable | DAI | LCI | Unbuf | Buf   | Description
------------|------------------------|-------------|-----|-----|-------|-------|-------------
-0x0        | `NoError`              | -           |  x  |  x  |   x   |  x    | No error has occurred.
-0x1        | `MacroError`           | no          |  x  |  x  |   x   |  x    | Returned if the OTP macro command did not complete successfully due to a macro malfunction.
-0x2        | `MacroEccCorrError`    | yes         |  x  |  -  |   x   |  x    | A correctable ECC error has occurred during a read operation in the OTP macro.
-0x3        | `MacroEccUncorrError`  | no          |  x  |  -  |   x*  |  x    | An uncorrectable ECC error has occurred during a read operation in the OTP macro. Note (*): This error is collapsed into `MacroEccCorrError` if the partition is a vendor test partition. It then becomes a recoverable error.
-0x4        | `MacroWriteBlankError` | yes / no*   |  x  |  x  |   -   |  -    | This error is returned if a write operation attempted to clear an already programmed bit location. Note (*): This error is recoverable if encountered in the DAI, but unrecoverable if encountered in the LCI.
-0x5        | `AccessError`          | yes         |  x  |  -  |   x   |  -    | An access error has occurred (e.g. write to write-locked region, or read to a read-locked region).
-0x6        | `CheckFailError`       | no          |  -  |  -  |   x   |  x    | An unrecoverable ECC, integrity or consistency error has been detected.
-0x7        | `FsmStateError`        | no          |  x  |  x  |   x   |  x    | The FSM has been glitched into an invalid state, or escalation has been triggered and the FSM has been moved into a terminal error state.
-
-All non-zero error codes listed above trigger an `otp_error` interrupt.
-In addition, all unrecoverable OTP `Macro*` errors (codes 0x1, 0x3) trigger a `fatal_macro_error` alert, while all remaining unrecoverable errors trigger a `fatal_check_error` alert.
-
-If software receives an `otp_error` interrupt, but all error codes read back as 0x0 (`NoError`), this should be treated as a fatal error condition, and the system should be shut down as soon as possible.
-
-Note that while the `MacroWriteBlankError` is marked as a recoverable error, the affected OTP word may be in an inconsistent state after this error has been returned.
-This can cause several issues when the word is accessed again (either as part of a regular read operation, as part of the readout at boot, or as part of a background check).
-It is important that SW ensures that each word is only written once, since this can render the device useless.
-
-## Direct Access Memory Map
-
-The table below provides a detailed overview of the items stored in the OTP partitions.
-Some of the items that are buffered in registers is readable via memory mapped CSRs, and these CSRs are linked in the table below.
-Items that are not linked can only be accessed via the direct programming interface (if the partition is not locked via the corresponding digest).
-It should be noted that CREATOR_SW_CFG and OWNER_SW_CFG are accessible through a memory mapped window, and content of these partitions is not buffered.
-Hence, a read access to those windows will take in the order of 10-20 cycles until the read returns.
-
-Sizes below are specified in multiples of 32bit words.
-
-{{#include doc/otp_ctrl_mmap.md}}
-
-Note that since the content in the SECRET* partitions are scrambled using a 64bit PRESENT cipher, read and write access through the DAI needs to occur at a 64bit granularity.
-Also, all digests (no matter whether they are SW or HW digests) have an access granule of 64bit.
-
-The table below lists digests locations, and the corresponding locked partitions.
-
-{{#include doc/otp_ctrl_digests.md}}
-
-Write access to the affected partition will be locked if the digest has a nonzero value.
-
-For the software partition digests, it is entirely up to software to decide on the digest algorithm to be used.
-Hardware will determine the lock condition only based on whether a non-zero value is present at that location or not.
-
-For the hardware partitions, hardware calculates this digest and uses it for [background verification](#partition-checks).
-Digest calculation can be triggered via the DAI.
-
-Finally, it should be noted that the RMA_TOKEN and CREATOR_ROOT_KEY_SHARE0 / CREATOR_ROOT_KEY_SHARE1 items can only be programmed when the device is in the DEV, PROD, PROD_END and RMA stages.
-Please consult the [life cycle controller documentation](../lc_ctrl/README.md) documentation for more information.
-
-## Examples
-
-### Provisioning Items
-
-The following represents a typical provisioning sequence for items in all partitions (except for the LIFE_CYCLE partition, which is not software-programmable):
-
-1. [Program](#programming-sequence) the item in 32bit or 64bit chunks via the DAI.
-2. [Read back](#readout-sequence) and verify the item via the DAI.
-3. If the item is exposed via CSRs or a CSR window, perform a full-system reset and verify whether those fields are correctly populated.
-
-Note that any unrecoverable errors during the programming steps, or mismatches during the readback and verification steps indicate that the device might be malfunctioning (possibly due to fabrication defects) and hence the device may have to be scrapped.
-This is however rare and should not happen after fabrication testing.
-
-### Locking Partitions
-
-Once a partition has been fully populated, write access to that partition has to be permanently locked.
-For the HW_CFG and SECRET* partitions, this can be achieved as follows:
-
-1. [Trigger](#digest-calculation-sequence) a digest calculation via the DAI.
-2. [Read back](#readout-sequence) and verify the digest location via the DAI.
-3. Perform a full-system reset and verify that the corresponding CSRs exposing the 64bit digest have been populated ([`HW_CFG_DIGEST_0`](data/otp_ctrl.hjson#hw_cfg_digest_0), [`SECRET0_DIGEST_0`](data/otp_ctrl.hjson#secret0_digest_0), [`SECRET1_DIGEST_0`](data/otp_ctrl.hjson#secret1_digest_0) or [`SECRET2_DIGEST_0`](data/otp_ctrl.hjson#secret2_digest_0)).
-
-It should be noted that locking only takes effect after a system reset since the affected partitions first have to re-sense the digest values.
-Hence, it is critical that SW ensures that no more data is written to the partition to be locked after triggering the hardware digest calculation.
-Otherwise, the device will likely be rendered inoperable as this can lead to permanent digest mismatch errors after system reboot.
-
-For the [`CREATOR_SW_CFG`](data/otp_ctrl.hjson#creator_sw_cfg) and [`OWNER_SW_CFG`](data/otp_ctrl.hjson#owner_sw_cfg) partitions, the process is similar, but computation and programming of the digest is entirely up to software:
-
-1. Compute a 64bit digest over the relevant parts of the partition, and [program](#programming-sequence) that value to [`CREATOR_SW_CFG_DIGEST_0`](data/otp_ctrl.hjson#creator_sw_cfg_digest_0) or [`OWNER_SW_CFG_DIGEST_0`](data/otp_ctrl.hjson#owner_sw_cfg_digest_0) via the DAI. Note that digest accesses through the DAI have an access granule of 64bit.
-2. [Read back](#readout-sequence) and verify the digest location via the DAI.
-3. Perform a full-system reset and verify that the corresponding digest CSRs [`CREATOR_SW_CFG_DIGEST_0`](data/otp_ctrl.hjson#creator_sw_cfg_digest_0) or [`OWNER_SW_CFG_DIGEST_0`](data/otp_ctrl.hjson#owner_sw_cfg_digest_0) have been populated with the correct 64bit value.
-
-Note that any unrecoverable errors during the programming steps, or mismatches during the read-back and verification steps indicate that the device might be malfunctioning (possibly due to fabrication defects) and hence the device may have to be scrapped.
-This is however rare and should not happen after fabrication testing.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_otp_ctrl.h)
-
-## Register Table
-
-* [Register Tabel](data/otp_ctrl.hjson#registers)
-
-# Additional Notes
-
-## OTP IP Assumptions
-
-It is assumed the OTP IP employed in production has reasonable physical defense characteristics.
-Specifically which defensive features will likely be use case dependent, but at a minimum they should have the properties below.
-Note some properties are worded with "SHALL" and others with "SHOULD".
-"SHALL" refers to features that must be present, while "SHOULD" refers to features that are ideal, but optional.
-
-- The contents shall not be observable via optical microscopy (for example anti-fuse technology).
-- The IP lifetime shall not be limited by the amount of read cycles performed.
-- If the IP contains field programmability (internal charge pumps and LDOs), there shall be mechanisms in place to selectively disable this function based on device context.
-- If the IP contains redundant columns, rows, pages or banks for yield improvement, it shall provide a mechanism to lock down arbitrary manipulation of page / bank swapping during run-time.
-- The IP shall be clear on what bits must be manipulated by the user, what bits are automatically manipulated by hardware (for example ECC or redundancy) and what areas the user can influence.
-- The IP shall be compatible, through the use of a proprietary wrapper or shim, with an open-source friendly IO interface.
-- The IP should functionally support the programming of already programmed bits without information leakage.
-- The IP should offer SCA resistance:
-  - For example, the content may be stored differentially.
-  - For example, the sensing exhibits similar power signatures no matter if the stored bit is 0 or 1.
-- The IP interface shall be memory-like if beyond a certain size.
-- When a particular location is read, a fixed width output is returned; similar when a particular location is programmed, a fixed width input is supplied.
-- The IP does not output all stored bits in parallel.
-- The contents should be electrically hidden. For example, it should be difficult for an attacker to energize the fuse array and observe how the charge leaks.
-- The IP should route critical nets at lower metal levels to avoid probing.
-- The IP should contain native detectors for fault injection attacks.
-- The IP should contain mechanisms to guard against interrupted programming - either through malicious intent or unexpected power loss and glitched address lines.
-- The IP should contain mechanisms for error corrections (single bit errors).
-  - For example ECC or redundant bits voting / or-ing.
-  - As error correction mechanisms are technology dependent, that information should not be exposed to the open-source controller, instead the controller should simply receive information on whether a read / program was successful.
-- The IP should have self-test functionality to assess the health of the storage and analog structures.
-- The IP may contain native PUF-like functionality.
diff --git a/hw/ip/otp_ctrl/doc/programmers_guide.md b/hw/ip/otp_ctrl/doc/programmers_guide.md
new file mode 100644
index 0000000000000..b14926728e0a2
--- /dev/null
+++ b/hw/ip/otp_ctrl/doc/programmers_guide.md
@@ -0,0 +1,259 @@
+# Programmer's Guide
+
+During provisioning and manufacturing, SW interacts with the OTP controller mostly through the Direct Access Interface (DAI), which is described below.
+Afterwards during production, SW is expected to perform only read accesses via the exposed CSRs and CSR windows, since all write access to the partitions has been locked down.
+
+The following sections provide some general guidance, followed by an explanation of the DAI and a detailed OTP memory map.
+Typical programming sequences are explained at the end of the Programmer's guide.
+
+## General Guidance
+
+### Initialization
+
+The OTP controller initializes automatically upon power-up and is fully operational by the time the processor boots.
+The only initialization steps that SW should perform are:
+
+1. Check that the OTP controller has successfully initialized by reading [`STATUS`](../data/otp_ctrl.hjson#status). I.e., make sure that none of the ERROR bits are set, and that the DAI is idle ([`STATUS.DAI_IDLE`](../data/otp_ctrl.hjson#status)).
+2. Set up the periodic background checks:
+    - Choose whether to enable periodic [background checks](#partition-checks) by programming nonzero mask values to [`INTEGRITY_CHECK_PERIOD`](../data/otp_ctrl.hjson#integrity_check_period) and [`CONSISTENCY_CHECK_PERIOD`](../data/otp_ctrl.hjson#consistency_check_period).
+    - Choose whether such checks shall be subject to a timeout by programming a nonzero timeout cycle count to [`CHECK_TIMEOUT`](../data/otp_ctrl.hjson#check_timeout).
+    - It is recommended to lock down the background check registers via [`CHECK_REGWEN`](../data/otp_ctrl.hjson#check_regwen), once the background checks have been set up.
+
+If needed, one-off integrity and consistency checks can be triggered via [`CHECK_TRIGGER`](../data/otp_ctrl.hjson#check_trigger).
+If this functionality is not needed, it is recommended to lock down the trigger register via [`CHECK_TRIGGER_REGWEN`](../data/otp_ctrl.hjson#check_trigger_regwen).
+
+Later on during the boot process, SW may also choose to block read access to the CREATOR_SW_CFG or OWNER_SW_CFG partitions at runtime via [`CREATOR_SW_CFG_READ_LOCK`](../data/otp_ctrl.hjson#creator_sw_cfg_read_lock) and [`OWNER_SW_CFG_READ_LOCK`](../data/otp_ctrl.hjson#owner_sw_cfg_read_lock).
+
+
+### Reset Considerations
+
+It is important to note that values in OTP **can be corrupted** if a reset occurs during a programming operation.
+This should be of minor concern for SW, however, since all partitions except for the LIFE_CYCLE partition are being provisioned in secure and controlled environments, and not in the field.
+The LIFE_CYCLE partition is the only partition that is modified in the field - but that partition is entirely owned by the life cycle controller and not by SW.
+
+### Programming Already Programmed Regions
+
+OTP words cannot be programmed twice, and doing so may damage the memory array.
+Hence the OTP controller performs a blank check and returns an error if a write operation is issued to an already programmed location.
+
+### Potential Side-Effects on Flash via Life Cycle
+
+It should be noted that the locked status of the partition holding the creator root key (i.e., the value of the [`SECRET2_DIGEST_0`](../data/otp_ctrl.hjson#secret2_digest_0)) determines the ID_STATUS of the device, which in turn determines SW accessibility of creator seed material in flash and OTP.
+That means that creator-seed-related collateral needs to be provisioned to Flash **before** the OTP digest lockdown mechanism is triggered, since otherwise accessibility to the corresponding flash region is lost.
+See the [life cycle controller documentation](../../lc_ctrl/README.md#id-state-of-the-device) for more details.
+
+## Direct Access Interface
+
+OTP has to be programmed via the Direct Access Interface, which is comprised of the following CSRs:
+
+CSR Name                             | Description
+-------------------------------------|------------------------------------
+[`DIRECT_ACCESS_WDATA_0`](../data/otp_ctrl.hjson#direct_access_wdata_0) | Low 32bit word to be written.
+[`DIRECT_ACCESS_WDATA_1`](../data/otp_ctrl.hjson#direct_access_wdata_1) | High 32bit word to be written.
+[`DIRECT_ACCESS_RDATA_0`](../data/otp_ctrl.hjson#direct_access_rdata_0) | Low 32bit word that has been read.
+[`DIRECT_ACCESS_RDATA_1`](../data/otp_ctrl.hjson#direct_access_rdata_1) | High 32bit word that has been read.
+[`DIRECT_ACCESS_ADDRESS`](../data/otp_ctrl.hjson#direct_access_address) | byte address for the access.
+[`DIRECT_ACCESS_CMD`](../data/otp_ctrl.hjson#direct_access_cmd)     | Command register to trigger a read or a write access.
+[`DIRECT_ACCESS_REGWEN`](../data/otp_ctrl.hjson#direct_access_regwen)  | Write protection register for DAI.
+
+See further below for a detailed [Memory Map](#direct-access-memory-map) of the address space accessible via the DAI.
+
+### Readout Sequence
+
+A typical readout sequence looks as follows:
+
+1. Check whether the DAI is idle by reading the [`STATUS`](../data/otp_ctrl.hjson#status) register.
+2. Write the byte address for the access to [`DIRECT_ACCESS_ADDRESS`](../data/otp_ctrl.hjson#direct_access_address).
+Note that the address is aligned with the granule, meaning that either 2 or 3 LSBs of the address are ignored, depending on whether the access granule is 32 or 64bit.
+3. Trigger a read command by writing 0x1 to [`DIRECT_ACCESS_CMD`](../data/otp_ctrl.hjson#direct_access_cmd).
+4. Poll the [`STATUS`](../data/otp_ctrl.hjson#status) until the DAI state goes back to idle.
+Alternatively, the `otp_operation_done` interrupt can be enabled up to notify the processor once an access has completed.
+5. If the status register flags a DAI error, additional handling is required (see [Section on Error handling](#error-handling)).
+6. If the region accessed has a 32bit access granule, the 32bit chunk of read data can be read from [`DIRECT_ACCESS_RDATA_0`](../data/otp_ctrl.hjson#direct_access_rdata_0).
+If the region accessed has a 64bit access granule, the 64bit chunk of read data can be read from the [`DIRECT_ACCESS_RDATA_0`](../data/otp_ctrl.hjson#direct_access_rdata_0) and [`DIRECT_ACCESS_RDATA_1`](../data/otp_ctrl.hjson#direct_access_rdata_1) registers.
+7. Go back to 1. and repeat until all data has been read.
+
+The hardware will set [`DIRECT_ACCESS_REGWEN`](../data/otp_ctrl.hjson#direct_access_regwen) to 0x0 while an operation is pending in order to temporarily lock write access to the CSRs registers.
+
+### Programming Sequence
+
+A typical programming sequence looks as follows:
+
+1. Check whether the DAI is idle by reading the [`STATUS`](../data/otp_ctrl.hjson#status) register.
+2. If the region to be accessed has a 32bit access granule, place a 32bit chunk of data into [`DIRECT_ACCESS_WDATA_0`](../data/otp_ctrl.hjson#direct_access_wdata_0).
+If the region to be accessed has a 64bit access granule, both the [`DIRECT_ACCESS_WDATA_0`](../data/otp_ctrl.hjson#direct_access_wdata_0) and [`DIRECT_ACCESS_WDATA_1`](../data/otp_ctrl.hjson#direct_access_wdata_1) registers have to be used.
+3. Write the byte address for the access to [`DIRECT_ACCESS_ADDRESS`](../data/otp_ctrl.hjson#direct_access_address).
+Note that the address is aligned with the granule, meaning that either 2 or 3 LSBs of the address are ignored, depending on whether the access granule is 32 or 64bit.
+4. Trigger a write command by writing 0x2 to [`DIRECT_ACCESS_CMD`](../data/otp_ctrl.hjson#direct_access_cmd).
+5. Poll the [`STATUS`](../data/otp_ctrl.hjson#status) until the DAI state goes back to idle.
+Alternatively, the `otp_operation_done` interrupt can be enabled up to notify the processor once an access has completed.
+6. If the status register flags a DAI error, additional handling is required (see [Section on Error handling](#error-handling)).
+7. Go back to 1. and repeat until all data has been written.
+
+The hardware will set [`DIRECT_ACCESS_REGWEN`](../data/otp_ctrl.hjson#direct_access_regwen) to 0x0 while an operation is pending in order to temporarily lock write access to the CSRs registers.
+
+Note that SW is responsible for keeping track of already programmed OTP word locations during the provisioning phase.
+**It is imperative that SW does not write the same word location twice**, since this can lead to ECC inconsistencies, thereby potentially rendering the device useless.
+
+### Digest Calculation Sequence
+
+The hardware digest computation for the hardware and secret partitions can be triggered as follows:
+
+1. Check whether the DAI is idle by reading the [`STATUS`](../data/otp_ctrl.hjson#status) register.
+3. Write the partition base address to [`DIRECT_ACCESS_ADDRESS`](../data/otp_ctrl.hjson#direct_access_address).
+4. Trigger a digest calculation command by writing 0x4 to [`DIRECT_ACCESS_CMD`](../data/otp_ctrl.hjson#direct_access_cmd).
+5. Poll the [`STATUS`](../data/otp_ctrl.hjson#status) until the DAI state goes back to idle.
+Alternatively, the `otp_operation_done` interrupt can be enabled up to notify the processor once an access has completed.
+6. If the status register flags a DAI error, additional handling is required (see [Section on Error handling](#error-handling)).
+
+The hardware will set [`DIRECT_ACCESS_REGWEN`](../data/otp_ctrl.hjson#direct_access_regwen) to 0x0 while an operation is pending in order to temporarily lock write access to the CSRs registers.
+
+It should also be noted that the effect of locking a partition via the digest only takes effect **after** the next system reset.
+To prevent integrity check failures SW must therefore ensure that no more programming operations are issued to the affected partition after initiating the digest calculation sequence.
+
+### Software Integrity Handling
+
+As opposed to buffered partitions, the digest and integrity handling of unbuffered partitions is entirely up to software.
+The only hardware-assisted feature in unbuffered partitions is the digest lock, which locks write access to an unbuffered partition once a nonzero value has been programmed to the 64bit digest location.
+
+In a similar vein, it should be noted that the system-wide bus-integrity metadata does not travel alongside the data end-to-end in the OTP controller (i.e., the  bus-integrity metadata bits are not stored into the OTP memory array).
+This means that data written to and read from the OTP macro is not protected by the bus integrity feature at all stages.
+In case of buffered partitions this does not pose a concern since data integrity in these partitions is checked via the hardware assisted digest mechanism.
+In case of unbuffered partitions however, the data integrity checking is entirely up to software.
+I.e., if data is read from an unbuffered partition (either through the DAI or CSR windows), software should perform an integrity check on that data.
+
+## Error Handling
+
+The agents that can access the OTP macro (DAI, LCI, buffered/unbuffered partitions) expose detailed error codes that can be used to root cause any failure.
+The error codes are defined in the table below, and the corresponding `otp_err_e` enum type can be found in the `otp_ctrl_pkg`.
+The table also lists which error codes are supported by which agent.
+
+Errors that are not "recoverable" are severe errors that move the corresponding partition or DAI/LCI FSM into a terminal error state, where no more commands can be accepted (a system reset is required to restore functionality in that case).
+Errors that are "recoverable" are less severe and do not cause the FSM to jump into a terminal error state.
+
+Note that error codes that originate in the physical OTP macro are prefixed with `Macro*`.
+
+Error Code | Enum Name              | Recoverable | DAI | LCI | Unbuf | Buf   | Description
+-----------|------------------------|-------------|-----|-----|-------|-------|-------------
+0x0        | `NoError`              | -           |  x  |  x  |   x   |  x    | No error has occurred.
+0x1        | `MacroError`           | no          |  x  |  x  |   x   |  x    | Returned if the OTP macro command did not complete successfully due to a macro malfunction.
+0x2        | `MacroEccCorrError`    | yes         |  x  |  -  |   x   |  x    | A correctable ECC error has occurred during a read operation in the OTP macro.
+0x3        | `MacroEccUncorrError`  | no          |  x  |  -  |   x*  |  x    | An uncorrectable ECC error has occurred during a read operation in the OTP macro. Note (*): This error is collapsed into `MacroEccCorrError` if the partition is a vendor test partition. It then becomes a recoverable error.
+0x4        | `MacroWriteBlankError` | yes / no*   |  x  |  x  |   -   |  -    | This error is returned if a write operation attempted to clear an already programmed bit location. Note (*): This error is recoverable if encountered in the DAI, but unrecoverable if encountered in the LCI.
+0x5        | `AccessError`          | yes         |  x  |  -  |   x   |  -    | An access error has occurred (e.g. write to write-locked region, or read to a read-locked region).
+0x6        | `CheckFailError`       | no          |  -  |  -  |   x   |  x    | An unrecoverable ECC, integrity or consistency error has been detected.
+0x7        | `FsmStateError`        | no          |  x  |  x  |   x   |  x    | The FSM has been glitched into an invalid state, or escalation has been triggered and the FSM has been moved into a terminal error state.
+
+All non-zero error codes listed above trigger an `otp_error` interrupt.
+In addition, all unrecoverable OTP `Macro*` errors (codes 0x1, 0x3) trigger a `fatal_macro_error` alert, while all remaining unrecoverable errors trigger a `fatal_check_error` alert.
+
+If software receives an `otp_error` interrupt, but all error codes read back as 0x0 (`NoError`), this should be treated as a fatal error condition, and the system should be shut down as soon as possible.
+
+Note that while the `MacroWriteBlankError` is marked as a recoverable error, the affected OTP word may be in an inconsistent state after this error has been returned.
+This can cause several issues when the word is accessed again (either as part of a regular read operation, as part of the readout at boot, or as part of a background check).
+It is important that SW ensures that each word is only written once, since this can render the device useless.
+
+## Direct Access Memory Map
+
+The table below provides a detailed overview of the items stored in the OTP partitions.
+Some of the items that are buffered in registers is readable via memory mapped CSRs, and these CSRs are linked in the table below.
+Items that are not linked can only be accessed via the direct programming interface (if the partition is not locked via the corresponding digest).
+It should be noted that CREATOR_SW_CFG and OWNER_SW_CFG are accessible through a memory mapped window, and content of these partitions is not buffered.
+Hence, a read access to those windows will take in the order of 10-20 cycles until the read returns.
+
+Sizes below are specified in multiples of 32bit words.
+
+{{#include doc/otp_ctrl_mmap.md}}
+
+Note that since the content in the SECRET* partitions are scrambled using a 64bit PRESENT cipher, read and write access through the DAI needs to occur at a 64bit granularity.
+Also, all digests (no matter whether they are SW or HW digests) have an access granule of 64bit.
+
+The table below lists digests locations, and the corresponding locked partitions.
+
+{{#include doc/otp_ctrl_digests.md}}
+
+Write access to the affected partition will be locked if the digest has a nonzero value.
+
+For the software partition digests, it is entirely up to software to decide on the digest algorithm to be used.
+Hardware will determine the lock condition only based on whether a non-zero value is present at that location or not.
+
+For the hardware partitions, hardware calculates this digest and uses it for [background verification](#partition-checks).
+Digest calculation can be triggered via the DAI.
+
+Finally, it should be noted that the RMA_TOKEN and CREATOR_ROOT_KEY_SHARE0 / CREATOR_ROOT_KEY_SHARE1 items can only be programmed when the device is in the DEV, PROD, PROD_END and RMA stages.
+Please consult the [life cycle controller documentation](../../lc_ctrl/README.md) documentation for more information.
+
+## Examples
+
+### Provisioning Items
+
+The following represents a typical provisioning sequence for items in all partitions (except for the LIFE_CYCLE partition, which is not software-programmable):
+
+1. [Program](#programming-sequence) the item in 32bit or 64bit chunks via the DAI.
+2. [Read back](#readout-sequence) and verify the item via the DAI.
+3. If the item is exposed via CSRs or a CSR window, perform a full-system reset and verify whether those fields are correctly populated.
+
+Note that any unrecoverable errors during the programming steps, or mismatches during the readback and verification steps indicate that the device might be malfunctioning (possibly due to fabrication defects) and hence the device may have to be scrapped.
+This is however rare and should not happen after fabrication testing.
+
+### Locking Partitions
+
+Once a partition has been fully populated, write access to that partition has to be permanently locked.
+For the HW_CFG and SECRET* partitions, this can be achieved as follows:
+
+1. [Trigger](#digest-calculation-sequence) a digest calculation via the DAI.
+2. [Read back](#readout-sequence) and verify the digest location via the DAI.
+3. Perform a full-system reset and verify that the corresponding CSRs exposing the 64bit digest have been populated ([`HW_CFG_DIGEST_0`](../data/otp_ctrl.hjson#hw_cfg_digest_0), [`SECRET0_DIGEST_0`](../data/otp_ctrl.hjson#secret0_digest_0), [`SECRET1_DIGEST_0`](../data/otp_ctrl.hjson#secret1_digest_0) or [`SECRET2_DIGEST_0`](../data/otp_ctrl.hjson#secret2_digest_0)).
+
+It should be noted that locking only takes effect after a system reset since the affected partitions first have to re-sense the digest values.
+Hence, it is critical that SW ensures that no more data is written to the partition to be locked after triggering the hardware digest calculation.
+Otherwise, the device will likely be rendered inoperable as this can lead to permanent digest mismatch errors after system reboot.
+
+For the [`CREATOR_SW_CFG`](../data/otp_ctrl.hjson#creator_sw_cfg) and [`OWNER_SW_CFG`](../data/otp_ctrl.hjson#owner_sw_cfg) partitions, the process is similar, but computation and programming of the digest is entirely up to software:
+
+1. Compute a 64bit digest over the relevant parts of the partition, and [program](#programming-sequence) that value to [`CREATOR_SW_CFG_DIGEST_0`](../data/otp_ctrl.hjson#creator_sw_cfg_digest_0) or [`OWNER_SW_CFG_DIGEST_0`](../data/otp_ctrl.hjson#owner_sw_cfg_digest_0) via the DAI. Note that digest accesses through the DAI have an access granule of 64bit.
+2. [Read back](#readout-sequence) and verify the digest location via the DAI.
+3. Perform a full-system reset and verify that the corresponding digest CSRs [`CREATOR_SW_CFG_DIGEST_0`](../data/otp_ctrl.hjson#creator_sw_cfg_digest_0) or [`OWNER_SW_CFG_DIGEST_0`](../data/otp_ctrl.hjson#owner_sw_cfg_digest_0) have been populated with the correct 64bit value.
+
+Note that any unrecoverable errors during the programming steps, or mismatches during the read-back and verification steps indicate that the device might be malfunctioning (possibly due to fabrication defects) and hence the device may have to be scrapped.
+This is however rare and should not happen after fabrication testing.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_otp_ctrl.h)
+
+## Register Table
+
+* [Register Tabel](../data/otp_ctrl.hjson#registers)
+
+# Additional Notes
+
+## OTP IP Assumptions
+
+It is assumed the OTP IP employed in production has reasonable physical defense characteristics.
+Specifically which defensive features will likely be use case dependent, but at a minimum they should have the properties below.
+Note some properties are worded with "SHALL" and others with "SHOULD".
+"SHALL" refers to features that must be present, while "SHOULD" refers to features that are ideal, but optional.
+
+- The contents shall not be observable via optical microscopy (for example anti-fuse technology).
+- The IP lifetime shall not be limited by the amount of read cycles performed.
+- If the IP contains field programmability (internal charge pumps and LDOs), there shall be mechanisms in place to selectively disable this function based on device context.
+- If the IP contains redundant columns, rows, pages or banks for yield improvement, it shall provide a mechanism to lock down arbitrary manipulation of page / bank swapping during run-time.
+- The IP shall be clear on what bits must be manipulated by the user, what bits are automatically manipulated by hardware (for example ECC or redundancy) and what areas the user can influence.
+- The IP shall be compatible, through the use of a proprietary wrapper or shim, with an open-source friendly IO interface.
+- The IP should functionally support the programming of already programmed bits without information leakage.
+- The IP should offer SCA resistance:
+  - For example, the content may be stored differentially.
+  - For example, the sensing exhibits similar power signatures no matter if the stored bit is 0 or 1.
+- The IP interface shall be memory-like if beyond a certain size.
+- When a particular location is read, a fixed width output is returned; similar when a particular location is programmed, a fixed width input is supplied.
+- The IP does not output all stored bits in parallel.
+- The contents should be electrically hidden. For example, it should be difficult for an attacker to energize the fuse array and observe how the charge leaks.
+- The IP should route critical nets at lower metal levels to avoid probing.
+- The IP should contain native detectors for fault injection attacks.
+- The IP should contain mechanisms to guard against interrupted programming - either through malicious intent or unexpected power loss and glitched address lines.
+- The IP should contain mechanisms for error corrections (single bit errors).
+  - For example ECC or redundant bits voting / or-ing.
+  - As error correction mechanisms are technology dependent, that information should not be exposed to the open-source controller, instead the controller should simply receive information on whether a read / program was successful.
+- The IP should have self-test functionality to assess the health of the storage and analog structures.
+- The IP may contain native PUF-like functionality.
diff --git a/hw/ip/otp_ctrl/doc/theory_of_operation.md b/hw/ip/otp_ctrl/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..a50178f158869
--- /dev/null
+++ b/hw/ip/otp_ctrl/doc/theory_of_operation.md
@@ -0,0 +1,703 @@
+# Theory of Operation
+
+Conceptually speaking, the OTP functionality is at a high level split into "front-end" and "back-end".
+The "front-end" contains the logical partitions that feed the hardware and software consumer interfaces of the system.
+The "back-end" represents the programming interface used by hardware and software components to stage the upcoming values.
+The diagram below illustrates this behavioral model.
+
+![OTP Controller Block Diagram](../doc/otp_ctrl_behavioral_model.svg)
+
+Note that the front-end contains both buffered and unbuffered partitions.
+Buffered partitions are sensed once per power cycle and their contents are stored in registers, whereas unbuffered partitions are read on-demand.
+The former are typically partitions that contain data like hardware configuration bits, key material and the life cycle state that need to be always available to the hardware, whereas the latter are large partitions that are accessed infrequently, such as the software configurations.
+Values that are programmed into a buffered partition via the programming interface (coupled with read verification) are merely "staged", and do not take effect until the next power cycle.
+
+The sections below describe the operation of various pieces of the OTP controller and how it supports the described functionality.
+
+## Logical Partitions
+
+The OTP is logically separated into partitions that represent different functions.
+This means the isolation is virtual and maintained by the OTP controller instead of the underlying OTP IP.
+
+Within each logical partition, there are specific enforceable properties
+
+- Confidentiality via secret partitions
+  - This controls whether a particular partition contains secret data.
+  - If secret, a partition is not readable by software once locked, and is scrambled in storage.
+- Read lockability
+  - This controls whether a particular partition disables software readability for later stage software.
+  - Some partitions can be locked statically (by computing and storing an associated digest in OTP), others can be read locked at runtime via CSRs.
+- Write lockability
+  - This controls whether a partition is locked and prevented from future updates.
+  - A locked partition is stored alongside a digest to be used later for integrity verification.
+- Integrity Verification
+  - Once a partition is write-locked by calculating and writing a non-zero [digest](#locking-a-partition) to it, it can undergo periodic verification (time-scale configurable by software).
+This verification takes two forms, partition integrity checks, and storage consistency checks.
+
+Since the OTP is memory-like in nature (it only outputs a certain number of bits per address location), some of the logical partitions are buffered in registers for instantaneous and parallel access by hardware.
+This is a critical point, since after power-up, these particular OTP contents are stored in flip flops and sourced to the system.
+I.e., buffered partitions are **NOT** directly sourced from the OTP macro itself.
+Thus the security of both volatile (OTP controller) and non-volatile (OTP IP) storage becomes important.
+
+### Partition Listing and Description
+
+The OTP controller for OpenTitan contains the seven logical partitions shown below.
+
+{{#include doc/otp_ctrl_partitions.md}}
+
+Generally speaking, the production life cycle of a device is split into 5 stages "Manufacturing" -> "Calibration and Testing" -> "Provisioning" -> "Mission" -> "RMA".
+OTP values are usually programmed during "Calibration and Testing", "Provisioning" and "RMA" stages, as explained below.
+A detailed listing of all the items and the corresponding memory map can be found in the [Programmer's Guide](#programmers-guide)) further below.
+
+### Calibration and Test
+
+During this stage, the device is tested for functionality and calibrated to ensure uniformity.
+The calibration can focus on a number of things, but usually is centered around adjusting clock, voltage and timing sources to remove process variation.
+These calibration values are programmed into the CREATOR_SW_CFG partition, as they are non-secret values meant to be read out by software and programmed into respective peripherals.
+
+Early on during this stage, the various tokens are also programmed into the secret partitions and harvested by the silicon creator.
+
+### Provisioning
+
+During this stage, the device is provisioned with the final firmware and a "unique" seed or identity.
+The secret partitions are populated with root secrets and keys that are critical to establishing the device identity.
+
+As part of injecting the final firmware, the stock-keeping-unit-specific hardware and software configurations are also programmed.
+
+### Life Cycle Partition
+
+The life cycle partition is active throughout all stages and hence it is the **ONLY** partition that cannot be locked.
+After the device finishes provisioning and goes into production, it must retain the ability to transition back to RMA in case of unexpected failures.
+
+In order to support this transition, the [life cycle state](../../lc_ctrl/README.md) and counters must always be update-able.
+
+## Locking a Partition
+
+Write access to a partition can be permanently locked when software determines it will no longer make any updates to that partition.
+To lock, an integrity constant is calculated and programmed alongside the other data of that partition.
+The size of that integrity constant depends on the partition size granule, and is either 32bit or 64bit (see also [Direct Access Memory Map](#direct-access-memory-map)).
+
+Once the "integrity digest" is non-zero, no further updates are allowed.
+If the partition is secret, software is in addition no longer able to read its contents (see [Secret Partition description](#secret-vs-nonsecret-partitions)).
+
+Note however, in all partitions, the digest itself is **ALWAYS** readable.
+This gives software an opportunity to confirm that the locking operation has proceeded correctly, and if not, scrap the part immediately.
+
+Calculation of the integrity digest depends on whether the partition requires periodic background verification.
+
+### Vendor Test Partition
+
+The vendor test partition is intended to be used for OTP programming smoke checks during the manufacturing flow.
+The silicon creator may implement these checks inside the proprietary version of the `prim_otp` wrapper.
+This partition behaves like any other SW partition, with the exception that ECC uncorrectable errors will not lead to fatal errors / alerts as they do in all other partitions.
+This is due to the nature of the OTP programming smoke checks, which may leave certain OTP words in a state inconsistent with the ECC polynomial employed upon OTP readout.
+
+### Software Configuration Partitions
+
+The software configuration partitions are used as non-volatile storage for flags, configuration and calibration data.
+As such, the contents of this partition are usually consumed once as part of code execution, or moved to another storage compartment somewhere in the design.
+For example, the clock calibration values and the LDO calibration values are programmed to the analog sensor top (AST) at startup.
+
+As such, it is not necessary to check periodically at the OTP source.
+Instead, software can simply check as part of secure boot and take other measures when these values are programmed into peripherals.
+
+For this partition it is thus the responsibility of software to calculate the integrity digest and program it into the OTP.
+It is also reasonable to shadow (parts of) this partition in main memory, and there is not an immediate impact from OTP contents to hardware.
+
+### Hardware Configuration and Secret Partitions
+
+The hardware and secret partitions directly affect downstream hardware.
+The contents must go through periodic integrity checks and therefore the stored digest is calculated by hardware when software provides the intent to lock (as opposed to the software partitions where the digest has to be calculated by software).
+
+### Life Cycle Partition
+
+The life cycle partition cannot be locked and will therefore not contain a stored digest.
+Note however that only the life cycle controller has access to this partition, i.e., the Direct Access Interface (DAI) cannot read nor write from/to the life cycle partition.
+
+## Secret vs Non-Secret Partitions
+
+Non-secret OTP partitions hold data that can be public; or data that has no impact on security.
+For example, the current value of lock bits or clock calibration values.
+These values are stored in OTP as plaintext.
+
+Secret partitions contain data that are critical to security, for example FLASH scrambling keys, device root secret and unlock tokens.
+These values are stored scrambled in OTP, and are descrambled upon read.
+The currently employed cipher is PRESENT, as it lends itself well to iterative decomposition, and it is a proven lightweight block cipher (see also [PRESENT Scrambling Primitive](../../prim/doc/prim_present.md).
+The usage of a block cipher however implies that the secret partitions can only be written in 64bit chunks.
+
+Further, the contents of a particular secret partition are not readable by software once locked (other than the digest which must be always readable); while non-secret partitions are always readable unless read accessibility is explicitly removed by software.
+
+Unfortunately, secret partitions must utilize a global netlist key for the scrambling operation, as there is no other non-volatile storage to store a unique key.
+
+
+## Partition Checks
+
+### Integrity
+
+Once the appropriate partitions have been locked, the hardware integrity checker employs two integrity checks to verify the content of the volatile buffer registers:
+
+1. All buffered partitions have additional ECC protection (8bit ECC for each 64bit block) that is concurrently monitored.
+2. The digest of the partition is recomputed at semi-random intervals and compared to the digest stored alongside the partition.
+
+The purpose of this check is NOT to check between the storage flops and the OTP, but whether the buffer register contents remain consistent with the calculated digest.
+This verification is primarily concerned with whether the storage flops have experienced fault attacks.
+This check applies to only the HW_CFG and SECRET* partitions.
+If a failure is encountered, the OTP controller will send out a `fatal_check_error` alert and reset all of its hardware outputs to their defaults.
+
+### Storage Consistency
+
+This verification ensures the value stored in the buffer registers remain consistent with those in the OTP.
+This process re-reads the OTP at semi-random intervals and confirms the value read is the same as the value stored.
+Note, given there are integrity checks in parallel, it is not necessary for some partitions to check ALL read contents for consistency.
+If there is an integrity digest, only the digest needs to be read; otherwise, all values must be read.
+
+
+This check applies to LIFE_CYCLE, HW_CFG and SECRET* partitions.
+If a failure is encountered, the OTP controller will send out a `fatal_check_error` alert and reset all of its hardware outputs to their defaults.
+
+Note that checks applied to life cycle could cause a failure if life cycle is updated, because life cycle is the only partition that may contain live updates.
+The controller hence detects this condition based on the `lc_check_byp_en_i` signal coming from the life cycle controller, and pauses background checks on this partition in order to prevent false positives.
+
+### Secret Partition Integrity Checks
+
+Since the secret partitions are stored scrambled, this also implies the integrity digest is calculated over the scrambled form.
+In order to balance the amount of buffer registers needed, only the decrypted form of the secret partitions is held in buffer registers.
+Hardware calculates the digest by re-scrambling the data before passing it through the digest.
+
+
+## Power-up and Sense
+
+The OTP controller partition storage must output a specified safe default (it is not always 0 like a blank OTP) upon reset release.
+This default output must remain until the OTP controller completes all checks.
+
+The OTP controller reads from the OTP IP.
+If the reads pass OTP IP internal checks (for example ECC or redundancy), the partition storage is updated; however the output is still held at the default state via an output mux.
+After all read is complete, the OTP controller performs integrity checks on the HW_CFG and SECRET* partitions.
+If a partition fails the integrity checks at this point it would signal an initialization error in the status CSR and abort further initialization.
+
+After all integrity checks are complete, the OTP controller releases the output gating and marks outputs as valid.
+However, any partition marked with "error" continues to hold its output in the default state.
+
+Once the above steps are complete, the partition storage in buffered registers is not updated again (except for updates to the life cycle partition through the life cycle interface).
+I.e., values programmed to OTP via the programming interface will not be visible in buffered registers until after the next power cycle.
+
+At this point, outputs of the partition storage are NOT expected to change unless a periodic check suddenly fails.
+When this failure occurs, all outputs are reverted to their default state, and an alert is immediately triggered to the alert handler.
+For timing purposes, OTP outputs can be treated as semi-static, as this error event should be rare and exceptional.
+
+
+## Partition Defaults
+
+Partition defaults are context specific.
+For example, a hardware configuration item that locks down specific access should default to "no access".
+This ensures that a glitch attack on the OTP cannot easily revert the design to an insecure state.
+
+This hence suggests that when an OTP is all 0's and all 1's, it should, whenever possible, reflect an invalid or inert state in the encoding space of the affected item.
+This also implies the reset state of consuming agents (for example key manager and life cycle), should default to invalid / inert state as well.
+
+
+## Program and Read Ports
+
+As shown previously, the OTP is split into a front and back end.
+The back-end interface is primarily used to update OTP contents, and read back for debug and verification purposes.
+Despite being a separate functional access port from the logical partitions, the program and read ports are subjected to the same access controls.
+
+When a partition is write-locked, programming accesses are disallowed.
+If the partition is secret, read accesses by the back-end interface are also disallowed (except for the digest which must always be readable).
+Software can also disable any read accesses to the software configuration partitions via CSR settings to prevent later stage software from reading any content.
+
+The exception to the above is the life cycle partition.
+The life cycle controller interface also acts as a "back-end" interface that always has programming access to ensure life cycle state can be advanced.
+
+Note, the program and read ports can conflict with ongoing background storage checks, and the OTP controller arbitrates between these two sides.
+An in-progress operation will always be completed.
+Afterwards, or when two requests arrive at the same time, the priority is life cycle > programming interface > on-demand read accesses via CSR windows > background checks.
+
+
+## Programming the OTP
+
+The OTP controller has two programming paths:
+
+1. a functional programming path through software (the program port),
+2. Life cycle programming path through hardware.
+
+The functional interface is used to update all partitions except for life cycle.
+As mentioned previously, any updates made during the current power cycle are **NOT** reflected in the buffered partitions until the next reboot.
+
+The life cycle interface is used to update the life cycle state and transition counter only.
+The commands are issued from the [life cycle controller](../../lc_ctrl/README.md), and similarly, successful or failed indications are also sent back to the life cycle controller.
+Similar to the functional interface, the life cycle controller allows only one update per power cycle, and after a requested transition reverts to an inert state until reboot.
+
+For more details on how the software programs the OTP, please refer to the [Programmer's Guide](#programmers-guide)) further below.
+
+
+## Hardware Interfaces
+
+### Parameters
+
+The following table lists the instantiation parameters of OTP.
+Note that parameters prefixed with `RndCnst` are random netlist constants that need to be regenerated via topgen before the tapeout (typically by the silicon creator).
+
+Parameter                   | Default (Max) | Top Earlgrey | Description
+----------------------------|---------------|--------------|---------------
+`AlertAsyncOn`              | 2'b11         | 2'b11        |
+`RndCnstLfsrSeed`           | (see RTL)     | (see RTL)    | Seed to be used for the internal 40bit partition check timer LFSR. This needs to be replaced by the silicon creator before the tapeout.
+`RndCnstLfsrPerm`           | (see RTL)     | (see RTL)    | Permutation to be used for the internal 40bit partition check timer LFSR. This needs to be replaced by the silicon creator before the tapeout.
+`RndCnstKey`                | (see RTL)     | (see RTL)    | Random scrambling keys for secret partitions, to be used in the [scrambling datapath](#scrambling-datapath).
+`RndCnstDigestConst`        | (see RTL)     | (see RTL)    | Random digest finalization constants, to be used in the [scrambling datapath](#scrambling-datapath).
+`RndCnstDigestIV`           | (see RTL)     | (see RTL)    | Random digest initialization vectors, to be used in the [scrambling datapath](#scrambling-datapath).
+`RndCnstRawUnlockToken`     | (see RTL)     | (see RTL)    | Global RAW unlock token to be used for the first life cycle transition. See also [conditional life cycle transitions](../../lc_ctrl/README.md#conditional-transitions).
+
+### Signals
+
+* [Interface Tables](../data/otp_ctrl.hjson#interfaces)
+
+The OTP controller contains various interfaces that connect to other comportable IPs within OpenTitan, and these are briefly explained further below.
+
+#### EDN Interface
+
+The entropy request interface that talks to EDN in order to fetch fresh entropy for ephemeral SRAM scrambling key derivation and the LFSR counters for background checks.
+It is comprised of the `otp_edn_o` and `otp_edn_i` signals and follows a req / ack protocol.
+
+See also [EDN documentation](../../edn/README.md).
+
+#### Power Manager Interface
+
+The power manager interface is comprised of three signals overall: an initialization request (`pwr_otp_i.otp_init`), an initialization done response (`pwr_otp_o.otp_done`) and an idle indicator (`pwr_otp_o.otp_idle`).
+
+The power manager asserts `pwr_otp_i.otp_init` in order to signal to the OTP controller that it can start initialization, and the OTP controller signals completion of the initialization sequence by asserting `pwr_otp_o.otp_done` (the signal will remain high until reset).
+
+The idle indication signal `pwr_otp_o.otp_idle` indicates whether there is an ongoing write operation in the Direct Access Interface (DAI) or Life Cycle Interface (LCI), and the power manager uses that indication to determine whether a power down request needs to be aborted.
+
+Since the power manager may run in a different clock domain, the `pwr_otp_i.otp_init` signal is synchronized within the OTP controller.
+The power manager is responsible for synchronizing the `pwr_otp_o.otp_done` and `pwr_otp_o.otp_idle` signals.
+
+See also [power manager documentation](../../pwrmgr/README.md).
+
+#### Life Cycle Interfaces
+
+The interface to the life cycle controller can be split into three functional sub-interfaces (vendor test, state output, state transitions), and these are explained in more detail below.
+Note that the OTP and life cycle controllers are supposed to be in the same clock domain, hence no additional signal synchronization is required.
+See also [life cycle controller documentation](../../lc_ctrl/README.md) for more details.
+
+##### Vendor Test Signals
+
+The `lc_otp_vendor_test_i` and `lc_otp_vendor_test_o` signals are connected to a 32bit control and a 32bit status register in the life cycle TAP, respectively, and are directly routed to the `prim_otp` wrapper.
+These control and status signals may be used by the silicon creator to exercise the OTP programming smoke checks on the VENDOR_TEST partition.
+The signals are gated with the life cycle state inside the life cycle controller such that they do not have any effect in production life cycle states.
+
+##### State, Counter and Token Output
+
+After initialization, the life cycle partition contents, as well as the tokens and personalization status is output to the life cycle controller via the `otp_lc_data_o` struct.
+The life cycle controller uses this information to determine the life cycle state, and steer the appropriate qualifier signals.
+Some of these qualifier signals (`lc_dft_en_i`, `lc_creator_seed_sw_rw_en_i`, `lc_seed_hw_rd_en_i` and `lc_escalate_en_i`) are fed back to the OTP controller in order to ungate testing logic to the OTP macro; enable SW write access to the `SECRET2` partition; enable hardware read access to the root key in the `SECRET2` partition; or to push the OTP controller into escalation state.
+
+A possible sequence for the signals described is illustrated below.
+```wavejson
+{signal: [
+  {name: 'clk_i',                           wave: 'p.................'},
+  {name: 'otp_lc_data_o.valid',             wave: '0.|...|.1.|...|...'},
+  {name: 'otp_lc_data_o.state',             wave: '03|...|...|...|...'},
+  {name: 'otp_lc_data_o.count',             wave: '03|...|...|...|...'},
+  {},
+  {name: 'otp_lc_data_o.test_unlock_token', wave: '0.|...|.3.|...|...'},
+  {name: 'otp_lc_data_o.test_exit_token',   wave: '0.|...|.3.|...|...'},
+  {name: 'otp_lc_data_o.test_tokens_valid', wave: '0.|...|.3.|...|...'},
+  {},
+  {name: 'otp_lc_data_o.rma_token',         wave: '0.|.3.|...|...|...'},
+  {name: 'otp_lc_data_o.rma_token_valid',   wave: '0.|.3.|...|...|...'},
+  {},
+  {name: 'otp_lc_data_o.secrets_valid',     wave: '0.|.3.|...|...|...'},
+  {},
+  {name: 'lc_creator_seed_sw_rw_en_i',      wave: '0.|...|...|.4.|...'},
+  {name: 'lc_seed_hw_rd_en_i',              wave: '0.|...|...|.4.|...'},
+  {name: 'lc_dft_en_i',                     wave: '0.|...|...|.4.|...'},
+  {},
+  {name: 'lc_escalate_en_i',                wave: '0.|...|...|...|.5.'},
+]}
+```
+
+Note that the `otp_lc_data_o.valid` signal is only asserted after the `LIFE_CYCLE`, `SECRET0` and `SECRET2` partitions have successfully initialized, since the life cycle collateral contains information from all three partitions.
+The `otp_lc_data_o.test_tokens_valid` and `otp_lc_data_o.rma_token_valid` signals are multibit valid signals indicating whether the corresponding tokens are valid.
+The ``otp_lc_data_o.secrets_valid`` signal is a multibit valid signal that is set to `lc_ctrl_pkg::On` iff the `SECRET2` partition containing the root keys has been locked with a digest.
+
+
+##### State Transitions
+
+In order to perform life cycle state transitions, the life cycle controller can present the new value of the life cycle state and counter via the programming interface as shown below:
+
+```wavejson
+{signal: [
+  {name: 'clk_i',                          wave: 'p.......'},
+  {name: 'lc_otp_program_i.req',           wave: '01.|..0.'},
+  {name: 'lc_otp_program_i.state',         wave: '03.|..0.'},
+  {name: 'lc_otp_program_i.count',         wave: '03.|..0.'},
+  {name: 'lc_otp_program_o.ack',           wave: '0..|.10.'},
+  {name: 'lc_otp_program_o.err',           wave: '0..|.40.'},
+]}
+```
+
+The request must remain asserted until the life cycle controller has responded.
+An error is fatal and indicates that the OTP programming operation has failed.
+
+Note that the new state must not clear any bits that have already been programmed to OTP - i.e., the new state must be incrementally programmable on top of the previous state.
+There are hence some implications on the life cycle encoding due to the ECC employed, see [life cycle state encoding](../../lc_ctrl/README.md#life-cycle-manufacturing-state-encodings) for details.
+
+Note that the behavior of the `lc_otp_program_i.otp_test_ctrl` signal is vendor-specific, and hence the signal is set to `x` in the timing diagram above.
+The purpose of this signal is to control vendor-specific test mechanisms, and its value will only be forwarded to the OTP macro in RAW, TEST_* and RMA states.
+In all other life cycle states this signal will be clamped to zero.
+
+#### Interface to Key Manager
+
+The interface to the key manager is a simple struct that outputs the CREATOR_ROOT_KEY_SHARE0 and CREATOR_ROOT_KEY_SHARE1 keys via `otp_keymgr_key_o` if these secrets have been provisioned and locked (via CREATOR_KEY_LOCK).
+Otherwise, this signal is tied to a random netlist constant.
+
+Since the key manager may run in a different clock domain, key manager is responsible for synchronizing the `otp_keymgr_key_o` signals.
+
+#### Interface to Flash Scrambler
+
+The interface to the FLASH scrambling device is a simple req/ack interface that provides the flash controller with the two 128bit keys for data and address scrambling.
+
+The keys can be requested as illustrated below:
+
+```wavejson
+{signal: [
+  {name: 'clk_i',                      wave: 'p...........'},
+  {name: 'flash_otp_key_i.data_req',   wave: '01.|..0.|...'},
+  {name: 'flash_otp_key_i.addr_req',   wave: '01.|....|..0'},
+  {name: 'flash_otp_key_o.data_ack',   wave: '0..|.10.|...'},
+  {name: 'flash_otp_key_o.addr_ack',   wave: '0..|....|.10'},
+  {name: 'flash_otp_key_o.key',        wave: '0..|.30.|.40'},
+  {name: 'flash_otp_key_o.seed_valid', wave: '0..|.10.|.10'},
+]}
+```
+
+The keys are derived from the FLASH_DATA_KEY_SEED and FLASH_ADDR_KEY_SEED values stored in the `SECRET1` partition using the [scrambling primitive](#scrambling-datapath).
+If the key seeds have not yet been provisioned, the keys are derived from all-zero constants, and the `flash_otp_key_o.seed_valid` signal will be set to 0 in the response.
+
+Note that the req/ack protocol runs on the OTP clock.
+It is the task of the scrambling device to synchronize the handshake protocol by instantiating the `prim_sync_reqack.sv` primitive as shown below.
+
+![OTP Key Req Ack](../doc/otp_ctrl_key_req_ack.svg)
+
+Note that the key and nonce output signals on the OTP controller side are guaranteed to remain stable for at least 62 OTP clock cycles after the `ack` signal is pulsed high, because the derivation of a 64bit half-key takes at least two passes through the 31-cycle PRESENT primitive.
+Hence, if the scrambling device clock is faster or in the same order of magnitude as the OTP clock, the data can be directly sampled upon assertion of `src_ack_o`.
+If the scrambling device runs on a significantly slower clock than OTP, an additional register (as indicated with dashed grey lines in the figure) has to be added.
+
+#### Interfaces to SRAM and OTBN Scramblers
+
+The interfaces to the SRAM and OTBN scrambling devices follow a req / ack protocol, where the scrambling device first requests a new ephemeral key by asserting the request channel (`sram_otp_key_i[*]`, `otbn_otp_key_i`).
+The OTP controller then fetches entropy from EDN and derives an ephemeral key using the SRAM_DATA_KEY_SEED and the [PRESENT scrambling data path](#scrambling-datapath).
+Finally, the OTP controller returns a fresh ephemeral key via the response channels (`sram_otp_key_o[*]`, `otbn_otp_key_o`), which complete the req / ack handshake.
+The wave diagram below illustrates this process for the OTBN scrambling device.
+
+```wavejson
+{signal: [
+  {name: 'clk_i',                     wave: 'p.......'},
+  {name: 'otbn_otp_key_i.req',        wave: '01.|..0.'},
+  {name: 'otbn_otp_key_o.ack',        wave: '0..|.10.'},
+  {name: 'otbn_otp_key_o.nonce',      wave: '0..|.30.'},
+  {name: 'otbn_otp_key_o.key',        wave: '0..|.30.'},
+  {name: 'otbn_otp_key_o.seed_valid', wave: '0..|.10.'},
+]}
+```
+
+If the key seeds have not yet been provisioned, the keys are derived from all-zero constants, and the `*.seed_valid` signal will be set to 0 in the response.
+It should be noted that this mechanism requires the EDN and entropy distribution network to be operational, and a key derivation request will block if they are not.
+
+Note that the req/ack protocol runs on the OTP clock.
+It is the task of the scrambling device to perform the synchronization as described in the previous subsection on the [flash scrambler interface](#interface-to-flash-scrambler).
+
+#### Hardware Config Bits
+
+The bits of the HW_CFG partition are output via the `otp_hw_cfg_o` struct.
+IPs that consume collateral stored in this partition shall connect to this struct via the topgen feature, and break out the appropriate bits by either accessing the correct index or using the struct fields.
+These fields are autogenerated from the memory map items allocated to the HW_CFG partition, and the autogenerated struct type can be found in the `otp_ctrl_part_pkg.sv` package.
+Note that it is the task of the receiving IP to synchronize these bits accordingly to the local clock.
+For convenience, a valid bit is also available in that struct.
+The valid bit indicates that the HW_CFG partition has initialized.
+
+### Parameter and Memory Map Changes after D3/V3
+
+Note that all instantiation parameters can be changed without affecting D3/V3 status of the module.
+Similarly, it is permissible to change the contents (partition size, adding and removing items) of the `CREATOR_SW_CFG`, `OWNER_SW_CFG` and `HW_CFG` partitions without affecting D3 status.
+Note however that partition size changes may affect V3 coverage metrics, hence if the size any of the above three partitions is changed, V3 needs to be re-assessed.
+
+## Design Details
+
+### Block Diagram
+
+The following is a high-level block diagram that illustrates everything that has been discussed.
+
+![OTP Controller Block Diagram](../doc/otp_ctrl_blockdiag.svg)
+
+Each of the partitions P0-P7 has its [own controller FSM](#partition-implementations) that interacts with the OTP wrapper and the [scrambling datapath](#scrambling-datapath) to fulfill its tasks.
+The partitions expose the address ranges and access control information to the Direct Access Interface (DAI) in order to block accesses that go to locked address ranges.
+Further, the only two blocks that have (conditional) write access to the OTP are the DAI and the Life Cycle Interface (LCI) blocks.
+The partitions can only issue read transactions to the OTP macro.
+Note that the access ranges of the DAI and the LCI are mutually exclusive.
+I.e., the DAI cannot read from nor write to the life cycle partition.
+The LCI cannot read the OTP, but is allowed to write to the life cycle partition.
+
+The CSR node on the left side of this diagram connects to the DAI, the OTP partitions (P0-P7) and the OTP wrapper through a gated TL-UL interface.
+All connections from the partitions to the CSR node are read-only, and typically only carry a subset of the information available.
+E.g., the secret partitions only expose their digest value via the CSRs.
+
+The Key Derivation Interface (KDI) on the bottom right side interacts with the scrambling datapath, the EDN and the partition holding the scrambling root keys in order to derive static and ephemeral scrambling keys for FLASH and SRAM scrambling.
+
+The test access gate shown at the top of the block diagram is governed by the life cycle qualification signal `dft_en_i`, which is only enabled during the TEST_UNLOCKED* life cycle states.
+Otherwise, test access via this TL-UL window is locked down.
+
+In addition to the blocks mentioned so far, the OTP controller also contains an LFSR timer that creates pseudo-randomly distributed partition check requests, and provides pseudo random data at high bandwidth in the event of a secure erase request due to chip-wide alert escalation.
+For security reasons, the LFSR is periodically reseeded with entropy coming from EDN.
+
+### Data Allocation and Packing
+#### Software View
+
+The effective word width of an OTP IP typically depends on a couple of factors, including the redundancy scheme employed.
+For this the design at hand, it is assumed that this native OTP word-width is 16bit.
+For software convenience, however, these details are abstracted and the open-source OTP controller exposes the OTP storage as a linear address space of 32bit words, which is aligned with the machine word size of the Ibex processor.
+Since the OTP IP employs a redundancy mechanism similar to ECC, this implies however that write operations take place at a granularity of 32bit blocks for non-secret and 64bit blocks for secret partitions (due to the scrambling).
+Hence, software is responsible to appropriately pack and program items, since each 32bit location can only be programmed once.
+
+#### Life Cycle View
+
+Since the life cycle partition is the only partition that needs live updates in-field, proper care must be taken to properly encode data in this partition such that incremental updates are possible.
+The life cycle state is hence encoded such that incremental updates to the state are always carried out at the granularity of a 16bit word.
+Further, the life cycle transition counter is encoded such that each stroke consumes a full 16bit word for the same reason.
+
+See [life cycle controller documentation](../../lc_ctrl/README.md) for more details on the life cycle encoding.
+
+### Partition Controllers
+
+In RTL, we distinguish between buffered and unbuffered partition modules.
+These are parameterized, such that we can assemble the array of OTP partitions with these two modules only.
+The corresponding controller FSMs are explained in more detail below.
+
+#### Unbuffered Partition
+
+![Unbuffered Partition FSM](../doc/otp_ctrl_unbuf_part_fsm.svg)
+
+As shown above, the unbuffered partition module has a relatively simple controller FSM that only reads out the digest value of the partition upon initialization, and then basically waits for TL-UL read transactions to its corresponding window in the CSR space.
+
+Write access through the DAI will be locked in case the digest is set to a non-zero value.
+Also, read access through the DAI and the CSR window can be locked at runtime via a CSR.
+Read transactions through the CSR window will error out if they are out of bounds, or if read access is locked.
+
+Note that unrecoverable [OTP errors](#generalized-open-source-interface), ECC failures in the digest register or external escalation via `lc_escalate_en` will move the partition controller into a terminal error state.
+
+#### Buffered Partition
+
+![Buffered Partition FSM](../doc/otp_ctrl_buf_part_fsm.svg)
+
+The controller FSM of the buffered partition module is more complex than the unbuffered counterpart, since it has to account for scrambling and digest calculation.
+
+Upon initialization, the controller reads out the whole partition and descrambles it on the fly if needed.
+
+Then, right after the initial readout, the partition controller jumps into the first integrity check, which behaves somewhat differently, depending on whether the partition is digest protected (or not) and/or scrambled (or not).
+If the partition is not digest protected, or if the digest has not yet been computed, the check completes right away, and the buffered values are released for hardware broadcast.
+Otherwise, the partition contents in the buffer registers are re-scrambled if needed, and a digest is computed on the fly.
+If the computed digest matches with the one that has been read out before, the buffered registers are released for hardware broadcast.
+Otherwise, the buffered values are gated to their default, and an alert is triggered through the error handling logic.
+
+After initialization, the integrity check (as described above) and the consistency check can be triggered by the LFSR timer mechanism on a periodic basis.
+
+The consistency check behaves differently, depending on whether the partition is digest protected or not.
+If it is, the consistency check will read out the digest stored in OTP and compare it with the value stored in the buffer register.
+Otherwise, if no digest is available, the controller will read out the whole partition from OTP, and compare it to the contents stored in the buffer registers.
+In case of a mismatch, the buffered values are gated to their default, and an alert is triggered through the error handling logic.
+
+Note that in case of unrecoverable OTP errors or ECC failures in the buffer registers, the partition controller FSM is moved into a terminal error state, which locks down all access through DAI and clamps the values that are broadcast in hardware to their defaults.
+
+External escalation via the `lc_escalate_en` signal will move the partition controller FSM into the terminal error state as well.
+See [life cycle controller documentation](../../lc_ctrl/README.md) for more details.
+
+### Direct Access Interface Control
+
+![Direct Access Interface FSM](../doc/otp_ctrl_dai_fsm.svg)
+
+Upon reset release, the DAI controller first sends an initialization command to the OTP macro.
+Once the OTP macro becomes operational, an initialization request is sent to all partition controllers, which will read out and initialize the corresponding buffer registers.
+The DAI then becomes operational once all partitions have initialized, and supports read, write and digest calculation commands (see [here](#direct-access-interface) for more information about how to interact with the DAI through the CSRs).
+
+Read and write commands transfer either 32bit or 64bit of data from the OTP to the corresponding CSR and vice versa. The access size is determined automatically, depending on whether the partition is scrambled or not. Also, (de)scrambling is performed transparently, depending on whether the partition is scrambled or not.
+
+Digest calculation commands read out the complete contents of a particular partition, compute a digest and write that digest value to the predefined location at the end of the partition.
+
+Note that any unrecoverable OTP error will move the DAI into a terminal error state, where all access through the DAI will be locked.
+Also, the DAI consumes the read and write access information provided by the partition controller, and if a certain read or write access is not permitted, a recoverable error will be flagged in the status / error CSRs.
+
+### Life Cycle Interface Control
+
+![Life Cycle Interface FSM](../doc/otp_ctrl_lci_fsm.svg)
+
+Upon reset release the LCI FSM waits until the OTP controller has initialized and the LCI gets enabled.
+Once it is in the idle state, life cycle state updates can be initiated via the life cycle interface as [described here](#state-transitions).
+The LCI controller takes the life cycle state to be programmed and writes all 16bit words to OTP.
+In case of unrecoverable OTP errors, the FSM signals an error to the life cycle controller and moves into a terminal error state.
+
+### Key Derivation Interface
+
+![Key Derivation Interface FSM](../doc/otp_ctrl_kdi_fsm.svg)
+
+Upon reset release the KDI FSM waits until the OTP controller has initialized and the KDI gets enabled.
+Once it is in the idle state, key derivation can be requested via the [flash](#interface-to-flash-scrambler) and [sram](#interface-to-sram-and-otbn-scramblers) interfaces.
+Based on which interface makes the request, the KDI controller will evaluate a variant of the PRESENT digest mechanism as described in more detail below.
+
+### Scrambling Datapath
+
+![OTP Digest Mechanism](../doc/otp_ctrl_digest_mechanism.svg)
+
+The scrambling datapath is built around an iterative implementation of the [PRESENT lightweight cipher](../../prim/doc/prim_present.md) that performs one round per cycle.
+The datapath contains some additional multiplexing circuitry to enable the DAI, KDI and partition controllers to evaluate different functions with the same datapath.
+The algorithmic steps of these functions are explained in more detail below.
+
+#### Scrambling
+
+As illustrated in subfigure a) in the diagram above, the standard 128bit-key PRESENT configuration with 31 rounds is used for scrambling operations.
+The key used for scrambling is a global netlist constant chosen by the silicon creator, and all secret partitions are encrypted using the their own distinct netlist constant.
+Note that the amount of data that is being scrambled is small (160byte = 20 x 64bit blocks) and the scrambled data remains constant.
+Hence, no additional masking or diversification scheme is applied since only a very limited amount of information can be gathered by observing the scrambling operation via side-channels.
+
+#### Digest Calculation
+
+The integrity digests used in the [partition checks](#partition-checks) are computed using a custom [Merkle-Damgard](https://en.wikipedia.org/wiki/Merkle%E2%80%93Damg%C3%A5rd_construction) scheme, where the employed one-way compression function F is constructed by using PRESENT in a [Davies-Meyer arrangement](https://en.wikipedia.org/wiki/One-way_compression_function#Davies%E2%80%93Meyer).
+This is illustrated in subfigure b).
+
+At the beginning of the digest calculation the 64bit state is initialized with an initialization vector (IV).
+Then, the data to be digested is split into 128bit chunks, each of which is used as a 128bit key input for updating the 64bit state with the compression function F.
+Chunks that are not aligned with 128bit are padded with zero, and the finalization operation consists of another 31-round encryption pass with a finalization constant.
+Note that both the IV as well as the finalization constant are global netlist constants chosen by the silicon creator.
+
+#### Scrambling Key Derivation
+
+The key derivation functions for ephemeral SRAM and static FLASH scrambling keys employ a similar construction as the digest calculation function.
+In particular, the keys are derived by repeatedly reducing a (partially random) block of data into a 64bit block, as illustrated in subfigures c) and d).
+
+For ephemeral SRAM scrambling keys, the data block is composed of the 128bit SRAM_DATA_KEY_SEED stored in OTP, as well as 128bit of fresh entropy fetched from the EDN.
+This process is repeated twice in order to produce a 128bit key.
+
+For static FLASH scrambling keys, the data block is composed of a 128bit part of either the FLASH_DATA_KEY_SEED or the FLASH_ADDR_KEY_SEED stored in OTP.
+These key seeds are 256bit in size, allowing to use a unique chunk of 128bit of key seed data to derive a 64bit halve of a particular scrambling key.
+
+Note that the IV and finalization constants are distinct for SRAM and FLASH data and FLASH address scrambling keys.
+These constants are chosen by the silicon creator prior to the tapeout.
+
+### Access Arbitration
+
+Access to the OTP wrapper and the scrambling datapath are both round-robin arbitrated, where the former arbitration occurs at cycle level (i.e., individual OTP memory accesses), and the latter occurs at the level of complete transactions (i.e., full digest or encryption).
+Arbitration at transaction level is implemented similarly to cycle-based arbitration, with the difference that the grant signals remain asserted until the requestor deasserts the request (thereby releasing the arbiter, which acts as a mutex in this case).
+This is behavior illustrated in the example below.
+
+```wavejson
+{signal: [
+  {name: 'clk_i',                  wave: 'p............'},
+  {name: 'part_scrmbl_mtx_req[0]', wave: '01....0.1....'},
+  {name: 'part_scrmbl_mtx_req[1]', wave: '0.1......0...'},
+  {name: 'part_scrmbl_mtx_req[2]', wave: '0.1........0.'},
+  {},
+  {name: 'part_scrmbl_mtx_gnt[0]', wave: '01....0....1.'},
+  {name: 'part_scrmbl_mtx_gnt[1]', wave: '0.....1..0...'},
+  {name: 'part_scrmbl_mtx_gnt[2]', wave: '0........1.0.'},
+]}
+```
+
+### Primitive Wrapper and FPGA Emulation
+
+![OTP Wrapper Block Diagram](../doc/otp_ctrl_prim_otp.svg)
+
+The OTP IP is wrapped up in a primitive wrapper that exposes a TL-UL interface for testing purposes, and a generalized open-source interface for functional operation (described below).
+Any OTP redundancy mechanism like per-word ECC is assumed to be handled inside the wrapper, which means that the word width exposed as part of the generalized interface is the effective word width.
+
+Note that the register space exposed via the TL-UL test interface, as well as DFT and power-related signals are dependent on the underlying proprietary OTP IP.
+They are therefore not further described in this document.
+
+#### Generalized Open-source Interface
+
+The generalized open-source interface uses a couple of parameters (defaults set for Earlgrey configuration).
+
+Parameter      | Default | Top Earlgrey  | Description
+---------------|---------|---------------|---------------
+`Width`        | 16      | 16            | Native OTP word width.
+`Depth`        | 1024    | 1024          | Depth of OTP macro.
+`CmdWidth`     | 2       | 2             | Width of the OTP command.
+`ErrWidth`     | 3       | 3             | Width of error code output signal.
+`PwrSeqWidth`  | 2       | 2             | Width of power sequencing signals to/from AST.
+`SizeWidth`    | 2       | 2             | Width of the size field.
+`IfWidth`      | 2^`SizeWidth` * `Width` | 2^`SizeWidth` * `Width` | Data interface width.
+
+The generalized open-source interface is a simple command interface with a ready / valid handshake that makes it possible to introduce back pressure if the OTP macro is not able to accept a command due to an ongoing operation.
+
+In order to facilitate the scrambling and digest operations, the data width has been sized such that data blocks up to the PRESENT block size (64bit) can be transferred across the generalized interface. The actual size of a transfer is determined via the size_i field. Transfer sizes are specified in multiples of the native OTP block size, as listed below.
+
+Value of `size_i` | #Native OTP Words | Bit Slice
+------------------|-------------------|------------
+2'b00             |                 1 | `{word0} = data[15:0]`
+2'b01             |                 2 | `{word1, word0} = data[31:0]`
+2'b10             |                 3 | `{word2, word1, word0} = data[47:0]`
+2'b11             |                 4 | `{word3, word2, word1, word0} = data[63:0]`
+
+Responses are returned in-order via an unidirectional response interface (i.e., without back pressure capability).
+Downstream logic must be able to sink the response in any case.
+The response optionally carries read data, depending on whether the operation that took place was a read or not.
+Also, an error signal returns a non-zero error code in case an error occurred while carrying out the OTP command.
+
+The signals pertaining to the generalized open-source interface are listed below.
+
+Signal                  | Direction        | Type                        | Description
+------------------------|------------------|-----------------------------|---------------
+`fatal_alert_o`         | `output`         | `logic`                     | Fatal alert output from the primitive. This is connected to a separate alert channel in the instantiating IP. The instantiating IP latches the alert indication and continuously outputs alert events until reset.
+`recov_alert_o`         | `output`         | `logic`                     | Recoverable alert output from the primitive. This is connected to a separate alert channel in the instantiating IP. Should only be pulsed high for each alert occurrence. The instantiating IP then sends out a single alert event for each pulse.
+`ready_o`               | `output`         | `logic`                     | Ready signal for the command handshake.
+`valid_i`               | `input`          | `logic`                     | Valid signal for the command handshake.
+`size_i`                | `input`          | `logic [SizeWidth-1:0]`     | Number of native OTP words to transfer, minus one: `2'b00 = 1 native word` ... `2'b11 = 4 native words`.
+`cmd_i`                 | `input`          | `logic [CmdWidth-1:0]`      | OTP command: `2'b00 = read`, `2'b01 = write`, `2'b11 = initialize`
+`addr_i`                | `input`          | `logic [$clog2(Depth)-1:0]` | OTP word address.
+`wdata_i`               | `input`          | `logic [IfWidth-1:0]`       | Write data for write commands.
+`valid_o`               | `output`         | `logic`                     | Valid signal for command response.
+`rdata_o`               | `output`         | `logic [IfWidth-1:0]`       | Read data from read commands.
+`err_o`                 | `output`         | `logic [ErrWidth-1:0]`      | Error code.
+
+The `prim_otp` wrappers implements the `Macro*` error codes (0x0 - 0x4) defined in the [OTP error handling](#error-handling).
+
+The timing diagram below illustrates the timing of a command.
+Note that both read and write commands return a response, and each command is independent of the previously issued commands.
+The latency from accepting a command to returning a response depends on the underlying OTP IP and is typically larger than 10 cycles.
+The returned values depend on the command type and whether an error occurred or not.
+
+```wavejson
+{
+  signal: [
+    { name: 'clk_i',    wave: 'p.............' },
+    { name: 'ready_o',  wave: '0..10|.10.|...' , node: '...a...c'},
+    { name: 'valid_i',  wave: '01..0|1.0.|...' },
+    { name: 'size_i',   wave: '03..0|3.0.|...' },
+    { name: 'cmd_i',    wave: '04..0|4.0.|...' },
+    { name: 'wdata_i',  wave: '05..0|5.0.|...' },
+    { name: 'valid_o',  wave: '0....|..10|.10' , node: '........b...d'},
+    { name: 'rdata_o',  wave: '0....|..50|.50' },
+    { name: 'err_o',    wave: '0....|..40|.40' },
+  ],
+  edge: [
+   'a~>b',
+   'c~>d',
+  ],
+  head: {
+    text: 'Timing of an OTP command.',
+  },
+  foot: {
+    text: "Cmd's are accepted in cycles 3/7, and the corresponding responses return in cycles 8/12.",
+    tick: 0,
+  }
+}
+```
+
+Note that the open source OTP controller allows up to two outstanding OTP commands, meaning that it is permissible to acknowledge an incoming command and start working on it while the results of the last command are still in the process of being output (e.g., due to an output register stage).
+
+#### Generic Simulation and FPGA Emulation Model
+
+For open-source simulation and FPGA emulation, a synthesizable and generic OTP wrapper module is provided (`prim_generic_otp`).
+This is automatically selected in the OpenTitan build flow via the technology primitive mechanism if no proprietary OTP IP is available for a specific technology.
+The OTP storage in `prim_generic_otp` is emulated using a standard RAM primitive `prim_generic_ram_1p`.
+While this storage element is volatile, the primitive is constructed such that the contents are not wiped upon a system-wide reset.
+I.e., only a power-cycle wipes the RAM primitive, thereby enabling limited emulation of the OTP function and life cycle transitions also on an FPGA device.
diff --git a/hw/ip/pattgen/README.md b/hw/ip/pattgen/README.md
index 578ce4ca7056a..85a3596893b6d 100644
--- a/hw/ip/pattgen/README.md
+++ b/hw/ip/pattgen/README.md
@@ -29,117 +29,3 @@ The output channels may be activated and operated independently, or they can be
 ## Compatibility
 
 This IP block does not have any direct hardware compatibility requirements.
-
-# Theory of Operations
-
-The pattern can be started (or halted) on either channel by setting the corresponding [`CTRL.ENABLE`](data/pattgen.hjson#ctrl) bit to 1 (or 0) for the desired channel.
-Once disabled, either channel can be configured independently.
-The channel parameters (i.e. clock divider ratio, clock polarity, pattern length, pattern data, and repetition count) can all be programmed on a per-channel basis.
-Enabling the pattern generator channel starts the pattern from the beginning.
-
-Please note that writes to a channel's configuration registers have no effect while the channel is enabled.
-For operational simplicity, the configuration registers are only transferred into the internal finite state machines while a channel is disabled.
-Changes to the configuration registers only take effect once the channel has been disabled and re-enabled.
-
-## Block Diagram
-
-![](./doc/pattgen_block_diagram.svg)
-
-## Hardware Interfaces
-
-* [Interface Tables](data/pattgen.hjson#interfaces)
-
-## Design Details
-
-The design consists of two identical and independent finite state machines, each an instance of module `pattgen_fsm`.
-Each FSM is essentially three nested counters, with one counter to control the clock division, another to count out the sequence bits, and a third to keep count of the number of repetitions.
-
-Each FSM consists of
-- Inputs:
-    - `clk_io`, `reset`, `enable`, `clk_pre_divide`, `pattern`, `pattern_size`, `polarity`,  and `n_repeats`
-- Outputs:
-    - `pda` and `pcl`
-- a single state variable with three states `IDLE`, `ACTIVE`, and `END`,
-- a clock-divide counter, `clk_div`,
-- a single clock-divide flop, `clk_int`, and
-- two additional internal counters `bit_ctr` and `repeat_ctr`.
-
-Each FSM is disabled when `enable` is low.
-Disabling the FSM is equivalent to an FSM reset, and both operations place the FSM in the `IDLE` state.
-While in `IDLE`, the other state machine registers assume their default states:
-The internal counters, the clock-divide, `bit_ctr` and `repeat_ctr` all reset to 0, as does `clk_int`.
-
-Once the FSM is enabled, it transitions to the `ACTIVE` state.
-The clock-divide counter `clk_div` increments every cycle, except when it overflows matching the value applied to the `clk_pre_divide` input.
-Then `clk_div` resets to 0, toggling `clk_int` in the process.
-Two overflow events result in a complete clock cycle, resulting in an internal clock frequency of:
-$$f_{pclx}=\frac{f_\textrm{I/O clk}}{2(\textrm{CLK_RATIO}+1)}$$
-
-The FSM clock output, `pcl`, is directly driven by `clk_int`, unless the `polarity` input is high, in which case `pcl` is inverted from `clk_int`.
-
-The `bit_ctr` counter increments on every falling edge of `clk_int`, until it overflows at the pattern length based on the `pattern_size` input.
-
-In the `ACTIVE` state, the FSM `pda` output is driven by a multiplexer, connected to the `pattern` input.
-The value of `bit_ctr` selects the bit value from the appropriate sequence position, via this multiplexor.
-
-Finally whenever `bit_ctr` overflows and reverts to zero, the `repeat_ctr` increments, and the pattern starts again.
-Finally `repeat_ctr` overflows to zero as it reaches the input value `n_repeats`.
-When this overflow occurs, the FSM transitions to the `END` state.
-All counters halt, the `pda` data lines reset to zero, and an interrupt event is sent out to signal completion.
-
-The entire sequence can be restarted either by resetting or disabling and re-enabling the FSM.
-
-### Interrupts
-
-The pattern generator HWIP provides two interrupt pins, `done_ch0` and `done_ch1`, which indicate the completion of pattern generation on the output channels.
-These interrupts can be enabled/disabled by setting/un-setting the corresponding bits of the [`INTR_ENABLE`](data/pattgen.hjson#intr_enable) register.
-To clear the interrupts, bit `1` must be written the corresponding bits of [`INTR_STATE`](data/pattgen.hjson#intr_state) register
-
-# Programmers guide
-
-To start pattern generation, the register interface of the pattern generator HWIP should be properly initialized and configured.
-
-The guide that follows provides instructions for configuring Channel 0.
-To configure Channel 1, use the registers with the "CH1" suffix, instead of the "CH0" registers.
-
-To configure a single channel:
-1. Before configuration, disable the desired channel by clearing the enable bit, [`CTRL.ENABLE_CH0`](data/pattgen.hjson#ctrl).
-1. Set the polarity bit, [`CTRL.POLARITY_CH0`](data/pattgen.hjson#ctrl), to determine the desired clock phase.
-For either channel, a zero in the polarity bit indicates that the channel clock line (`pcl`) should start low, and the channel data line `pda` transitions on every falling edge of `pcl`.
-A one in the polarity bit inverts the `pcl` clock so that it starts high and `pda` transitions on the rising edge.
-The following waveform illustrates the effect of the `POLARITY` bit.
-Here both channels are configured for simultaneous pattern generation, but the two channels are configured for opposite polarity.
-```wavejson
-{signal: [
-  {name: 'CTRL.ENABLE_CH0', wave: 'lh......'},
-  {name: 'CTRL.POLARITY_CH0 (default: low)', wave: '0.......'},
-  {name: 'pcl0_tx', wave: '0.hlhlhl'},
-  {name: 'pda0_tx', wave: 'x3.3.3.3', data: 'DATA[0] DATA[1] DATA[2]'},
-  {name: 'CTRL.POLARITY_CH1 (high)', wave: '1.......'},
-  {name: 'pcl1_tx', wave: '1.lhlhlh'},
-  {name: 'pda1_tx', wave: 'x5.5.5.5', data: 'DATA[0] DATA[1] DATA[2]'},
-],
-  head: {text: 'Effect of the Polarity Registers',tick:0}}
-```
-
-1. Program the length of seed pattern using the length field, [`SIZE.LEN_CH0`](data/pattgen.hjson#size).
-Note that since the allowed seed length ranges from 1-64, the value of this field should be one less than the pattern length.
-For example, to generate an 16-bit pattern, a value of 15 should be written to the field [`SIZE.LEN_CH0`](data/pattgen.hjson#size).
-1. Program the seed pattern (between 1 and 64 bits in length) using the multi-register [`DATA_CH0_0`](data/pattgen.hjson#data_ch0_0) and [`DATA_CH0_1`](data/pattgen.hjson#data_ch0_1).
-The first 32 bits to be transmitted, are programmed in the lower half of the multi-register (i.e. [`DATA_CH0_0`](data/pattgen.hjson#data_ch0_0)), and the latter 32 bits are programmed in the upper half of the multi-register (i.e. [`DATA_CH0_1`](data/pattgen.hjson#data_ch0_1)).
-1. Program the clock divider ratio using the register [`PREDIV_CH0.CLK_RATIO`](data/pattgen.hjson#prediv_ch0).
-The resulting clock frequency will be slower than the input I/O clock by a ratio of 2&times;(CLK_RATIO+1):
-$$f_{pclx}=\frac{f_\textrm{I/O clk}}{2(\textrm{CLK_RATIO}+1)}$$
-1. Program the desired number of pattern repetitions using the repetition field [`SIZE.REPS_CH0`](data/pattgen.hjson#size).
-Note that since the allowed number of pattern repetitions ranges from 1-1024, the value of this field should be one less than the desired repetition count.
-For example, to repeat a pattern 30, a value of 29 should written to the field [`SIZE.REPS_CH0`](data/pattgen.hjson#size).
-1. Finally to start the pattern, set the [`CTRL.ENABLE_CH0`](data/pattgen.hjson#ctrl).
-To start both channel patterns at the same time, configure both channels then simultaneously assert both the [`CTRL.ENABLE_CH0`](data/pattgen.hjson#ctrl) and [`CTRL.ENABLE_CH1`](data/pattgen.hjson#ctrl) bits in the same register access.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_pattgen.h)
-
-## Register Table
-
-* [Register Table](data/pattgen.hjson#registers)
diff --git a/hw/ip/pattgen/doc/programmers_guide.md b/hw/ip/pattgen/doc/programmers_guide.md
new file mode 100644
index 0000000000000..320f8e81c1e51
--- /dev/null
+++ b/hw/ip/pattgen/doc/programmers_guide.md
@@ -0,0 +1,48 @@
+# Programmer's Guide
+
+To start pattern generation, the register interface of the pattern generator HWIP should be properly initialized and configured.
+
+The guide that follows provides instructions for configuring Channel 0.
+To configure Channel 1, use the registers with the "CH1" suffix, instead of the "CH0" registers.
+
+To configure a single channel:
+1. Before configuration, disable the desired channel by clearing the enable bit, [`CTRL.ENABLE_CH0`](../data/pattgen.hjson#ctrl).
+1. Set the polarity bit, [`CTRL.POLARITY_CH0`](../data/pattgen.hjson#ctrl), to determine the desired clock phase.
+For either channel, a zero in the polarity bit indicates that the channel clock line (`pcl`) should start low, and the channel data line `pda` transitions on every falling edge of `pcl`.
+A one in the polarity bit inverts the `pcl` clock so that it starts high and `pda` transitions on the rising edge.
+The following waveform illustrates the effect of the `POLARITY` bit.
+Here both channels are configured for simultaneous pattern generation, but the two channels are configured for opposite polarity.
+```wavejson
+{signal: [
+  {name: 'CTRL.ENABLE_CH0', wave: 'lh......'},
+  {name: 'CTRL.POLARITY_CH0 (default: low)', wave: '0.......'},
+  {name: 'pcl0_tx', wave: '0.hlhlhl'},
+  {name: 'pda0_tx', wave: 'x3.3.3.3', data: 'DATA[0] DATA[1] DATA[2]'},
+  {name: 'CTRL.POLARITY_CH1 (high)', wave: '1.......'},
+  {name: 'pcl1_tx', wave: '1.lhlhlh'},
+  {name: 'pda1_tx', wave: 'x5.5.5.5', data: 'DATA[0] DATA[1] DATA[2]'},
+],
+  head: {text: 'Effect of the Polarity Registers',tick:0}}
+```
+
+1. Program the length of seed pattern using the length field, [`SIZE.LEN_CH0`](../data/pattgen.hjson#size).
+Note that since the allowed seed length ranges from 1-64, the value of this field should be one less than the pattern length.
+For example, to generate an 16-bit pattern, a value of 15 should be written to the field [`SIZE.LEN_CH0`](../data/pattgen.hjson#size).
+1. Program the seed pattern (between 1 and 64 bits in length) using the multi-register [`DATA_CH0_0`](../data/pattgen.hjson#data_ch0_0) and [`DATA_CH0_1`](../data/pattgen.hjson#data_ch0_1).
+The first 32 bits to be transmitted, are programmed in the lower half of the multi-register (i.e. [`DATA_CH0_0`](../data/pattgen.hjson#data_ch0_0)), and the latter 32 bits are programmed in the upper half of the multi-register (i.e. [`DATA_CH0_1`](../data/pattgen.hjson#data_ch0_1)).
+1. Program the clock divider ratio using the register [`PREDIV_CH0.CLK_RATIO`](../data/pattgen.hjson#prediv_ch0).
+The resulting clock frequency will be slower than the input I/O clock by a ratio of 2&times;(CLK_RATIO+1):
+$$f_{pclx}=\frac{f_\textrm{I/O clk}}{2(\textrm{CLK_RATIO}+1)}$$
+1. Program the desired number of pattern repetitions using the repetition field [`SIZE.REPS_CH0`](../data/pattgen.hjson#size).
+Note that since the allowed number of pattern repetitions ranges from 1-1024, the value of this field should be one less than the desired repetition count.
+For example, to repeat a pattern 30, a value of 29 should written to the field [`SIZE.REPS_CH0`](../data/pattgen.hjson#size).
+1. Finally to start the pattern, set the [`CTRL.ENABLE_CH0`](../data/pattgen.hjson#ctrl).
+To start both channel patterns at the same time, configure both channels then simultaneously assert both the [`CTRL.ENABLE_CH0`](../data/pattgen.hjson#ctrl) and [`CTRL.ENABLE_CH1`](../data/pattgen.hjson#ctrl) bits in the same register access.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_pattgen.h)
+
+## Register Table
+
+* [Register Table](../data/pattgen.hjson#registers)
diff --git a/hw/ip/pattgen/doc/theory_of_operation.md b/hw/ip/pattgen/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..f14262a218ca1
--- /dev/null
+++ b/hw/ip/pattgen/doc/theory_of_operation.md
@@ -0,0 +1,64 @@
+# Theory of Operation
+
+The pattern can be started (or halted) on either channel by setting the corresponding [`CTRL.ENABLE`](../data/pattgen.hjson#ctrl) bit to 1 (or 0) for the desired channel.
+Once disabled, either channel can be configured independently.
+The channel parameters (i.e. clock divider ratio, clock polarity, pattern length, pattern data, and repetition count) can all be programmed on a per-channel basis.
+Enabling the pattern generator channel starts the pattern from the beginning.
+
+Please note that writes to a channel's configuration registers have no effect while the channel is enabled.
+For operational simplicity, the configuration registers are only transferred into the internal finite state machines while a channel is disabled.
+Changes to the configuration registers only take effect once the channel has been disabled and re-enabled.
+
+## Block Diagram
+
+![](../doc/pattgen_block_diagram.svg)
+
+## Hardware Interfaces
+
+* [Interface Tables](../data/pattgen.hjson#interfaces)
+
+## Design Details
+
+The design consists of two identical and independent finite state machines, each an instance of module `pattgen_fsm`.
+Each FSM is essentially three nested counters, with one counter to control the clock division, another to count out the sequence bits, and a third to keep count of the number of repetitions.
+
+Each FSM consists of
+- Inputs:
+    - `clk_io`, `reset`, `enable`, `clk_pre_divide`, `pattern`, `pattern_size`, `polarity`,  and `n_repeats`
+- Outputs:
+    - `pda` and `pcl`
+- a single state variable with three states `IDLE`, `ACTIVE`, and `END`,
+- a clock-divide counter, `clk_div`,
+- a single clock-divide flop, `clk_int`, and
+- two additional internal counters `bit_ctr` and `repeat_ctr`.
+
+Each FSM is disabled when `enable` is low.
+Disabling the FSM is equivalent to an FSM reset, and both operations place the FSM in the `IDLE` state.
+While in `IDLE`, the other state machine registers assume their default states:
+The internal counters, the clock-divide, `bit_ctr` and `repeat_ctr` all reset to 0, as does `clk_int`.
+
+Once the FSM is enabled, it transitions to the `ACTIVE` state.
+The clock-divide counter `clk_div` increments every cycle, except when it overflows matching the value applied to the `clk_pre_divide` input.
+Then `clk_div` resets to 0, toggling `clk_int` in the process.
+Two overflow events result in a complete clock cycle, resulting in an internal clock frequency of:
+$$f_{pclx}=\frac{f_\textrm{I/O clk}}{2(\textrm{CLK_RATIO}+1)}$$
+
+The FSM clock output, `pcl`, is directly driven by `clk_int`, unless the `polarity` input is high, in which case `pcl` is inverted from `clk_int`.
+
+The `bit_ctr` counter increments on every falling edge of `clk_int`, until it overflows at the pattern length based on the `pattern_size` input.
+
+In the `ACTIVE` state, the FSM `pda` output is driven by a multiplexer, connected to the `pattern` input.
+The value of `bit_ctr` selects the bit value from the appropriate sequence position, via this multiplexor.
+
+Finally whenever `bit_ctr` overflows and reverts to zero, the `repeat_ctr` increments, and the pattern starts again.
+Finally `repeat_ctr` overflows to zero as it reaches the input value `n_repeats`.
+When this overflow occurs, the FSM transitions to the `END` state.
+All counters halt, the `pda` data lines reset to zero, and an interrupt event is sent out to signal completion.
+
+The entire sequence can be restarted either by resetting or disabling and re-enabling the FSM.
+
+### Interrupts
+
+The pattern generator HWIP provides two interrupt pins, `done_ch0` and `done_ch1`, which indicate the completion of pattern generation on the output channels.
+These interrupts can be enabled/disabled by setting/un-setting the corresponding bits of the [`INTR_ENABLE`](../data/pattgen.hjson#intr_enable) register.
+To clear the interrupts, bit `1` must be written the corresponding bits of [`INTR_STATE`](../data/pattgen.hjson#intr_state) register
diff --git a/hw/ip/pinmux/README.md b/hw/ip/pinmux/README.md
index 9255304720113..5c7cef56fb450 100644
--- a/hw/ip/pinmux/README.md
+++ b/hw/ip/pinmux/README.md
@@ -27,362 +27,3 @@ For example, the sleep behavior of each pad can be programmed individually, and
 - Programmable sleep mode behavior
 
 - Support for life-cycle-based JTAG (TAP) isolation and muxing
-
-# Theory of Operations
-
-## Block Diagram and Overview
-
-The `pinmux` peripheral is a programmable module designed to wire arbitrary peripheral inputs and outputs to arbitrary multiplexable chip bidirectional pins.
-It gives much flexibility at the top level of the device, allowing most data pins to be flexibly wired and controlled by many peripherals.
-Even though the `pinmux` is referred to as one IP, it is logically split into two modules that are instantiated on the top-level and the chip-level, respectively, as can be seen in the block diagram below.
-The top-level module `pinmux` contains the CSRs accessible via the TL-UL interface, the main muxing matrix, retention registers, a set of programmable wakeup detectors, and the HW strap sampling and TAP / JTAG muxing logic.
-The chip-level module `padring` instantiates the bidirectional pads and connects the physical pad attributes.
-
-![Pinmux Block Diagram](./doc/pinmux_overview_block_diagram.svg)
-
-### MIO and DIO Signal Categories
-
-The `pinmux` supports two different IO signal categories:
-Muxed IO (MIO) signals that are routed through the `pinmux` matrix, and dedicated IO (DIO) signals that bypass the `pinmux` matrix.
-This distinction is useful for accommodating IO signals that are timing critical or that must have a fixed IO mapping for another reason.
-Note that although DIO signals are not routed through the `pinmux` matrix, they are still connected to the retention logic and the wakeup detectors (see next section below).
-
-The number of available peripheral IOs, pads, and their assignment to the MIO / DIO categories is done at design time as part of the top-level configuration.
-This configurability is achieved by representing inputs / outputs as packed arrays, in combination with the SystemVerilog parameters `NPeriphIn`, `NPeriphOut`, `NMioPads` and `NDioPads`.
-Note however that the register file is also affected by this configuration and needs to be regenerated for each design instance.
-
-It is assumed that all available pins that the `pinmux` connects to are bidirectional, controlled by logic within this module.
-By default, all muxed peripheral inputs are tied to zero.
-Further, all output enables are set to zero, which essentially causes all pads to be in high-Z state after reset.
-In addition to wiring programmability, each muxed peripheral input can be set constantly to 0 or 1, and each muxed chip output can be set constantly to 0, 1 or high-Z.
-
-See the [muxing matrix](#muxing-matrix) section for more details about the mux implementation.
-
-### Retention and Wakeup Features
-
-The retention logic allows SW to specify a certain behavior during sleep for each muxed and dedicated output.
-Legal behaviors are tie low, tie high, high-Z, keeping the previous state, or driving the current value (useful for peripherals that are always on).
-
-The wakeup detectors can detect patterns such as rising / falling edges and pulses of a certain width up to 255 AON clock cycles.
-Each wakeup detector can listen on any one of the MIO / DIO signals that are routed through the `pinmux`, and if a pattern is detected, the power manager is informed of that event via a wakeup request.
-
-The `pinmux` module itself is in the always-on (AON) power domain, and as such does not loose configuration state when a sleep power cycle is performed.
-However, only the wakeup detector logic will be actively clocked during sleep in order to save power.
-
-See the [retention logic](#retention-logic) and [wakeup detectors](#wakeup-detectors) sections for more details about the mux implementation.
-
-### Test and Debug Access
-
-The hardware strap sampling and TAP isolation logic provides test and debug access to the chip during specific life cycle states.
-This mechanism is explained in more detail in the [strap sampling and TAP isolation](#strap-sampling-and-tap-isolation) section.
-
-### Pad Attributes
-
-Additional pad-specific features such as inversion, pull-up, pull-down, virtual open-drain, drive-strength and input/output inversion etc. can be exercise via the pad attribute CSRs.
-The `pinmux` module supports a comprehensive set of such pad attributes, but it is permissible that some of them may not be supported by the underlying pad implementation.
-For example, certain ASIC libraries may not provide open-drain outputs, and FPGAs typically do not allow all of these attributes to be programmed dynamically at runtime.
-See the [generic pad wrapper](#generic-pad-wrapper) section below for more details.
-Note that static pad attributes for FPGAs are currently not covered in this specification.
-
-## Hardware Interfaces
-
-* [Interface Tables](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#interfaces)
-
-### Parameters
-
-The following table lists the main parameters used throughout the `pinmux` design.
-Note that the `pinmux` is generated based on the system configuration, and hence these parameters are placed into a package.
-The pinout and `pinmux` mappings are listed under [Pinout and Pinmux Mapping](#pinout-and-pinmux-mapping) for specific top-level configurations.
-
-Parameter      | Description
----------------|---------------
-`NPeriphOut`   | Number of peripheral outputs.
-`NPeriphIn`    | Number of peripheral input.
-`NMioPads`     | Number of muxed bidirectional pads.
-`NDioPads`     | Number of dedicated pads.
-
-### Primary IO Signals
-
-The table below lists the primary `pinmux` IO signals to/from the pad ring.
-The number of dedicated and muxed IOs is parametric, and hence the signals are stacked in packed arrays.
-
-Signal                                 | Direction | Type                               | Description
----------------------------------------|-----------|------------------------------------|---------------
-`periph_to_mio_i[NPeriphOut-1:0]`      | `input`   | packed `logic`                     | Signals from `NPeriphOut` muxed peripheral outputs coming into the `pinmux`.
-`periph_to_mio_oe_i[NPeriphOut-1:0]`   | `input`   | packed `logic`                     | Signals from `NPeriphOut` muxed peripheral output enables coming into the `pinmux`.
-`mio_to_periph_o[NPeriphIn-1:0]`       | `output`  | packed `logic`                     | Signals to `NPeriphIn` muxed peripherals coming from the `pinmux`.
-`periph_to_dio_i[NDioPads-1:0]`        | `input`   | packed `logic`                     | Signals from `NDioPads` dedicated peripheral outputs coming into the `pinmux`.
-`periph_to_dio_oe_i[NDioPads-1:0]`     | `input`   | packed `logic`                     | Signals from `NDioPads` dedicated peripheral output enables coming into the `pinmux`.
-`dio_to_periph_o[NDioPads-1:0]`        | `output`  | packed `logic`                     | Signals to `NDioPads` dedicated peripherals coming from the `pinmux`.
-`mio_attr_o[NMioPads-1:0]`             | `output`  | prim_pad_wrapper_pkg::pad_attr_t   | Packed array containing the pad attributes of all muxed IOs.
-`mio_out_o[NMioPads-1:0]`              | `output`  | packed `logic`                     | Signals to `NMioPads` bidirectional muxed pads as output data.
-`mio_oe_o[NMioPads-1:0]`               | `output`  | packed `logic`                     | Signals to `NMioPads` bidirectional muxed pads as output enables.
-`mio_in_i[NMioPads-1:0]`               | `input`   | packed `logic`                     | Signals from `NMioPads` bidirectional muxed pads as input data.
-`dio_attr_o[NDioPads-1:0]`             | `output`  | prim_pad_wrapper_pkg::pad_attr_t   | Packed array containing the pad attributes of all dedicated IOs.
-`dio_out_o[NDioPads-1:0]`              | `output`  | packed `logic`                     | Signals to `NDioPads` bidirectional dedicated pads as output data.
-`dio_oe_o[NDioPads-1:0]`               | `output`  | packed `logic`                     | Signals to `NDioPads` bidirectional dedicated pads as output enables.
-`dio_in_i[NDioPads-1:0]`               | `input`   | packed `logic`                     | Signals from `NDioPads` bidirectional dedicated pads as input data.
-
-
-## Muxing Matrix
-
-The diagram below shows connectivity between four arbitrary chip pins, named `MIO0` .. `MIO3`, and several muxed peripheral inputs and outputs.
-This shows the connectivity available in all directions, as well as the control registers described later in this document.
-Two example peripherals (`uart` and `spidev`) are attached to the `pinmux` in this example, one with one input and one output, the other with three inputs and one output.
-The diagram also shows the `padring` module which instantiates the bidirectional chip pads with output enable control.
-
-![Pinmux Block Diagram](./doc/pinmux_muxing_matrix.svg)
-
-Note that apart from selecting a specific input pad, the `periph_insel[*]` signals can also be used to tie the peripheral input to 0 or 1.
-Likewise, the output select signals `mio_outsel[*]` can also be used to constantly drive an output pin to 0/1 or to put it into high-Z state (default).
-The output enable and the associated data signal (i.e. `periph_to_mio` and `periph_to_mio_oe`) are indexed with the same select signal to allow the peripheral hardware to determine the pad direction instead of demoting that control to SW.
-
-## Retention Logic
-
-As illustrated in the picture above, all muxing matrix and DIO outputs are routed through the retention logic, which essentially consists of a set of multiplexors and two retention registers per output (one register is for the output data and one for the output enable).
-This multiplexor can be configured to be automatically activated upon sleep entry in order to either drive the output low, high, high-Z or to the last seen value (keep).
-If no sleep behavior is specified, the retention logic will continue to drive out the value coming from the peripheral side, which can be useful for peripherals that reside in the AON domain.
-
-The sleep behavior of all outputs is activated in parallel via a trigger signal asserted by the power manager.
-Once activated, it is the task of SW to disable the sleep behavior for each individual pin when waking up from sleep.
-This ensures that the output values remain stable until the system and its peripherals have been re-initialized.
-
-## Wakeup Detectors
-
-The `pinmux` contains eight programmable wakeup detector modules that can listen on any of the MIO or DIO pins.
-Each detector contains a debounce filter and an 8bit counter running on the AON clock domain.
-The detectors can be programmed via the [`WKUP_DETECTOR_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_detector_0) and [`WKUP_DETECTOR_CNT_TH_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_detector_cnt_th_0) registers to detect the following patterns:
-
-- rising edge
-- falling edge
-- rising or falling edge
-- positive pulse up to 255 AON clock cycles in length
-- negative pulse up to 255 AON clock cycles in length
-
-Note that for all patterns listed above, the input signal is sampled with the AON clock.
-This means that the input signal needs to remain stable for at least one AON clock cycle after a level change for the detector to recognize the event (depending on the debounce filter configuration, the signal needs to remain stable for multiple clock cycles).
-
-If a pattern is detected, the wakeup detector will send a wakeup request to the power manager, and the cause bit corresponding to that detector will be set in the [`WKUP_CAUSE`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_cause) register.
-
-Note that the wkup detector should be disabled by setting [`WKUP_DETECTOR_EN_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_detector_en_0) before changing the detection mode.
-The reason for that is that the pulse width counter is NOT cleared upon a mode change while the detector is enabled.
-
-## Strap Sampling and TAP Isolation
-
-The `pinmux` contains a set of dedicated HW "straps", which are essentially signals that are multiplexed onto fixed MIO pad locations.
-Depending on the life cycle state, these straps are either continuously sampled, or latched right after POR.
-
-There are two groups of HW straps:
-1. Three DFT straps that determine the DFT mode.
-   These bits are output via the `dft_strap_test_o` signal such that they can be routed to the tool-inserted DFT controller.
-2. Two TAP selection straps for determining which TAP should be multiplexed onto the JTAG IOs.
-
-The conditions under which these two strap groups are sampled are listed in the tables below.
-Note that the HW straps can be used just like regular GPIOs once they have been sampled.
-
-Strap Group \ Life Cycle State  | TEST_UNLOCKED* | RMA          | DEV          | All Other States
---------------------------------|----------------|--------------|--------------|------------------
-DFT straps                      | Once at boot   | Once at boot | -            | -
-TAP strap 0                     | Continuously   | Continuously | Once at boot | Once at boot
-TAP strap 1                     | Continuously   | Continuously | Once at boot | -
-
-*Once at boot:* Sampled once after life cycle initialization (sampling event is initiated by pwrmgr).
-
-*Continuously:* Sampled continuously after life cycle initialization.
-
-The TAP muxing logic is further qualified by the life cycle state in order to isolate the TAPs in certain life cycle states.
-The following table lists the TAP strap encoding and the life cycle states in which the associated TAPs can be selected and accessed.
-
-TAP strap 1 | TAP strap 0  | Life Cycle State         | Selected TAP
-------------|--------------|--------------------------|---------------
-0           | 0            | All states               | -
-0           | 1            | All states               | Life Cycle
-1           | 0            | TEST_UNLOCKED*, RMA, DEV | RISC-V
-1           | 1            | TEST_UNLOCKED*, RMA      | DFT
-
-Note that the tool-inserted DFT controller may assert the `dft_hold_tap_sel_i` during a test (e.g. boundary scan) in which case the `pinmux` will temporarily pause sampling of the TAP selection straps.
-
-Also, it should be noted that the pad attributes of all JTAG IOs will be gated to all-zero temporarily, while the JTAG is enabled (this does not affect the values in the CSRs).
-This is to ensure that any functional attributes like inversion or pull-ups / pull-downs do not interfere with the JTAG while it is in use.
-
-For more information about the life cycle states, see [Life Cycle Controller Specification](../lc_ctrl/README.md) and the [Life Cycle Definition Table](../../../doc/security/specs/device_life_cycle/README.md#manufacturing-states).
-
-
-## Generic Pad Wrapper
-
-<center>
-<img src="generic_pad_wrapper.svg" width="50%">
-</center>
-
-The generic pad wrapper is intended to abstract away implementation differences between the target technologies by providing a generic interface that is compatible with the `padring` module.
-It is the task of the RTL build flow to select the appropriate pad wrapper implementation.
-
-A specific implementation of a pad wrapper may choose to instantiate a technology primitive (as it is common in ASIC flows), or it may choose to model the functionality behaviorally such that it can be inferred by the technology mapping tool (e.g., in the case of an FPGA target).
-It is permissible to omit the implementation of all IO attributes except input/output inversion.
-
-The generic pad wrapper must expose the following IOs and parameters, even if they are not connected internally.
-In particular, the pad attribute struct `attr_i` must contain all fields listed below, even if not all attributes are supported (it is permissible to just leave them unconnected in the pad wrapper implementation).
-
-Parameter      | Default    | Description
----------------|------------|-----------------------------------------------------
-`PadType`      | `BidirStd` | Pad variant to be instantiated (technology-specific)
-`ScanRole`     | `NoScan`   | Scan role, can be `NoScan`, `ScanIn` or `ScanOut`
-
-Note that `PadType` is a technology-specific parameter.
-The generic pad wrapper only implements variant `BidirStd`, but for other target technologies, this parameter can be used to select among a variety of different pad flavors.
-
-The `ScanRole` parameter determines the behavior when scanmode is enabled.
-Depending on whether a given pad acts as a scan input or output, certain pad attributes and functionalities need to be bypassed.
-This parameter is typically only relevant for ASIC targets and therefore not modeled in the generic pad model.
-
-Also note that the pad wrapper may implement a "virtual" open-drain termination, where standard bidirectional pads are employed, but instead of driving the output high for a logic 1 the pad is put into tristate mode.
-
-Signal               | Direction  | Type        | Description
----------------------|------------|-------------|-----------------------------------------------
-`clk_scan_i`         | `input`    | `logic`     | Scan clock of the pad
-`scanmode_i`         | `input`    | `logic`     | Scan mode enable of the pad
-`pok_i`              | `input`    | `pad_pok_t` | Technology-specific power sequencing signals
-`inout_io`           | `inout`    | `wire`      | Bidirectional inout of the pad
-`in_o`               | `output`   | `logic`     | Input data signal
-`in_raw_o`           | `output`   | `logic`     | Un-inverted input data signal
-`out_i`              | `input`    | `logic`     | Output data signal
-`oe_i`               | `input`    | `logic`     | Output data enable
-`attr_i[0]`          | `input`    | `logic`     | Input/output inversion
-`attr_i[1]`          | `input`    | `logic`     | Virtual open-drain enable
-`attr_i[2]`          | `input`    | `logic`     | Pull enable
-`attr_i[3]`          | `input`    | `logic`     | Pull select (0: pull-down, 1: pull-up)
-`attr_i[4]`          | `input`    | `logic`     | Keeper enable
-`attr_i[5]`          | `input`    | `logic`     | Schmitt trigger enable
-`attr_i[6]`          | `input`    | `logic`     | Open drain enable
-`attr_i[8:7]`        | `input`    | `logic`     | Slew rate (0x0: slowest, 0x3: fastest)
-`attr_i[12:9]`       | `input`    | `logic`     | Drive strength (0x0: weakest, 0xf: strongest)
-
-Note that the corresponding pad attribute registers [`MIO_PAD_ATTR_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_attr_0) and [`DIO_PAD_ATTR_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_attr_0) have "writes-any-reads-legal" (WARL) behavior (see also [pad attributes](#pad-attributes)).
-
-# Programmers Guide
-
-## Pad Attributes
-
-Software should determine and program the pad attributes at startup, or reprogram the attributes when the functionality requirements change at runtime.
-
-This can be achieved by writing to the [`MIO_PAD_ATTR_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_attr_0) and [`DIO_PAD_ATTR_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_attr_0) registers.
-Note that the IO attributes should be configured before enabling muxed IOs going through the `pinmux` matrix in order to avoid undesired electrical behavior and/or contention at the pads.
-
-The pad attributes configuration can be locked down individually for each pad via the [`MIO_PAD_ATTR_REGWEN_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_attr_regwen_0) and [`DIO_PAD_ATTR_REGWEN_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_attr_regwen_0) registers.
-The configuration can then not be altered anymore until the next system reset.
-
-The following pad attributes are supported by this register layout by default:
-
-ATTR Bits | Description                                   | Access
-----------|-----------------------------------------------|---------
-0         | Input/output inversion                        | WARL
-1         | Virtual open drain enable                     | WARL
-2         | Pull enable                                   | WARL
-3         | Pull select (0: down, 1: up)                  | WARL
-4         | Keeper enable                                 | WARL
-5         | Schmitt trigger enable                        | WARL
-6         | Open drain enable                             | WARL
-8:7       | Slew rate (0x0: slowest, 0x3: fastest)        | WARL
-12:9      | Drive strength (0x0: weakest, 0xf: strongest) | WARL
-
-Since some of the pad attributes may not be implemented, SW can probe this capability by writing the CSRs and read them back to determine whether the value was legal.
-This behavior is also referred to as "writes-any-reads-legal" or "WARL" in the RISC-V world.
-For example, certain pads may only support two drive-strength bits, instead of four.
-The unsupported drive-strength bits in the corresponding CSRs would then always read as zero, even if SW attempts to set them to 1.
-
-## Pinmux Configuration
-
-Upon POR, the `pinmux` state is such that all MIO outputs are high-Z, and all MIO peripheral inputs are tied off to 0.
-Software should determine and program the `pinmux` mapping at startup, or reprogram it when the functionality requirements change at runtime.
-This can be achieved by writing the following values to the [`PERIPH_INSEL_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#periph_insel_0) and [`MIO_OUTSEL_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_outsel_0) registers.
-
-`periph_insel` Value  | Selected Input Signal
-----------------------|-----------------------
-0                     | Constant zero (default)
-1                     | Constant one
-2 + k                 | Corresponding MIO input signal at index k
-
-The global default at reset is `0`, but the default of individual signals can be overridden at design time, if needed.
-
-`mio_outsel` Value    | Selected Output signal
-----------------------|-----------------------
-0                     | Constant zero (default)
-1                     | Constant one
-2                     | High-Z
-3 + k                 | Corresponding peripheral output signal at index k
-
-The global default at reset is `2`, but the default of individual signals can be overridden at design time, if needed.
-
-Note that the `pinmux` configuration should be sequenced after any IO attribute-specific configuration in the [`MIO_PAD_ATTR_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_attr_0) and [`DIO_PAD_ATTR_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_attr_0) registers to avoid any unwanted electric behavior and/or contention.
-If needed, each select signal can be individually locked down via [`MIO_PERIPH_INSEL_REGWEN_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_periph_insel_regwen_0) or [`MIO_OUTSEL_REGWEN_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_outsel_regwen_0).
-The configuration can then not be altered anymore until the next system reset.
-
-## Sleep Features
-
-The sleep behavior of each individual MIO or DIO can be defined via the ([`MIO_PAD_SLEEP_EN_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_sleep_en_0), [`DIO_PAD_SLEEP_EN_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_sleep_en_0), [`MIO_PAD_SLEEP_MODE_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_sleep_mode_0) and [`DIO_PAD_SLEEP_MODE_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_sleep_mode_0)) registers.
-Available sleep behaviors are:
-`dio/mio_pad_sleep_en` Value  | `dio/mio_pad_sleep_mode` Value | Sleep Behavior
-------------------------------|--------------------------------|-----------------------
-0                             | -                              | Drive (default)
-1                             | 0                              | Tie-low
-1                             | 1                              | Tie-high
-1                             | 2                              | High-Z
-1                             | 3                              | Keep last value
-
-Note that if the behavior is set to "Drive", the sleep mode will not be activated upon sleep entry.
-Rather, the retention logic continues to drive the value coming from the peripheral side.
-Also note that the sleep logic is located after the `pinmux` matrix, hence the sleep configuration is per MIO pad and not per MIO peripheral.
-
-Before sleep entry, SW should configure the appropriate sleep behavior of all MIOs/DIOs via [`MIO_PAD_SLEEP_MODE_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_sleep_mode_0), [`DIO_PAD_SLEEP_MODE_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_sleep_mode_0).
-This configuration can be optionally locked down, in which case it cannot be modified again until POR.
-The configured behavior is then activated for all pads that have sleep mode set to enabled ([`MIO_PAD_SLEEP_EN_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_sleep_en_0) and [`DIO_PAD_SLEEP_EN_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_sleep_en_0)) at once by the power manager during the sleep entry sequence.
-
-When exiting sleep, the task of disabling the sleep behavior is however up to SW.
-I.e., it must clear the per-pad sleep status bits in registers [`MIO_PAD_SLEEP_STATUS_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_sleep_status_0) and [`DIO_PAD_SLEEP_STATUS_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_sleep_status_0) that have been set upon sleep entry.
-The rationale for this is that it may not be desirable to disable sleep behavior on all pads at once due to some additional book keeping / re-initialization that needs to be performed while exiting sleep.
-
-## Wakeup Features
-
-The `pinmux` contains eight wakeup detectors.
-These detectors can be individually enabled and disabled regardless of the sleep state.
-This ensures that SW can set them up before and disable them after sleep in order to ensure that no events are missed during sleep entry and exit.
-
-For more information on the patterns supported by the wakeup detectors, see [wakeup detectors](#wakeup-detectors).
-
-A typical programming sequence for the wakeup detectors looks as follows:
-
-1. Before initiating any sleep mode, SW should configure the wakeup detectors appropriately and enable them via the [`WKUP_DETECTOR_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_detector_0), [`WKUP_DETECTOR_CNT_TH_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_detector_cnt_th_0) and [`WKUP_DETECTOR_PADSEL_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_detector_padsel_0) registers.
-
-2. Optionally, lock the wakeup detector configuration via the [`WKUP_DETECTOR_REGWEN_0`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_detector_regwen_0) registers.
-
-3. During sleep, the wakeup detectors will trigger a wakeup request if a matching pattern has been observed.
-   A bit corresponding to the wakeup detector that has observed the pattern will be set in the [`WKUP_CAUSE`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_cause) register.
-
-4. When exiting sleep, SW should read the wake info register in the [power manager](../pwrmgr/README.md) to determine the reason(s) for the wakeup request.
-
-5. If the wakeup request was due to a pin wakeup pattern detector, SW should inspect the [`WKUP_CAUSE`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_cause) registers in order to determine the exact cause.
-
-6. SW should in any case disable the wakeup detectors and clear the [`WKUP_CAUSE`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_cause) registers once it is safe to do so (in order to not miss any events).
-   Note that the [`WKUP_CAUSE`](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_cause) registers reside in the slow AON clock domain, and hence clearing them takes a few uS to take effect.
-   If needed, a SW readback can be performed to ensure that the clear operation has completed successfully.
-
-## Pinout and Pinmux Mapping
-
-The tables below summarize the pinout and pinmux connectivity for certain top-level designs.
-
-### Top Earlgrey
-
-{{#include ../../top_earlgrey/ip/pinmux/doc/autogen/targets.md}}
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_pinmux.h)
-
-## Register Table
-
-The register description below matches the instance in the [Earl Grey top level
-design](../../top_earlgrey/doc/specification.md).
-
-Similar register descriptions can be generated with different parameterizations.
-
-* [Register Table](../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#registers)
diff --git a/hw/ip/pinmux/doc/programmers_guide.md b/hw/ip/pinmux/doc/programmers_guide.md
new file mode 100644
index 0000000000000..dbba1ac757fac
--- /dev/null
+++ b/hw/ip/pinmux/doc/programmers_guide.md
@@ -0,0 +1,127 @@
+# Programmer's Guide
+
+## Pad Attributes
+
+Software should determine and program the pad attributes at startup, or reprogram the attributes when the functionality requirements change at runtime.
+
+This can be achieved by writing to the [`MIO_PAD_ATTR_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_attr_0) and [`DIO_PAD_ATTR_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_attr_0) registers.
+Note that the IO attributes should be configured before enabling muxed IOs going through the `pinmux` matrix in order to avoid undesired electrical behavior and/or contention at the pads.
+
+The pad attributes configuration can be locked down individually for each pad via the [`MIO_PAD_ATTR_REGWEN_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_attr_regwen_0) and [`DIO_PAD_ATTR_REGWEN_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_attr_regwen_0) registers.
+The configuration can then not be altered anymore until the next system reset.
+
+The following pad attributes are supported by this register layout by default:
+
+ATTR Bits | Description                                   | Access
+----------|-----------------------------------------------|---------
+0         | Input/output inversion                        | WARL
+1         | Virtual open drain enable                     | WARL
+2         | Pull enable                                   | WARL
+3         | Pull select (0: down, 1: up)                  | WARL
+4         | Keeper enable                                 | WARL
+5         | Schmitt trigger enable                        | WARL
+6         | Open drain enable                             | WARL
+8:7       | Slew rate (0x0: slowest, 0x3: fastest)        | WARL
+12:9      | Drive strength (0x0: weakest, 0xf: strongest) | WARL
+
+Since some of the pad attributes may not be implemented, SW can probe this capability by writing the CSRs and read them back to determine whether the value was legal.
+This behavior is also referred to as "writes-any-reads-legal" or "WARL" in the RISC-V world.
+For example, certain pads may only support two drive-strength bits, instead of four.
+The unsupported drive-strength bits in the corresponding CSRs would then always read as zero, even if SW attempts to set them to 1.
+
+## Pinmux Configuration
+
+Upon POR, the `pinmux` state is such that all MIO outputs are high-Z, and all MIO peripheral inputs are tied off to 0.
+Software should determine and program the `pinmux` mapping at startup, or reprogram it when the functionality requirements change at runtime.
+This can be achieved by writing the following values to the [`PERIPH_INSEL_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#periph_insel_0) and [`MIO_OUTSEL_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_outsel_0) registers.
+
+`periph_insel` Value  | Selected Input Signal
+----------------------|-----------------------
+0                     | Constant zero (default)
+1                     | Constant one
+2 + k                 | Corresponding MIO input signal at index k
+
+The global default at reset is `0`, but the default of individual signals can be overridden at design time, if needed.
+
+`mio_outsel` Value    | Selected Output signal
+----------------------|-----------------------
+0                     | Constant zero (default)
+1                     | Constant one
+2                     | High-Z
+3 + k                 | Corresponding peripheral output signal at index k
+
+The global default at reset is `2`, but the default of individual signals can be overridden at design time, if needed.
+
+Note that the `pinmux` configuration should be sequenced after any IO attribute-specific configuration in the [`MIO_PAD_ATTR_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_attr_0) and [`DIO_PAD_ATTR_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_attr_0) registers to avoid any unwanted electric behavior and/or contention.
+If needed, each select signal can be individually locked down via [`MIO_PERIPH_INSEL_REGWEN_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_periph_insel_regwen_0) or [`MIO_OUTSEL_REGWEN_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_outsel_regwen_0).
+The configuration can then not be altered anymore until the next system reset.
+
+## Sleep Features
+
+The sleep behavior of each individual MIO or DIO can be defined via the ([`MIO_PAD_SLEEP_EN_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_sleep_en_0), [`DIO_PAD_SLEEP_EN_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_sleep_en_0), [`MIO_PAD_SLEEP_MODE_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_sleep_mode_0) and [`DIO_PAD_SLEEP_MODE_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_sleep_mode_0)) registers.
+Available sleep behaviors are:
+`dio/mio_pad_sleep_en` Value  | `dio/mio_pad_sleep_mode` Value | Sleep Behavior
+------------------------------|--------------------------------|-----------------------
+0                             | -                              | Drive (default)
+1                             | 0                              | Tie-low
+1                             | 1                              | Tie-high
+1                             | 2                              | High-Z
+1                             | 3                              | Keep last value
+
+Note that if the behavior is set to "Drive", the sleep mode will not be activated upon sleep entry.
+Rather, the retention logic continues to drive the value coming from the peripheral side.
+Also note that the sleep logic is located after the `pinmux` matrix, hence the sleep configuration is per MIO pad and not per MIO peripheral.
+
+Before sleep entry, SW should configure the appropriate sleep behavior of all MIOs/DIOs via [`MIO_PAD_SLEEP_MODE_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_sleep_mode_0), [`DIO_PAD_SLEEP_MODE_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_sleep_mode_0).
+This configuration can be optionally locked down, in which case it cannot be modified again until POR.
+The configured behavior is then activated for all pads that have sleep mode set to enabled ([`MIO_PAD_SLEEP_EN_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_sleep_en_0) and [`DIO_PAD_SLEEP_EN_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_sleep_en_0)) at once by the power manager during the sleep entry sequence.
+
+When exiting sleep, the task of disabling the sleep behavior is however up to SW.
+I.e., it must clear the per-pad sleep status bits in registers [`MIO_PAD_SLEEP_STATUS_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_sleep_status_0) and [`DIO_PAD_SLEEP_STATUS_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_sleep_status_0) that have been set upon sleep entry.
+The rationale for this is that it may not be desirable to disable sleep behavior on all pads at once due to some additional book keeping / re-initialization that needs to be performed while exiting sleep.
+
+## Wakeup Features
+
+The `pinmux` contains eight wakeup detectors.
+These detectors can be individually enabled and disabled regardless of the sleep state.
+This ensures that SW can set them up before and disable them after sleep in order to ensure that no events are missed during sleep entry and exit.
+
+For more information on the patterns supported by the wakeup detectors, see [wakeup detectors](#wakeup-detectors).
+
+A typical programming sequence for the wakeup detectors looks as follows:
+
+1. Before initiating any sleep mode, SW should configure the wakeup detectors appropriately and enable them via the [`WKUP_DETECTOR_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_detector_0), [`WKUP_DETECTOR_CNT_TH_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_detector_cnt_th_0) and [`WKUP_DETECTOR_PADSEL_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_detector_padsel_0) registers.
+
+2. Optionally, lock the wakeup detector configuration via the [`WKUP_DETECTOR_REGWEN_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_detector_regwen_0) registers.
+
+3. During sleep, the wakeup detectors will trigger a wakeup request if a matching pattern has been observed.
+   A bit corresponding to the wakeup detector that has observed the pattern will be set in the [`WKUP_CAUSE`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_cause) register.
+
+4. When exiting sleep, SW should read the wake info register in the [power manager](../../pwrmgr/README.md) to determine the reason(s) for the wakeup request.
+
+5. If the wakeup request was due to a pin wakeup pattern detector, SW should inspect the [`WKUP_CAUSE`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_cause) registers in order to determine the exact cause.
+
+6. SW should in any case disable the wakeup detectors and clear the [`WKUP_CAUSE`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_cause) registers once it is safe to do so (in order to not miss any events).
+   Note that the [`WKUP_CAUSE`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_cause) registers reside in the slow AON clock domain, and hence clearing them takes a few uS to take effect.
+   If needed, a SW readback can be performed to ensure that the clear operation has completed successfully.
+
+## Pinout and Pinmux Mapping
+
+The tables below summarize the pinout and pinmux connectivity for certain top-level designs.
+
+### Top Earlgrey
+
+{{#include ../../top_earlgrey/ip/pinmux/doc/autogen/targets.md}}
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_pinmux.h)
+
+## Register Table
+
+The register description below matches the instance in the [Earl Grey top level
+design](../../../top_earlgrey/doc/specification.md).
+
+Similar register descriptions can be generated with different parameterizations.
+
+* [Register Table](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#registers)
diff --git a/hw/ip/pinmux/doc/theory_of_operation.md b/hw/ip/pinmux/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..ab5eecf79e35c
--- /dev/null
+++ b/hw/ip/pinmux/doc/theory_of_operation.md
@@ -0,0 +1,230 @@
+# Theory of Operation
+
+## Block Diagram and Overview
+
+The `pinmux` peripheral is a programmable module designed to wire arbitrary peripheral inputs and outputs to arbitrary multiplexable chip bidirectional pins.
+It gives much flexibility at the top level of the device, allowing most data pins to be flexibly wired and controlled by many peripherals.
+Even though the `pinmux` is referred to as one IP, it is logically split into two modules that are instantiated on the top-level and the chip-level, respectively, as can be seen in the block diagram below.
+The top-level module `pinmux` contains the CSRs accessible via the TL-UL interface, the main muxing matrix, retention registers, a set of programmable wakeup detectors, and the HW strap sampling and TAP / JTAG muxing logic.
+The chip-level module `padring` instantiates the bidirectional pads and connects the physical pad attributes.
+
+![Pinmux Block Diagram](../doc/pinmux_overview_block_diagram.svg)
+
+### MIO and DIO Signal Categories
+
+The `pinmux` supports two different IO signal categories:
+Muxed IO (MIO) signals that are routed through the `pinmux` matrix, and dedicated IO (DIO) signals that bypass the `pinmux` matrix.
+This distinction is useful for accommodating IO signals that are timing critical or that must have a fixed IO mapping for another reason.
+Note that although DIO signals are not routed through the `pinmux` matrix, they are still connected to the retention logic and the wakeup detectors (see next section below).
+
+The number of available peripheral IOs, pads, and their assignment to the MIO / DIO categories is done at design time as part of the top-level configuration.
+This configurability is achieved by representing inputs / outputs as packed arrays, in combination with the SystemVerilog parameters `NPeriphIn`, `NPeriphOut`, `NMioPads` and `NDioPads`.
+Note however that the register file is also affected by this configuration and needs to be regenerated for each design instance.
+
+It is assumed that all available pins that the `pinmux` connects to are bidirectional, controlled by logic within this module.
+By default, all muxed peripheral inputs are tied to zero.
+Further, all output enables are set to zero, which essentially causes all pads to be in high-Z state after reset.
+In addition to wiring programmability, each muxed peripheral input can be set constantly to 0 or 1, and each muxed chip output can be set constantly to 0, 1 or high-Z.
+
+See the [muxing matrix](#muxing-matrix) section for more details about the mux implementation.
+
+### Retention and Wakeup Features
+
+The retention logic allows SW to specify a certain behavior during sleep for each muxed and dedicated output.
+Legal behaviors are tie low, tie high, high-Z, keeping the previous state, or driving the current value (useful for peripherals that are always on).
+
+The wakeup detectors can detect patterns such as rising / falling edges and pulses of a certain width up to 255 AON clock cycles.
+Each wakeup detector can listen on any one of the MIO / DIO signals that are routed through the `pinmux`, and if a pattern is detected, the power manager is informed of that event via a wakeup request.
+
+The `pinmux` module itself is in the always-on (AON) power domain, and as such does not loose configuration state when a sleep power cycle is performed.
+However, only the wakeup detector logic will be actively clocked during sleep in order to save power.
+
+See the [retention logic](#retention-logic) and [wakeup detectors](#wakeup-detectors) sections for more details about the mux implementation.
+
+### Test and Debug Access
+
+The hardware strap sampling and TAP isolation logic provides test and debug access to the chip during specific life cycle states.
+This mechanism is explained in more detail in the [strap sampling and TAP isolation](#strap-sampling-and-tap-isolation) section.
+
+### Pad Attributes
+
+Additional pad-specific features such as inversion, pull-up, pull-down, virtual open-drain, drive-strength and input/output inversion etc. can be exercise via the pad attribute CSRs.
+The `pinmux` module supports a comprehensive set of such pad attributes, but it is permissible that some of them may not be supported by the underlying pad implementation.
+For example, certain ASIC libraries may not provide open-drain outputs, and FPGAs typically do not allow all of these attributes to be programmed dynamically at runtime.
+See the [generic pad wrapper](#generic-pad-wrapper) section below for more details.
+Note that static pad attributes for FPGAs are currently not covered in this specification.
+
+## Hardware Interfaces
+
+* [Interface Tables](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#interfaces)
+
+### Parameters
+
+The following table lists the main parameters used throughout the `pinmux` design.
+Note that the `pinmux` is generated based on the system configuration, and hence these parameters are placed into a package.
+The pinout and `pinmux` mappings are listed under [Pinout and Pinmux Mapping](#pinout-and-pinmux-mapping) for specific top-level configurations.
+
+Parameter      | Description
+---------------|---------------
+`NPeriphOut`   | Number of peripheral outputs.
+`NPeriphIn`    | Number of peripheral input.
+`NMioPads`     | Number of muxed bidirectional pads.
+`NDioPads`     | Number of dedicated pads.
+
+### Primary IO Signals
+
+The table below lists the primary `pinmux` IO signals to/from the pad ring.
+The number of dedicated and muxed IOs is parametric, and hence the signals are stacked in packed arrays.
+
+Signal                                 | Direction | Type                               | Description
+---------------------------------------|-----------|------------------------------------|---------------
+`periph_to_mio_i[NPeriphOut-1:0]`      | `input`   | packed `logic`                     | Signals from `NPeriphOut` muxed peripheral outputs coming into the `pinmux`.
+`periph_to_mio_oe_i[NPeriphOut-1:0]`   | `input`   | packed `logic`                     | Signals from `NPeriphOut` muxed peripheral output enables coming into the `pinmux`.
+`mio_to_periph_o[NPeriphIn-1:0]`       | `output`  | packed `logic`                     | Signals to `NPeriphIn` muxed peripherals coming from the `pinmux`.
+`periph_to_dio_i[NDioPads-1:0]`        | `input`   | packed `logic`                     | Signals from `NDioPads` dedicated peripheral outputs coming into the `pinmux`.
+`periph_to_dio_oe_i[NDioPads-1:0]`     | `input`   | packed `logic`                     | Signals from `NDioPads` dedicated peripheral output enables coming into the `pinmux`.
+`dio_to_periph_o[NDioPads-1:0]`        | `output`  | packed `logic`                     | Signals to `NDioPads` dedicated peripherals coming from the `pinmux`.
+`mio_attr_o[NMioPads-1:0]`             | `output`  | prim_pad_wrapper_pkg::pad_attr_t   | Packed array containing the pad attributes of all muxed IOs.
+`mio_out_o[NMioPads-1:0]`              | `output`  | packed `logic`                     | Signals to `NMioPads` bidirectional muxed pads as output data.
+`mio_oe_o[NMioPads-1:0]`               | `output`  | packed `logic`                     | Signals to `NMioPads` bidirectional muxed pads as output enables.
+`mio_in_i[NMioPads-1:0]`               | `input`   | packed `logic`                     | Signals from `NMioPads` bidirectional muxed pads as input data.
+`dio_attr_o[NDioPads-1:0]`             | `output`  | prim_pad_wrapper_pkg::pad_attr_t   | Packed array containing the pad attributes of all dedicated IOs.
+`dio_out_o[NDioPads-1:0]`              | `output`  | packed `logic`                     | Signals to `NDioPads` bidirectional dedicated pads as output data.
+`dio_oe_o[NDioPads-1:0]`               | `output`  | packed `logic`                     | Signals to `NDioPads` bidirectional dedicated pads as output enables.
+`dio_in_i[NDioPads-1:0]`               | `input`   | packed `logic`                     | Signals from `NDioPads` bidirectional dedicated pads as input data.
+
+
+## Muxing Matrix
+
+The diagram below shows connectivity between four arbitrary chip pins, named `MIO0` .. `MIO3`, and several muxed peripheral inputs and outputs.
+This shows the connectivity available in all directions, as well as the control registers described later in this document.
+Two example peripherals (`uart` and `spidev`) are attached to the `pinmux` in this example, one with one input and one output, the other with three inputs and one output.
+The diagram also shows the `padring` module which instantiates the bidirectional chip pads with output enable control.
+
+![Pinmux Block Diagram](../doc/pinmux_muxing_matrix.svg)
+
+Note that apart from selecting a specific input pad, the `periph_insel[*]` signals can also be used to tie the peripheral input to 0 or 1.
+Likewise, the output select signals `mio_outsel[*]` can also be used to constantly drive an output pin to 0/1 or to put it into high-Z state (default).
+The output enable and the associated data signal (i.e. `periph_to_mio` and `periph_to_mio_oe`) are indexed with the same select signal to allow the peripheral hardware to determine the pad direction instead of demoting that control to SW.
+
+## Retention Logic
+
+As illustrated in the picture above, all muxing matrix and DIO outputs are routed through the retention logic, which essentially consists of a set of multiplexors and two retention registers per output (one register is for the output data and one for the output enable).
+This multiplexor can be configured to be automatically activated upon sleep entry in order to either drive the output low, high, high-Z or to the last seen value (keep).
+If no sleep behavior is specified, the retention logic will continue to drive out the value coming from the peripheral side, which can be useful for peripherals that reside in the AON domain.
+
+The sleep behavior of all outputs is activated in parallel via a trigger signal asserted by the power manager.
+Once activated, it is the task of SW to disable the sleep behavior for each individual pin when waking up from sleep.
+This ensures that the output values remain stable until the system and its peripherals have been re-initialized.
+
+## Wakeup Detectors
+
+The `pinmux` contains eight programmable wakeup detector modules that can listen on any of the MIO or DIO pins.
+Each detector contains a debounce filter and an 8bit counter running on the AON clock domain.
+The detectors can be programmed via the [`WKUP_DETECTOR_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_detector_0) and [`WKUP_DETECTOR_CNT_TH_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_detector_cnt_th_0) registers to detect the following patterns:
+
+- rising edge
+- falling edge
+- rising or falling edge
+- positive pulse up to 255 AON clock cycles in length
+- negative pulse up to 255 AON clock cycles in length
+
+Note that for all patterns listed above, the input signal is sampled with the AON clock.
+This means that the input signal needs to remain stable for at least one AON clock cycle after a level change for the detector to recognize the event (depending on the debounce filter configuration, the signal needs to remain stable for multiple clock cycles).
+
+If a pattern is detected, the wakeup detector will send a wakeup request to the power manager, and the cause bit corresponding to that detector will be set in the [`WKUP_CAUSE`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_cause) register.
+
+Note that the wkup detector should be disabled by setting [`WKUP_DETECTOR_EN_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#wkup_detector_en_0) before changing the detection mode.
+The reason for that is that the pulse width counter is NOT cleared upon a mode change while the detector is enabled.
+
+## Strap Sampling and TAP Isolation
+
+The `pinmux` contains a set of dedicated HW "straps", which are essentially signals that are multiplexed onto fixed MIO pad locations.
+Depending on the life cycle state, these straps are either continuously sampled, or latched right after POR.
+
+There are two groups of HW straps:
+1. Three DFT straps that determine the DFT mode.
+   These bits are output via the `dft_strap_test_o` signal such that they can be routed to the tool-inserted DFT controller.
+2. Two TAP selection straps for determining which TAP should be multiplexed onto the JTAG IOs.
+
+The conditions under which these two strap groups are sampled are listed in the tables below.
+Note that the HW straps can be used just like regular GPIOs once they have been sampled.
+
+Strap Group \ Life Cycle State  | TEST_UNLOCKED* | RMA          | DEV          | All Other States
+--------------------------------|----------------|--------------|--------------|------------------
+DFT straps                      | Once at boot   | Once at boot | -            | -
+TAP strap 0                     | Continuously   | Continuously | Once at boot | Once at boot
+TAP strap 1                     | Continuously   | Continuously | Once at boot | -
+
+*Once at boot:* Sampled once after life cycle initialization (sampling event is initiated by pwrmgr).
+
+*Continuously:* Sampled continuously after life cycle initialization.
+
+The TAP muxing logic is further qualified by the life cycle state in order to isolate the TAPs in certain life cycle states.
+The following table lists the TAP strap encoding and the life cycle states in which the associated TAPs can be selected and accessed.
+
+TAP strap 1 | TAP strap 0  | Life Cycle State         | Selected TAP
+------------|--------------|--------------------------|---------------
+0           | 0            | All states               | -
+0           | 1            | All states               | Life Cycle
+1           | 0            | TEST_UNLOCKED*, RMA, DEV | RISC-V
+1           | 1            | TEST_UNLOCKED*, RMA      | DFT
+
+Note that the tool-inserted DFT controller may assert the `dft_hold_tap_sel_i` during a test (e.g. boundary scan) in which case the `pinmux` will temporarily pause sampling of the TAP selection straps.
+
+Also, it should be noted that the pad attributes of all JTAG IOs will be gated to all-zero temporarily, while the JTAG is enabled (this does not affect the values in the CSRs).
+This is to ensure that any functional attributes like inversion or pull-ups / pull-downs do not interfere with the JTAG while it is in use.
+
+For more information about the life cycle states, see [Life Cycle Controller Specification](../../lc_ctrl/README.md) and the [Life Cycle Definition Table](../../../../doc/security/specs/device_life_cycle/README.md#manufacturing-states).
+
+
+## Generic Pad Wrapper
+
+<center>
+<img src="generic_pad_wrapper.svg" width="50%">
+</center>
+
+The generic pad wrapper is intended to abstract away implementation differences between the target technologies by providing a generic interface that is compatible with the `padring` module.
+It is the task of the RTL build flow to select the appropriate pad wrapper implementation.
+
+A specific implementation of a pad wrapper may choose to instantiate a technology primitive (as it is common in ASIC flows), or it may choose to model the functionality behaviorally such that it can be inferred by the technology mapping tool (e.g., in the case of an FPGA target).
+It is permissible to omit the implementation of all IO attributes except input/output inversion.
+
+The generic pad wrapper must expose the following IOs and parameters, even if they are not connected internally.
+In particular, the pad attribute struct `attr_i` must contain all fields listed below, even if not all attributes are supported (it is permissible to just leave them unconnected in the pad wrapper implementation).
+
+Parameter      | Default    | Description
+---------------|------------|-----------------------------------------------------
+`PadType`      | `BidirStd` | Pad variant to be instantiated (technology-specific)
+`ScanRole`     | `NoScan`   | Scan role, can be `NoScan`, `ScanIn` or `ScanOut`
+
+Note that `PadType` is a technology-specific parameter.
+The generic pad wrapper only implements variant `BidirStd`, but for other target technologies, this parameter can be used to select among a variety of different pad flavors.
+
+The `ScanRole` parameter determines the behavior when scanmode is enabled.
+Depending on whether a given pad acts as a scan input or output, certain pad attributes and functionalities need to be bypassed.
+This parameter is typically only relevant for ASIC targets and therefore not modeled in the generic pad model.
+
+Also note that the pad wrapper may implement a "virtual" open-drain termination, where standard bidirectional pads are employed, but instead of driving the output high for a logic 1 the pad is put into tristate mode.
+
+Signal               | Direction  | Type        | Description
+---------------------|------------|-------------|-----------------------------------------------
+`clk_scan_i`         | `input`    | `logic`     | Scan clock of the pad
+`scanmode_i`         | `input`    | `logic`     | Scan mode enable of the pad
+`pok_i`              | `input`    | `pad_pok_t` | Technology-specific power sequencing signals
+`inout_io`           | `inout`    | `wire`      | Bidirectional inout of the pad
+`in_o`               | `output`   | `logic`     | Input data signal
+`in_raw_o`           | `output`   | `logic`     | Un-inverted input data signal
+`out_i`              | `input`    | `logic`     | Output data signal
+`oe_i`               | `input`    | `logic`     | Output data enable
+`attr_i[0]`          | `input`    | `logic`     | Input/output inversion
+`attr_i[1]`          | `input`    | `logic`     | Virtual open-drain enable
+`attr_i[2]`          | `input`    | `logic`     | Pull enable
+`attr_i[3]`          | `input`    | `logic`     | Pull select (0: pull-down, 1: pull-up)
+`attr_i[4]`          | `input`    | `logic`     | Keeper enable
+`attr_i[5]`          | `input`    | `logic`     | Schmitt trigger enable
+`attr_i[6]`          | `input`    | `logic`     | Open drain enable
+`attr_i[8:7]`        | `input`    | `logic`     | Slew rate (0x0: slowest, 0x3: fastest)
+`attr_i[12:9]`       | `input`    | `logic`     | Drive strength (0x0: weakest, 0xf: strongest)
+
+Note that the corresponding pad attribute registers [`MIO_PAD_ATTR_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#mio_pad_attr_0) and [`DIO_PAD_ATTR_0`](../../../top_earlgrey/ip/pinmux/data/autogen/pinmux.hjson#dio_pad_attr_0) have "writes-any-reads-legal" (WARL) behavior (see also [pad attributes](#pad-attributes)).
diff --git a/hw/ip/pwm/README.md b/hw/ip/pwm/README.md
index bd4779e917311..2b31de11b1b7f 100644
--- a/hw/ip/pwm/README.md
+++ b/hw/ip/pwm/README.md
@@ -68,255 +68,3 @@ This feature is enabled on a per-channel basis, and for each blinking channel, t
 As a variant of the blink feature, the output duty cycle can also be programmed to linearly increase and decrease in time.
 In this "heartbeat" mode, the duty cycle increments by a programmable amount after a programmable number of pulses, starting at some firmware-selected initial duty cycle.
 Once the internal duty cycle reaches the target value, the internal duty cycle begins to decrement until it returns to the initial value, at which point the cycle repeats until heartbeat mode is disabled.
-
-# Theory of Operations
-
-## Block Diagram
-
-![](./doc/pwm_block_diagram.svg)
-
-## Hardware Interfaces
-
-* [Interface Tables](data/pwm.hjson#interfaces)
-
-## Design Details
-
-### Phase and Duty Cycle Representation
-
-The PWM IP creates series of pulses with the desired on-off duty cycle.
-The duty cycle, DC, is typically expressed a fraction of pulse duration, <i>d</i>, over the period between pulses, <i>T</i>:
-
-$$DC\equiv d/T.$$
-
-Since 0&lt;<i>d</i>&lt;<i>T</i>, the duty cycle ranges from 0 to 1.
-
-The PWM IP can control the duty cycle in a number of ways:
-- The PWM can be programmed to generate pulses at a firmware-defined duty cycle.
-- The duty cycle can be programmed to toggle (or "blink") between two duty cycles at a programmable rate.
-In the tri-color LED use-case, this would have the visual effect of making the LED blink between two colors.
-- The duty cycle can linearly sweep in time, gradually shifting back-and-forth between two endpoints.
-
-Thus the duty cycle can be controlled by firmware, or may change under hardware control.
-The operation of each of these modes will be discussed later.
-
-Each channel can also be assigned a different phase delay.
-Like the duty cycle, this delay is expressed as a fraction of the pulse period, <i>T</i>.
-The phase delay of each channel is always directly controlled by a firmware register value.
-
-Since the phase and duty cycle are always a fraction less than or equal to one, the PWM IP represents them as 16-bit fixed point numbers, with an implicit 16-bit shift.
-If the duty cycle is internally represented as a 16-bit value x, the output pulse train will have the duty cycle:
-
-$$DC(x)=\frac{x}{2^{16}}.$$
-
-Thus the allowed duty cycle in principle ranges from 0 to 99.998% (i.e. <nobr>1-(&frac12;)<sup>16</sup></nobr>).
-
-However, the actual phase resolution may be smaller.
-In order to support faster pulse rates, the phase resolution can be set to less than 16-bits, in which case the observed duty cycle will be rounded down to the next lowest multiple of <nobr>2<sup>-([`CFG.DC_RESN`](data/pwm.hjson#cfg)+1)</sup></nobr>.
-In other words, the [`CFG.DC_RESN`](data/pwm.hjson#cfg) register effectively limits the duty cycle resolution, such that only the <nobr>[`CFG.DC_RESN`](data/pwm.hjson#cfg)+1</nobr> most significant bits are relevant:
-
-$$DC(x; \textrm{DC_RESN})=\frac{\textrm{MSB}(x; \textrm{DC_RESN}+1)}{2^{(\textrm{DC_RESN}+1)}},$$
-
-where here we use the notation MSB(<i>x</i>; <i>y</i>), to mean the <i>y</i> most significant bits of the binary value <i>x</i>.
-
-### PWM Phase Counter
-
-The IP maintains a single phase counter that is shared by all outputs.
-As we discuss in the next section, each channel has a comparator which compares these values to the current duty cycle and phase value and generates the appropriate pulse.
-Since all phase or duty cycle related quantities are represented as 16-bit fixed point fractions-regardless of whether they are calculated by the PWM IP or determined by firmware-the phase counter is also a 16-bit quantity.
-
-Each PWM pulse cycle is divided into <nobr>2<sup>DC_RESN+1</sup></nobr> beats.
-During each beat, the 16-bit phase counter increments by 2<sup>(16-DC_RESN-1)</sup> (modulo 65536).
-The beat period is defined by the [`CFG.CLK_DIV`](data/pwm.hjson#cfg) register:
-
-$$f_\textrm{beat}=\frac{f_\textrm{core clk}}{\textrm{CLK_DIV}+1}$$
-
-A PWM pulse cycle is completed each time the phase counter overflows to 0.
-The PWM drive frequency is therefore:
-$$f_\textrm{PWM}=f_\textrm{beat}\frac{2^{16-\textrm{DC_RESN}-1}}{2^{16}}=\frac{f_\textrm{core clk}}{2^{\textrm{DC_RESN}+1}(\textrm{CLK_DIV}+1)}$$
-
-The PWM phase counter is reset whenever [`CFG.CNTR_EN`](data/pwm.hjson#cfg) is disabled.
-
-The following figure illustrates the effect of the clock divider register.  Note that changes to [`CFG.CLK_DIV`](data/pwm.hjson#cfg) or [`CFG.DC_RESN`](data/pwm.hjson#cfg) only take effect when [`CFG.CNTR_EN`](data/pwm.hjson#cfg) is disabled.
-
-```wavejson
-{signal: [
-  {name: 'core_clk_i', wave: 'p..............|..........'},
-  {name: 'sync(CFG.CNTR_EN)', wave: '0.1............|01........'},
-  {name: 'sync(CFG.CLK_DIV)', wave: '2....4.........|..2.......', data: '2 4 4'},
-  {name: 'clk_div_int', wave: 'x..2...........|..2.......', data: '2 4'},
-  {name: 'sync(CFG.DC_RESN)', wave: '2..............|..........', data: '10d'},
-  {name: 'dc_resn_int', wave: 'x..2...........|..........', data: '10d'},
-  {name: 'phase_ctr', wave: '3..2..2..2..2..|3.2....2..', data: '0x00 0x00 0x20 0x40 0x80 0 0x00 0x20'},
-  {name: 'clk_div_ctr', wave: '2...22222222222|2..2222222', data: '0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 3 4 0 1 2'},
-  {name: 'beat_end', wave: '0....10.10.10.1|0.....10..'}],
-  config:{skin:'narrow'}
-}
-```
-
-### PWM Comparators and Pulse Generation.
-
-Whenever the phase counter loops back to zero, this marks the start of a new <i>pulse cycle</i>.
-This section describes how the comparator creates pulses with the correct duty cycle and phase.
-
-In the following sections, this document describes the various per-channel configuration options of this IP.
-For concreteness, the text discusses the operation of channel 0, using registers and fields ending with "_0".
-To operate other channels, simply choose the registers with the appropriate channel suffix.
-
-Clearing [`PWM_EN.EN_0`](data/pwm.hjson#pwm_en) disables the channel, suppressing all output pulses.
-
-The pulse phase delay is always programmed by firmware into the TL-UL register [`PWM_PARAM_0.PHASE_DELAY_0`](data/pwm.hjson#pwm_param_0).
-The duty cycle however comes from the blink control hardware (which is described in the next section).
-The current duty cycle is stored in a channel-specific signal register, `duty_cycle`.
-
-When operating at full resolution (i.e. `DC_RESN`==15), the channel output rises when the phase counter equals [`PWM_PARAM_0.PHASE_DELAY_0`](data/pwm.hjson#pwm_param_0), and falls when the phase counter equals [`PWM_PARAM_0.PHASE_DELAY_0`](data/pwm.hjson#pwm_param_0) + `duty_cycle` (mod 2<sup>(`DC_RESN`+1)</sup>).
-In both cases, the transition occurs at the beginning of the beat.
-When operating at lower resolution the same comparison applies, but using only the most significant (`DC_RESN`+1) bits.
-
-If the combination of phase delay and duty cycle is larger than one pulse cycle, the pulse will start in one pulse cycle and end in the next.
-In this case the comparator output will be high at the beginning of each cycle, as seen in the example waveform below.
-
-By default the pulses are all active-high, meaning the output is low if a PWM channel is disabled.
-However, to support various drive schemes, the polarity can be inverted on a channel-by-channel basis using the [`INVERT`](data/pwm.hjson#invert) register.
-
-The following figure illustrates the effect of the [`PWM_PARAM_0.PHASE_DELAY_0`](data/pwm.hjson#pwm_param_0) register and `duty_cycle`.
-Note that this figure shows two channels, 0 and 1, where the second channel has a significant phase delay, such that the output pulse is high when `phase_ctr` overflows to zero.
-
-```wavejson
-{signal: [
-  {name: 'core_clk_i', wave: 'p.....|....|......'},
-  {name: 'CFG.CLK_DIV', wave: '2.....|....|......', data: '0'},
-  {name: 'sync(CFG.CNTR_EN)', wave: '01....|....|......'},
-  {name: 'CFG.DC_RESN', wave: '2.....|....|......', data: '3'},
-  {name: 'INVERT.INVERT_0', wave: '0.....|....|......'},
-  {name: 'phase_ctr[15:12]', wave: '3.2222|2222|222222', data: ['0', '0', '1', '2', '', '7', '8', '9', '10    ', '14', '15' ,'0', '1', '2', '3', '4']},
-  {name: 'cycle_end', wave: '0.....|....|.10...'},
-  {name: 'PHASE_DELAY_0[15:12]', wave: '2.....|....|......', data: '0'},
-  {name: 'pulse_gen[0].duty_cycle[15:12]', wave: '2.....|....|......', data: '9'},
-  {name: 'pwm_out_d[0]', wave: '0.1...|..0.|..1...'},
-  {name: 'pwm_out_q[0]', wave: '0..1..|...0|...1..'},
-  {name: 'PHASE_DELAY_1[15:12]', wave: '2.....|....|......', data: '15'},
-  {name: 'pulse_gen[1].duty_cycle[15:12]', wave: '2.....|....|......', data: '3'},
-  {name: 'pwm_out_d[1]', wave: '0.1.0.|....|.1..0.'},
-  {name: 'pwm_out_q[1]', wave: '0..1.0|....|..1..0'}
-  ],
- config:{skin:'narrow'}
-}
-```
-
-Changes to [`PWM_EN.EN_0`](data/pwm.hjson#pwm_en) bit have no effect on the *timing* of the pulses, as the `phase_ctr` is common to all channels.
-Enabling [`PWM_EN.EN_0`](data/pwm.hjson#pwm_en), or changing [`PWM_PARAM_0.PHASE_DELAY_0`](data/pwm.hjson#pwm_param_0) is acceptable while the PWM channel is enabled.
-Since these registers take effect immediately, the shape of the following pulse may be unpredictable if they are changed while [`CFG.CNTR_EN`](data/pwm.hjson#cfg) is active, though this glitch in a single pulse is likely not a problem for most applications.
-Changes to the duty cycle register [`DUTY_CYCLE_0.A_0`](data/pwm.hjson#duty_cycle_0) may also be effective immediately, but only *when blinking is disabled*.
-
-In the above waveform, the first beat (labeled "0") does not start for one clock after [`CFG.CNTR_EN`](data/pwm.hjson#cfg) is asserted.
-This delay is typical, and reflects the fact that it takes exactly one clock cycle for the phase counter to start (as seen in the previous waveform).
-
-There is a register `pwm_out` at the output pin, which adds an additional delay cycle before the output pin.
-Thus, in addition to delay of the clock domain crossing, there is in total a minimum two clock delay between the assertion of [`CFG.CNTR_EN`](data/pwm.hjson#cfg) and the rising edge of the first output pulse.
-
-### Hardware-Controlled Blink Features
-
-By default, the duty cycle of each channel is directly controlled by firmware, by writing the desired PWM duty cycle to the [`DUTY_CYCLE_0.A_0`](data/pwm.hjson#duty_cycle_0) register.
-
-There are two other modes which allow for programmably-timed duty cycle modulations, under hardware control.
-- In the standard blink mode the duty cycle toggles between two values, [`DUTY_CYCLE_0.A_0`](data/pwm.hjson#duty_cycle_0) and [`DUTY_CYCLE_0.B_0`](data/pwm.hjson#duty_cycle_0).
-- In heartbeat mode, the duty cycle linearly transitions from [`DUTY_CYCLE_0.A_0`](data/pwm.hjson#duty_cycle_0) to [`DUTY_CYCLE_0.B_0`](data/pwm.hjson#duty_cycle_0) and back, via a regularly-timed sequence of duty cycle increments or decrements.
-
-In both modes the timing and control of the blinking or transition is controlled by the register fields [`BLINK_PARAM_0.X_0`](data/pwm.hjson#blink_param_0) and [`BLINK_PARAM_0.Y_0`](data/pwm.hjson#blink_param_0).
-However in either mode, the interpretation of these fields is different.
-
-Note that changes to the [`BLINK_PARAM_0`](data/pwm.hjson#blink_param_0) register or to the register field [`PWM_PARAM_0.HTBT_EN_0`](data/pwm.hjson#pwm_param_0) only take effect when the [`PWM_PARAM_0.BLINK_EN_0`](data/pwm.hjson#pwm_param_0) is deasserted.
-Both of the blink modes make use of a 16-bit internal blink counter (one per channel).
-This counter is reset whenever [`PWM_PARAM_0.BLINK_EN_0`](data/pwm.hjson#pwm_param_0) is cleared.
-In other words, changing the blink behavior requires first halting the blink pattern, and the pattern starts from the beginning whenever the blink enable bit is reasserted.
-
-#### Standard Blink Mode
-
-To enter standard blink mode, assert [`PWM_PARAM_0.BLINK_EN_0`](data/pwm.hjson#pwm_param_0), while leaving [`PWM_PARAM_0.HTBT_EN_0`](data/pwm.hjson#pwm_param_0) deasserted.
-
-In standard blink mode, the duty cycle abruptly alternates between two values: [`DUTY_CYCLE_0.A_0`](data/pwm.hjson#duty_cycle_0) and [`DUTY_CYCLE_0.B_0`](data/pwm.hjson#duty_cycle_0).
-The sequence starts with [`BLINK_PARAM_0.X_0`](data/pwm.hjson#blink_param_0)+1 pulses at [`DUTY_CYCLE_0.A_0`](data/pwm.hjson#duty_cycle_0), followed by [`BLINK_PARAM_0.Y_0`](data/pwm.hjson#blink_param_0)+1 pulses at [`DUTY_CYCLE_0.B_0`](data/pwm.hjson#duty_cycle_0), after which the cycle repeats until blink mode is disabled.
-
-Typically multiple channels need to be configured to blink synchronously, for example in the tri-color LED case.
-This can be achieved by first disabling the desired PWM outputs using the [`PWM_EN`](data/pwm.hjson#pwm_en) multi-register.
-Once the blink parameters have been configured for these channels, they can be simultaneously re-enabled using a single write to [`PWM_EN`](data/pwm.hjson#pwm_en).
-
-#### Heartbeat Mode
-
-To enter heartbeat mode, assert both [`PWM_PARAM_0.BLINK_EN_0`](data/pwm.hjson#pwm_param_0) and [`PWM_PARAM_0.HTBT_EN_0`](data/pwm.hjson#pwm_param_0).
-
-In heartbeat mode the duty cycle gradually transitions from [`DUTY_CYCLE_0.A_0`](data/pwm.hjson#duty_cycle_0) to [`DUTY_CYCLE_0.B_0`](data/pwm.hjson#duty_cycle_0) and back in a series of small steps.
-
-An example of this process is illustrated in the following waveform.
-```wavejson
-{signal: [
-  {name: 'Pulse Cycle', wave: '2222222222222222222',
-   data: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]},
-  {name: 'PWM_PARAM.BLINK_EN', wave: '01.................'},
-  {name: 'PWM_PARAM.HTBT_EN', wave: '01.................'},
-  {name: 'DUTY_CYCLE.A', wave: '2..................', data: '3'},
-  {name: 'DUTY_CYCLE.B', wave: '2..................', data: '21'},
-  {name: 'BLINK_PARAM.X', wave: '2..................', data: ['1']},
-  {name: 'BLINK_PARAM.Y', wave: '2..................', data: ['4']},
-  {name: 'duty_cycle', wave: '4.2.2.2.2.2.2.2.2.2', data: [3, 3, 8, 13, 18, 23, 18, 13, 8, 3]}
-],
- config:{skin:'narrow'}
-}
-```
-
-The sequence starts with [`BLINK_PARAM_0.X_0`](data/pwm.hjson#blink_param_0)+1 pulses at [`DUTY_CYCLE_0.A_0`](data/pwm.hjson#duty_cycle_0).
-The duty cycle then increases by [`BLINK_PARAM_0.Y_0`](data/pwm.hjson#blink_param_0)+1 units, and [`BLINK_PARAM_0.X_0`](data/pwm.hjson#blink_param_0)+1 more pulses are generated at the new duty cycle.
-The cycle repeats until the `duty cycle`&ge; [`DUTY_CYCLE_0.B_0`](data/pwm.hjson#duty_cycle_0), at which point the cycle is reversed, decrementing with the same step-size and rate until the duty cycle once again returns to [`DUTY_CYCLE_0.A_0`](data/pwm.hjson#duty_cycle_0) and the whole process repeats.
-(This all assumes that [`DUTY_CYCLE_0.B_0`](data/pwm.hjson#duty_cycle_0) &gt; [`DUTY_CYCLE_0.A_0`](data/pwm.hjson#duty_cycle_0).
-If [`DUTY_CYCLE_0.B_0`](data/pwm.hjson#duty_cycle_0) &lt; [`DUTY_CYCLE_0.A_0`](data/pwm.hjson#duty_cycle_0), the cycle is similar but with all the signs reversed.
-For instance, the duty cycle is repeatedly <i>decremented</i> until reaching [`DUTY_CYCLE_0.B_0`](data/pwm.hjson#duty_cycle_0).)
-
-In the heartbeat process, the duty cycle always starts at [`DUTY_CYCLE_0.A_0`](data/pwm.hjson#duty_cycle_0), but it may slightly exceed [`DUTY_CYCLE_0.B_0`](data/pwm.hjson#duty_cycle_0) on the last step if the step-size does not evenly divide the difference between duty cycles.
-
-The duty cycle is never allowed to overflow or underflow, even if [`DUTY_CYCLE_0.B_0`](data/pwm.hjson#duty_cycle_0) is very close to the minimum or maximum 16-bit value.
-If needed, the most extreme value in the `duty_cycle` sequence is truncated to stay within the allowable 16-bit range.
-All other points in the heartbeat sequence are unaffected by this truncation.
-
-# Programmer's Guide
-
-To set the PWM Frequency for the entire IP:
-1. Clear [`CFG.CNTR_EN`](data/pwm.hjson#cfg)
-2. Select [`CFG.CLK_DIV`](data/pwm.hjson#cfg)
-3. Assert [`CFG.CNTR_EN`](data/pwm.hjson#cfg)
-
-To configure the fixed PWM duty cycle and for a particular output channel (for example channel 0):
-
-1. Disable blinking by clearing the [`PWM_PARAM_0.BLINK_EN_0`](data/pwm.hjson#pwm_param_0) bit.
-2. Set [`DUTY_CYCLE_0.A_0`](data/pwm.hjson#duty_cycle_0)
-3. Optionally set [`PWM_PARAM_0.PHASE_DELAY_0`](data/pwm.hjson#pwm_param_0) to adjust the pulse phase.
-4. Optionally assert [`INVERT.INVERT_0`](data/pwm.hjson#invert) to flip the polarity.
-5. Set [`PWM_EN.EN_0`](data/pwm.hjson#pwm_en) to turn the channel on.
-
-These changes will take place immediately, regardless of whether the `phase_ctr` is currently in the middle of a pulse cycle.
-
-To activate simple blinking for channel 0:
-
-1. Set [`DUTY_CYCLE_0.A_0`](data/pwm.hjson#duty_cycle_0) and [`DUTY_CYCLE_0.B_0`](data/pwm.hjson#duty_cycle_0) to establish the initial and target duty cycles.
-2. Clear the [`PWM_PARAM_0.BLINK_EN_0`](data/pwm.hjson#pwm_param_0) and [`PWM_PARAM_0.HTBT_EN_0`](data/pwm.hjson#pwm_param_0) bits.
-This step is necessary for changing the blink timing parameters
-3. Set  [`BLINK_PARAM_0.X_0`](data/pwm.hjson#blink_param_0) and [`BLINK_PARAM_0.Y_0`](data/pwm.hjson#blink_param_0) to set the number of pulse cycles respectively spent at duty cycle A and duty cycle B.
-4. Re-assert [`PWM_PARAM_0.BLINK_EN_0`](data/pwm.hjson#pwm_param_0).
-
-For synchronous blinking of a group of channels, first disable the desired channels using the [`PWM_EN`](data/pwm.hjson#pwm_en) register.
-Then after configuring the blink properties of the entire group, re-enable them with a single write to [`PWM_EN`](data/pwm.hjson#pwm_en).
-
-To activate heartbeat blinking for channel 0:
-1. Set [`DUTY_CYCLE_0.A_0`](data/pwm.hjson#duty_cycle_0) and [`DUTY_CYCLE_0.B_0`](data/pwm.hjson#duty_cycle_0) to establish the initial and target duty cycles.
-2. Clear the [`PWM_PARAM_0.BLINK_EN_0`](data/pwm.hjson#pwm_param_0) bit.
-This step is necessary for changing the blink timing parameters
-3. Set [`BLINK_PARAM_0.X_0`](data/pwm.hjson#blink_param_0) to the number of pulse cycles between duty cycle steps (i.e. increments or decrements).
-4. Set [`BLINK_PARAM_0.Y_0`](data/pwm.hjson#blink_param_0) to set the size of each step.
-5. In a single write, assert both [`PWM_PARAM_0.BLINK_EN_0`](data/pwm.hjson#pwm_param_0) and [`PWM_PARAM_0.HTBT_EN_0`](data/pwm.hjson#pwm_param_0)
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_pwm.h)
-
-## Register Table
-
-* [Register Table](data/pwm.hjson#registers)
diff --git a/hw/ip/pwm/doc/programmers_guide.md b/hw/ip/pwm/doc/programmers_guide.md
new file mode 100644
index 0000000000000..b1044c07370e8
--- /dev/null
+++ b/hw/ip/pwm/doc/programmers_guide.md
@@ -0,0 +1,43 @@
+# Programmer's Guide
+
+To set the PWM Frequency for the entire IP:
+1. Clear [`CFG.CNTR_EN`](../data/pwm.hjson#cfg)
+2. Select [`CFG.CLK_DIV`](../data/pwm.hjson#cfg)
+3. Assert [`CFG.CNTR_EN`](../data/pwm.hjson#cfg)
+
+To configure the fixed PWM duty cycle and for a particular output channel (for example channel 0):
+
+1. Disable blinking by clearing the [`PWM_PARAM_0.BLINK_EN_0`](../data/pwm.hjson#pwm_param_0) bit.
+2. Set [`DUTY_CYCLE_0.A_0`](../data/pwm.hjson#duty_cycle_0)
+3. Optionally set [`PWM_PARAM_0.PHASE_DELAY_0`](../data/pwm.hjson#pwm_param_0) to adjust the pulse phase.
+4. Optionally assert [`INVERT.INVERT_0`](../data/pwm.hjson#invert) to flip the polarity.
+5. Set [`PWM_EN.EN_0`](../data/pwm.hjson#pwm_en) to turn the channel on.
+
+These changes will take place immediately, regardless of whether the `phase_ctr` is currently in the middle of a pulse cycle.
+
+To activate simple blinking for channel 0:
+
+1. Set [`DUTY_CYCLE_0.A_0`](../data/pwm.hjson#duty_cycle_0) and [`DUTY_CYCLE_0.B_0`](../data/pwm.hjson#duty_cycle_0) to establish the initial and target duty cycles.
+2. Clear the [`PWM_PARAM_0.BLINK_EN_0`](../data/pwm.hjson#pwm_param_0) and [`PWM_PARAM_0.HTBT_EN_0`](../data/pwm.hjson#pwm_param_0) bits.
+This step is necessary for changing the blink timing parameters
+3. Set  [`BLINK_PARAM_0.X_0`](../data/pwm.hjson#blink_param_0) and [`BLINK_PARAM_0.Y_0`](../data/pwm.hjson#blink_param_0) to set the number of pulse cycles respectively spent at duty cycle A and duty cycle B.
+4. Re-assert [`PWM_PARAM_0.BLINK_EN_0`](../data/pwm.hjson#pwm_param_0).
+
+For synchronous blinking of a group of channels, first disable the desired channels using the [`PWM_EN`](../data/pwm.hjson#pwm_en) register.
+Then after configuring the blink properties of the entire group, re-enable them with a single write to [`PWM_EN`](../data/pwm.hjson#pwm_en).
+
+To activate heartbeat blinking for channel 0:
+1. Set [`DUTY_CYCLE_0.A_0`](../data/pwm.hjson#duty_cycle_0) and [`DUTY_CYCLE_0.B_0`](../data/pwm.hjson#duty_cycle_0) to establish the initial and target duty cycles.
+2. Clear the [`PWM_PARAM_0.BLINK_EN_0`](../data/pwm.hjson#pwm_param_0) bit.
+This step is necessary for changing the blink timing parameters
+3. Set [`BLINK_PARAM_0.X_0`](../data/pwm.hjson#blink_param_0) to the number of pulse cycles between duty cycle steps (i.e. increments or decrements).
+4. Set [`BLINK_PARAM_0.Y_0`](../data/pwm.hjson#blink_param_0) to set the size of each step.
+5. In a single write, assert both [`PWM_PARAM_0.BLINK_EN_0`](../data/pwm.hjson#pwm_param_0) and [`PWM_PARAM_0.HTBT_EN_0`](../data/pwm.hjson#pwm_param_0)
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_pwm.h)
+
+## Register Table
+
+* [Register Table](../data/pwm.hjson#registers)
diff --git a/hw/ip/pwm/doc/theory_of_operation.md b/hw/ip/pwm/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..064ec6ec57fbb
--- /dev/null
+++ b/hw/ip/pwm/doc/theory_of_operation.md
@@ -0,0 +1,207 @@
+# Theory of Operation
+
+## Block Diagram
+
+![](../doc/pwm_block_diagram.svg)
+
+## Hardware Interfaces
+
+* [Interface Tables](../data/pwm.hjson#interfaces)
+
+## Design Details
+
+### Phase and Duty Cycle Representation
+
+The PWM IP creates series of pulses with the desired on-off duty cycle.
+The duty cycle, DC, is typically expressed a fraction of pulse duration, <i>d</i>, over the period between pulses, <i>T</i>:
+
+$$DC\equiv d/T.$$
+
+Since 0&lt;<i>d</i>&lt;<i>T</i>, the duty cycle ranges from 0 to 1.
+
+The PWM IP can control the duty cycle in a number of ways:
+- The PWM can be programmed to generate pulses at a firmware-defined duty cycle.
+- The duty cycle can be programmed to toggle (or "blink") between two duty cycles at a programmable rate.
+In the tri-color LED use-case, this would have the visual effect of making the LED blink between two colors.
+- The duty cycle can linearly sweep in time, gradually shifting back-and-forth between two endpoints.
+
+Thus the duty cycle can be controlled by firmware, or may change under hardware control.
+The operation of each of these modes will be discussed later.
+
+Each channel can also be assigned a different phase delay.
+Like the duty cycle, this delay is expressed as a fraction of the pulse period, <i>T</i>.
+The phase delay of each channel is always directly controlled by a firmware register value.
+
+Since the phase and duty cycle are always a fraction less than or equal to one, the PWM IP represents them as 16-bit fixed point numbers, with an implicit 16-bit shift.
+If the duty cycle is internally represented as a 16-bit value x, the output pulse train will have the duty cycle:
+
+$$DC(x)=\frac{x}{2^{16}}.$$
+
+Thus the allowed duty cycle in principle ranges from 0 to 99.998% (i.e. <nobr>1-(&frac12;)<sup>16</sup></nobr>).
+
+However, the actual phase resolution may be smaller.
+In order to support faster pulse rates, the phase resolution can be set to less than 16-bits, in which case the observed duty cycle will be rounded down to the next lowest multiple of <nobr>2<sup>-([`CFG.DC_RESN`](../data/pwm.hjson#cfg)+1)</sup></nobr>.
+In other words, the [`CFG.DC_RESN`](../data/pwm.hjson#cfg) register effectively limits the duty cycle resolution, such that only the <nobr>[`CFG.DC_RESN`](../data/pwm.hjson#cfg)+1</nobr> most significant bits are relevant:
+
+$$DC(x; \textrm{DC_RESN})=\frac{\textrm{MSB}(x; \textrm{DC_RESN}+1)}{2^{(\textrm{DC_RESN}+1)}},$$
+
+where here we use the notation MSB(<i>x</i>; <i>y</i>), to mean the <i>y</i> most significant bits of the binary value <i>x</i>.
+
+### PWM Phase Counter
+
+The IP maintains a single phase counter that is shared by all outputs.
+As we discuss in the next section, each channel has a comparator which compares these values to the current duty cycle and phase value and generates the appropriate pulse.
+Since all phase or duty cycle related quantities are represented as 16-bit fixed point fractions-regardless of whether they are calculated by the PWM IP or determined by firmware-the phase counter is also a 16-bit quantity.
+
+Each PWM pulse cycle is divided into <nobr>2<sup>DC_RESN+1</sup></nobr> beats.
+During each beat, the 16-bit phase counter increments by 2<sup>(16-DC_RESN-1)</sup> (modulo 65536).
+The beat period is defined by the [`CFG.CLK_DIV`](../data/pwm.hjson#cfg) register:
+
+$$f_\textrm{beat}=\frac{f_\textrm{core clk}}{\textrm{CLK_DIV}+1}$$
+
+A PWM pulse cycle is completed each time the phase counter overflows to 0.
+The PWM drive frequency is therefore:
+$$f_\textrm{PWM}=f_\textrm{beat}\frac{2^{16-\textrm{DC_RESN}-1}}{2^{16}}=\frac{f_\textrm{core clk}}{2^{\textrm{DC_RESN}+1}(\textrm{CLK_DIV}+1)}$$
+
+The PWM phase counter is reset whenever [`CFG.CNTR_EN`](../data/pwm.hjson#cfg) is disabled.
+
+The following figure illustrates the effect of the clock divider register.  Note that changes to [`CFG.CLK_DIV`](../data/pwm.hjson#cfg) or [`CFG.DC_RESN`](../data/pwm.hjson#cfg) only take effect when [`CFG.CNTR_EN`](../data/pwm.hjson#cfg) is disabled.
+
+```wavejson
+{signal: [
+  {name: 'core_clk_i', wave: 'p..............|..........'},
+  {name: 'sync(CFG.CNTR_EN)', wave: '0.1............|01........'},
+  {name: 'sync(CFG.CLK_DIV)', wave: '2....4.........|..2.......', data: '2 4 4'},
+  {name: 'clk_div_int', wave: 'x..2...........|..2.......', data: '2 4'},
+  {name: 'sync(CFG.DC_RESN)', wave: '2..............|..........', data: '10d'},
+  {name: 'dc_resn_int', wave: 'x..2...........|..........', data: '10d'},
+  {name: 'phase_ctr', wave: '3..2..2..2..2..|3.2....2..', data: '0x00 0x00 0x20 0x40 0x80 0 0x00 0x20'},
+  {name: 'clk_div_ctr', wave: '2...22222222222|2..2222222', data: '0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 3 4 0 1 2'},
+  {name: 'beat_end', wave: '0....10.10.10.1|0.....10..'}],
+  config:{skin:'narrow'}
+}
+```
+
+### PWM Comparators and Pulse Generation.
+
+Whenever the phase counter loops back to zero, this marks the start of a new <i>pulse cycle</i>.
+This section describes how the comparator creates pulses with the correct duty cycle and phase.
+
+In the following sections, this document describes the various per-channel configuration options of this IP.
+For concreteness, the text discusses the operation of channel 0, using registers and fields ending with "_0".
+To operate other channels, simply choose the registers with the appropriate channel suffix.
+
+Clearing [`PWM_EN.EN_0`](../data/pwm.hjson#pwm_en) disables the channel, suppressing all output pulses.
+
+The pulse phase delay is always programmed by firmware into the TL-UL register [`PWM_PARAM_0.PHASE_DELAY_0`](../data/pwm.hjson#pwm_param_0).
+The duty cycle however comes from the blink control hardware (which is described in the next section).
+The current duty cycle is stored in a channel-specific signal register, `duty_cycle`.
+
+When operating at full resolution (i.e. `DC_RESN`==15), the channel output rises when the phase counter equals [`PWM_PARAM_0.PHASE_DELAY_0`](../data/pwm.hjson#pwm_param_0), and falls when the phase counter equals [`PWM_PARAM_0.PHASE_DELAY_0`](../data/pwm.hjson#pwm_param_0) + `duty_cycle` (mod 2<sup>(`DC_RESN`+1)</sup>).
+In both cases, the transition occurs at the beginning of the beat.
+When operating at lower resolution the same comparison applies, but using only the most significant (`DC_RESN`+1) bits.
+
+If the combination of phase delay and duty cycle is larger than one pulse cycle, the pulse will start in one pulse cycle and end in the next.
+In this case the comparator output will be high at the beginning of each cycle, as seen in the example waveform below.
+
+By default the pulses are all active-high, meaning the output is low if a PWM channel is disabled.
+However, to support various drive schemes, the polarity can be inverted on a channel-by-channel basis using the [`INVERT`](../data/pwm.hjson#invert) register.
+
+The following figure illustrates the effect of the [`PWM_PARAM_0.PHASE_DELAY_0`](../data/pwm.hjson#pwm_param_0) register and `duty_cycle`.
+Note that this figure shows two channels, 0 and 1, where the second channel has a significant phase delay, such that the output pulse is high when `phase_ctr` overflows to zero.
+
+```wavejson
+{signal: [
+  {name: 'core_clk_i', wave: 'p.....|....|......'},
+  {name: 'CFG.CLK_DIV', wave: '2.....|....|......', data: '0'},
+  {name: 'sync(CFG.CNTR_EN)', wave: '01....|....|......'},
+  {name: 'CFG.DC_RESN', wave: '2.....|....|......', data: '3'},
+  {name: 'INVERT.INVERT_0', wave: '0.....|....|......'},
+  {name: 'phase_ctr[15:12]', wave: '3.2222|2222|222222', data: ['0', '0', '1', '2', '', '7', '8', '9', '10    ', '14', '15' ,'0', '1', '2', '3', '4']},
+  {name: 'cycle_end', wave: '0.....|....|.10...'},
+  {name: 'PHASE_DELAY_0[15:12]', wave: '2.....|....|......', data: '0'},
+  {name: 'pulse_gen[0].duty_cycle[15:12]', wave: '2.....|....|......', data: '9'},
+  {name: 'pwm_out_d[0]', wave: '0.1...|..0.|..1...'},
+  {name: 'pwm_out_q[0]', wave: '0..1..|...0|...1..'},
+  {name: 'PHASE_DELAY_1[15:12]', wave: '2.....|....|......', data: '15'},
+  {name: 'pulse_gen[1].duty_cycle[15:12]', wave: '2.....|....|......', data: '3'},
+  {name: 'pwm_out_d[1]', wave: '0.1.0.|....|.1..0.'},
+  {name: 'pwm_out_q[1]', wave: '0..1.0|....|..1..0'}
+  ],
+ config:{skin:'narrow'}
+}
+```
+
+Changes to [`PWM_EN.EN_0`](../data/pwm.hjson#pwm_en) bit have no effect on the *timing* of the pulses, as the `phase_ctr` is common to all channels.
+Enabling [`PWM_EN.EN_0`](../data/pwm.hjson#pwm_en), or changing [`PWM_PARAM_0.PHASE_DELAY_0`](../data/pwm.hjson#pwm_param_0) is acceptable while the PWM channel is enabled.
+Since these registers take effect immediately, the shape of the following pulse may be unpredictable if they are changed while [`CFG.CNTR_EN`](../data/pwm.hjson#cfg) is active, though this glitch in a single pulse is likely not a problem for most applications.
+Changes to the duty cycle register [`DUTY_CYCLE_0.A_0`](../data/pwm.hjson#duty_cycle_0) may also be effective immediately, but only *when blinking is disabled*.
+
+In the above waveform, the first beat (labeled "0") does not start for one clock after [`CFG.CNTR_EN`](../data/pwm.hjson#cfg) is asserted.
+This delay is typical, and reflects the fact that it takes exactly one clock cycle for the phase counter to start (as seen in the previous waveform).
+
+There is a register `pwm_out` at the output pin, which adds an additional delay cycle before the output pin.
+Thus, in addition to delay of the clock domain crossing, there is in total a minimum two clock delay between the assertion of [`CFG.CNTR_EN`](../data/pwm.hjson#cfg) and the rising edge of the first output pulse.
+
+### Hardware-Controlled Blink Features
+
+By default, the duty cycle of each channel is directly controlled by firmware, by writing the desired PWM duty cycle to the [`DUTY_CYCLE_0.A_0`](../data/pwm.hjson#duty_cycle_0) register.
+
+There are two other modes which allow for programmably-timed duty cycle modulations, under hardware control.
+- In the standard blink mode the duty cycle toggles between two values, [`DUTY_CYCLE_0.A_0`](../data/pwm.hjson#duty_cycle_0) and [`DUTY_CYCLE_0.B_0`](../data/pwm.hjson#duty_cycle_0).
+- In heartbeat mode, the duty cycle linearly transitions from [`DUTY_CYCLE_0.A_0`](../data/pwm.hjson#duty_cycle_0) to [`DUTY_CYCLE_0.B_0`](../data/pwm.hjson#duty_cycle_0) and back, via a regularly-timed sequence of duty cycle increments or decrements.
+
+In both modes the timing and control of the blinking or transition is controlled by the register fields [`BLINK_PARAM_0.X_0`](../data/pwm.hjson#blink_param_0) and [`BLINK_PARAM_0.Y_0`](../data/pwm.hjson#blink_param_0).
+However in either mode, the interpretation of these fields is different.
+
+Note that changes to the [`BLINK_PARAM_0`](../data/pwm.hjson#blink_param_0) register or to the register field [`PWM_PARAM_0.HTBT_EN_0`](../data/pwm.hjson#pwm_param_0) only take effect when the [`PWM_PARAM_0.BLINK_EN_0`](../data/pwm.hjson#pwm_param_0) is deasserted.
+Both of the blink modes make use of a 16-bit internal blink counter (one per channel).
+This counter is reset whenever [`PWM_PARAM_0.BLINK_EN_0`](../data/pwm.hjson#pwm_param_0) is cleared.
+In other words, changing the blink behavior requires first halting the blink pattern, and the pattern starts from the beginning whenever the blink enable bit is reasserted.
+
+#### Standard Blink Mode
+
+To enter standard blink mode, assert [`PWM_PARAM_0.BLINK_EN_0`](../data/pwm.hjson#pwm_param_0), while leaving [`PWM_PARAM_0.HTBT_EN_0`](../data/pwm.hjson#pwm_param_0) deasserted.
+
+In standard blink mode, the duty cycle abruptly alternates between two values: [`DUTY_CYCLE_0.A_0`](../data/pwm.hjson#duty_cycle_0) and [`DUTY_CYCLE_0.B_0`](../data/pwm.hjson#duty_cycle_0).
+The sequence starts with [`BLINK_PARAM_0.X_0`](../data/pwm.hjson#blink_param_0)+1 pulses at [`DUTY_CYCLE_0.A_0`](../data/pwm.hjson#duty_cycle_0), followed by [`BLINK_PARAM_0.Y_0`](../data/pwm.hjson#blink_param_0)+1 pulses at [`DUTY_CYCLE_0.B_0`](../data/pwm.hjson#duty_cycle_0), after which the cycle repeats until blink mode is disabled.
+
+Typically multiple channels need to be configured to blink synchronously, for example in the tri-color LED case.
+This can be achieved by first disabling the desired PWM outputs using the [`PWM_EN`](../data/pwm.hjson#pwm_en) multi-register.
+Once the blink parameters have been configured for these channels, they can be simultaneously re-enabled using a single write to [`PWM_EN`](../data/pwm.hjson#pwm_en).
+
+#### Heartbeat Mode
+
+To enter heartbeat mode, assert both [`PWM_PARAM_0.BLINK_EN_0`](../data/pwm.hjson#pwm_param_0) and [`PWM_PARAM_0.HTBT_EN_0`](../data/pwm.hjson#pwm_param_0).
+
+In heartbeat mode the duty cycle gradually transitions from [`DUTY_CYCLE_0.A_0`](../data/pwm.hjson#duty_cycle_0) to [`DUTY_CYCLE_0.B_0`](../data/pwm.hjson#duty_cycle_0) and back in a series of small steps.
+
+An example of this process is illustrated in the following waveform.
+```wavejson
+{signal: [
+  {name: 'Pulse Cycle', wave: '2222222222222222222',
+   data: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]},
+  {name: 'PWM_PARAM.BLINK_EN', wave: '01.................'},
+  {name: 'PWM_PARAM.HTBT_EN', wave: '01.................'},
+  {name: 'DUTY_CYCLE.A', wave: '2..................', data: '3'},
+  {name: 'DUTY_CYCLE.B', wave: '2..................', data: '21'},
+  {name: 'BLINK_PARAM.X', wave: '2..................', data: ['1']},
+  {name: 'BLINK_PARAM.Y', wave: '2..................', data: ['4']},
+  {name: 'duty_cycle', wave: '4.2.2.2.2.2.2.2.2.2', data: [3, 3, 8, 13, 18, 23, 18, 13, 8, 3]}
+],
+ config:{skin:'narrow'}
+}
+```
+
+The sequence starts with [`BLINK_PARAM_0.X_0`](../data/pwm.hjson#blink_param_0)+1 pulses at [`DUTY_CYCLE_0.A_0`](../data/pwm.hjson#duty_cycle_0).
+The duty cycle then increases by [`BLINK_PARAM_0.Y_0`](../data/pwm.hjson#blink_param_0)+1 units, and [`BLINK_PARAM_0.X_0`](../data/pwm.hjson#blink_param_0)+1 more pulses are generated at the new duty cycle.
+The cycle repeats until the `duty cycle`&ge; [`DUTY_CYCLE_0.B_0`](../data/pwm.hjson#duty_cycle_0), at which point the cycle is reversed, decrementing with the same step-size and rate until the duty cycle once again returns to [`DUTY_CYCLE_0.A_0`](../data/pwm.hjson#duty_cycle_0) and the whole process repeats.
+(This all assumes that [`DUTY_CYCLE_0.B_0`](../data/pwm.hjson#duty_cycle_0) &gt; [`DUTY_CYCLE_0.A_0`](../data/pwm.hjson#duty_cycle_0).
+If [`DUTY_CYCLE_0.B_0`](../data/pwm.hjson#duty_cycle_0) &lt; [`DUTY_CYCLE_0.A_0`](../data/pwm.hjson#duty_cycle_0), the cycle is similar but with all the signs reversed.
+For instance, the duty cycle is repeatedly <i>decremented</i> until reaching [`DUTY_CYCLE_0.B_0`](../data/pwm.hjson#duty_cycle_0).)
+
+In the heartbeat process, the duty cycle always starts at [`DUTY_CYCLE_0.A_0`](../data/pwm.hjson#duty_cycle_0), but it may slightly exceed [`DUTY_CYCLE_0.B_0`](../data/pwm.hjson#duty_cycle_0) on the last step if the step-size does not evenly divide the difference between duty cycles.
+
+The duty cycle is never allowed to overflow or underflow, even if [`DUTY_CYCLE_0.B_0`](../data/pwm.hjson#duty_cycle_0) is very close to the minimum or maximum 16-bit value.
+If needed, the most extreme value in the `duty_cycle` sequence is truncated to stay within the allowable 16-bit range.
+All other points in the heartbeat sequence are unaffected by this truncation.
diff --git a/hw/ip/pwrmgr/README.md b/hw/ip/pwrmgr/README.md
index 4aebd4126cf77..6cb52a38ea685 100644
--- a/hw/ip/pwrmgr/README.md
+++ b/hw/ip/pwrmgr/README.md
@@ -31,384 +31,3 @@ The power manager sequences the design from a freshly reset state to an active s
 Reset scenarios refer to non-POR events that cause the device to reboot.
 There are various stimuli that can cause such a reset, ranging from external user input to watchdog timeout.
 The power manager processes the reset request and brings the device to an appropriate state.
-
-
-# Theory Of Operation
-
-The power manager performs the following functions:
-- Turn on/off power domain(s).
-- Control root resets with the reset manager.
-- Control root clock enables with AST and clock manager.
-- Sequence various power up activities such as OTP sensing, life cycle initiation and releasing software to execute.
-
-
-## Block Diagram
-
-See the below high level block diagram that illustrates the connections between the power manager and various system components.
-Blocks outlined with a solid magenta line are always on; while blocks outlined with a dashed magenta line are a mix of components that are and those that are not.
-
-![Power Manager Connectivity Diagram](./doc/pwrmgr_connectivity.svg)
-
-## Hardware Interfaces
-
-* [Interface Tables](../../top_earlgrey/ip/pwrmgr/data/autogen/pwrmgr.hjson#interfaces)
-
-## Overall Sequencing
-
-The power manager contains two state machines.
-One operates on the always-on slow clock (this clock is always running and usually measured in KHz) and is responsible for turning faster clocks on and off and managing the power domains.
-The other operates on a normal fixed clock (usually measured in MHz) and is responsible for everything else in the power sequence.
-
-The following diagram breaks down the general functionality of both.
-The state machines are colored based on their clock domains.
-The green state machine is clocked by the normal fixed domain, while the orange state machine is clocked by the slow domain.
-Specific request / acknowledge signals are also highlighted in this color scheme to show where the two state machines communicate.
-
-![Power Manager FSMs](./doc/pwrmgr_fsms.svg)
-
-
-Note, most of the states are transitional states, and only the following state combinations are resting states.
-
-
-*   Slow FSM `Idle` and fast FSM `Active`
-*   Slow FSM `Low Power` and fast FSM `Low Power`
-
-The slow FSM `Low Power` and fast FSM `Active` states specifically are concepts useful when examining [reset handling](#reset-request-handling).
-
-
-## Slow Clock Domain FSM
-
-The slow clock domain FSM (referred to as the slow FSM from here on) resets to the Reset state.
-This state is released by `por_rst_n`, which is supplied from the reset controller.
-The `por_rst_n` signal is released when the reset controller detects the root power domains (`vcaon_pok` from AST) of the system are ready.
-Please see the [ast](../../top_earlgrey/ip/ast/README.md) for more details.
-
-The slow FSM requests the AST to power up the main domain and high speed clocks.
-Once those steps are done, it requests the [fast FSM](#fast-clock-domain-fsm) to begin operation.
-The slow FSM also handles power isolation controls as part of this process.
-
-Once the fast FSM acknowledges the power-up completion, the slow FSM transitions to `Idle` and waits for a power down request.
-When a power down request is received, the slow FSM turns off AST clocks and power as directed by software configuration.
-This means the clocks and power are not always turned off, but are rather controlled by software configurations in [`CONTROL`](../pwm/data/pwm.hjson#control) prior to low power entry .
-Once these steps are complete, the slow FSM transitions to a low power state and awaits a wake request, which can come either as an actual wakeup, or a reset event (for example always on watchdog expiration).
-
-#### Sparse FSM
-
-Since the slow FSM is sparsely encoded, it is possible for the FSM to end up in an undefined state if attacked.
-When this occurs, the slow FSM sends an `invalid` indication to the fast FSM and forcibly powers off and clamps everything.
-
-The clocks are kept on however to allow the fast FSM to operate if it is able to receive the `invalid` indication.
-The slow FSM does not recover from this state until the system is reset by POR.
-
-Unlike [escalation resets](#escalation-reset-request), the system does not self reset.
-Instead the system goes into a terminal non-responsive state where a user or host must directly intervene by toggling the power or asserting an external reset input.
-
-## Fast Clock Domain FSM
-
-The fast clock domain FSM (referred to as fast FSM from here on) resets to `Low Power` state and waits for a power-up request from the slow FSM.
-
-Once received, the fast FSM releases the life cycle reset stage (see [reset controller]({{< relref "hw/ip/rstmgr/doc" >}}) for more details).
-This allows the [OTP](../otp_ctrl/README.md) to begin sensing.
-Once OTP sensing completes , the life cycle controller is initialized.
-The initialization of the life cycle controller puts the device into its allowed operating state (see [life cycle controller](../lc_ctrl/README.md) for more details).
-
-Once life cycle initialization is done, the fast FSM enables all second level clock gating (see [clock controller](../clkmgr/README.md) for more details) and initiates strap sampling.
-For more details on what exactly the strap samples, please see [here](https://docs.google.com/spreadsheets/d/1pH8T1MhQ7TXtP_bFNT85T9jSVIHlxHAfbMnPbsMdjc0/edit?usp=sharing).
-
-Once strap sampling is complete, the system is ready to begin normal operations (note `flash_ctrl` initialization is explicitly not done here, please see [sections below](#flash-handling) for more details).
-The fast FSM acknowledges the slow FSM (which made the original power up request) and releases the system reset stage - this enables the processor to begin operation.
-Afterwards, the fast FSM transitions to `Active` state and waits for a software low power entry request.
-
-A low power request is initiated by software through a combination of WFI and software low power hint in [`CONTROL`](../pwm/data/pwm.hjson#control).
-Specifically, this means if software issues only WFI, the power manager does not treat it as a power down request.
-The notion of WFI is exported from the processor.
-For Ibex, this is currently in the form of `core_sleeping_o`.
-
-In response to the low power entry request, the fast FSM disables all second level clock gating.
-Before proceeding, the fast FSM explicitly separates the handling between a normal low power entry and a [reset request](#reset-request-handlig).
-
-For low power entry, there are two cases, [fall through handling](#fall-through-handling) and [abort handling](#abort-handling).
-If none of these exception cases are matched for low power entry, the fast FSM then asserts appropriate resets as necessary and requests the slow FSM to take over.
-
-For reset requests, fall through and aborts are not checked and the system simply resets directly.
-Note in this scenario the slow FSM is not requested to take over.
-
-#### Sparse FSM
-
-Since the fast FSM is sparsely encoded, it is possible for the FSM to end up in an undefined state if attacked.
-When this occurs, the fast FSM forcibly disables all clocks and holds the system in reset.
-
-The fast FSM does not recover from this state until the system is reset by POR.
-
-
-### ROM Integrity Checks
-
-The power manager coordinates the [start up ROM check](../rom_ctrl/README.md#the-startup-rom-check) with `rom_ctrl`.
-
-After every reset, the power manager sends an indication to the `rom_ctrl` to begin performing integrity checks.
-When the `rom_ctrl` checks are finished, a `done` and `good` indication are sent back to the power manager.
-
-If the device is in life cycle test states (`TEST_UNLOCKED` or `RMA`), the `good` signal is ignored and the ROM contents are always allowed to execute.
-
-If the device is not in one of the test states, the `good` signal is used to determine ROM execution.
-If `good` is true, ROM execution is allowed.
-If `good` is false, ROM execution is disallowed.
-
-### Fall Through Handling
-
-A low power entry fall through occurs when some condition occurs that immediately de-assert the entry conditions right after the software requests it.
-
-This can happen if right after software asserts WFI, an interrupt is shown to the processor, thus breaking it out of its currently stopped state.
-Whether this type of fall through happens is highly dependent on how the system handles interrupts during low power entry - some systems may choose to completely silence any interrupt not related to wakeup, others may choose to leave them all enabled.
-The fall through handle is specifically catered to the latter category.
-
-For a normal low power entry, the fast FSM first checks that the low power entry conditions are still true.
-If the entry conditions are no longer true, the fast FSM "falls through" the entry handling and returns the system to active state, thus terminating the entry process.
-
-### Abort Handling
-
-If the entry conditions are still true, the fast FSM then checks there are no ongoing non-volatile activities from `otp_ctrl`, `lc_ctrl` and `flash_ctrl`.
-If any module is active, the fast FSM "aborts" entry handling and returns the system to active state, thus terminating the entry process.
-
-## Reset Request Handling
-
-There are 4 reset requests in the system
-- peripheral requested reset such as watchdog.
-- reset manager's software requested reset, which is functionally very similar to a peripheral requested reset.
-- power manager's internal reset request.
-- Non-debug module reset.
-
-Flash brownout is handled separately and described in [flash handling section](#flash-handling) below.
-
-Peripheral requested resets such as watchdog are handled directly by the power manager, while the non-debug module reset is handled by the reset controller.
-This separation is because the non-debug reset does not affect the life cycle controller, non-volatile storage controllers and alert states.
-There is thus no need to sequence its operation like the others.
-
-The power controller only observes reset requests in two states - the slow FSM `Low Power` state and the fast FSM `Active` state.
-When a reset request is received during slow FSM `Low Power` state, the system begins its usual power up sequence even if a wakeup has not been received.
-
-When a reset request is received during fast FSM `Active` state, the fast FSM asserts resets and transitions back to its `Low Power` state.
-The normal power-up process described [above](#fast-clock-domain-fsm) is then followed to release the resets.
-Note in this case, the slow FSM is "not activated" and remains in its `Idle` state.
-
-### Power Manager Internal Reset Requests
-
-In additional to external requests, the power manager maintains 2 internal reset requests:
-* Escalation reset request
-* Main power domain unstable reset request
-
-#### Escalation Reset Request
-
-Alert escalation resets in general behave similarly to peripheral requested resets.
-However, peripheral resets are always handled gracefully and follow the normal FSM transition.
-
-Alert escalations can happen at any time and do not always obey normal rules.
-As a result, upon alert escalation, the power manager makes a best case effort to transition directly into reset handling.
-
-This may not always be possible if the escalation happens while the FSM is in an invalid state.
-In this scenario, the pwrmgr keeps everything powered off and silenced and requests escalation handling if the system ever wakes up.
-
-#### Escalation Clock Timeout
-
-Under normal behavior, the power manager can receive escalation requests from the system and handle them [appropriately](#escalation-reset-request).
-However, if the escalation clock or reset are non-functional for any reason, the escalation request would not be serviced.
-
-To mitigate this, the power manager actively checks for escalation interface clock/reset timeout.
-This is done by a continuous request / acknowledge interface between the power manager's local clock/reset and the escalate network's clock/reset.
-
-If the request / acknowledge interface does not respond within 128 power manager clock cycles, the escalate domain is assumed to be off.
-When this happens, the power manager creates a local escalation request that behaves identically to the global escalation request.
-
-
-#### Main Power Unstable Reset Requests
-If the main power ever becomes unstable (the power okay indication is low even though it is powered on), the power manager requests an internal reset.
-This reset behaves similarly to the escalation reset and transitions directly into reset handling.
-
-Note that under normal low power conditions, the main power may be be turned off.
-As a result of this, the main power unstable checks are valid only during states that power should be on and stable.
-This includes any state where power manager has requested the power to be turned on.
-
-
-### Reset Requests Received During Other States
-
-All other states in the slow / fast FSM are considered transitional states.
-Resets are not observed in other states because the system will always be transitioning towards one of the steady states (the system is in the process of powering down or powering up).
-Once a steady state is reached, reset requests are then observed and processed.
-
-### Reset Recording
-
-There are three ways in which the device is reset:
-- Non-debug-module reset request
-- Low power entry (`sleep_req` in the state diagram)
-- Direct reset requests by peripherals or alert escalation
-
-The power manager does not handle the non-debug-module request (please see reset controller).
-For the remaining two reset causes, the power manager handles only 1 pathway at a time (see state diagrams).
-This means if reset request and low power entry collide, the power manager will handle them on a first come first served basis.
-When the handling of the first is completed, the power manager handles the second pending request if it is still present.
-
-This is done because low power resets and peripheral requested resets lead to different behaviors.
-When the power manager commits to handling a specific request, it informs the reset manager why it has reset the processor.
-
-For example, assume a low power entry request arrives slightly ahead of reset requests.
-The power manager will:
-- Transition the system into low power state.
-- Inform the reset manager to record "low power exit" as the reset reason.
-- Once in low state, transition the system to `Active` state by using the reset request as a wakeup indicator.
-- Inform the reset manager to also record the peripheral that requested reset.
-- Once in `Active` state, reset the system and begin normal power-up routines again.
-
-If reset requests arrive slightly ahead of a low power entry request, then power manager will:
-- Reset the system and begin normal power-up routines.
-- Inform the reset manager to record the peripheral that requested reset.
-- Once in `Active` state, if the low power entry request is still present, transition to low power state.
-  - Inform the reset manager to also record "low power exit" as the reset reason.
-- If the low power entry request was wiped out by reset, the system then stays in `Active` state and awaits software instructions.
-
-Ultimately when control is returned to software, it may see two reset reasons and must handle them accordingly.
-
-
-## Wakeup Recording
-
-Similar to [reset handling](#reset-request-handling), wakeup signals are only observed during slow FSM `Low Power`; however their recording is continuous until explicitly disabled by software.
-
-Wakeup recording begins when the fast FSM transitions out of `Active` state and continues until explicitly disabled by software.
-This ensures wakeup events are not missed until software has set up the appropriate peripherals.
-
-The software is also able to enable recording during `Active` state if it chooses to do so.  The recording enables are OR’d together for hardware purposes.
-
-
-## Flash Handling
-For the section below, flash macro refers to the proprietary flash storage supplied by a vendor.
-`flash_ctrl`, on the other hand, refers to the open source controller that manages access to the flash macro.
-
-### Power-Up Handling
-
-The [AST](../../top_earlgrey/ip/ast/README.md) automatically takes the flash macro out of power down state as part of the power manager's power up request.
-
-Once flash macro is powered up and ready, an indication is sent to the `flash_ctrl`.
-
-Once the boot ROM is allowed to execute, it is expected to further initialize the `flash_ctrl` and flash macro prior to using it.
-This involves the following steps:
-
-*   Poll `flash_ctrl` register to ensure flash macro has powered up and completed internal initialization.
-*   Initialize `flash_ctrl` seed reading and scrambling.
-
-### Power-Down Handling
-
-Before the device enters low power, the pwrmgr first checks to ensure there are no ongoing transactions to the flash macro.
-When the device enters deep sleep, the flash macro is automatically put into power down mode by the AST.
-The AST places the flash macro into power down through direct signaling between AST and flash macro, the pwrmgr is not directly involved.
-
-When the device exits low power state, it is the responsibility of the boot ROM to poll for flash macro and `flash_ctrl` power-up complete similar to the above section.
-
-### Flash Brownout Handling
-
-When the external supply of the device dips below a certain threshold during a non-volatile flash macro operation (program or erase), the flash macro requires the operation to terminate in a pre-defined manner.
-This sequence will be exclusively handled by the AST.
-
-The power manager is unaware of the difference between POR and flash brownout.
-Because of this, the software also cannot distinguish between these two reset causes.
-
-
-## Supported Low Power Modes
-
-This section details the various low power modes supported by OpenTitan.
-
-
-### Deep Sleep or Standby
-
-This is the lowest power mode of the device (outside of full power down or device held in reset).
-During this state:
-
-*   All clocks other than the always-on slow clock are turned off at the source.
-*   All non-always-on digital domains are powered off.
-*   I/O power domains may or may not be off.
-    *   The state of the IO power domain has no impact on the digital core’s power budget, e.g. the IO power being off does not cause the accompanying digital logic in pads or elsewhere to leak more.
-
-
-### Normal Sleep
-
-This is a fast low power mode of the device that trades-off power consumption for resume latency.
-During this state:
-
-*   All clocks other than the KHz slow clock are turned off at the source.
-*   All power domains are kept on for fast resume.
-*   Sensor countermeasures can be opportunistically on.
-*   I/O power domains may or may not be off.
-    *   The state of the IO power domain has no impact on the digital core’s power budget, e.g. the IO power being off does not cause the accompanying digital logic in pads or elsewhere to leak more.
-
-## Debug
-
-When performing TAP debug, it is important for the debugging software to prevent the system from going to low power.
-If the system enters low power during live debug, the debug session will be broken.
-There is currently no standardized way to do this, so it is up to the debugging agent to perform the correct steps.
-
-
-# Programmers Guide
-
-The process in which the power manager is used is highly dependent on the system's topology.
-The following proposes one method for how this can be done.
-
-Assume first the system has the power states described [above](#supported-low-power-modes).
-
-## Programmer Sequence for Entering Low Power
-
-1. Disable interrupts
-2. Enable desired wakeup and reset sources in [`WAKEUP_EN`](../pwm/data/pwm.hjson#wakeup_en) and [`RESET_EN`](../pwm/data/pwm.hjson#reset_en).
-3. Perform any system-specific low power entry steps, e.g.
-   - Interrupt checks (if something became pending prior to disable)
-4. Configure low power mode in [`CONTROL`](../pwm/data/pwm.hjson#control).
-5. Set low power hint in [`LOW_POWER_HINT`](../pwm/data/pwm.hjson#low_power_hint).
-6. Set and poll [`CFG_CDC_SYNC`](../pwm/data/pwm.hjson#cfg_cdc_sync) to ensure above settings propagate across clock domains.
-7. Execute wait-for-interrupt instruction on the processing host.
-
-### Possible Exits
-
-Once low power is initiated, the system may exit due to several reasons.
-1. Graceful low power exit - This exit occurs when some source in the system gracefully wakes up the power manager.
-2. System reset request - This exit occurs when either software or a peripheral requests the pwrmgr to reset the system.
-3. [Fall through exit](#fall-through-handling) - This exit occurs when an interrupt manages to break the wait-for-interrupt loop.
-4. [Aborted entry](#abort-handling) - This exit occurs when low power entry is attempted with an ongoing non-volatile transaction.
-
-In both fall through exit and aborted entry, the power manager does not actually enter low power.
-Instead the low power entry is interrupted and the system restored to active state.
-
-## Programmer Sequence for Exiting Low Power
-
-There are two separate cases for low power exit.
-One is exiting from deep sleep, and the other is exiting from normal sleep.
-
-### Exiting from Deep Sleep
-
-When exiting from deep sleep, the system begins execution in ROM.
-
-1. Complete normal preparation steps.
-2. Check reset cause in [rstmgr](../rstmgr/README.md)
-3. Re-enable modules that have powered down.
-4. Disable wakeup recording through [`WAKE_INFO_CAPTURE_DIS`](../pwm/data/pwm.hjson#wake_info_capture_dis).
-5. Check which source woke up the system through [`WAKE_INFO`](../pwm/data/pwm.hjson#wake_info).
-6. Take appropriate steps to handle the wake and resume normal operation.
-7. Once wake is handled, clear the wake indication in [`WAKE_INFO`](../pwm/data/pwm.hjson#wake_info).
-
-### Exiting from Normal Sleep
-
-The handling for fall-through and abort are similar to normal sleep exit.
-Since in these scenarios the system was not reset, software continues executing the instruction after the wait-for-interrupt invocation.
-
-1. Check exit condition to determine appropriate steps.
-2. Clear low power hints and configuration in [`CONTROL`](../pwm/data/pwm.hjson#control).
-3. Set and poll [`CFG_CDC_SYNC`](../pwm/data/pwm.hjson#cfg_cdc_sync) to ensure setting changes have propagated across clock boundaries.
-4. Disable wakeup sources and stop recording.
-5. Re-enable interrupts for normal operation and wakeup handling.
-6. Once wake is handled, clear the wake indication in [`WAKE_INFO`](../pwm/data/pwm.hjson#wake_info).
-
-For an in-depth discussion, please see [power management programmers model](https://docs.google.com/document/d/1w86rmvylJgZVmmQ6Q1YBcCp2VFctkQT3zJ408SJMLPE/edit?usp=sharing) for additional details.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_pwrmgr.h)
-
-## Register Table
-
-* [Register Table](../../top_earlgrey/ip/pwrmgr/data/autogen/pwrmgr.hjson#registers)
diff --git a/hw/ip/pwrmgr/doc/programmers_guide.md b/hw/ip/pwrmgr/doc/programmers_guide.md
new file mode 100644
index 0000000000000..d1df8f2b48ab3
--- /dev/null
+++ b/hw/ip/pwrmgr/doc/programmers_guide.md
@@ -0,0 +1,67 @@
+# Programmer's Guide
+
+The process in which the power manager is used is highly dependent on the system's topology.
+The following proposes one method for how this can be done.
+
+Assume first the system has the power states described [above](#supported-low-power-modes).
+
+## Programmer Sequence for Entering Low Power
+
+1. Disable interrupts
+2. Enable desired wakeup and reset sources in [`WAKEUP_EN`](../../pwm/data/pwm.hjson#wakeup_en) and [`RESET_EN`](../../pwm/data/pwm.hjson#reset_en).
+3. Perform any system-specific low power entry steps, e.g.
+   - Interrupt checks (if something became pending prior to disable)
+4. Configure low power mode in [`CONTROL`](../../pwm/data/pwm.hjson#control).
+5. Set low power hint in [`LOW_POWER_HINT`](../../pwm/data/pwm.hjson#low_power_hint).
+6. Set and poll [`CFG_CDC_SYNC`](../../pwm/data/pwm.hjson#cfg_cdc_sync) to ensure above settings propagate across clock domains.
+7. Execute wait-for-interrupt instruction on the processing host.
+
+### Possible Exits
+
+Once low power is initiated, the system may exit due to several reasons.
+1. Graceful low power exit - This exit occurs when some source in the system gracefully wakes up the power manager.
+2. System reset request - This exit occurs when either software or a peripheral requests the pwrmgr to reset the system.
+3. [Fall through exit](#fall-through-handling) - This exit occurs when an interrupt manages to break the wait-for-interrupt loop.
+4. [Aborted entry](#abort-handling) - This exit occurs when low power entry is attempted with an ongoing non-volatile transaction.
+
+In both fall through exit and aborted entry, the power manager does not actually enter low power.
+Instead the low power entry is interrupted and the system restored to active state.
+
+## Programmer Sequence for Exiting Low Power
+
+There are two separate cases for low power exit.
+One is exiting from deep sleep, and the other is exiting from normal sleep.
+
+### Exiting from Deep Sleep
+
+When exiting from deep sleep, the system begins execution in ROM.
+
+1. Complete normal preparation steps.
+2. Check reset cause in [rstmgr](../../rstmgr/README.md)
+3. Re-enable modules that have powered down.
+4. Disable wakeup recording through [`WAKE_INFO_CAPTURE_DIS`](../../pwm/data/pwm.hjson#wake_info_capture_dis).
+5. Check which source woke up the system through [`WAKE_INFO`](../../pwm/data/pwm.hjson#wake_info).
+6. Take appropriate steps to handle the wake and resume normal operation.
+7. Once wake is handled, clear the wake indication in [`WAKE_INFO`](../../pwm/data/pwm.hjson#wake_info).
+
+### Exiting from Normal Sleep
+
+The handling for fall-through and abort are similar to normal sleep exit.
+Since in these scenarios the system was not reset, software continues executing the instruction after the wait-for-interrupt invocation.
+
+1. Check exit condition to determine appropriate steps.
+2. Clear low power hints and configuration in [`CONTROL`](../../pwm/data/pwm.hjson#control).
+3. Set and poll [`CFG_CDC_SYNC`](../../pwm/data/pwm.hjson#cfg_cdc_sync) to ensure setting changes have propagated across clock boundaries.
+4. Disable wakeup sources and stop recording.
+5. Re-enable interrupts for normal operation and wakeup handling.
+6. Once wake is handled, clear the wake indication in [`WAKE_INFO`](../../pwm/data/pwm.hjson#wake_info).
+
+For an in-depth discussion, please see [power management programmers model](https://docs.google.com/document/d/1w86rmvylJgZVmmQ6Q1YBcCp2VFctkQT3zJ408SJMLPE/edit?usp=sharing) for additional details.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_pwrmgr.h)
+
+## Register Table
+
+* [Register Table](../../../top_earlgrey/ip/pwrmgr/data/autogen/pwrmgr.hjson#registers)
diff --git a/hw/ip/pwrmgr/doc/theory_of_operation.md b/hw/ip/pwrmgr/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..5b2c5b248f54c
--- /dev/null
+++ b/hw/ip/pwrmgr/doc/theory_of_operation.md
@@ -0,0 +1,310 @@
+# Theory of Operation
+
+The power manager performs the following functions:
+- Turn on/off power domain(s).
+- Control root resets with the reset manager.
+- Control root clock enables with AST and clock manager.
+- Sequence various power up activities such as OTP sensing, life cycle initiation and releasing software to execute.
+
+
+## Block Diagram
+
+See the below high level block diagram that illustrates the connections between the power manager and various system components.
+Blocks outlined with a solid magenta line are always on; while blocks outlined with a dashed magenta line are a mix of components that are and those that are not.
+
+![Power Manager Connectivity Diagram](../doc/pwrmgr_connectivity.svg)
+
+## Hardware Interfaces
+
+* [Interface Tables](../../../top_earlgrey/ip/pwrmgr/data/autogen/pwrmgr.hjson#interfaces)
+
+## Overall Sequencing
+
+The power manager contains two state machines.
+One operates on the always-on slow clock (this clock is always running and usually measured in KHz) and is responsible for turning faster clocks on and off and managing the power domains.
+The other operates on a normal fixed clock (usually measured in MHz) and is responsible for everything else in the power sequence.
+
+The following diagram breaks down the general functionality of both.
+The state machines are colored based on their clock domains.
+The green state machine is clocked by the normal fixed domain, while the orange state machine is clocked by the slow domain.
+Specific request / acknowledge signals are also highlighted in this color scheme to show where the two state machines communicate.
+
+![Power Manager FSMs](../doc/pwrmgr_fsms.svg)
+
+
+Note, most of the states are transitional states, and only the following state combinations are resting states.
+
+
+*   Slow FSM `Idle` and fast FSM `Active`
+*   Slow FSM `Low Power` and fast FSM `Low Power`
+
+The slow FSM `Low Power` and fast FSM `Active` states specifically are concepts useful when examining [reset handling](#reset-request-handling).
+
+
+## Slow Clock Domain FSM
+
+The slow clock domain FSM (referred to as the slow FSM from here on) resets to the Reset state.
+This state is released by `por_rst_n`, which is supplied from the reset controller.
+The `por_rst_n` signal is released when the reset controller detects the root power domains (`vcaon_pok` from AST) of the system are ready.
+Please see the [ast](../../../top_earlgrey/ip/ast/README.md) for more details.
+
+The slow FSM requests the AST to power up the main domain and high speed clocks.
+Once those steps are done, it requests the [fast FSM](#fast-clock-domain-fsm) to begin operation.
+The slow FSM also handles power isolation controls as part of this process.
+
+Once the fast FSM acknowledges the power-up completion, the slow FSM transitions to `Idle` and waits for a power down request.
+When a power down request is received, the slow FSM turns off AST clocks and power as directed by software configuration.
+This means the clocks and power are not always turned off, but are rather controlled by software configurations in [`CONTROL`](../../pwm/data/pwm.hjson#control) prior to low power entry .
+Once these steps are complete, the slow FSM transitions to a low power state and awaits a wake request, which can come either as an actual wakeup, or a reset event (for example always on watchdog expiration).
+
+#### Sparse FSM
+
+Since the slow FSM is sparsely encoded, it is possible for the FSM to end up in an undefined state if attacked.
+When this occurs, the slow FSM sends an `invalid` indication to the fast FSM and forcibly powers off and clamps everything.
+
+The clocks are kept on however to allow the fast FSM to operate if it is able to receive the `invalid` indication.
+The slow FSM does not recover from this state until the system is reset by POR.
+
+Unlike [escalation resets](#escalation-reset-request), the system does not self reset.
+Instead the system goes into a terminal non-responsive state where a user or host must directly intervene by toggling the power or asserting an external reset input.
+
+## Fast Clock Domain FSM
+
+The fast clock domain FSM (referred to as fast FSM from here on) resets to `Low Power` state and waits for a power-up request from the slow FSM.
+
+Once received, the fast FSM releases the life cycle reset stage (see [reset controller]({{< relref "hw/ip/rstmgr/doc" >}}) for more details).
+This allows the [OTP](../../otp_ctrl/README.md) to begin sensing.
+Once OTP sensing completes , the life cycle controller is initialized.
+The initialization of the life cycle controller puts the device into its allowed operating state (see [life cycle controller](../../lc_ctrl/README.md) for more details).
+
+Once life cycle initialization is done, the fast FSM enables all second level clock gating (see [clock controller](../../clkmgr/README.md) for more details) and initiates strap sampling.
+For more details on what exactly the strap samples, please see [here](https://docs.google.com/spreadsheets/d/1pH8T1MhQ7TXtP_bFNT85T9jSVIHlxHAfbMnPbsMdjc0/edit?usp=sharing).
+
+Once strap sampling is complete, the system is ready to begin normal operations (note `flash_ctrl` initialization is explicitly not done here, please see [sections below](#flash-handling) for more details).
+The fast FSM acknowledges the slow FSM (which made the original power up request) and releases the system reset stage - this enables the processor to begin operation.
+Afterwards, the fast FSM transitions to `Active` state and waits for a software low power entry request.
+
+A low power request is initiated by software through a combination of WFI and software low power hint in [`CONTROL`](../../pwm/data/pwm.hjson#control).
+Specifically, this means if software issues only WFI, the power manager does not treat it as a power down request.
+The notion of WFI is exported from the processor.
+For Ibex, this is currently in the form of `core_sleeping_o`.
+
+In response to the low power entry request, the fast FSM disables all second level clock gating.
+Before proceeding, the fast FSM explicitly separates the handling between a normal low power entry and a [reset request](#reset-request-handlig).
+
+For low power entry, there are two cases, [fall through handling](#fall-through-handling) and [abort handling](#abort-handling).
+If none of these exception cases are matched for low power entry, the fast FSM then asserts appropriate resets as necessary and requests the slow FSM to take over.
+
+For reset requests, fall through and aborts are not checked and the system simply resets directly.
+Note in this scenario the slow FSM is not requested to take over.
+
+#### Sparse FSM
+
+Since the fast FSM is sparsely encoded, it is possible for the FSM to end up in an undefined state if attacked.
+When this occurs, the fast FSM forcibly disables all clocks and holds the system in reset.
+
+The fast FSM does not recover from this state until the system is reset by POR.
+
+
+### ROM Integrity Checks
+
+The power manager coordinates the [start up ROM check](../../rom_ctrl/README.md#the-startup-rom-check) with `rom_ctrl`.
+
+After every reset, the power manager sends an indication to the `rom_ctrl` to begin performing integrity checks.
+When the `rom_ctrl` checks are finished, a `done` and `good` indication are sent back to the power manager.
+
+If the device is in life cycle test states (`TEST_UNLOCKED` or `RMA`), the `good` signal is ignored and the ROM contents are always allowed to execute.
+
+If the device is not in one of the test states, the `good` signal is used to determine ROM execution.
+If `good` is true, ROM execution is allowed.
+If `good` is false, ROM execution is disallowed.
+
+### Fall Through Handling
+
+A low power entry fall through occurs when some condition occurs that immediately de-assert the entry conditions right after the software requests it.
+
+This can happen if right after software asserts WFI, an interrupt is shown to the processor, thus breaking it out of its currently stopped state.
+Whether this type of fall through happens is highly dependent on how the system handles interrupts during low power entry - some systems may choose to completely silence any interrupt not related to wakeup, others may choose to leave them all enabled.
+The fall through handle is specifically catered to the latter category.
+
+For a normal low power entry, the fast FSM first checks that the low power entry conditions are still true.
+If the entry conditions are no longer true, the fast FSM "falls through" the entry handling and returns the system to active state, thus terminating the entry process.
+
+### Abort Handling
+
+If the entry conditions are still true, the fast FSM then checks there are no ongoing non-volatile activities from `otp_ctrl`, `lc_ctrl` and `flash_ctrl`.
+If any module is active, the fast FSM "aborts" entry handling and returns the system to active state, thus terminating the entry process.
+
+## Reset Request Handling
+
+There are 4 reset requests in the system
+- peripheral requested reset such as watchdog.
+- reset manager's software requested reset, which is functionally very similar to a peripheral requested reset.
+- power manager's internal reset request.
+- Non-debug module reset.
+
+Flash brownout is handled separately and described in [flash handling section](#flash-handling) below.
+
+Peripheral requested resets such as watchdog are handled directly by the power manager, while the non-debug module reset is handled by the reset controller.
+This separation is because the non-debug reset does not affect the life cycle controller, non-volatile storage controllers and alert states.
+There is thus no need to sequence its operation like the others.
+
+The power controller only observes reset requests in two states - the slow FSM `Low Power` state and the fast FSM `Active` state.
+When a reset request is received during slow FSM `Low Power` state, the system begins its usual power up sequence even if a wakeup has not been received.
+
+When a reset request is received during fast FSM `Active` state, the fast FSM asserts resets and transitions back to its `Low Power` state.
+The normal power-up process described [above](#fast-clock-domain-fsm) is then followed to release the resets.
+Note in this case, the slow FSM is "not activated" and remains in its `Idle` state.
+
+### Power Manager Internal Reset Requests
+
+In additional to external requests, the power manager maintains 2 internal reset requests:
+* Escalation reset request
+* Main power domain unstable reset request
+
+#### Escalation Reset Request
+
+Alert escalation resets in general behave similarly to peripheral requested resets.
+However, peripheral resets are always handled gracefully and follow the normal FSM transition.
+
+Alert escalations can happen at any time and do not always obey normal rules.
+As a result, upon alert escalation, the power manager makes a best case effort to transition directly into reset handling.
+
+This may not always be possible if the escalation happens while the FSM is in an invalid state.
+In this scenario, the pwrmgr keeps everything powered off and silenced and requests escalation handling if the system ever wakes up.
+
+#### Escalation Clock Timeout
+
+Under normal behavior, the power manager can receive escalation requests from the system and handle them [appropriately](#escalation-reset-request).
+However, if the escalation clock or reset are non-functional for any reason, the escalation request would not be serviced.
+
+To mitigate this, the power manager actively checks for escalation interface clock/reset timeout.
+This is done by a continuous request / acknowledge interface between the power manager's local clock/reset and the escalate network's clock/reset.
+
+If the request / acknowledge interface does not respond within 128 power manager clock cycles, the escalate domain is assumed to be off.
+When this happens, the power manager creates a local escalation request that behaves identically to the global escalation request.
+
+
+#### Main Power Unstable Reset Requests
+If the main power ever becomes unstable (the power okay indication is low even though it is powered on), the power manager requests an internal reset.
+This reset behaves similarly to the escalation reset and transitions directly into reset handling.
+
+Note that under normal low power conditions, the main power may be be turned off.
+As a result of this, the main power unstable checks are valid only during states that power should be on and stable.
+This includes any state where power manager has requested the power to be turned on.
+
+
+### Reset Requests Received During Other States
+
+All other states in the slow / fast FSM are considered transitional states.
+Resets are not observed in other states because the system will always be transitioning towards one of the steady states (the system is in the process of powering down or powering up).
+Once a steady state is reached, reset requests are then observed and processed.
+
+### Reset Recording
+
+There are three ways in which the device is reset:
+- Non-debug-module reset request
+- Low power entry (`sleep_req` in the state diagram)
+- Direct reset requests by peripherals or alert escalation
+
+The power manager does not handle the non-debug-module request (please see reset controller).
+For the remaining two reset causes, the power manager handles only 1 pathway at a time (see state diagrams).
+This means if reset request and low power entry collide, the power manager will handle them on a first come first served basis.
+When the handling of the first is completed, the power manager handles the second pending request if it is still present.
+
+This is done because low power resets and peripheral requested resets lead to different behaviors.
+When the power manager commits to handling a specific request, it informs the reset manager why it has reset the processor.
+
+For example, assume a low power entry request arrives slightly ahead of reset requests.
+The power manager will:
+- Transition the system into low power state.
+- Inform the reset manager to record "low power exit" as the reset reason.
+- Once in low state, transition the system to `Active` state by using the reset request as a wakeup indicator.
+- Inform the reset manager to also record the peripheral that requested reset.
+- Once in `Active` state, reset the system and begin normal power-up routines again.
+
+If reset requests arrive slightly ahead of a low power entry request, then power manager will:
+- Reset the system and begin normal power-up routines.
+- Inform the reset manager to record the peripheral that requested reset.
+- Once in `Active` state, if the low power entry request is still present, transition to low power state.
+  - Inform the reset manager to also record "low power exit" as the reset reason.
+- If the low power entry request was wiped out by reset, the system then stays in `Active` state and awaits software instructions.
+
+Ultimately when control is returned to software, it may see two reset reasons and must handle them accordingly.
+
+
+## Wakeup Recording
+
+Similar to [reset handling](#reset-request-handling), wakeup signals are only observed during slow FSM `Low Power`; however their recording is continuous until explicitly disabled by software.
+
+Wakeup recording begins when the fast FSM transitions out of `Active` state and continues until explicitly disabled by software.
+This ensures wakeup events are not missed until software has set up the appropriate peripherals.
+
+The software is also able to enable recording during `Active` state if it chooses to do so.  The recording enables are OR’d together for hardware purposes.
+
+
+## Flash Handling
+For the section below, flash macro refers to the proprietary flash storage supplied by a vendor.
+`flash_ctrl`, on the other hand, refers to the open source controller that manages access to the flash macro.
+
+### Power-Up Handling
+
+The [AST](../../../top_earlgrey/ip/ast/README.md) automatically takes the flash macro out of power down state as part of the power manager's power up request.
+
+Once flash macro is powered up and ready, an indication is sent to the `flash_ctrl`.
+
+Once the boot ROM is allowed to execute, it is expected to further initialize the `flash_ctrl` and flash macro prior to using it.
+This involves the following steps:
+
+*   Poll `flash_ctrl` register to ensure flash macro has powered up and completed internal initialization.
+*   Initialize `flash_ctrl` seed reading and scrambling.
+
+### Power-Down Handling
+
+Before the device enters low power, the pwrmgr first checks to ensure there are no ongoing transactions to the flash macro.
+When the device enters deep sleep, the flash macro is automatically put into power down mode by the AST.
+The AST places the flash macro into power down through direct signaling between AST and flash macro, the pwrmgr is not directly involved.
+
+When the device exits low power state, it is the responsibility of the boot ROM to poll for flash macro and `flash_ctrl` power-up complete similar to the above section.
+
+### Flash Brownout Handling
+
+When the external supply of the device dips below a certain threshold during a non-volatile flash macro operation (program or erase), the flash macro requires the operation to terminate in a pre-defined manner.
+This sequence will be exclusively handled by the AST.
+
+The power manager is unaware of the difference between POR and flash brownout.
+Because of this, the software also cannot distinguish between these two reset causes.
+
+
+## Supported Low Power Modes
+
+This section details the various low power modes supported by OpenTitan.
+
+
+### Deep Sleep or Standby
+
+This is the lowest power mode of the device (outside of full power down or device held in reset).
+During this state:
+
+*   All clocks other than the always-on slow clock are turned off at the source.
+*   All non-always-on digital domains are powered off.
+*   I/O power domains may or may not be off.
+    *   The state of the IO power domain has no impact on the digital core’s power budget, e.g. the IO power being off does not cause the accompanying digital logic in pads or elsewhere to leak more.
+
+
+### Normal Sleep
+
+This is a fast low power mode of the device that trades-off power consumption for resume latency.
+During this state:
+
+*   All clocks other than the KHz slow clock are turned off at the source.
+*   All power domains are kept on for fast resume.
+*   Sensor countermeasures can be opportunistically on.
+*   I/O power domains may or may not be off.
+    *   The state of the IO power domain has no impact on the digital core’s power budget, e.g. the IO power being off does not cause the accompanying digital logic in pads or elsewhere to leak more.
+
+## Debug
+
+When performing TAP debug, it is important for the debugging software to prevent the system from going to low power.
+If the system enters low power during live debug, the debug session will be broken.
+There is currently no standardized way to do this, so it is up to the debugging agent to perform the correct steps.
diff --git a/hw/ip/rom_ctrl/README.md b/hw/ip/rom_ctrl/README.md
index b1c22a803d658..910ffd94c7631 100644
--- a/hw/ip/rom_ctrl/README.md
+++ b/hw/ip/rom_ctrl/README.md
@@ -19,225 +19,3 @@ This ROM checker is used to compute a cryptographic hash of the ROM contents jus
 - Logic for memory and address descrambling
 - Post-boot ROM integrity check
 - Alert trigger and status CSRs for ROM integrity errors or FSM glitches.
-
-# Theory of Operations
-
-## Block Diagram
-
-The image below shows a high-level block diagram of the module.
-Blue boxes are instantiations of generic primitives that are used elsewhere on the chip.
-Green boxes are simple operations; the meat of the design is in the grey boxes.
-
-The upper half of the diagram shows paths for ROM reads when the system is in normal operation.
-The lower half of the diagram shows the ROM checker.
-This is triggered by the power manager early in the chip boot sequence to check validity of the ROM image.
-It runs exactly once, and releases the green multiplexer when it is done.
-
-![ROM Controller Block Diagram](./doc/rom_ctrl_blockdiag.svg)
-
-## ROM access when chip is in operation
-
-Once the chip has booted, ROM accesses are requested over the system TL-UL bus.
-These come in through the TL-UL SRAM adapter (top-left of block diagram).
-In normal operation, the green multiplexer will give access to these TL reads.
-The address is scrambled at the first substitution-permutation network (marked S&P in the diagram).
-
-In parallel with the ROM access, a reduced `prim_prince` primitive (5 rounds with latency 1; equivalent to the cipher used for SRAM) computes a 39-bit truncated keystream for the block.
-On the following cycle, the scrambled data from ROM goes through a substitution-permutation network and is then XOR'd with the keystream.
-This scheme is the same as that used by the [SRAM controller](../sram_ctrl/README.md), but is much simplified because the ROM doesn't have to deal with writes, byte accesses or key changes.
-
-The output from the XOR is the unscrambled 32-bit data, plus seven ECC bits.
-This data is passed straight through the TL-UL SRAM adapter; the ECC bits are used as a signal integrity check by the system bus.
-
-The following diagram shows the timing of the different signals.
-The time from the `req` output from the `tlul_adapter_sram` to the response that appears on its `rvalid` input is one cycle.
-The "scrambling scheme" for addresses in the diagram is to reverse their digits.
-The word stored at address 21 in the ROM is denoted `w21`.
-The keystream value for address 12 is denoted `k12`.
-The unscrambled ROM data for (logical) address 12 is denoted `d12`.
-
-```wavejson
-{signal: [
-  {name: 'clk', wave: 'p....', period: 2},
-  {name: 'req', wave: '0.1...0...'},
-  {name: 'addr', wave: 'x.3.4.x...', data: ['12', '34']},
-  {name: 'scrambled addr', wave: 'x.3.4.x...', data: ['21', '43']},
-  {name: 'scrambled rdata + ecc', wave: 'x...3.4.x.', data: ['w21', 'w43']},
-  {name: 'keystream', wave: 'x...3.4.x.', data: ['k12', 'k34']},
-  {name: 'rdata + ecc', wave: 'x...3.4.x.', data: ['d12', 'd34']},
-  {name: 'rvalid', wave: '0...1...0.'},
-]}
-```
-
-The `prim_prince` primitive and the two substitution-permutation networks are all parameterised by "keys".
-For `rom_ctrl`, these keys are global randomised netlist constants: they are assumed to be difficult to recover, but aren't considered secret data.
-
-## The startup ROM check
-
-The ROM checker runs immediately after reset.
-Until it is done, it controls ROM address requests (through the green multiplexer).
-The select signal for this multiplexer has a redundant encoding to protect it against fault injection attacks.
-If the select signal has an invalid value, this will trigger a fatal alert.
-Before starting to read data, it starts a cSHAKE operation on the [KMAC](../kmac/README.md) module using one of its application interfaces.
-We expect to use the `cSHAKE256` algorithm, with prefix "ROM_CTRL".
-The [Application Interface](../kmac/README.md#application-interface) section of the KMAC documentation details the parameters used.
-
-The checker reads the ROM contents in address order, resulting in a scattered access pattern on the ROM itself because of the address scrambling.
-Each read produces 39 bits of data, which are padded with zeros to 64 bits to match the interface expected by the KMAC block.
-The checker FSM loops through almost all the words in ROM (from bottom to top), passing each to the KMAC block with the ready/valid interface and setting the `kmac_data_o.last` bit for the last word that is sent.
-Once the last word has been sent, the FSM releases the multiplexer; this now switches over permanently to allow access through the TL-UL SRAM adapter.
-
-The top eight words in ROM (by logical address) are interpreted as a 256-bit expected hash.
-Unlike the rest of ROM, their data is not stored scrambled, so the expected hash can be read directly.
-This is taken by the checker FSM (ignoring ECC bits) and will be compared with the digest that is read back from the KMAC block.
-
-Once it comes back, the digest is forwarded directly to the [Key Manager](../keymgr/README.md).
-It is also compared with the hash that was read from the top eight words of ROM.
-On a match, `pwrmgr_data_o.good` is signalled as `Mubi4True`.
-In either case, `pwrmgr_data_o.done` goes high when the calculation is complete.
-
-The diagram below shows the operation of the simple FSM.
-
-<div align="center">
-
-![ROM checker FSM Diagram](./doc/rom_check_fsm.svg)
-
-</div>
-
-## What does the ROM check do?
-
-One of the possible physical attacks on a system like OpenTitan is to subvert the ROM.
-The regular structure of a ROM is useful because it makes metal fixes easy, but (for the same reasons) it makes the ROM quite an easy target for an attacker.
-See \[SKO-05\][^SKO-05], section 2.1.1, for a description of ROMs and attacks on them.
-
-[^SKO-05]: **SKO-05**: Skorobogatov, [*Semi-Invasive Attacks - A New Approach to Hardware Security Analysis*](https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-630.html), University of Cambridge Computer Laboratory Technical Report 630, 2005
-
-Since the code in ROM is the first thing to execute, an attacker that modifies it undetected can completely subvert the chain of trust.
-As such, OpenTitan needs some form of ROM integrity checking and the ROM checker is the module in charge of providing it.
-
-After bringing the ROM controller module out of reset, the power manager must wait until `pwrgr_data_o.done` is asserted before starting the host processor.
-The ROM controller also passes the `pwrmgr_data_o.good` signal.
-The power manager can use this to decide whether to boot (taking into account life cycle state).
-This provides an extra safety check, but the real security comes from key manager integration described below.
-
-The simple KMAC interface assumes that KMAC is pre-configured to run the cSHAKE algorithm with a prefix specific to the ROM checker.
-The ROM checker will not assert `kmac_data_o.valid` after finishing the one and only digest computation.
-The KMAC module may choose to add a check for this, to detect reset glitches affecting the `rom_ctrl` block.
-
-The integration with the key manager is based on forwarding the digest data in `kmac_data_i` as `keymgr_data_o.data`.
-This 256-bit digest will be incorporated into the [`CreatorRootKey`](../../../doc/security/specs/identities_and_root_keys/README.md#creator-root-key).
-The key manager should only allow one transaction (of 256 bits / 32 bits = 8 beats) after reset to pass this information across.
-On future messages, it should raise an alert, defeating an attacker that tries to trigger extra transactions before or after the real one.
-
-`CreatorRootKey` forms the first key in the chain described in Identities and Root Keys.
-An attacker who modifies the ROM will perturb `CreatorRootKey` (to avoid doing so would require a preimage attack on the ROM checksum calculation or the `KM_DERIVE` function).
-The result is that, while the chip will function, it will have the "wrong" root key and the chain of trust used for attestation will be broken.
-
-## Fault-injection hardening
-
-The core integrity check, flowing from the ROM data to `CreatorRootKey`, should be infeasible to subvert.
-However, `rom_ctrl` also controls bus access to ROM data and interacts with other blocks.
-To avoid attacks propagating into the rest of the system, we take the following extra hardening steps:
-
-- All internal FSMs are sparsely encoded, with a minimum Hamming distance of 3.
-- The "good" signal passed to the power manager is multi-bit encoded (using `mubi4_t`).
-- The switching signals for the mux are multi-bit encoded (using `mubi4_t`).
-- We check to ensure the mux doesn't switch back to the checker after giving access to the bus.
-- The main FSM has internal consistency checking to ensure that other blocks don't signal completion when the FSM is in a state that doesn't expect them to be running.
-
-## Hardware Interfaces
-
-* [Interface Tables](data/rom_ctrl.hjson#interfaces)
-
-### Parameters
-
-Parameter                   | Default (Max)         | Top Earlgrey | Description
-----------------------------|-----------------------|--------------|---------------
-`RndCnstRomKey`             | (see RTL)             | (see RTL)    | Compile-time random default constant for scrambling key (used in `prim_prince` block).
-`RndCnstRomNonce`           | (see RTL)             | (see RTL)    | Compile-time random default constant for scrambling nonce (used in `prim_prince` block and the two S&P blocks).
-
-### Signals
-
-The table below lists other ROM controller inter-module signals.
-
-<table>
-  <tr>
-    <th>Signal</th>
-    <th>Type</th>
-    <th>Destination</th>
-    <th>Description</th>
-  </tr>
-  <tr>
-    <td><code>pwrmgr_data_o</code></td>
-    <td><code>rom_ctrl_pkg::pwrmgr_data_t</code></td>
-    <td>pwrmgr</td>
-    <td>
-      <p>
-        A structure with two fields.
-        The first, <code>done</code>, becomes true when the ROM check is complete and remains true until reset.
-      </p><p>
-        The second, <code>good</code>, is only valid if <code>done</code> is true.
-        This is true if the digest computation matched the expected value stored in the top words of ROM and false otherwise.
-        This field stays constant when <code>done</code> is true.
-      </p>
-    </td>
-  </tr>
-
-  <tr>
-    <td><code>keymgr_data_o</code></td>
-    <td><code>rom_ctrl_pkg::keymgr_data_t</code></td>
-    <td>keymgr</td>
-    <td>
-      A 256-bit digest, together with a <code>valid</code> signal.
-      Once the ROM check is complete, <code>valid</code> will become true and will then remain true until reset.
-      The digest in <code>data</code> is only valid when <code>valid</code> is true and is be constant until reset.
-    </td>
-  </tr>
-
-  <tr>
-    <td><code>kmac_data_o</code></td>
-    <td>kmac_pkg::app_req_t</td>
-    <td>kmac</td>
-    <td>
-      Outgoing data to KMAC.
-      Data is sent in 64-bit words in the <code>data</code> field.
-      When a word of data is available, the <code>valid</code> field is true.
-      When this is the last word of data, the <code>last</code> field is also true.
-      Since we never send partial words, the <code>strb</code> field is always zero.
-    </td>
-  </tr>
-  <tr>
-    <td><code>kmac_data_i</code></td>
-    <td>kmac_pkg::app_rsp_t</td>
-    <td>kmac</td>
-    <td>
-      Incoming data from KMAC interface.
-      This contains a <code>ready</code> signal for passing ROM data and a <code>done</code> signal that shows a digest has been computed.
-      When <code>done</code> is true, the digest is exposed in two shares (<code>digest_share0</code> and <code>digest_share1</code>).
-      The <code>error</code> field is ignored.
-    </td>
-  </tr>
-</table>
-
-# Programmer's Guide
-
-Software will mostly interact with the ROM controller by fetching code or loading data from ROM.
-For this, the block looks like a block of memory, accessible through a TL-UL window.
-However, there are a few registers that are accessible.
-Other than the standard [`ALERT_TEST`](data/rom_ctrl.hjson#alert_test) register, all are read-only.
-
-The [`FATAL_ALERT_CAUSE`](data/rom_ctrl.hjson#fatal_alert_cause) register might change value during operations (if an alert is signalled), but the other registers will all have fixed values by the time any software runs.
-
-To get the computed ROM digest, software can read [`DIGEST_0`](data/rom_ctrl.hjson#digest_0) through [`DIGEST_7`](data/rom_ctrl.hjson#digest_7).
-The ROM also contains an expected ROM digest.
-Unlike the rest of the contents of ROM, this isn't scrambled.
-As such, software can't read it through the standard ROM interface (which would try to unscramble it again, resulting in rubbish data that would cause a failed ECC check).
-In case software needs access to this value, it can be read at [`EXP_DIGEST_0`](data/rom_ctrl.hjson#exp_digest_0) through [`EXP_DIGEST_7`](data/rom_ctrl.hjson#exp_digest_7).
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_rom_ctrl.h)
-
-## Register Table
-
-* [Register Table](data/rom_ctrl.hjson#registers)
diff --git a/hw/ip/rom_ctrl/doc/programmers_guide.md b/hw/ip/rom_ctrl/doc/programmers_guide.md
new file mode 100644
index 0000000000000..7e52b037516bf
--- /dev/null
+++ b/hw/ip/rom_ctrl/doc/programmers_guide.md
@@ -0,0 +1,22 @@
+# Programmer's Guide
+
+Software will mostly interact with the ROM controller by fetching code or loading data from ROM.
+For this, the block looks like a block of memory, accessible through a TL-UL window.
+However, there are a few registers that are accessible.
+Other than the standard [`ALERT_TEST`](../data/rom_ctrl.hjson#alert_test) register, all are read-only.
+
+The [`FATAL_ALERT_CAUSE`](../data/rom_ctrl.hjson#fatal_alert_cause) register might change value during operations (if an alert is signalled), but the other registers will all have fixed values by the time any software runs.
+
+To get the computed ROM digest, software can read [`DIGEST_0`](../data/rom_ctrl.hjson#digest_0) through [`DIGEST_7`](../data/rom_ctrl.hjson#digest_7).
+The ROM also contains an expected ROM digest.
+Unlike the rest of the contents of ROM, this isn't scrambled.
+As such, software can't read it through the standard ROM interface (which would try to unscramble it again, resulting in rubbish data that would cause a failed ECC check).
+In case software needs access to this value, it can be read at [`EXP_DIGEST_0`](../data/rom_ctrl.hjson#exp_digest_0) through [`EXP_DIGEST_7`](../data/rom_ctrl.hjson#exp_digest_7).
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_rom_ctrl.h)
+
+## Register Table
+
+* [Register Table](../data/rom_ctrl.hjson#registers)
diff --git a/hw/ip/rom_ctrl/doc/theory_of_operation.md b/hw/ip/rom_ctrl/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..29bc462060e17
--- /dev/null
+++ b/hw/ip/rom_ctrl/doc/theory_of_operation.md
@@ -0,0 +1,198 @@
+# Theory of Operation
+
+## Block Diagram
+
+The image below shows a high-level block diagram of the module.
+Blue boxes are instantiations of generic primitives that are used elsewhere on the chip.
+Green boxes are simple operations; the meat of the design is in the grey boxes.
+
+The upper half of the diagram shows paths for ROM reads when the system is in normal operation.
+The lower half of the diagram shows the ROM checker.
+This is triggered by the power manager early in the chip boot sequence to check validity of the ROM image.
+It runs exactly once, and releases the green multiplexer when it is done.
+
+![ROM Controller Block Diagram](../doc/rom_ctrl_blockdiag.svg)
+
+## ROM access when chip is in operation
+
+Once the chip has booted, ROM accesses are requested over the system TL-UL bus.
+These come in through the TL-UL SRAM adapter (top-left of block diagram).
+In normal operation, the green multiplexer will give access to these TL reads.
+The address is scrambled at the first substitution-permutation network (marked S&P in the diagram).
+
+In parallel with the ROM access, a reduced `prim_prince` primitive (5 rounds with latency 1; equivalent to the cipher used for SRAM) computes a 39-bit truncated keystream for the block.
+On the following cycle, the scrambled data from ROM goes through a substitution-permutation network and is then XOR'd with the keystream.
+This scheme is the same as that used by the [SRAM controller](../../sram_ctrl/README.md), but is much simplified because the ROM doesn't have to deal with writes, byte accesses or key changes.
+
+The output from the XOR is the unscrambled 32-bit data, plus seven ECC bits.
+This data is passed straight through the TL-UL SRAM adapter; the ECC bits are used as a signal integrity check by the system bus.
+
+The following diagram shows the timing of the different signals.
+The time from the `req` output from the `tlul_adapter_sram` to the response that appears on its `rvalid` input is one cycle.
+The "scrambling scheme" for addresses in the diagram is to reverse their digits.
+The word stored at address 21 in the ROM is denoted `w21`.
+The keystream value for address 12 is denoted `k12`.
+The unscrambled ROM data for (logical) address 12 is denoted `d12`.
+
+```wavejson
+{signal: [
+  {name: 'clk', wave: 'p....', period: 2},
+  {name: 'req', wave: '0.1...0...'},
+  {name: 'addr', wave: 'x.3.4.x...', data: ['12', '34']},
+  {name: 'scrambled addr', wave: 'x.3.4.x...', data: ['21', '43']},
+  {name: 'scrambled rdata + ecc', wave: 'x...3.4.x.', data: ['w21', 'w43']},
+  {name: 'keystream', wave: 'x...3.4.x.', data: ['k12', 'k34']},
+  {name: 'rdata + ecc', wave: 'x...3.4.x.', data: ['d12', 'd34']},
+  {name: 'rvalid', wave: '0...1...0.'},
+]}
+```
+
+The `prim_prince` primitive and the two substitution-permutation networks are all parameterised by "keys".
+For `rom_ctrl`, these keys are global randomised netlist constants: they are assumed to be difficult to recover, but aren't considered secret data.
+
+## The startup ROM check
+
+The ROM checker runs immediately after reset.
+Until it is done, it controls ROM address requests (through the green multiplexer).
+The select signal for this multiplexer has a redundant encoding to protect it against fault injection attacks.
+If the select signal has an invalid value, this will trigger a fatal alert.
+Before starting to read data, it starts a cSHAKE operation on the [KMAC](../../kmac/README.md) module using one of its application interfaces.
+We expect to use the `cSHAKE256` algorithm, with prefix "ROM_CTRL".
+The [Application Interface](../../kmac/README.md#application-interface) section of the KMAC documentation details the parameters used.
+
+The checker reads the ROM contents in address order, resulting in a scattered access pattern on the ROM itself because of the address scrambling.
+Each read produces 39 bits of data, which are padded with zeros to 64 bits to match the interface expected by the KMAC block.
+The checker FSM loops through almost all the words in ROM (from bottom to top), passing each to the KMAC block with the ready/valid interface and setting the `kmac_data_o.last` bit for the last word that is sent.
+Once the last word has been sent, the FSM releases the multiplexer; this now switches over permanently to allow access through the TL-UL SRAM adapter.
+
+The top eight words in ROM (by logical address) are interpreted as a 256-bit expected hash.
+Unlike the rest of ROM, their data is not stored scrambled, so the expected hash can be read directly.
+This is taken by the checker FSM (ignoring ECC bits) and will be compared with the digest that is read back from the KMAC block.
+
+Once it comes back, the digest is forwarded directly to the [Key Manager](../../keymgr/README.md).
+It is also compared with the hash that was read from the top eight words of ROM.
+On a match, `pwrmgr_data_o.good` is signalled as `Mubi4True`.
+In either case, `pwrmgr_data_o.done` goes high when the calculation is complete.
+
+The diagram below shows the operation of the simple FSM.
+
+<div align="center">
+
+![ROM checker FSM Diagram](../doc/rom_check_fsm.svg)
+
+</div>
+
+## What does the ROM check do?
+
+One of the possible physical attacks on a system like OpenTitan is to subvert the ROM.
+The regular structure of a ROM is useful because it makes metal fixes easy, but (for the same reasons) it makes the ROM quite an easy target for an attacker.
+See \[SKO-05\][^SKO-05], section 2.1.1, for a description of ROMs and attacks on them.
+
+[^SKO-05]: **SKO-05**: Skorobogatov, [*Semi-Invasive Attacks - A New Approach to Hardware Security Analysis*](https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-630.html), University of Cambridge Computer Laboratory Technical Report 630, 2005
+
+Since the code in ROM is the first thing to execute, an attacker that modifies it undetected can completely subvert the chain of trust.
+As such, OpenTitan needs some form of ROM integrity checking and the ROM checker is the module in charge of providing it.
+
+After bringing the ROM controller module out of reset, the power manager must wait until `pwrgr_data_o.done` is asserted before starting the host processor.
+The ROM controller also passes the `pwrmgr_data_o.good` signal.
+The power manager can use this to decide whether to boot (taking into account life cycle state).
+This provides an extra safety check, but the real security comes from key manager integration described below.
+
+The simple KMAC interface assumes that KMAC is pre-configured to run the cSHAKE algorithm with a prefix specific to the ROM checker.
+The ROM checker will not assert `kmac_data_o.valid` after finishing the one and only digest computation.
+The KMAC module may choose to add a check for this, to detect reset glitches affecting the `rom_ctrl` block.
+
+The integration with the key manager is based on forwarding the digest data in `kmac_data_i` as `keymgr_data_o.data`.
+This 256-bit digest will be incorporated into the [`CreatorRootKey`](../../../../doc/security/specs/identities_and_root_keys/README.md#creator-root-key).
+The key manager should only allow one transaction (of 256 bits / 32 bits = 8 beats) after reset to pass this information across.
+On future messages, it should raise an alert, defeating an attacker that tries to trigger extra transactions before or after the real one.
+
+`CreatorRootKey` forms the first key in the chain described in Identities and Root Keys.
+An attacker who modifies the ROM will perturb `CreatorRootKey` (to avoid doing so would require a preimage attack on the ROM checksum calculation or the `KM_DERIVE` function).
+The result is that, while the chip will function, it will have the "wrong" root key and the chain of trust used for attestation will be broken.
+
+## Fault-injection hardening
+
+The core integrity check, flowing from the ROM data to `CreatorRootKey`, should be infeasible to subvert.
+However, `rom_ctrl` also controls bus access to ROM data and interacts with other blocks.
+To avoid attacks propagating into the rest of the system, we take the following extra hardening steps:
+
+- All internal FSMs are sparsely encoded, with a minimum Hamming distance of 3.
+- The "good" signal passed to the power manager is multi-bit encoded (using `mubi4_t`).
+- The switching signals for the mux are multi-bit encoded (using `mubi4_t`).
+- We check to ensure the mux doesn't switch back to the checker after giving access to the bus.
+- The main FSM has internal consistency checking to ensure that other blocks don't signal completion when the FSM is in a state that doesn't expect them to be running.
+
+## Hardware Interfaces
+
+* [Interface Tables](../data/rom_ctrl.hjson#interfaces)
+
+### Parameters
+
+Parameter                   | Default (Max)         | Top Earlgrey | Description
+----------------------------|-----------------------|--------------|---------------
+`RndCnstRomKey`             | (see RTL)             | (see RTL)    | Compile-time random default constant for scrambling key (used in `prim_prince` block).
+`RndCnstRomNonce`           | (see RTL)             | (see RTL)    | Compile-time random default constant for scrambling nonce (used in `prim_prince` block and the two S&P blocks).
+
+### Signals
+
+The table below lists other ROM controller inter-module signals.
+
+<table>
+  <tr>
+    <th>Signal</th>
+    <th>Type</th>
+    <th>Destination</th>
+    <th>Description</th>
+  </tr>
+  <tr>
+    <td><code>pwrmgr_data_o</code></td>
+    <td><code>rom_ctrl_pkg::pwrmgr_data_t</code></td>
+    <td>pwrmgr</td>
+    <td>
+      <p>
+        A structure with two fields.
+        The first, <code>done</code>, becomes true when the ROM check is complete and remains true until reset.
+      </p><p>
+        The second, <code>good</code>, is only valid if <code>done</code> is true.
+        This is true if the digest computation matched the expected value stored in the top words of ROM and false otherwise.
+        This field stays constant when <code>done</code> is true.
+      </p>
+    </td>
+  </tr>
+
+  <tr>
+    <td><code>keymgr_data_o</code></td>
+    <td><code>rom_ctrl_pkg::keymgr_data_t</code></td>
+    <td>keymgr</td>
+    <td>
+      A 256-bit digest, together with a <code>valid</code> signal.
+      Once the ROM check is complete, <code>valid</code> will become true and will then remain true until reset.
+      The digest in <code>data</code> is only valid when <code>valid</code> is true and is be constant until reset.
+    </td>
+  </tr>
+
+  <tr>
+    <td><code>kmac_data_o</code></td>
+    <td>kmac_pkg::app_req_t</td>
+    <td>kmac</td>
+    <td>
+      Outgoing data to KMAC.
+      Data is sent in 64-bit words in the <code>data</code> field.
+      When a word of data is available, the <code>valid</code> field is true.
+      When this is the last word of data, the <code>last</code> field is also true.
+      Since we never send partial words, the <code>strb</code> field is always zero.
+    </td>
+  </tr>
+  <tr>
+    <td><code>kmac_data_i</code></td>
+    <td>kmac_pkg::app_rsp_t</td>
+    <td>kmac</td>
+    <td>
+      Incoming data from KMAC interface.
+      This contains a <code>ready</code> signal for passing ROM data and a <code>done</code> signal that shows a digest has been computed.
+      When <code>done</code> is true, the digest is exposed in two shares (<code>digest_share0</code> and <code>digest_share1</code>).
+      The <code>error</code> field is ignored.
+    </td>
+  </tr>
+</table>
diff --git a/hw/ip/rstmgr/README.md b/hw/ip/rstmgr/README.md
index 91fdbc9c2a028..ed5b7c327eef5 100644
--- a/hw/ip/rstmgr/README.md
+++ b/hw/ip/rstmgr/README.md
@@ -17,335 +17,3 @@ This document describes the functionality of the reset controller and its intera
 *   Always-on alert crash dump register.
 *   Always-on CPU crash dump register.
 *   Reset consistency checks.
-
-# Theory of Operation
-
-The OpenTitan reset topology and reset controller block diagram are shown in the diagram below.
-The reset controller is closely related to the [power controller](../pwrmgr/README.md), please refer to that spec for details on how reset controller inputs are controlled.
-
-![Reset Topology](./doc/reset_topology.svg)
-
-## Reset Topology
-
-The topology can be summarized as follows:
-
-*   There are two reset domains
-    *   Test Domain - Driven by `TRSTn`
-    *   Core Domain - Driven by internal [POR circuitry](../../top_earlgrey/ip/ast/README.md).
-*   Test domain is comprised of the following components
-    *   SOC TAP and related DFT circuits
-    *   RISC-V TAP (part of the `rv_dm` module)
-
-The test domain does not have sub reset trees.
-`TRSTn` is used directly by all components in the domain.
-
-The Core domain consists of all remaining logic and contains 4 sub reset trees, see table below.
-
-<table>
-  <tr>
-   <td>
-<strong>Reset Tree</strong>
-   </td>
-   <td><strong>Description</strong>
-   </td>
-  </tr>
-  <tr>
-   <td><code>rst_por_n</code>
-   </td>
-   <td><code>POR reset tree.</code>
-<p>
-<code>This reset is driven by ast, stretched inside the reset manager and resets all core domain logic in the design. </code>
-   </td>
-  </tr>
-  <tr>
-   <td><code>rst_lc_n</code>
-   </td>
-   <td><code>Life Cycle reset tree.</code>
-<p>
-<code>This reset is derived from rst_por_n and resets all logic in the design except:</code><ul>
-
-<li><code>rv_dm</code>
-<li><code>A small portion of pinmux</code></li></ul>
-   </td>
-  </tr>
-  <tr>
-   <td><code>rst_sys_n</code>
-   </td>
-   <td><code>Debug reset tree.</code>
-<p>
-<code>This reset is derived from rst_por_n and resets debug domain logic excluded in the life cycle reset tree</code><ul>
-   </td>
-  </tr>
-  <tr>
-   <td><code>rst_{module}_n</code>
-   </td>
-   <td><code>Module specific reset.</code>
-<p>
-<code>This reset is derived from rst_lc_n and sets only the targeted module and nothing else.</code>
-<p>
-<code>For OpenTitan, the only current targets are spi_device, all instances of spi_host, all instances of i2c and usbdev</code>
-   </td>
-  </tr>
-</table>
-
-The reset trees are cascaded upon one another in this order:
-- `rst_por_n` -> `rst_lc_n` -> `rst_module_n`
-- `rst_por_n` -> `rst_sys_n` -> `rst_module_n`
-This means when a particular reset asserts, all downstream resets also assert.
-
-The primary difference between `rst_lc_n` and `rst_sys_n` is that the former controls the reset state of most logic in the system, while the latter controls the reset state only of the debug domain.
-This separation is required because the debug domain may request the system to reset while retaining debug info and control.
-This is particularly useful if one wanted to debug something early during the boot flow, and thus needed to set a break point after requesting a debug reset.
-
-The reset topology also contains additional properties:
-*   Selective processor HART resets, such as `hartreset` in `dmcontrol`, are not implemented, as it causes a security policy inconsistency with the remaining system.
-    *   Specifically, these selective resets can cause the cascaded property shown above to not be obeyed.
-*   Modules do not implement local resets that wipe configuration registers, especially if there are configuration locks.
-    *   Modules are allowed to implement local soft resets that clear datapaths; but these are examined on a case by case basis for possible security side channels.
-*   In a production system, the Test Reset Input (`TRSTn`) should be explicitly asserted through system integration.
-    *   In a production system, `TRSTn` only needs to be released for RMA transitions and nothing else.
-.
-
-## Reset Manager
-
-The reset manager handles the reset of the core domain, and also holds relevant reset information in CSR registers, such as:
-
-*  [`RESET_INFO`](../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#reset_info) indicates why the system was reset.
-*  [`ALERT_INFO`](../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#alert_info) contains the recorded alert status prior to system reset.
-   *  This is useful in case the reset was triggered by an alert escalation.
-*  [`CPU_INFO`](../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#cpu_info) contains recorded CPU state prior to system reset.
-   *  This is useful in case the reset was triggered by a watchdog where the host hung on a particular bus transaction.
-
-Additionally, the reset manager, along with the power manager, accepts requests from the system and asserts resets for the appropriate clock trees.
-These requests primarily come from the following sources:
-*  Peripherals capable of reset requests: such as [sysrst_ctrl](../sysrst_ctrl/README.md) and [always on timers ](../aon_timer/README.md).
-*  Debug modules such as `rv_dm`.
-*  Power manager request for low power entry and exit.
-*  Escalation reset requests such as those from `alert_handler` or `pwrmgr` itself.
-*  Direct software request for reset.
-
-### Shadow Resets
-
-OpenTitan supports the shadow configuration registers.
-These are registers stored in two constantly checking copies to ensure the values are not maliciously or accidentally disturbed.
-For these components, the reset manager outputs a shadow reset dedicated to resetting only the shadow storage.
-This reset separation ensures that a targetted attack on the reset line cannot easily defeat shadow registers.
-
-### Reset Consistency Checks
-
-The reset manager implements reset consistency checks to ensure that triggered resets are supposed to happen and not due to some fault in the system.
-Every leaf reset in the system has an associated consistency checker.
-
-The consistency check ensures that when a leaf reset asserts, either its parent reset must have asserted, or the software request, if available, has asserted.
-While this sounds simple in principle, the check itself crosses up to 3 clock domains and must be carefully managed.
-
-First, the parent and leaf resets are used to asynchronously assert a flag indication.
-This flag indication is then synchronized into the reset manager's local clock domain.
-
-The reset manager then checks as follows:
-- If a leaf reset has asserted, check to see either its parent or software request (synchronous to the local domain) has asserted.
-
-- If the condition is not true, it is possible the parent reset indication is still being synchronized, thus we wait for the parent indication.
-
-- It is also possible the parent indication was seen first, but the leaf condition was not, in this case, we wait for the leaf indication.
-
-- A timeout period corresponding to the maximum synchronization delay is used to cover both waits.
-  - If the appropriate pairing is not seen in the given amount of time, signal an error, as the leaf reset asserted without cause.
-
-- If all reset conditions are satisfied, wait for the reset release to gracefully complete the cycle.
-
-## Hardware Interfaces
-
-### Parameters
-
-The following table lists the instantiation parameters of `rstmgr`.
-
-
-Parameter                   | Default       | Description
-----------------------------|---------------|---------------
-`SecCheck`                  | 1             | Enables reset consistency checks on the leaf reset.  Each check contains a small FSM.
-`SecMaxSyncDelay`           | 2             | The default synchronization delay assumptions used in reset consistency checks.  If a design uses a sync cell with more stages of delay, that value should be supplied.
-
-
-### Signals
-
-* [Interface Tables](../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#interfaces)
-
-## Design Details
-
-The reset manager generates the resets required by the system by synchronizing reset tree components to appropriate output clocks.
-As a result, a particular reset tree (for example `rst_lc_n`) may have multiple outputs depending on the clock domains of its consumers.
-
-Each reset tree is discussed in detail below.
-
-## POR Reset Tree
-
-The POR reset tree, `rst_por_n`, is the root reset of the entire device.
-If this reset ever asserts, everything in the design is reset.
-
-The `ast` input `aon_pok` is used as the root reset indication.
-It is filtered and stretched to cover any slow voltage ramp scenarios.
-The stretch parameters are design time configurations.
-
-*   The filter acts as a synchronizer and is by default 3 stages.
-*   The count by default is 32.
-    *   The counter increments only when all stages of the filter are 1.
-    *   If any stage at any point becomes '0', the reset counter returns to 0 and downstream logic is driven to reset again.
-*   Both functions are expected to operate on slow, always available KHz clocks.
-
-
-## Life Cycle Reset Tree
-
-Life cycle reset, `rst_lc_n` asserts under the following conditions:
-*  Whenever `rst_por_n` asserts.
-*  Whenever a peripheral reset request (always on timer watchdog, rbox reset request, alert handler escalation, direct software request) is received.
-
-The `rst_lc_n` tree contains both always-on and non-always-on versions.
-How many non-always-on versions is dependent on how many power domains are supported by the system.
-
-## System Reset Tree
-
-System reset, `rst_sys_n` , assertion depends on life cycle state.
-
-When in PROD and PROD_END states, `rst_sys_n` is identical to `rst_lc_n`.
-
-When in TEST, RMA and DEV states, `rst_sys_n` is identical to `rst_lc_n` unless the reset request is `ndmreset_req`.
-`ndmreset_req` is issued by the debug module of the system, it requests for all logic, except those needed to maintain debug state to reset.
-
-Since `ndmreset_req` is valid only during TEST, RMA and DEV states, it is the only place where the reset is differentiated.
-During these states, when `ndmreset_req` is issued, all logic except the debug module and associated glue logic are reset.
-
-The `rst_sys_n` tree contains both always-on and non-always-on versions.
-How many non-always-on versions is dependent on how many power domains are supported by the system.
-
-## Output Leaf Resets
-
-The reset trees discussed above are not directly output to the system for consumption.
-Instead, the output leaf resets are synchronized versions of the various root resets.
-How many leaf resets there are and to which clock is decided by the system and templated through the reset manager module.
-
-Assuming a leaf output has N power domains and M clock domains, it potentially means one reset tree may output NxM outputs to satisfy all the reset scenario combinations.
-
-## Power Domains and Reset Trees
-
-It is alluded above that reset trees may contain both always-on and non-always-on versions.
-This distinction is required to support power manager's various low power states.
-When a power domain goes offline, all of its components must reset, regardless of the reset tree to which it belongs.
-
-For example, assume a system with two power domains - `Domain A` is always-on, and `Domain B` is non-always-on.
-When `Domain B` is powered off, all of `Domain B`'s resets, from `rst_lc_n`, `rst_sys_n` to `rst_module_n` are asserted.
-However, the corresponding resets for `Domain A` are left untouched because it has not been powered off.
-
-## Software Controlled Resets
-
-Certain leaf resets can be directly controlled by software.
-Due to security considerations, most leaf resets cannot be controlled, only a few blocks are given exceptions.
-The only blocks currently allowed to software reset are `spi_device`, `usbdev`, `spi_host` and `i2c`.
-
-The criteria for selecting which block is software reset controllable is meant to be overly restrictive.
-Unless there is a clear need, the default option is to not provide reset control.
-
-In general, the following rules apply:
-*   If a module has configuration register lockdown, it cannot be software resettable.
-*   If a module operates on secret data (keys), it cannot be software resettable.
-    *   Or a software reset should render the secret data unusable until some initialization routine is run to reduce the Hamming leakage of secret data.
-*   If a module can alter the software's perception of time or general control flow (timer or interrupt aggregator), it cannot be software resettable.
-*   If a module contains sensor functions for security, it cannot be software resettable.
-*   If a module controls life cycle or related function, it cannot be software resettable.
-
-## Summary
-
-The following table summarizes the different reset requests and which part of each reset tree, along with what power domain is affected.
-
-Reset Request Type                | Example                                                       | POR Reset Tree | LC Reset Tree   | SYS Reset Tree  | Module Specific Reset
-----------------------------------| --------------------------------------------------------------| ---------------| -------------   | --------------- | ----------------------
-POR                               | VCC toggle, POR_N pad toggle                                  | all domains    | all domains     | all domains     | all domains
-HW reset Request                  | `aon_timer` reset request, `alert_handler` escalation request |                | all domains     | all domains     | all domains
-Directed SW system reset request  | `rstmgr` SW_RESET                                             |                | all domains     | all domains     | all domains
-Ndm reset request (PROD/PROD_END) | `rv_dm` non-debug-module reset request in PROD                |                | all domains     | all domains     | all domains
-Ndm reset request (Other states)  | `rv_dm` non-debug-module reset request in DEV                 |                | all domains     |                 | all domains
-SW low power entry                | wait-for-interrupt deep sleep entry                           |                | non-aon domains | non-aon domains | non-aon domains
-SW controlled reset request       | `rstmgr` SW_RST_CTRL_N                                        |                |                 |                 | all domains
-
-
-## Reset Information
-
-The reset information register is a reflection of the reset state from the perspective of the system.
-In OpenTitan, since there is only 1 host, it is thus from the perspective of the processor.
-This also suggests that if the design had multiple processors, there would need to be multiple such registers.
-
-If a reset does not cause the processor to reset, there is no reason for the reset information to change (this is also why there is a strong security link between the reset of the processor and the rest of the system).
-The following are the currently defined reset reasons and their meaning:
-
-Reset Cause             | Description
-------------------------|---------------
-`POR`                   | Cold boot, the system was reset through POR circuitry.
-`LOW_POWER_EXIT`        | Warm boot, the system was reset through low power exit.
-`NDM RESET`             | Warm boot, the system was reset through `rv_dm` non-debug-module request.
-`SW_REQ`                | Warm boot, the system was reset through [`RESET_REQ`](../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#reset_req).
-`HW_REQ`                | Warm boot, the system was reset through peripheral requests.  There may be multiple such requests.
-
-
-The reset info register is write 1 clear.
-It is software responsibility to clear old reset reasons; the reset manager simply records based on the rules below.
-
-Excluding power on reset, which is always recorded when the device POR circuitry is triggered, the other resets are recorded when authorized by the reset manager.
-Reset manager authorization is based on reset categories as indicated by the power manager.
-The power manager has three reset categories that are mutually exclusive:
-*   No reset has been triggered by pwrmgr.
-*   Low power entry reset has been triggered by pwrmgr.
-*   Software or peripheral reset request has been triggered by pwrmgr.
-
-The reset categories are sent to the reset manager so that it can decide which reason to record when the processor reset is observed.
-Non-debug-module resets are allowed only when no resets have been triggered by pwrmgr.
-
-Since a reset could be motivated by multiple reasons (a security escalation during low power transition for example), the reset information registers constantly record all reset causes in which it is allowed.
-The only case where this is not done is `POR`, where active recording is silenced until the first processor reset release.
-
-Even though four reset causes are labeled as warm boot, their effects on the system are not identical.
-
-*  When the reset cause is `LOW_POWER_EXIT`, it means only the non-always-on domains have been reset.
-   *  Always-on domains retain their pre-low power values.
-*  When the reset cause is `NDM_RESET`, it means only the `rst_sys_n` tree has asserted for all power domains.
-*  When the reset cause is `HW_REQ` or `SW_REQ`, it means everything other than power / clock / reset managers have reset.
-
-This behavioral difference may be important to software, as it implies the configuration of the system may need to be different.
-
-## Crash Dump Information
-
-The reset manager manages crash dump information for software debugging across unexpected resets and watchdogs.
-When enabled, the latest alert information and latest CPU information are captured in always-on registers.
-
-When the software resumes after the reset, it is then able to examine the last CPU state or the last set of alert information to understand why the system has reset.
-
-The enable for such debug capture can be locked such that it never captures.
-
-### Alert Information
-
-The alert information register contains the value of the alert crash dump prior to a triggered reset.
-Since this information differs in length between system implementation, the alert information register only displays 32-bits at a time.
-The [`ALERT_INFO_ATTR`](../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#alert_info_attr) register indicates how many 32-bit data segments must be read.
-
-To enable alert crash dump capture, set [`ALERT_INFO_CTRL.EN`](../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#alert_info_ctrl) to 1.
-Once the system has reset, check [`ALERT_INFO_ATTR.CNT_AVAIL`](../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#alert_info_attr) for how many reads need to be done.
-Set [`ALERT_INFO_CTRL.INDEX`](../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#alert_info_ctrl) to the desired segment, and then read the output from [`ALERT_INFO`](../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#alert_info).
-
-### CPU Information
-
-The CPU information register contains the value of the CPU state prior to a triggered reset.
-Since this information differs in length between system implementation, the information register only displays 32-bits at a time.
-
-For more details on the CPU dump details, please see [crash dump](../rv_core_ibex/README.md#crash-dump-collection).
-
-The [`CPU_INFO_ATTR`](../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#cpu_info_attr) register indicates how many 32-bit data segments must be read.
-Software then simply needs to write in [`CPU_INFO_CTRL.INDEX`](../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#cpu_info_ctrl) which segment it wishes and then read out the [`CPU_INFO`](../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#cpu_info) register.
-
-# Programmers Guide
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_rstmgr.h)
-
-## Register Table
-
-* [Register Table](../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#registers)
diff --git a/hw/ip/rstmgr/doc/programmers_guide.md b/hw/ip/rstmgr/doc/programmers_guide.md
new file mode 100644
index 0000000000000..7232d0f474712
--- /dev/null
+++ b/hw/ip/rstmgr/doc/programmers_guide.md
@@ -0,0 +1,9 @@
+# Programmer's Guide
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_rstmgr.h)
+
+## Register Table
+
+* [Register Table](../../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#registers)
diff --git a/hw/ip/rstmgr/doc/theory_of_operation.md b/hw/ip/rstmgr/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..5ccf0ddfce962
--- /dev/null
+++ b/hw/ip/rstmgr/doc/theory_of_operation.md
@@ -0,0 +1,321 @@
+# Theory of Operation
+
+The OpenTitan reset topology and reset controller block diagram are shown in the diagram below.
+The reset controller is closely related to the [power controller](../../pwrmgr/README.md), please refer to that spec for details on how reset controller inputs are controlled.
+
+![Reset Topology](../doc/reset_topology.svg)
+
+## Reset Topology
+
+The topology can be summarized as follows:
+
+*   There are two reset domains
+    *   Test Domain - Driven by `TRSTn`
+    *   Core Domain - Driven by internal [POR circuitry](../../../top_earlgrey/ip/ast/README.md).
+*   Test domain is comprised of the following components
+    *   SOC TAP and related DFT circuits
+    *   RISC-V TAP (part of the `rv_dm` module)
+
+The test domain does not have sub reset trees.
+`TRSTn` is used directly by all components in the domain.
+
+The Core domain consists of all remaining logic and contains 4 sub reset trees, see table below.
+
+<table>
+  <tr>
+   <td>
+<strong>Reset Tree</strong>
+   </td>
+   <td><strong>Description</strong>
+   </td>
+  </tr>
+  <tr>
+   <td><code>rst_por_n</code>
+   </td>
+   <td><code>POR reset tree.</code>
+<p>
+<code>This reset is driven by ast, stretched inside the reset manager and resets all core domain logic in the design. </code>
+   </td>
+  </tr>
+  <tr>
+   <td><code>rst_lc_n</code>
+   </td>
+   <td><code>Life Cycle reset tree.</code>
+<p>
+<code>This reset is derived from rst_por_n and resets all logic in the design except:</code><ul>
+
+<li><code>rv_dm</code>
+<li><code>A small portion of pinmux</code></li></ul>
+   </td>
+  </tr>
+  <tr>
+   <td><code>rst_sys_n</code>
+   </td>
+   <td><code>Debug reset tree.</code>
+<p>
+<code>This reset is derived from rst_por_n and resets debug domain logic excluded in the life cycle reset tree</code><ul>
+   </td>
+  </tr>
+  <tr>
+   <td><code>rst_{module}_n</code>
+   </td>
+   <td><code>Module specific reset.</code>
+<p>
+<code>This reset is derived from rst_lc_n and sets only the targeted module and nothing else.</code>
+<p>
+<code>For OpenTitan, the only current targets are spi_device, all instances of spi_host, all instances of i2c and usbdev</code>
+   </td>
+  </tr>
+</table>
+
+The reset trees are cascaded upon one another in this order:
+- `rst_por_n` -> `rst_lc_n` -> `rst_module_n`
+- `rst_por_n` -> `rst_sys_n` -> `rst_module_n`
+This means when a particular reset asserts, all downstream resets also assert.
+
+The primary difference between `rst_lc_n` and `rst_sys_n` is that the former controls the reset state of most logic in the system, while the latter controls the reset state only of the debug domain.
+This separation is required because the debug domain may request the system to reset while retaining debug info and control.
+This is particularly useful if one wanted to debug something early during the boot flow, and thus needed to set a break point after requesting a debug reset.
+
+The reset topology also contains additional properties:
+*   Selective processor HART resets, such as `hartreset` in `dmcontrol`, are not implemented, as it causes a security policy inconsistency with the remaining system.
+    *   Specifically, these selective resets can cause the cascaded property shown above to not be obeyed.
+*   Modules do not implement local resets that wipe configuration registers, especially if there are configuration locks.
+    *   Modules are allowed to implement local soft resets that clear datapaths; but these are examined on a case by case basis for possible security side channels.
+*   In a production system, the Test Reset Input (`TRSTn`) should be explicitly asserted through system integration.
+    *   In a production system, `TRSTn` only needs to be released for RMA transitions and nothing else.
+.
+
+## Reset Manager
+
+The reset manager handles the reset of the core domain, and also holds relevant reset information in CSR registers, such as:
+
+*  [`RESET_INFO`](../../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#reset_info) indicates why the system was reset.
+*  [`ALERT_INFO`](../../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#alert_info) contains the recorded alert status prior to system reset.
+   *  This is useful in case the reset was triggered by an alert escalation.
+*  [`CPU_INFO`](../../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#cpu_info) contains recorded CPU state prior to system reset.
+   *  This is useful in case the reset was triggered by a watchdog where the host hung on a particular bus transaction.
+
+Additionally, the reset manager, along with the power manager, accepts requests from the system and asserts resets for the appropriate clock trees.
+These requests primarily come from the following sources:
+*  Peripherals capable of reset requests: such as [sysrst_ctrl](../../sysrst_ctrl/README.md) and [always on timers ](../../aon_timer/README.md).
+*  Debug modules such as `rv_dm`.
+*  Power manager request for low power entry and exit.
+*  Escalation reset requests such as those from `alert_handler` or `pwrmgr` itself.
+*  Direct software request for reset.
+
+### Shadow Resets
+
+OpenTitan supports the shadow configuration registers.
+These are registers stored in two constantly checking copies to ensure the values are not maliciously or accidentally disturbed.
+For these components, the reset manager outputs a shadow reset dedicated to resetting only the shadow storage.
+This reset separation ensures that a targetted attack on the reset line cannot easily defeat shadow registers.
+
+### Reset Consistency Checks
+
+The reset manager implements reset consistency checks to ensure that triggered resets are supposed to happen and not due to some fault in the system.
+Every leaf reset in the system has an associated consistency checker.
+
+The consistency check ensures that when a leaf reset asserts, either its parent reset must have asserted, or the software request, if available, has asserted.
+While this sounds simple in principle, the check itself crosses up to 3 clock domains and must be carefully managed.
+
+First, the parent and leaf resets are used to asynchronously assert a flag indication.
+This flag indication is then synchronized into the reset manager's local clock domain.
+
+The reset manager then checks as follows:
+- If a leaf reset has asserted, check to see either its parent or software request (synchronous to the local domain) has asserted.
+
+- If the condition is not true, it is possible the parent reset indication is still being synchronized, thus we wait for the parent indication.
+
+- It is also possible the parent indication was seen first, but the leaf condition was not, in this case, we wait for the leaf indication.
+
+- A timeout period corresponding to the maximum synchronization delay is used to cover both waits.
+  - If the appropriate pairing is not seen in the given amount of time, signal an error, as the leaf reset asserted without cause.
+
+- If all reset conditions are satisfied, wait for the reset release to gracefully complete the cycle.
+
+## Hardware Interfaces
+
+### Parameters
+
+The following table lists the instantiation parameters of `rstmgr`.
+
+
+Parameter                   | Default       | Description
+----------------------------|---------------|---------------
+`SecCheck`                  | 1             | Enables reset consistency checks on the leaf reset.  Each check contains a small FSM.
+`SecMaxSyncDelay`           | 2             | The default synchronization delay assumptions used in reset consistency checks.  If a design uses a sync cell with more stages of delay, that value should be supplied.
+
+
+### Signals
+
+* [Interface Tables](../../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#interfaces)
+
+## Design Details
+
+The reset manager generates the resets required by the system by synchronizing reset tree components to appropriate output clocks.
+As a result, a particular reset tree (for example `rst_lc_n`) may have multiple outputs depending on the clock domains of its consumers.
+
+Each reset tree is discussed in detail below.
+
+## POR Reset Tree
+
+The POR reset tree, `rst_por_n`, is the root reset of the entire device.
+If this reset ever asserts, everything in the design is reset.
+
+The `ast` input `aon_pok` is used as the root reset indication.
+It is filtered and stretched to cover any slow voltage ramp scenarios.
+The stretch parameters are design time configurations.
+
+*   The filter acts as a synchronizer and is by default 3 stages.
+*   The count by default is 32.
+    *   The counter increments only when all stages of the filter are 1.
+    *   If any stage at any point becomes '0', the reset counter returns to 0 and downstream logic is driven to reset again.
+*   Both functions are expected to operate on slow, always available KHz clocks.
+
+
+## Life Cycle Reset Tree
+
+Life cycle reset, `rst_lc_n` asserts under the following conditions:
+*  Whenever `rst_por_n` asserts.
+*  Whenever a peripheral reset request (always on timer watchdog, rbox reset request, alert handler escalation, direct software request) is received.
+
+The `rst_lc_n` tree contains both always-on and non-always-on versions.
+How many non-always-on versions is dependent on how many power domains are supported by the system.
+
+## System Reset Tree
+
+System reset, `rst_sys_n` , assertion depends on life cycle state.
+
+When in PROD and PROD_END states, `rst_sys_n` is identical to `rst_lc_n`.
+
+When in TEST, RMA and DEV states, `rst_sys_n` is identical to `rst_lc_n` unless the reset request is `ndmreset_req`.
+`ndmreset_req` is issued by the debug module of the system, it requests for all logic, except those needed to maintain debug state to reset.
+
+Since `ndmreset_req` is valid only during TEST, RMA and DEV states, it is the only place where the reset is differentiated.
+During these states, when `ndmreset_req` is issued, all logic except the debug module and associated glue logic are reset.
+
+The `rst_sys_n` tree contains both always-on and non-always-on versions.
+How many non-always-on versions is dependent on how many power domains are supported by the system.
+
+## Output Leaf Resets
+
+The reset trees discussed above are not directly output to the system for consumption.
+Instead, the output leaf resets are synchronized versions of the various root resets.
+How many leaf resets there are and to which clock is decided by the system and templated through the reset manager module.
+
+Assuming a leaf output has N power domains and M clock domains, it potentially means one reset tree may output NxM outputs to satisfy all the reset scenario combinations.
+
+## Power Domains and Reset Trees
+
+It is alluded above that reset trees may contain both always-on and non-always-on versions.
+This distinction is required to support power manager's various low power states.
+When a power domain goes offline, all of its components must reset, regardless of the reset tree to which it belongs.
+
+For example, assume a system with two power domains - `Domain A` is always-on, and `Domain B` is non-always-on.
+When `Domain B` is powered off, all of `Domain B`'s resets, from `rst_lc_n`, `rst_sys_n` to `rst_module_n` are asserted.
+However, the corresponding resets for `Domain A` are left untouched because it has not been powered off.
+
+## Software Controlled Resets
+
+Certain leaf resets can be directly controlled by software.
+Due to security considerations, most leaf resets cannot be controlled, only a few blocks are given exceptions.
+The only blocks currently allowed to software reset are `spi_device`, `usbdev`, `spi_host` and `i2c`.
+
+The criteria for selecting which block is software reset controllable is meant to be overly restrictive.
+Unless there is a clear need, the default option is to not provide reset control.
+
+In general, the following rules apply:
+*   If a module has configuration register lockdown, it cannot be software resettable.
+*   If a module operates on secret data (keys), it cannot be software resettable.
+    *   Or a software reset should render the secret data unusable until some initialization routine is run to reduce the Hamming leakage of secret data.
+*   If a module can alter the software's perception of time or general control flow (timer or interrupt aggregator), it cannot be software resettable.
+*   If a module contains sensor functions for security, it cannot be software resettable.
+*   If a module controls life cycle or related function, it cannot be software resettable.
+
+## Summary
+
+The following table summarizes the different reset requests and which part of each reset tree, along with what power domain is affected.
+
+Reset Request Type                | Example                                                       | POR Reset Tree | LC Reset Tree   | SYS Reset Tree  | Module Specific Reset
+----------------------------------| --------------------------------------------------------------| ---------------| -------------   | --------------- | ----------------------
+POR                               | VCC toggle, POR_N pad toggle                                  | all domains    | all domains     | all domains     | all domains
+HW reset Request                  | `aon_timer` reset request, `alert_handler` escalation request |                | all domains     | all domains     | all domains
+Directed SW system reset request  | `rstmgr` SW_RESET                                             |                | all domains     | all domains     | all domains
+Ndm reset request (PROD/PROD_END) | `rv_dm` non-debug-module reset request in PROD                |                | all domains     | all domains     | all domains
+Ndm reset request (Other states)  | `rv_dm` non-debug-module reset request in DEV                 |                | all domains     |                 | all domains
+SW low power entry                | wait-for-interrupt deep sleep entry                           |                | non-aon domains | non-aon domains | non-aon domains
+SW controlled reset request       | `rstmgr` SW_RST_CTRL_N                                        |                |                 |                 | all domains
+
+
+## Reset Information
+
+The reset information register is a reflection of the reset state from the perspective of the system.
+In OpenTitan, since there is only 1 host, it is thus from the perspective of the processor.
+This also suggests that if the design had multiple processors, there would need to be multiple such registers.
+
+If a reset does not cause the processor to reset, there is no reason for the reset information to change (this is also why there is a strong security link between the reset of the processor and the rest of the system).
+The following are the currently defined reset reasons and their meaning:
+
+Reset Cause             | Description
+------------------------|---------------
+`POR`                   | Cold boot, the system was reset through POR circuitry.
+`LOW_POWER_EXIT`        | Warm boot, the system was reset through low power exit.
+`NDM RESET`             | Warm boot, the system was reset through `rv_dm` non-debug-module request.
+`SW_REQ`                | Warm boot, the system was reset through [`RESET_REQ`](../../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#reset_req).
+`HW_REQ`                | Warm boot, the system was reset through peripheral requests.  There may be multiple such requests.
+
+
+The reset info register is write 1 clear.
+It is software responsibility to clear old reset reasons; the reset manager simply records based on the rules below.
+
+Excluding power on reset, which is always recorded when the device POR circuitry is triggered, the other resets are recorded when authorized by the reset manager.
+Reset manager authorization is based on reset categories as indicated by the power manager.
+The power manager has three reset categories that are mutually exclusive:
+*   No reset has been triggered by pwrmgr.
+*   Low power entry reset has been triggered by pwrmgr.
+*   Software or peripheral reset request has been triggered by pwrmgr.
+
+The reset categories are sent to the reset manager so that it can decide which reason to record when the processor reset is observed.
+Non-debug-module resets are allowed only when no resets have been triggered by pwrmgr.
+
+Since a reset could be motivated by multiple reasons (a security escalation during low power transition for example), the reset information registers constantly record all reset causes in which it is allowed.
+The only case where this is not done is `POR`, where active recording is silenced until the first processor reset release.
+
+Even though four reset causes are labeled as warm boot, their effects on the system are not identical.
+
+*  When the reset cause is `LOW_POWER_EXIT`, it means only the non-always-on domains have been reset.
+   *  Always-on domains retain their pre-low power values.
+*  When the reset cause is `NDM_RESET`, it means only the `rst_sys_n` tree has asserted for all power domains.
+*  When the reset cause is `HW_REQ` or `SW_REQ`, it means everything other than power / clock / reset managers have reset.
+
+This behavioral difference may be important to software, as it implies the configuration of the system may need to be different.
+
+## Crash Dump Information
+
+The reset manager manages crash dump information for software debugging across unexpected resets and watchdogs.
+When enabled, the latest alert information and latest CPU information are captured in always-on registers.
+
+When the software resumes after the reset, it is then able to examine the last CPU state or the last set of alert information to understand why the system has reset.
+
+The enable for such debug capture can be locked such that it never captures.
+
+### Alert Information
+
+The alert information register contains the value of the alert crash dump prior to a triggered reset.
+Since this information differs in length between system implementation, the alert information register only displays 32-bits at a time.
+The [`ALERT_INFO_ATTR`](../../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#alert_info_attr) register indicates how many 32-bit data segments must be read.
+
+To enable alert crash dump capture, set [`ALERT_INFO_CTRL.EN`](../../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#alert_info_ctrl) to 1.
+Once the system has reset, check [`ALERT_INFO_ATTR.CNT_AVAIL`](../../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#alert_info_attr) for how many reads need to be done.
+Set [`ALERT_INFO_CTRL.INDEX`](../../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#alert_info_ctrl) to the desired segment, and then read the output from [`ALERT_INFO`](../../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#alert_info).
+
+### CPU Information
+
+The CPU information register contains the value of the CPU state prior to a triggered reset.
+Since this information differs in length between system implementation, the information register only displays 32-bits at a time.
+
+For more details on the CPU dump details, please see [crash dump](../../rv_core_ibex/README.md#crash-dump-collection).
+
+The [`CPU_INFO_ATTR`](../../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#cpu_info_attr) register indicates how many 32-bit data segments must be read.
+Software then simply needs to write in [`CPU_INFO_CTRL.INDEX`](../../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#cpu_info_ctrl) which segment it wishes and then read out the [`CPU_INFO`](../../../top_earlgrey/ip/rstmgr/data/autogen/rstmgr.hjson#cpu_info) register.
diff --git a/hw/ip/rv_timer/README.md b/hw/ip/rv_timer/README.md
index d45272a144b96..803ce668fcccd 100644
--- a/hw/ip/rv_timer/README.md
+++ b/hw/ip/rv_timer/README.md
@@ -34,216 +34,3 @@ The timer IP provides memory-mapped registers `mtime` and `mtimecmp` which can
 be used as the machine-mode timer registers defined in the RISC-V privileged
 spec. Additional features such as prescaler, step, and a configurable number of
 timers and harts have been added.
-
-# Theory of Operations
-
-## Block Diagram
-
-![Timer Block Diagram](./doc/timer_block_diagram.svg)
-
-The timer module is composed of tick generators, counters, and comparators.
-A tick generator creates a tick every time its internal counter hits the
-[`CFG0.prescaler`](../spi_device/data/spi_device.hjson#cfg0) value. The tick is used to increment `mtime` by the [`CFG0.step`](../spi_device/data/spi_device.hjson#cfg0)
-value. The 64-bit `mtime` value is compared with the 64-bit `mtimecmp`. If
-`mtime` is greater than or equal to `mtimecmp`, the timer raises an interrupt.
-
-## Hardware Interfaces
-
-* [Interface Tables](data/rv_timer.hjson#interfaces)
-
-## Design Details
-
-### Tick Generator
-
-The tick module inside the timer IP is used to generate a fixed period of pulse
-signal. This allows creation of a call-clock timer tick such as 1us or 10us
-regardless of the system clock period. It is useful if the system has more than
-one clock as a clock source. The firmware just needs to adjust the
-[`CFG0.prescaler`](../spi_device/data/spi_device.hjson#cfg0) value and the actual timer interrupt handling routine does not
-need a variable clock period to update `mtimecmp`.
-
-For instance, if a system switches between 48MHz and 200MHz clocks, a prescaler
-value of **47** for 48MHz and **199** for 200MHz will generate a 1us tick.
-In this version, the timer only supports a single fixed clock, so the firmware
-should change [`CFG0.prescaler`](../spi_device/data/spi_device.hjson#cfg0) appropriately.
-
-### Configurable number of timers and harts
-
-The timer IP supports more than one HART and/or more than one timer per hart.
-Each hart has a set of tick generator and counter. It means the timer IP has the
-same number of prescalers, steps, and `mtime` registers as the number of harts.
-
-Each hart can have multiple sets of `mtimecmp`, comparator logic, and expire
-interrupt signals. This version of the IP is fixed to have one Hart and one
-Timer per Hart.
-
-Below is an example configuration file for `N_TIMER` 2 and `N_HARTS` 2.
-It has separate interrupts per timer and a set of interrupt enable and state
-registers per Hart.
-
-```hjson
-{
-  // ...
-  interrupt_list: [
-    { name: "timer_expired_hart0_timer0",
-      desc: "raised if hart0's timer0 expired (mtimecmp >= mtime)"
-    },
-    { name: "timer_expired_hart0_timer1",
-      desc: "raised if hart0's timer1 expired (mtimecmp >= mtime)"
-    },
-    { name: "timer_expired_hart1_timer0",
-      desc: "raised if hart1's timer0 expired (mtimecmp >= mtime)"
-    },
-    { name: "timer_expired_hart1_timer1",
-      desc: "raised if hart1's timer1 expired (mtimecmp >= mtime)"
-    },
-  ],
-  //...
-  registers: {
-    // ...
-    { skipto: "0x100" },
-    { name: "CFG0",
-      desc: "Configuration for Hart 0",
-      swaccess: "rw",
-      hwaccess: "hro",
-      fields: [
-        { bits: "11:0", name: "prescale", desc: "Prescaler to generate tick" },
-        { bits: "23:16", name: "step", resval: "0x1", desc: "Incremental value for each tick" },
-      ],
-    },
-    // ...
-    { multireg: {
-        name: "INTR_ENABLE0",
-        desc: "Interrupt Enable",
-        count: 2,
-        cname: "TIMER",
-        swaccess: "rw",
-        hwaccess: "hro",
-        fields: [
-          { bits: "0", name: "IE", desc: "Interrupt Enable for timer" }
-        ]
-      }
-    },
-    { multireg: {
-        name: "INTR_STATE0",
-        desc: "Interrupt Status",
-        count: 2,
-        cname: "TIMER",
-        swaccess: "ro",
-        hwaccess: "hrw",
-        fields: [
-          { bits: "0", name: "IS", desc: "Interrupt status for timer" }
-        ],
-      }
-    },
-    // ...
-    { skipto: "0x200" },
-    { name: "CFG1",
-      desc: "Configuration for Hart 1",
-      swaccess: "rw",
-      hwaccess: "hro",
-      fields: [
-        { bits: "11:0", name: "prescale", desc: "Prescaler to generate tick" },
-        { bits: "23:16", name: "step", resval: "0x1", desc: "Incremental value for each tick" },
-      ],
-    },
-    // ...
-    { name: "TIMER_V_UPPER1",
-      desc: "Timer value Upper",
-      swaccess: "rw",
-      hwaccess: "hrw",
-      fields: [
-        { bits: "31:0", name: "v", desc: "Timer value [63:32]" },
-      ],
-    },
-    // ...
-}
-```
-
-
-# Programmers Guide
-
-## Initialization
-
-Software is expected to configure `prescaler` and `step` before activating the
-timer. These two fields need to be stable to correctly increment the timer
-value. If software wants to change these fields, it should de-activate the
-timer and then proceed.
-
-## Register Access
-
-The timer IP has 64-bit timer value registers and 64-bit compare registers. The
-register interface, however, is set to 32-bit data width. The CPU cannot access
-64-bit data in a single request. However, when split into two reads, it is
-possible the timer value can increment between the two requests.
-
-The IP doesn't have a latching or blocking mechanism to avoid this issue. It is
-the programmer's responsibility to ensure the correct value is read. For
-instance, if the CPU reads `0xFFFF_FFFF` from lower 32-bit timer value (`mtime`)
-and `0x0000_0001` from upper 32-bit timer value (`mtimeh`), there is a chance
-that rather than having the value `0x1_FFFF_FFFF` the timer value has changed
-from `0x0_FFFF_FFFF` to `0x1_0000_0000` between the two reads. If there is the
-possibility of an interrupt between the two reads then the counter could have
-advanced even more.
-
-This condition can be detected in a standard way using a third read. Figure 10.1
-in the RISC-V unprivileged specification explains how to avoid this.
-
-```asm
-again:
-    rdcycleh  x3
-    rdcycle   x2
-    rdcycleh  x4
-    bne       x3, x4, again
-```
-
-Updating `mtimecmp` register also follows a similar approach to avoid a spurious
-interrupt during the register update. Please refer to the `mtimecmp` section in
-the RISC-V privileged specification.
-
-```asm
-# New comparand is in a1:a0.
-li t0, -1
-sw t0, mtimecmp   # No smaller than old value.
-sw a1, mtimecmp+4 # No smaller than new value.
-sw a0, mtimecmp   # New value.
-```
-
-## Timer behaviour close to 2^64
-
-There are some peculiarities when `mtime` and `mtimecmp` get close to the end of
-the 64-bit integer range. In particular, because an unsigned comparison is done
-between `mtime` and `mtimecmp` care is needed. A couple of cases are:
-
-1. `mtimecmp` close to 0xFFFF_FFFF_FFFF_FFFF. In this case the time-out event
-   will be signaled when `mtime` passes the comparison value, the interrupt will
-   be raised and the source indicated in the corresponding bit of the interrupt
-   status register. However, if there is a delay in servicing the interrupt the
-   `mtime` value could wrap to zero (and continue to increment) so the value
-   read by the interrupt service routine will be less than the comparison value.
-
-2. When the timer is setup to trigger a `timeout` some number of timer ticks
-   into the future, the computation of the comparison value `mtime + timeout`
-   may overflow. If this value is set in `mtimecmp` it would make `mtime`
-   greater than `mtimecmp` and immediately signal an interrupt.
-   A possible solution is to have an intermediate interrupt by setting the
-   `mtimecmp` to 64-bit all-ones, `0xFFFF_FFFF_FFFF_FFFF`. Then the service
-   routine for that interrupt will need to poll `mtime` until it wraps (which
-   could take up to a timer clock tick) before scheduling the required interrupt
-   using the originally computed `mtimecmp` value.
-
-## Interrupt Handling
-
-If `mtime` is greater than or equal to the value of `mtimecmp`, the interrupt is generated from the RV_TIMER module.
-If the core enables the timer interrupt in `MIE` CSR, it jumps into the timer interrupt service routine.
-Clearing the interrupt can be done by writing 1 into the Interrupt Status register [`INTR_STATE0`](../spi_device/data/spi_device.hjson#intr_state0).
-The RV_TIMER module also follows RISC-V Privileged spec that requires the interrupt to be cleared by updating `mtimecmp` memory-mapped CSRs.
-In this case both [`COMPARE_LOWER0_0`](../spi_device/data/spi_device.hjson#compare_lower0_0) and [`COMPARE_UPPER0_0`](../spi_device/data/spi_device.hjson#compare_upper0_0) can clear the interrupt source.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_rv_timer.h)
-
-## Register Table
-
-* [Register Table](data/rv_timer.hjson#registers)
diff --git a/hw/ip/rv_timer/doc/programmers_guide.md b/hw/ip/rv_timer/doc/programmers_guide.md
new file mode 100644
index 0000000000000..e38bddfca85f0
--- /dev/null
+++ b/hw/ip/rv_timer/doc/programmers_guide.md
@@ -0,0 +1,86 @@
+# Programmer's Guide
+
+## Initialization
+
+Software is expected to configure `prescaler` and `step` before activating the
+timer. These two fields need to be stable to correctly increment the timer
+value. If software wants to change these fields, it should de-activate the
+timer and then proceed.
+
+## Register Access
+
+The timer IP has 64-bit timer value registers and 64-bit compare registers. The
+register interface, however, is set to 32-bit data width. The CPU cannot access
+64-bit data in a single request. However, when split into two reads, it is
+possible the timer value can increment between the two requests.
+
+The IP doesn't have a latching or blocking mechanism to avoid this issue. It is
+the programmer's responsibility to ensure the correct value is read. For
+instance, if the CPU reads `0xFFFF_FFFF` from lower 32-bit timer value (`mtime`)
+and `0x0000_0001` from upper 32-bit timer value (`mtimeh`), there is a chance
+that rather than having the value `0x1_FFFF_FFFF` the timer value has changed
+from `0x0_FFFF_FFFF` to `0x1_0000_0000` between the two reads. If there is the
+possibility of an interrupt between the two reads then the counter could have
+advanced even more.
+
+This condition can be detected in a standard way using a third read. Figure 10.1
+in the RISC-V unprivileged specification explains how to avoid this.
+
+```asm
+again:
+    rdcycleh  x3
+    rdcycle   x2
+    rdcycleh  x4
+    bne       x3, x4, again
+```
+
+Updating `mtimecmp` register also follows a similar approach to avoid a spurious
+interrupt during the register update. Please refer to the `mtimecmp` section in
+the RISC-V privileged specification.
+
+```asm
+# New comparand is in a1:a0.
+li t0, -1
+sw t0, mtimecmp   # No smaller than old value.
+sw a1, mtimecmp+4 # No smaller than new value.
+sw a0, mtimecmp   # New value.
+```
+
+## Timer behaviour close to 2^64
+
+There are some peculiarities when `mtime` and `mtimecmp` get close to the end of
+the 64-bit integer range. In particular, because an unsigned comparison is done
+between `mtime` and `mtimecmp` care is needed. A couple of cases are:
+
+1. `mtimecmp` close to 0xFFFF_FFFF_FFFF_FFFF. In this case the time-out event
+   will be signaled when `mtime` passes the comparison value, the interrupt will
+   be raised and the source indicated in the corresponding bit of the interrupt
+   status register. However, if there is a delay in servicing the interrupt the
+   `mtime` value could wrap to zero (and continue to increment) so the value
+   read by the interrupt service routine will be less than the comparison value.
+
+2. When the timer is setup to trigger a `timeout` some number of timer ticks
+   into the future, the computation of the comparison value `mtime + timeout`
+   may overflow. If this value is set in `mtimecmp` it would make `mtime`
+   greater than `mtimecmp` and immediately signal an interrupt.
+   A possible solution is to have an intermediate interrupt by setting the
+   `mtimecmp` to 64-bit all-ones, `0xFFFF_FFFF_FFFF_FFFF`. Then the service
+   routine for that interrupt will need to poll `mtime` until it wraps (which
+   could take up to a timer clock tick) before scheduling the required interrupt
+   using the originally computed `mtimecmp` value.
+
+## Interrupt Handling
+
+If `mtime` is greater than or equal to the value of `mtimecmp`, the interrupt is generated from the RV_TIMER module.
+If the core enables the timer interrupt in `MIE` CSR, it jumps into the timer interrupt service routine.
+Clearing the interrupt can be done by writing 1 into the Interrupt Status register [`INTR_STATE0`](../../spi_device/data/spi_device.hjson#intr_state0).
+The RV_TIMER module also follows RISC-V Privileged spec that requires the interrupt to be cleared by updating `mtimecmp` memory-mapped CSRs.
+In this case both [`COMPARE_LOWER0_0`](../../spi_device/data/spi_device.hjson#compare_lower0_0) and [`COMPARE_UPPER0_0`](../../spi_device/data/spi_device.hjson#compare_upper0_0) can clear the interrupt source.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_rv_timer.h)
+
+## Register Table
+
+* [Register Table](../data/rv_timer.hjson#registers)
diff --git a/hw/ip/rv_timer/doc/theory_of_operation.md b/hw/ip/rv_timer/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..aaae5c6b42a9f
--- /dev/null
+++ b/hw/ip/rv_timer/doc/theory_of_operation.md
@@ -0,0 +1,124 @@
+# Theory of Operation
+
+## Block Diagram
+
+![Timer Block Diagram](../doc/timer_block_diagram.svg)
+
+The timer module is composed of tick generators, counters, and comparators.
+A tick generator creates a tick every time its internal counter hits the
+[`CFG0.prescaler`](../../spi_device/data/spi_device.hjson#cfg0) value. The tick is used to increment `mtime` by the [`CFG0.step`](../../spi_device/data/spi_device.hjson#cfg0)
+value. The 64-bit `mtime` value is compared with the 64-bit `mtimecmp`. If
+`mtime` is greater than or equal to `mtimecmp`, the timer raises an interrupt.
+
+## Hardware Interfaces
+
+* [Interface Tables](../data/rv_timer.hjson#interfaces)
+
+## Design Details
+
+### Tick Generator
+
+The tick module inside the timer IP is used to generate a fixed period of pulse
+signal. This allows creation of a call-clock timer tick such as 1us or 10us
+regardless of the system clock period. It is useful if the system has more than
+one clock as a clock source. The firmware just needs to adjust the
+[`CFG0.prescaler`](../../spi_device/data/spi_device.hjson#cfg0) value and the actual timer interrupt handling routine does not
+need a variable clock period to update `mtimecmp`.
+
+For instance, if a system switches between 48MHz and 200MHz clocks, a prescaler
+value of **47** for 48MHz and **199** for 200MHz will generate a 1us tick.
+In this version, the timer only supports a single fixed clock, so the firmware
+should change [`CFG0.prescaler`](../../spi_device/data/spi_device.hjson#cfg0) appropriately.
+
+### Configurable number of timers and harts
+
+The timer IP supports more than one HART and/or more than one timer per hart.
+Each hart has a set of tick generator and counter. It means the timer IP has the
+same number of prescalers, steps, and `mtime` registers as the number of harts.
+
+Each hart can have multiple sets of `mtimecmp`, comparator logic, and expire
+interrupt signals. This version of the IP is fixed to have one Hart and one
+Timer per Hart.
+
+Below is an example configuration file for `N_TIMER` 2 and `N_HARTS` 2.
+It has separate interrupts per timer and a set of interrupt enable and state
+registers per Hart.
+
+```hjson
+{
+  // ...
+  interrupt_list: [
+    { name: "timer_expired_hart0_timer0",
+      desc: "raised if hart0's timer0 expired (mtimecmp >= mtime)"
+    },
+    { name: "timer_expired_hart0_timer1",
+      desc: "raised if hart0's timer1 expired (mtimecmp >= mtime)"
+    },
+    { name: "timer_expired_hart1_timer0",
+      desc: "raised if hart1's timer0 expired (mtimecmp >= mtime)"
+    },
+    { name: "timer_expired_hart1_timer1",
+      desc: "raised if hart1's timer1 expired (mtimecmp >= mtime)"
+    },
+  ],
+  //...
+  registers: {
+    // ...
+    { skipto: "0x100" },
+    { name: "CFG0",
+      desc: "Configuration for Hart 0",
+      swaccess: "rw",
+      hwaccess: "hro",
+      fields: [
+        { bits: "11:0", name: "prescale", desc: "Prescaler to generate tick" },
+        { bits: "23:16", name: "step", resval: "0x1", desc: "Incremental value for each tick" },
+      ],
+    },
+    // ...
+    { multireg: {
+        name: "INTR_ENABLE0",
+        desc: "Interrupt Enable",
+        count: 2,
+        cname: "TIMER",
+        swaccess: "rw",
+        hwaccess: "hro",
+        fields: [
+          { bits: "0", name: "IE", desc: "Interrupt Enable for timer" }
+        ]
+      }
+    },
+    { multireg: {
+        name: "INTR_STATE0",
+        desc: "Interrupt Status",
+        count: 2,
+        cname: "TIMER",
+        swaccess: "ro",
+        hwaccess: "hrw",
+        fields: [
+          { bits: "0", name: "IS", desc: "Interrupt status for timer" }
+        ],
+      }
+    },
+    // ...
+    { skipto: "0x200" },
+    { name: "CFG1",
+      desc: "Configuration for Hart 1",
+      swaccess: "rw",
+      hwaccess: "hro",
+      fields: [
+        { bits: "11:0", name: "prescale", desc: "Prescaler to generate tick" },
+        { bits: "23:16", name: "step", resval: "0x1", desc: "Incremental value for each tick" },
+      ],
+    },
+    // ...
+    { name: "TIMER_V_UPPER1",
+      desc: "Timer value Upper",
+      swaccess: "rw",
+      hwaccess: "hrw",
+      fields: [
+        { bits: "31:0", name: "v", desc: "Timer value [63:32]" },
+      ],
+    },
+    // ...
+}
+```
diff --git a/hw/ip/spi_device/README.md b/hw/ip/spi_device/README.md
index 75f79dfdae606..4285c14233f48 100644
--- a/hw/ip/spi_device/README.md
+++ b/hw/ip/spi_device/README.md
@@ -109,849 +109,3 @@ The TPM submodule conforms to the [TPM over SPI 2.0][] specification. The TPM op
 
 [TPM over SPI 2.0]: https://trustedcomputinggroup.org/wp-content/uploads/Trusted-Platform-Module-Library-Family-2.0-Level-00-Revision-1.59_pub.zip
 [TPM PCCP]: https://trustedcomputinggroup.org/resource/pc-client-platform-tpm-profile-ptp-specification/
-
-# Theory of Operations
-
-## Block Diagram
-
-![Block Diagram](./doc/block_diagram.svg)
-
-In Generic mode, the incoming data is stored byte-based into an asynchronous FIFO.
-The logic inside the generic mode then updates the DPSRAM RX space.
-The logic also reads data from the DPSRAM then pushes out to the SPI MISO line.
-
-The Generic mode uses the entire DPSRAM space exclusively.
-The TX/RX size in the DPSRAM can be changed by compile-time parameters.
-
-When Flash mode is selected, the command parser accepts the first byte of the SPI MOSI line then activates the flash submodules, such as Status, JEDEC, Read command, and Upload function.
-The Status logic processes the three Read Status commands.
-The SW may configure three bytes of the Flash Status CSR then the Status submodule returns the CSR data into the SPI MISO line.
-The SW may configure the Read Status commands' opcodes.
-
-The JEDEC submodule returns the JEDEC Manufacturer ID followed by the additional information.
-The Manufacturer ID may vary depending on the company.
-For example, lowRISC JEDEC ID `EFh` follows twelve bytes of 7Fh Continuous Codes, requiring a total thirteen bytes for the manufacturer ID.
-The SW may configure how many Continuous Codes is needed and the actual manufacturer ID.
-
-The Read submodule processes the Read SFDP (Serial Flash Discoverable Parameters) command, and up to six different types of the read commands.
-The read submodule receives address information from the SPI transaction, fetches the data from the read buffer in the DPSRAM, and returns the data on SPI lines (single, dual, quad lines).
-If the received address falls into the SW programmable mailbox address space, the logic fetches data not from the read buffer but from the mailbox buffer in the DPSRAM.
-
-SW may configure command information slots to upload the command into the FIFOs and the payload buffer in the DPSRAM.
-SW may additionally let HW to set the BUSY bit in the Status register when the HW uploads the command.
-
-In Passthrough mode, the logic filters the incoming transaction if the transaction is not permitted.
-The SW may configure the logic to change a portion of the address or first 4 bytes of the payload.
-
-## Hardware Interfaces
-
-* [Interface Tables](data/spi_device.hjson#interfaces)
-
-The TPM submodule requires a separate input port for CS#.
-The TPM submodule and other SPI Device modes are able to be active together.
-The host system distinguishes between the TPM transactions and the other SPI transactions using separate CS# ports.
-Even though both submodules are able to be active, the host system cannot issue a TPM command and a SPI transaction at the same time due to the SPI IO lines being shared.
-
-The TPM has no write FIFO interrupt.
-As TPM transactions are not bigger than 4B in current usage case, the waiting time of the core is not a concern.
-The core takes multiple cycles to pop a byte from the write FIFO due to the slower peripheral clock and multiple CDC paths.
-The gain of having write FIFO interrupt is not great.
-
-## SPI Device Generic mode
-
-![Generic Mode Block Diagram](./doc/generic-blockdiagram.svg)
-
-The block diagram above shows how the SPI Device generic mode converts incoming
-bit-serialized SDI data into a valid byte, where the data bit is valid when the
-chip select signal (CSB) is 0 (active low) and SCK is at positive or negative
-edge (configurable, henceforth called the "active edge"). The bit order within
-the byte is determined by [`CFG.rx_order`](data/spi_device.hjson#cfg) configuration register field. After a
-byte is gathered, the interface module writes the byte data into a small FIFO
-("RXFIFO") using SCK. It is read out of the FIFO and written into to the
-buffer SRAM ("DP_SRAM") using the system bus clock. If RXFIFO is full, this is
-an error condition and the interface module discards the byte.
-
-The interface module also serializes data from the small transmit FIFO
-("TXFIFO") and shifts it out on the SDO pin when CSB is 0 and SCK is at the
-active edge. The bit order within the byte can be configured with configuration
-register field [`CFG.tx_order`](data/spi_device.hjson#cfg). It is expected that software has prepared TX data
-based on the description in the "Defining
-Firmware Operation Mode" section below. Since SCK is not under the control of
-software or the device (it is driven by the external SPI host), it is possible
-that there is no data ready in the TXFIFO when chip select becomes active and
-the interface needs to send data on the SDO pin. Either software has not
-prepared TX data or software does not care about the contents of the TX data -
-then the hardware will send whatever lingering data is in the empty TXFIFO. If
-this is a functional issue, then software should at least soft-reset the contents
-of the TXFIFO using the [`CONTROL.rst_txfifo`](data/spi_device.hjson#control) register. The soft-reset signal
-is not synchronized to the SCK clock, so software should drive the reset
-signal when the SPI interface is idle.
-
-### General Data Transfer on Pins
-
-Data transfers with the SPI device module involve four peripheral SPI pins: SCK,
-CSB, SDI, SDO. SCK is the SPI clock driven by an external SPI host. CSB (chip
-select bar) is an active low enable signal that frames a transfer, driven by the
-external host. Transfers with active SCK edges but inactive (high) CSB are
-ignored. Data is driven into the SPI device on the SDI pin ("Serial Data
-In", though we're otherwise using host/device terminology) and driven out on
-SDO. Any transfer length is legal, though higher level protocols typically
-assume word width boundaries. See details on protocols and transfers that
-follow. The diagram below shows a typical transfer, here for 8 bytes (64 cycles,
-showing the beginning and end of the transfer). Configurability for active
-edges, polarities, and bit orders are described later.
-
-```wavejson
-{ signal: [
-  { name: 'CSB',  wave: '10.........|....1.'},
-  { name: 'SCK',  wave: '0.p........|....l.'},
-  { name: 'SDI',  wave: 'z.=..=.=.=.=.=.=.=.=.=|=.=.=.=.z....',
-    data:['R07','R06','R05','R04','R03','R02','R01','R00','R17',
-          '','R73','R72','R71','R70'], period:0.5, },
-  { name: 'SDO',  wave: 'z.=..=.=.=.=.=.=.=.=.=|=.=.=.=.z....',
-    data:['T07','T06','T05','T04','T03','T02','T01','T00','T17',
-          '','T73','T72','T71','T70'], period:0.5}],
-  head:{
-    text: 'Data Transfer',
-    tick: ['-2 -1 0 1 2 3 4 5 6 7 8 9 60 61 62 63     ']
-  }
-}
-```
-
-
-### Defining "Firmware Operation Mode"
-
-Firmware operation mode, as implemented by this SPI device, is used to bulk copy data in
-and out of the chip using the pins as shown above. In general, it is used to
-load firmware into the chip, but can be used for any data transfer into or out
-of the chip. The transfers are "generic" in the sense that there is no
-addressing or overarching protocol involved. Data transferred into the chip goes into a SPI Device
-circular buffer implemented in an SRAM, and firmware decides what to do with the
-data. Data transferred out of the chip comes out of a circular buffer in an
-SRAM. Software can build any number of higher level protocols on top of this
-basic mechanism. All transfers are by definition full duplex: whenever an active
-SCK edge is received, a bit of RX data is latched into the peripheral, and a bit
-of TX data is sent out of the peripheral. If transfers only require
-unidirectional movement of data, the other direction can be ignored but will
-still be active. For instance, if only receive data is needed in the transfer,
-the device will still be transmitting data out on the TX ("SDO") pin.
-
-### SPI Generic Protocol
-
-The primary protocol considered is one used by an external SPI host to send
-chunks of firmware data into the device in the receive direction, confirming the
-contents with an echo back of a hash of the received data in the transmit
-direction. This is generally termed the 'SPI Generic' protocol, since SPI is used to
-send firmware into device memory, brokered by software confirming integrity
-of the received firmware data. This special case will be described first, and
-then a generic understanding of how firmware mode operates will follow.
-
-The following diagram shows the expected data transfer in SPI Generic mode.
-
-![data transfer in SPI Device](./doc/data_transfer.svg)
-
-In this diagram, bursts of data transfer are shown as "pages" of firmware
-content being driven into the device. The size of the page is not relevant,
-though it must be less than the size of the internal SPI Device SRAM. Typically
-the SRAM is divided in half for RX and TX buffers, but the boundary is
-configurable. The total size of RX and TX buffer must fit in the SPI device
-SRAM. Since the external SPI Host is in charge of the clock (SCK), it controls
-all aspects of the transfer, including the size of the page. But it is done in
-coordination with software running on the device that manages the higher level
-protocol.
-
-The protocol assumes that for each page written into the device, a response will
-be prepared for the next page. But since the SPI Device is always transmitting
-during every received page, the first transmitted page can be ignored. After the
-first page is received, software will get alerted as to its completion (via an
-RX interrupt), and will execute whatever integrity check is required on that
-data. It can then prepare its response to page zero by writing into the SPI
-Device TX buffer. What it writes into the TX buffer the concern of the
-higher level protocol. It could be a "good" indication, a full echo of the RX
-data, or a hash of the received contents. The decision is not in scope for this
-specification.
-
-Clearly there is a potential race condition here as a new page could begin to be
-received before software has prepared the transmit response to page zero
-(including the time to read data out of the SRAM), but that is a condition that
-the higher level protocol must prepare for. That protocol is not in scope for
-this document, but some hints to its implementation are given in the
-programmers guide section below.
-
-The transfer continues until all received data is taken in, and responded back.
-In this protocol the last "received" page of data is a "don't care" as long
-as the response is transmitted successfully.
-
-### Firmware Operation Mode
-
-Taking this example as a guide, we can see the general method of the SPI
-Firmware Operation Mode. On every active SCK clock edge, data is received from the SDI
-pin into the SPI device, and data is transmitted on the SDO pin. Received data
-is gathered into bytes and written into the RX circular buffer in the SPI Device
-SRAM as it is accumulated. Whatever data exists in the TX circular buffer is
-serialized and transmitted. Transfers are framed using the active low chip
-select pin SCB. What happens when data arrives and the RX circular buffer is
-full, or when the transmitter encounters an empty TX circular buffer are
-error conditions discussed in the Design Details section that follows.
-
-### RXFIFO, TXFIFO, and DP_SRAM
-
-The relationship between the Dual Port SRAM (DP_SRAM) and the RX and TXFIFOs
-should be explained. The SRAM is divided into a section for the transmit
-direction, named TXF, and a section for the receive direction, named RXF. Each
-section has its own read and write pointer. The SRAM may be read and written by
-software at any time, but for correct normal operation it will only write the
-empty area of the TXF (between the write pointer and read pointer) and only read
-the full area of the RXF (between the read pointer and write pointer) with the
-other areas used by the hardware. It is first worth noting that the hardware
-implications of the asynchronous nature of SCK and the fact it may not be free
-running, complicate some of the logic. The full feature set of that interface
-logic (clocked by SCK) includes the serial to parallel converter for RX data,
-the parallel-to-serial converter for TX data, and the interfaces to RXFIFO and
-TXFIFO. Before the first bit transfer and after the last SCK is stopped,
-there is no clock for any of this logic.  So for instance there is no guarantee
-of the two-clock-edges normally required for asynchronous handshaking protocols.
-The RXFIFO and TXFIFO exist to facilitate this situation.
-
-In the receive direction, data gathered from the SDI pin is written into the
-RXFIFO (see details below) at appropriate size boundaries. This data is
-handshake-received on the core clock side, gathered into byte or word quantity,
-and written into the RX circular buffer of the dual-port SRAM. On each write,
-the RXF write pointer ([`RXF_PTR.wptr`](data/spi_device.hjson#rxf_ptr)) is incremented by hardware, wrapping at
-the size of the circular buffer. Software can watch (via polling or interrupts)
-the incrementing of this write pointer to determine how much valid data has been
-received, and determine when and what data to act upon. Once it has acted upon
-data, the software should update the RXF read pointer to indicate that space in
-the SRAM is available for future writes by the hardware. If incrementing the
-write pointer would result in it becoming equal to the read pointer then the RXF
-is full and any subsequently received data will be discarded. Thus in normal
-operation, the RXF write pointer is updated automatically by hardware and the RXF
-read pointer is managed by software. As an optimization the hardware will
-normally only write to the 32-bit wide SRAM when an entire word can be written.
-Since the end of the received data may not be aligned, there is a timer that
-forces sub-word writes if data has been staged for too long. The timer value
-([`CFG.timer_v`](data/spi_device.hjson#cfg)) represents the number of core clock cycles. For instance, if
-timer value is configured in 0xFF, the RXF control logic will write gathered
-sub-word data in 255 cycles if no further bit stream from SPI is received.
-
-In the transmit direction, things are a little more tricky. Since the pin
-interface logic begins transmitting data on its very first SCK edge, there are
-no previous clock edges in the interface side of the fifo to allow an empty flag
-to be updated. The interface  must *blindly* take whatever data is at the
-read pointer of the TXFIFO (in a typical asynchronous FIFO with free-running
-clocks the pointers can always be sent across the asynchronous boundary to
-determine if the FIFO is truly empty or not). Hence the need to potentially send
-out garbage data if software has not prepared the TXFIFO in time.
-
-The software writes data that it wants to transmit into the TXF circular buffer
-of the DP_SRAM buffer. It then passes the data to the hardware by moving the TXF
-write pointer to point to the next location after the data (this is the location
-it will use to start the data for the next transmission). Hardware that manages
-the TXFIFO detects the change in TXF write pointer and begins reading from the
-SRAM and prefilling the TXFIFO until it is full or until all valid TXF data has
-been read. This prepares the TXFIFO with the desired data for when the next SCK
-data arrives. As the SCK domain logic pulls data out of the TXFIFO to transmit
-on the SDO pin, that TXFIFO read is detected (after synchronization to the core
-clock domain) and potentially another word of data is read from the SRAM and
-written into the TXFIFO. Each time the SRAM is read the hardware increments the
-TXF read pointer making the space available to software. Like above, though
-conversely, in normal operation the TXF write pointer is managed completely by
-software and the TXF read pointer is incremented by hardware.
-
-All reads and writes to/from the SRAM for RXF and TXF activity are managed by
-direct reads and writes through the TLUL bus interface, managed by the
-auto-generated register file control logic.
-
-## SPI Flash and Passthrough Modes
-
-### Command Information List
-
-The SW may configure the map from the received opcode to the command process module by programming *cmd_info* list.
-Current SPI_DEVICE provides 24 command information entries.
-Each entry represents a command.
-Details of the fields are explained in the [`CMD_INFO_0`](data/spi_device.hjson#cmd_info_0)
-
-First 11 commands are assigned to specific submodules.
-
-Index  | Assigned Submodule
--------|--------------------
-[2:0]  | Read Status
-[3]    | Read JEDEC ID
-[4]    | Read SFDP
-[10:5] | Read commands
-
-If the IP is in flash mode or in passthrough mode with [`INTERCEPT_EN`](data/spi_device.hjson#intercept_en) set, other than *opcode* and *valid* fields in the command information entries are ignored for Read Status and Read JEDEC ID commands.
-The submodules directly return data on the MISO line (SD[1]).
-In Passthrough mode, if Read Status and Read JEDEC ID commands are intercepted by the internal HW, the other fields in the command information entries are ignored also.
-
-The main use of the fields other than *opcode* and *valid* is to control the output enable in the passthrough logic.
-See [Output Enable Control](#output-enable-control) section for more.
-
-*upload* and *busy* fields are used in the SPI Flash/ Passthrough modes.
-See [Command Upload](#command-upload) section for details.
-
-### Command Parser
-
-![Command Parser block](./doc/cmdparse.svg)
-
-Command parser (*cmdparse*) processes the first byte of the SPI and activates the processing submodules depending on the received opcode and the *cmd_info* list described in the previous section.
-
-The cmdparse compares the received opcode with the *cmd_info.opcode* data structure.
-If any entry matches to the received opcode, the cmdparse hands over the matched command information entry with the index to the corresponding submodule.
-As explained in the [previous section](#command-information-list), the command parser checks the index to activate Read Status / Read JEDEC ID/ Read Command / Address 4B modules.
-Other than the first 11 slots and last two slots (the last two slots are not visible to SW), the cmdparse checks the *upload* field and activates the upload module if the field is set.
-
-SW can configure whether a submodule should process the command while in the passthrough mode by setting the [`INTERCEPT_EN`](data/spi_device.hjson#intercept_en) CSR.
-
-### Status Control
-
-If the received command is one of the three read status commands, STATUS control module takes over the SPI interface after the opcode.
-The 3 bytes status register is not reset by CSb.
-Except BUSY bit and WEL bit, other bits are controlled by SW.
-
-BUSY bit is set by HW when it receives any commands that are uploaded to the FIFOs and their `busy` fields are 1 in the command information entry.
-SW may clear BUSY bit when it completes the received commands (e.g Erase/ Program).
-
-If BUSY is set, SPI_DEVICE IP blocks the passthrough interface in Passthrough mode.
-The blocking of the interface occurs in SPI transaction idle state (CSb == 1).
-When SW clears the BUSY bit, it is applied to the STATUS register in the SPI clock domain when SPI clock toggles.
-It means the update happens when the next SPI transaction is received.
-The BUSY bit in the CSR is the synchronized value of the STATUS BUSY bit in the SPI clock domain.
-Due to the CDC latency, SW may see the updated value (BUSY clear) with long delay.
-
-WEL bit can be controlled by SW and also by HW.
-HW updates WEL bit when it receives WREN(06h) or WRDI(04h) commands.
-The opcode can be configured via [`CMD_INFO_WREN`](data/spi_device.hjson#cmd_info_wren) and [`CMD_INFO_WRDI`](data/spi_device.hjson#cmd_info_wrdi).
-
-The SW update of the STATUS register via [`FLASH_STATUS`](data/spi_device.hjson#flash_status) is not instantaneous.
-The IP stores the SW request into the asynchronous FIFO then the request is processed in the SPI clock domain.
-The request updates the temporal status register, which is called as staged registers in the design.
-The staged registers are latched into the committed registers when CSb is released.
-SW sees the committed registers when reading the [`FLASH_STATUS`](data/spi_device.hjson#flash_status) CSR.
-
-The attached host system also reads back the committed registers via Read Status commands.
-This scheme is to guarantee the atomicity of the STATUS register.
-
-If the host sends the Write Status commands, the commands are not processed in this module.
-SW must configure the remaining command information entries to upload the Write Status commands to the FIFOs.
-
-### JEDEC ID Control
-
-JEDEC module returns JEDEC Device ID and Manufacturer ID following the Continuation Code (CC).
-SW may configure [`JEDEC_CC`](data/spi_device.hjson#jedec_cc) CSR for HW to return proper CC.
-The *cc* field in [`JEDEC_CC`](data/spi_device.hjson#jedec_cc) defines the return value, which is `0x7F` by default.
-*num_cc* defines how many times the HW to send CC byte before sending the JEDEC ID.
-
-The actual JEDEC ID consists of one byte manufacturer ID and two bytes device ID.
-The HW sends the manufacturer ID first, then `[7:0]` of the device ID then `[15:8]` byte.
-
-### Serial Flash Discoverable Parameters (SFDP) Control
-
-HW parses SFDP command then fetch the data from SFDP space in the DPSRAM.
-HW provides 256B SFDP space.
-HW uses lower 8bit of the received 24 bit address to access the DPSRAM.
-Upper 16 bits are ignored (aliased).
-SW should prepare proper SFDP contents before the host system issues SFDP commands.
-
-HW fetches from the DPSRAM in 4B and returns the data to the SPI line.
-HW repeats the operation until CSb is de-asserted.
-
-### Read Command Processor
-
-The read command block has multiple sub-blocks to process normal Read, Fast Read, Fast Read Dual/ Quad from the internal DPSRAM.
-The DPSRAM has a 2kB region for the read command access.
-The read command region has two 1kB buffers.
-If HW receives the read access to the other half of the space first time, then the HW reports to the SW to refill the current 1kB region with new content.
-
-The double buffering scheme aids the SW to prepare the next chunk of data.
-SW copies a portion of data (1kB) from the internal flash memory into SPI_DEVICE DPSRAM.
-From the host system, the emulated SPI Device is seen more than 2kB storage device with the double buffering scheme.
-The assumption is that the host system reads mostly sequentially.
-
-#### Address Handling
-
-For read commands such as Normal Read, Fast Read {Single/ Dual/ Quad} Output commands, the address comes through ID0 only.
-The state machine in this block shifts the address one-by-one and decrements the address counter register by 1.
-
-When it reaches the 4B address (`addr[2]`), the module triggers the DPSRAM state machine to fetch data from the DPSRAM.
-When the module receives `addr[0]`, at the positive edge of SCK, the module moves to appropriate command state based on the given CMD_INFO data.
-
-If the received address falls into mailbox address range and mailbox feature is enabled, the module turns on the mailbox selection bit.
-Then all out-going requests to the DPSRAM are forwarded to the mailbox section, not the read buffer section.
-
-#### Dummy Cycle
-
-The SW may configure the dummy cycle field for each individual read commands.
-The default dummy cycle for those commands are 7 (0-based).
-The value is the number of cycles.
-For example, if SW programs the dummy cycle for Fast Read Quad to `3h`, the module waits 4 cycles then returns data.
-
-#### Buffer Management
-
-![Read Buffer Management](./doc/buffer-management.svg)
-
-The SPI Device IP uses the first half of the DPSRAM as a read buffer when the SPI mode is flash or passthrough mode.
-The IP returns data from the read buffer based on the given address in the received read command.
-In the current version, the read buffer size is 2kB.
-The IP only uses lower 11 bits of the received read command address (`addr[10:0]`) to issue the read requests to the DPSRAM.
-
-SW is responsible for updating the read buffer contents.
-The HW notifies the SW to update the buffer contents when needed.
-The HW provides a SW configurable read watermark CSR and read-only [`LAST_READ_ADDR`](data/spi_device.hjson#last_read_addr) CSR.
-The **LAST_READ_ADDR** shows the last read address of the recent read command.
-For instance, if the host system issues `0xABCD_E000` and reads 128 (or 0x80) bytes, the **LAST_READ_ADDR** after the transaction will show `0xABCD_E07F`.
-It does not show the commands falling into the mailbox region or Read SFDP command's address.
-
-The read watermark address width is 1 bit smaller than the read buffer address.
-In the current version, the register has 10-bit width.
-The HW assumes the SW maintains the read buffer as a double buffer scheme.
-When the host system accesses one buffer (1kB), the SW prepares another 1kB by copying data from the internal non-volatile memory.
-If the received read address crosses the SW configured watermark address, the HW informs the SW.
-SW may configure the watermark CSR low enough so that the SW has enough time to copy over the data.
-
-If a new read command crosses the current buffer boundary, the SW flips the internal buffer index bit and clears the cross event for the HW to detect the address cross event again.
-
-### 4B Address Management (EN4B/ EX4B)
-
-SW may configure the HW to receive EN4B and EX4B commands and change the read command address size between 3 bytes and 4 bytes.
-For the IP to recognize EN4B/ EX4B commands, SW should configure [`CMD_INFO_EN4B`](data/spi_device.hjson#cmd_info_en4b) and [`CMD_INFO_EX4B`](data/spi_device.hjson#cmd_info_ex4b).
-
-The two CSRs omit unnecessary fields from the **CMD_INFO** data structure.
-The HW logic creates the default **CMD_INFO** structures for the two commands.
-The command parser module uses the generated structures to process and trigger the 4B management module.
-
-When the HW receives one of the commands, the HW changes the broadcast signal *cfg_addr_4b_en*.
-Also the HW updates [`CFG.addr_4b_en`](data/spi_device.hjson#cfg) after passing through CDC.
-It takes at most three SYS_CLK cycles to update the value in the *CFG* register after the completion of the SPI transaction (CSb de-assertion).
-
-_Note: The HW changes the broadcasting signal and the CSR even though the SPI host system sends more than 8 beats of the SPI S[0].
-After the logic matches the received command byte with EN4B/ EX4B, the logic ignores the rest of the SPI data._
-
-The broadcasted `cfg_addr_4b_en` signal affects the read commands which `addr_mode` is *AddrCfg* in their command information entries.
-
-### Command Upload
-
-If the received command meets following conditions, the HW stores the command into the command/ address FIFOs and the payload buffer:
-
-- The command does not match to the first 11 command information entries nor EN4B/ EX4B.
-- The command matches to any of the rest command information entries.
-- The matched entry has the `upload` field set.
-
-The upload module checks the command information entry to determine whether the address/ payload fields to be uploaded or not.
-The `addr_mode` is used to determine the address size in the command.
-
-If `busy` field in the command information entry is set, the upload module also sets *BUSY* bit in the *STATUS* register.
-SW may clear the *BUSY* bit after processing the command.
-
-The upload module provides [`UPLOAD_STATUS`](data/spi_device.hjson#upload_status) and [`UPLOAD_STATUS2`](data/spi_device.hjson#upload_status2) CSRs for SW to parse the command, address, and payload.
-If a received command has payload, SW may read the payload from the Payload buffer starting from `payload_start_idx` address.
-In normal case, `payload_start_idx` in [`UPLOAD_STATUS2`](data/spi_device.hjson#upload_status2) shows **0**.
-In error case of the host sending more than the maximum allowed payload size (256B in the current version), the `payload_start_idx` may not be 0.
-It is expected that the `payload_depth` is maximum payload size, 256B if `payload_start_idx` is non-zero.
-In this scenario, SW should read from `payload_start_idx` to the end of the payload buffer then do a second read from the beginning of the buffer to the remained bytes.
-
-If the error case above happens, the IP reports the event through the `payload_overflow` interrupt.
-
-### Passthrough
-
-The passthrough module controls the data between a host system and the attached downstream SPI flash device.
-It snoops the SPI transactions and intervenes if the transaction is not permitted.
-The module also manipulates the data if needed.
-
-#### Command Filtering
-
-Filtering the incoming command is the key role of the Passthrough module.
-
-![Command Filtering logic in Passthrough mode](./doc/passthrough-filter.svg)
-
-```wavejson
-{ signal: [
-  { name: 'CSb_in',  wave: '10.........|....1.'},
-  { name: 'SCK_in',  wave: '0.p........|....l.'},
-  { name: 'IO[0]_i',  wave: 'z.=..=.=.=.=.=.=.=.=|=.=.=.=.z......',
-   data:["C[7]", "C[6]", "C[5]", "C[4]", "C[3]", "C[2]", "C[1]", "C[0]"],
-    period:0.5, },
-  { name: 'filter',  wave: '0................10.................',
-    period:0.5},
-  { name: 'filtered', wave: '0.................1.................',
-    period:0.5},
-  { name: 'SCK_out', wave: '0.p......0........'},
-  { name: 'CSb_out', wave: '10................1.................', period:0.5}
-  ],
-  head:{
-    text: 'Command Filtering',
-    tick: ['-2 -1 0 n-1 n+' ]
-  }
-}
-```
-
-The passthrough logic filters the command based on the 256 bit of [`CMD_FILTER_0`](data/spi_device.hjson#cmd_filter_0) CSR.
-Each bit corresponds to each opcode.
-For example, if bit 5 of [`CMD_FILTER_0`](data/spi_device.hjson#cmd_filter_0) is set, the passthrough drops **CSb** when it receives `05h` SPI command.
-
-The SW does not know whether a SPI transaction is filtered or not.
-If the SW wants to check, it needs to set the _upload_ field with the opcode in the command information list.
-Then, the HW uploads the command into the command/ address FIFOs and the payload buffer.
-
-#### Address Manipulation
-
-SW may configure the passthrough logic to swap certain address bits to desired values by configuring [`ADDR_SWAP_MASK`](data/spi_device.hjson#addr_swap_mask) and [`ADDR_SWAP_DATA`](data/spi_device.hjson#addr_swap_data) CSRs.
-The address translation takes in effect only when the received command is in the command information list and *addr_swap_en* field in the entry is set.
-
-For instance, the passthrough logic sets bit 20 of the address to 1 if [`ADDR_SWAP_MASK`](data/spi_device.hjson#addr_swap_mask) is `0x0010_0000` and [`ADDR_SWAP_DATA`](data/spi_device.hjson#addr_swap_data) is `0x0010_0000`.
-
-#### Write Status Data Manipulation
-
-The passthrough logic also provides a way to change the first 4 bytes of the payload to the downstream SPI flash device on-the-fly as same as the address.
-The main use of this feature is to protect the Status register.
-
-SW may configure the [`PAYLOAD_SWAP_MASK`](data/spi_device.hjson#payload_swap_mask) and [`PAYLOAD_SWAP_DATA`](data/spi_device.hjson#payload_swap_data) CSRs to change the specific bit of the first 4 byte of the write payload.
-For example, [`PAYLOAD_SWAP_MASK`](data/spi_device.hjson#payload_swap_mask) as `32'h 0000_0023` and [`PAYLOAD_SWAP_DATA`](data/spi_device.hjson#payload_swap_data) as `32'h 0000_0022` change bit 0 to 0, bit 1 to 1, bit 5 to 1 in the first byte payload.
-
-The CSRs are Little Endian (LE)s.
-The passthrough module consumes the lower byte first as SPI flash writes byte 0 first followed by byte 1.
-For example, bit `[7:0]` is processed then `[15:8]`, `[23:16]`, and `[31:24]` at last.
-
-The CSRs affect the commands that have *payload_swap_en* as 1 in their command list entries.
-SW may use additional command information slots for the passthrough (index 11 to 23).
-SW must configure *payload_dir* to **PayloadIn** and *payload_en* to `4'b 0001` in order for the payload translation feature to work correctly.
-
-#### Output Enable Control
-
-Passthrough module controls the output enable signals on both host and downstream sides.
-Controlling the output enable ports is critical to not overdrive the PAD directions.
-The information of the pad enable and direction is given by SW.
-SW configures the address size, payload lanes, dummy size in **CMD_INFO** slots.
-
-If passthrough logic does not find valid command information entry based on the received opcode, it assumes the command is **PayloadIn** Single IO command.
-SW is recommended to set the filter bit for Passthrough to not deliver the unmatched command to the downstream flash device.
-
-#### Internally processed Commands
-
-As described in [SPI Device Modes](#spi-device-modes-and-active-submodules), SPI_DEVICE may return the data from the IP even if the passthrough mode is set.
-The HW can process Read Status, Read JEDEC ID, Read SFDP, Read commands accessing the mailbox region, and EN4B/EX4B.
-
-SW configures [`INTERCEPT_EN`](data/spi_device.hjson#intercept_en) CSR to enable the feature.
-SW may selectively enable/disable commands.
-For example, HW returns only Read Status data internally if [`INTERCEPT_EN`](data/spi_device.hjson#intercept_en) is `{status: 1'b 1, default: 1'b 0}`.
-
-Other than Read command accessing mailbox space, it is recommended to filter the intercepted commands.
-
-## TPM over SPI
-
-![TPM over SPI block diagram](./doc/tpm-blockdiagram.svg)
-
-The TPM over SPI submodule processes the low level data only.
-The TPM submodule parses the incoming SPI MOSI line and stacks the stream up to the SW accessible registers, such as TPM_CMD_ADDR, and TPM_WRITE_FIFO.
-The SW must decode the command and the address.
-Then the SW reads the data from the write FIFO or pushes data into the read FIFO depending on the command.
-
-The TPM submodule returns appropriate data for read commands depending on the current read FIFO status, the received address, and the Locality.
-The module sends bytes from the return-by-HW registers to the parallel-to-serial logic right after the address phase when the received address falls into the HW managed registers.
-
-The TPM specification mandates the TPM module to return the data right after the address phase or send the WAIT at the last bit of the address phase.
-The address of the return-by-HW registers has a 4B boundary.
-The TPM submodule has enough time to determine if the incoming address falls into the return-by-HW registers or not.
-As the logic decides if the HW returns data or waits for the SW response at the address[2] bit phase, the logic always sends `WAIT(0x00)` at the last byte of the incoming address phase.
-The module sends `START(0x01)` at the next byte followed by the actual return-by-HW value if the received address falls into the list of the return-by-HW registers.
-
-The module, by default, returns `WAIT` when the address does not fall into the return-by-HW register address.
-In the wait state, the TPM submodule watches the read FIFO status.
-The module stays in the wait state until the read FIFO has the data >= requested transfer size.
-The module sends `START` at the next byte when the read FIFO has enough data.
-Then the module pops data from the read FIFO and sends the data over SPI.
-
-The TPM submodule accepts the payload for the TPM write command without the `WAIT` state if the write FIFO is empty.
-In other case, the TPM submodule sends `WAIT` until the write FIFO becomes available (empty).
-
-### Configuring Return-by-HW registers
-
-The return-by-HW register values come from the SW read-writable CSRs.
-The module latches the CSRs from the SYS_CLK domain into the SPI SCK domain when CSb is asserted.
-The SW is allowed to modify the return-by-HW registers when CSb is not active.
-
-The [TCG PC Client Platform TPM Profile][TPM PCCP] spec describes in the section 6 that the TPM device returns registers values based on the received locality (address[15:12]) and the `TPM_ACCESS_x.activeLocality`.
-The HW uses `TPM_ACCESS_x.activeLocaltiy` and the address bit 15:12 to determine what value the logic should return.
-If `invalid_locality` configuration is set, the logic returns `INVALID` value to the host system, when the host system sends a read request to the Locality greater than 4.
-If the request is in the supported locality (0-4), the logic checks `TPM_ACCESS_x.activeLocality` then returns data based on the table 39 in the spec for Return-by-HW registers.
-Other registers in the table should be processed by SW.
-
-## Detecting Reliability Errors
-
-This version of the SPI_DEVICE IP implements the parity to detect bit flip errors on the internal SRAM.
-The HW checks the parity error when the SW reads data from the SRAM.
-The error is reported to the SW via TL D channel error signal.
-SW is recommended to discard the current context if any transaction is ongoing then to reset the IP.
-
-# Design Details
-
-## Clock and Phase
-
-The SPI device module has two programmable register bits to control the SPI clock, [`CFG.CPOL`](data/spi_device.hjson#cfg) and [`CFG.CPHA`](data/spi_device.hjson#cfg).
-CPOL controls clock polarity and CPHA controls the clock phase.
-For further details, please refer to this diagram from Wikipedia:
-[File:SPI_timing_diagram2.svg](https://en.wikipedia.org/wiki/Serial_Peripheral_Interface#/media/File:SPI_timing_diagram2.svg)
-
-This version of SPI_DEVICE HWIP supports mode 0 (CPHA and CPOL as 0) for Generic, Flash, and Passthrough modes. Mode 3 (CPHA and CPOL as 1) is not supported in the current version.
-SW should configure the SPI_DEVICE to mode 0 to enable TPM mode along with other modes.
-
-## SPI Device Firmware Operation Mode
-
-As described in the Theory of Operations above, in this mode, the SPI device
-writes incoming data directly into the SRAM (through RXFIFO) and updates the SPI
-device SRAM write pointer ([`RXF_PTR.wptr`](data/spi_device.hjson#rxf_ptr)). It does not parse a command byte nor
-address bytes, analyzing incoming data relies on firmware implementation of a
-higher level protocol. Data is sent from the TXF SRAM contents via TXFIFO.
-
-It is important that the data path inside the block should meet the timing that
-is a half cycle of SCK. As SCK clock is shut off right after the last bit of the
-last byte is received, the hardware module cannot register the SDI signal. The
-module registers bits [7:1] and combines them with the SDI signal directly to
-form the input to RXFIFO. This is detailed in the waveform below.
-
-```wavejson
-{ signal: [
-  { name: 'CSB', wave: '10.||...|..1'},
-  { name: 'SCK', wave: '0.p||...|..l', node:'......b' },
-  { name: 'SDI', wave: '0.=..=|=|=.=.=.=|=.=.z..', data:['7','6','5','1','0','7','6','1','0'], period:0.5, },
-  { name: 'BitCount', wave: '=...=.=|=|=.=.=.=|=.=...', data:['7','6','5','1','0','7','6','1','0','7'], period:0.5},
-  { name: 'RX_WEN', wave: '0....|....1.0.|...1.0...' , period:0.5},
-  { name: 'RXFIFO_D', wave:'x.=.=================.x.', node: '...........a',period:0.5},
-  ],
-  head:{
-    text: 'Read Data to FIFO',
-    tick: ['-2 -1 0 1 . 30 31 32 33 n-1 n n+1 n+2 '],
-  },
-}
-```
-
-As shown above, the RXFIFO write request signal (`RX_WEN`) is asserted when
-BitCount reaches 0h. Bitcount is reset by CSB asynchronously, returning to 7h
-for the next round. RXFIFO input data changes on the half clock cycle. RXFIFO
-latches WEN at the positive edge of SCK. When BitCount is 0h, bit 0 of FIFO data
-shows the bit 1 value for the first half clock cycle then shows correct value
-once the incoming SDI value is updated.
-
-TXFIFO is similar. TX_REN is asserted when Tx BitCount reaches 1, and the
-current entry of TXFIFO is popped at the negative edge of SCK. It results in a
-change of SDO value at the negative edge of SCK. SDO_OE is controlled by the
-CSB signal. If CSB goes to high, SDO is returned to High-Z state.
-
-```wavejson
-{ signal: [
-  { name: 'CSB',      wave:'10.||...|..1'},
-  { name: 'SCK',      wave:'0...p.|.|...|l' , node:'.............a', period:0.5},
-  { name: 'SDO',     wave:'x.=..=|=|=.=.=.=|=.=.x..', data:['7','6','5','1','0','7','6','1','0'], period:0.5, },
-  { name: 'SDO_OE',  wave:'0.1...................0.', period:0.5},
-  { name: 'BitCount', wave:'=....=.=|=|=.=.=.=|=.=..', data:['7','6','5','1','0','7','6','1','0','7'], period:0.5},
-  { name: 'TX_REN',   wave:'0.....|..1.0...|.1.0....' , node:'..........c',period:0.5},
-  { name: 'TX_DATA_i',wave:'=.....|....=.......=....',data:['D0','Dn','Dn+1'], node:'...........b', period:0.5},
-  ],
-  edge: ['a~b', 'c~b t1'],
-  head:{
-    text: 'Write Data from FIFO',
-    tick: ['-2 -1 0 1 . 30 31 32 33 n-1 n n+1 n+2 '],
-  },
-}
-```
-
-Note that in the SPI mode 3 configuration ([`CFG.CPOL`](data/spi_device.hjson#cfg)=1, [`CFG.CPHA`](data/spi_device.hjson#cfg)=1), the
-logic isn't able to pop the entry from the TX async FIFO after the last bit
-in the last byte of a transaction. In mode 3, no further SCK edge is given
-after sending the last bit before the CSB de-assertion. The design is chosen to
-pop the entry at the 7th bit position. This introduces unavoidable behavior of
-dropping the last byte if CSB is de-asserted before a byte transfer is
-completed. If CSB is de-asserted in bit 1 to 6 position, the FIFO entry isn't
-popped. TX logic will re-send the byte in next transaction. If CSB is
-de-asserted in the 7th or 8th bit position, the data is dropped and will
-re-commence with the next byte in the next transaction.
-
-### RXFIFO control
-
-![RXF CTRL State Machine](./doc/rxf_ctrl_fsm.svg)
-
-The RXFIFO Control module controls data flow from RXFIFO to SRAM. It connects
-two FIFOs having different data widths. RXFIFO is byte width, SRAM storing
-incoming data to serve FW is TL-UL interface width.
-
-To reduce traffic to SRAM, the control logic gathers FIFO entries up to full
-SRAM data width, then does a full-word SRAM write. A programmable timer exists
-in the case when partial bytes are received at the end of a transfer. If the
-timer expires while bytes are still in the RXFIFO, the logic writes partial
-words to SRAM. A read-modify-write operation is triggered to perform the partial
-update.
-
-![State Machine](./doc/rxf_ctrl_fsm_table.png)
-
-### TXFIFO control
-
-The TXFIFO control module reads data from SRAM then pushes to TXFIFO whenever
-there is space in TXFIFO and when the TXF wptr and rptr indicate there is data
-to transmit. Data is written into the TXF SRAM by software which also controls
-the TXF write pointer.
-
-![TXF CTRL Data Path](./doc/txf_ctrl_dp.svg)
-
-The TXFIFO control module latches the write pointer then uses it internally.
-This prevents HW from using incorrect data from SRAM if the write pointer
-and read pointer are pointing at the same location. It is
-recommended for the software to update the write pointer at the SRAM data width
-granularity if it has more than 1 DWord data to send out. If software updates
-write pointer every byte, HW tries to fetch data from SRAM every time it hits
-the write pointer leading to inefficiency of SRAM access.
-
-If TXFIFO is empty, HW module repeatedly sends current entry of TXFIFO output as
-explained in "Theory of Operations" section. It cannot use an empty signal from
-TXFIFO due to asynchronous timing constraints.
-
-So, if software wants to send specific dummy data, it should prepare the amount
-of data with that value. As shown in the Theory Of Operations figure, for
-example, internal software could prepare FFh values for first page.
-
-![State Machine](./doc/txf_ctrl_fsm_table.png)
-
-## Data Storage Sizes
-
-SPI Device IP uses a 2kB internal Dual-Port SRAM. Firmware can resize RX / TX
-circular buffers within the SRAM size. For example, the firmware is able to set
-RX circular buffer to be 1.5kB and 512B for TX circular buffer.
-
-To increase SRAM size, the `SramAw` local parameter in `spi_device.sv`
-should be changed. It cannot exceed 13 (32kB) due to the read and write
-pointers' widths.
-
-# Programmers Guide
-
-## Initialization
-
-By default, RX SRAM FIFO base and limit address (via [`RXF_ADDR`](data/spi_device.hjson#rxf_addr) register) are
-set to 0x0 and 0x1FC, 512 bytes. And TX SRAM FIFO base and limit addresses (in
-the [`TXF_ADDR`](data/spi_device.hjson#txf_addr) register)  are 0x200 and 0x3FC. If FW wants bigger spaces, it can
-change the values of the above registers [`RXF_ADDR`](data/spi_device.hjson#rxf_addr) and [`TXF_ADDR`](data/spi_device.hjson#txf_addr).
-
-Software can configure the timer value [`CFG.timer_v`](data/spi_device.hjson#cfg) to change the delay between
-partial DATA received from SPI interface being written into the SRAM. The value
-of the field is the number of the core clock cycles that the logic waits for.
-
-## Pointers
-
-RX / TX SRAM FIFO has read and write pointers, [`RXF_PTR`](data/spi_device.hjson#rxf_ptr) and [`TXF_PTR`](data/spi_device.hjson#txf_ptr) . Those
-pointers are used to manage circular FIFOs inside the SRAM. The pointer width in
-the register description is 16 bit but the number of valid bits in the pointers
-depends on the size of the SRAM.
-
-The current SRAM size is 2kB and the pointer width is 12 bits, 11bits
-representing a byte offset and 1 most-significant bit for indicating phase of
-the FIFO. Since they represent bytes, the low 2 bits indicate the offset within
-the 32-bit wide SRAM word. The pointers indicate the offset into the area
-described by the base and limit values, so the lower bits (11 bits in this case)
-of a pointer should not exceed the size in bytes (4 * (limit address - base
-address)) reserved for the region (RXF or TXF) that the pointer is in. For
-instance, if FW sets RXFIFO depth to 128 (default value), it should not update
-the read pointer outside the range 0x000 -  0x1FF (128*4 = 512Bytes ignoring
-the phase bit, bit 11).
-
-## Dual-port SRAM Layout
-
-The figure below shows the SRAM layout in the Flash and Passthrough modes.
-In generic mode, the whole DPSRAM is used as RX/TX buffers as described in the generic mode section.
-The SRAM begins at `0x1000`, which in the figure is `0x000`.
-
-![SPI Device Dual-port SRAM Layout](./doc/spid_sram_layout.svg)
-
-The regions starting from `0xF00` to `0xFFF` are assigned to TPM Read/Write FIFOs.
-They are not used in this version of IP.
-
-## TPM over SPI
-
-### Initialization
-
-The SW should enable the TPM submodule by writing 1 to the TPM_CFG.en CSR field.
-Other SPI_DEVICE features (Generic, Flash, Passthrough) CSRs do not affect the TPM feature.
-
-Update TPM_ACCESS_0, TPM_ACCESS_1 CSRs.
-The TPM submodule uses TPM_ACCESS_x.activeLocality to determine if the TPM_STS is returned to the host system.
-The SW may configure TPM_CFG.hw_reg_dis and/or TPM_CFG.invalid_locality to fully control the TPM transactions.
-
-### TPM mode: FIFO and CRB
-
-The TPM protocol supports two protocol interfaces: FIFO and CRB (Command Response Buffer).
-In terms of hardware design, these two interfaces differ in how return-by-HW registers are handled.
-
-In FIFO mode, when [`TPM_CFG.tpm_mode`](data/spi_device.hjson#tpm_cfg) is set to 0, HW registers reads must be returned after a maximum of 1 wait state.
-In CRB mode, when [`TPM_CFG.tpm_mode`](data/spi_device.hjson#tpm_cfg) is set to 1, there are no such restrictions.
-The logic always uploads both the command and address to the SW and waits for the return data in CRB mode.
-
-### Return-by-HW register update
-
-The SW manages the return-by-HW registers.
-The contents are placed inside the SPI_DEVICE CSRs.
-The SW must maintain the other TPM registers outside of the SPI_DEVICE HWIP and use write/read FIFOs to receive the content from/ send the register value to the host system.
-
-When the SW updates the return-by-HW registers, the SW is recommended to read back the register to confirm the value is written.
-Due to the CDC issue, the SW is only permitted to update the registers when the TPM CS# is de-asserted.
-
-### TPM Read
-
-1. The host system sends the TPM read command with the address.
-1. The SW reads a word from TPM_CMD_ADDR CSR (optional cmdaddr_notempty interrupt).
-  1. If the address falls into the return-by-HW registers and TPM_CFG.hw_reg_dis is not set, the HW does not push the command and address bytes into the TPM_CMD_ADDR CSR.
-1. The SW prepares the register value and writes the value into the read FIFO.
-1. The TPM submodule sends `WAIT` until the read FIFO is available.
-   When available, the TPM submodule sends `START` followed by the register value.
-
-### TPM Write
-
-1. The host system sends the TPM write command with the address.
-1. The TPM submodule pushes the command and the address to the TPM_CMD_ADDR CSR.
-1. The TPM submodule checks the write FIFO status.
-1. If not empty, the TPM submodule sends `WAIT` to the host system.
-1. When the FIFO is empty, the TPM sends `START` to the host system, receives the payload, and stores the data into the write FIFO.
-1. The SW, in the meantime, reads TPM_CMD_ADDR then reads the write FIFO data when the FIFO is available.
-
-### TPM_CMDADDR_NOTEMPTY Interrupt
-
-`TPM_CMDADDR_NOTEMPTY` interrupt remains high even SW clears the interrupt unless the cause is disappeared.
-SW should mask the interrupt if SW wants to process the event in a deferred way.
-
-```c
-void spi_tpm_isr() {
-  uint32_t irq_deferred = 0;
-  uint32_t irq_status = spi_tpm_get_irq_status();
-  if (irq_status & kSpiTpmFifoIrq) {
-    irq_deferred |= kSpiTpmFifoIrq;
-    schedule_deferred_work(spi_tpm_deferred_work);
-  }
-  // ...
-  spi_tpm_mask_irq(irq_deferred);
-}
-
-void spi_tpm_deferred_work() {
-  uint32_t irq_handled = 0;
-  uint32_t irq_status = spi_tpm_get_irq_status();
-  if (irq_status & kSpiTpmFifoIrq) {
-    spi_tpm_handle_fifo_irq();
-    irq_handled |= kSpiTpmFifoIrq;
-  }
-  // ...
-  // Now that we think the FIFO has been emptied, clear the latched status.
-  spi_tpm_clear_irq_status(irq_handled);
-  spi_tpm_unmask_irq(irq_handled);
-  // If the FIFO received more data after handling, the interrupt would assert
-  // again here.
-}
-```
-
-
-### TPM Interrupt
-
-The TPM submodule does not process the TPM over SPI interrupt.
-The SW must check TPM_INT_ENABLE, TPM_INT_STATUS and control the GPIO pin that is designated to the TPM over SPI interrupt.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_spi_device.h)
-
-## Register Table
-
-* [Register Table](data/spi_device.hjson#registers)
diff --git a/hw/ip/spi_device/doc/programmers_guide.md b/hw/ip/spi_device/doc/programmers_guide.md
new file mode 100644
index 0000000000000..bbdc29319902e
--- /dev/null
+++ b/hw/ip/spi_device/doc/programmers_guide.md
@@ -0,0 +1,135 @@
+# Programmer's Guide
+
+## Initialization
+
+By default, RX SRAM FIFO base and limit address (via [`RXF_ADDR`](../data/spi_device.hjson#rxf_addr) register) are
+set to 0x0 and 0x1FC, 512 bytes. And TX SRAM FIFO base and limit addresses (in
+the [`TXF_ADDR`](../data/spi_device.hjson#txf_addr) register)  are 0x200 and 0x3FC. If FW wants bigger spaces, it can
+change the values of the above registers [`RXF_ADDR`](../data/spi_device.hjson#rxf_addr) and [`TXF_ADDR`](../data/spi_device.hjson#txf_addr).
+
+Software can configure the timer value [`CFG.timer_v`](../data/spi_device.hjson#cfg) to change the delay between
+partial DATA received from SPI interface being written into the SRAM. The value
+of the field is the number of the core clock cycles that the logic waits for.
+
+## Pointers
+
+RX / TX SRAM FIFO has read and write pointers, [`RXF_PTR`](../data/spi_device.hjson#rxf_ptr) and [`TXF_PTR`](../data/spi_device.hjson#txf_ptr) . Those
+pointers are used to manage circular FIFOs inside the SRAM. The pointer width in
+the register description is 16 bit but the number of valid bits in the pointers
+depends on the size of the SRAM.
+
+The current SRAM size is 2kB and the pointer width is 12 bits, 11bits
+representing a byte offset and 1 most-significant bit for indicating phase of
+the FIFO. Since they represent bytes, the low 2 bits indicate the offset within
+the 32-bit wide SRAM word. The pointers indicate the offset into the area
+described by the base and limit values, so the lower bits (11 bits in this case)
+of a pointer should not exceed the size in bytes (4 * (limit address - base
+address)) reserved for the region (RXF or TXF) that the pointer is in. For
+instance, if FW sets RXFIFO depth to 128 (default value), it should not update
+the read pointer outside the range 0x000 -  0x1FF (128*4 = 512Bytes ignoring
+the phase bit, bit 11).
+
+## Dual-port SRAM Layout
+
+The figure below shows the SRAM layout in the Flash and Passthrough modes.
+In generic mode, the whole DPSRAM is used as RX/TX buffers as described in the generic mode section.
+The SRAM begins at `0x1000`, which in the figure is `0x000`.
+
+![SPI Device Dual-port SRAM Layout](../doc/spid_sram_layout.svg)
+
+The regions starting from `0xF00` to `0xFFF` are assigned to TPM Read/Write FIFOs.
+They are not used in this version of IP.
+
+## TPM over SPI
+
+### Initialization
+
+The SW should enable the TPM submodule by writing 1 to the TPM_CFG.en CSR field.
+Other SPI_DEVICE features (Generic, Flash, Passthrough) CSRs do not affect the TPM feature.
+
+Update TPM_ACCESS_0, TPM_ACCESS_1 CSRs.
+The TPM submodule uses TPM_ACCESS_x.activeLocality to determine if the TPM_STS is returned to the host system.
+The SW may configure TPM_CFG.hw_reg_dis and/or TPM_CFG.invalid_locality to fully control the TPM transactions.
+
+### TPM mode: FIFO and CRB
+
+The TPM protocol supports two protocol interfaces: FIFO and CRB (Command Response Buffer).
+In terms of hardware design, these two interfaces differ in how return-by-HW registers are handled.
+
+In FIFO mode, when [`TPM_CFG.tpm_mode`](../data/spi_device.hjson#tpm_cfg) is set to 0, HW registers reads must be returned after a maximum of 1 wait state.
+In CRB mode, when [`TPM_CFG.tpm_mode`](../data/spi_device.hjson#tpm_cfg) is set to 1, there are no such restrictions.
+The logic always uploads both the command and address to the SW and waits for the return data in CRB mode.
+
+### Return-by-HW register update
+
+The SW manages the return-by-HW registers.
+The contents are placed inside the SPI_DEVICE CSRs.
+The SW must maintain the other TPM registers outside of the SPI_DEVICE HWIP and use write/read FIFOs to receive the content from/ send the register value to the host system.
+
+When the SW updates the return-by-HW registers, the SW is recommended to read back the register to confirm the value is written.
+Due to the CDC issue, the SW is only permitted to update the registers when the TPM CS# is de-asserted.
+
+### TPM Read
+
+1. The host system sends the TPM read command with the address.
+1. The SW reads a word from TPM_CMD_ADDR CSR (optional cmdaddr_notempty interrupt).
+  1. If the address falls into the return-by-HW registers and TPM_CFG.hw_reg_dis is not set, the HW does not push the command and address bytes into the TPM_CMD_ADDR CSR.
+1. The SW prepares the register value and writes the value into the read FIFO.
+1. The TPM submodule sends `WAIT` until the read FIFO is available.
+   When available, the TPM submodule sends `START` followed by the register value.
+
+### TPM Write
+
+1. The host system sends the TPM write command with the address.
+1. The TPM submodule pushes the command and the address to the TPM_CMD_ADDR CSR.
+1. The TPM submodule checks the write FIFO status.
+1. If not empty, the TPM submodule sends `WAIT` to the host system.
+1. When the FIFO is empty, the TPM sends `START` to the host system, receives the payload, and stores the data into the write FIFO.
+1. The SW, in the meantime, reads TPM_CMD_ADDR then reads the write FIFO data when the FIFO is available.
+
+### TPM_CMDADDR_NOTEMPTY Interrupt
+
+`TPM_CMDADDR_NOTEMPTY` interrupt remains high even SW clears the interrupt unless the cause is disappeared.
+SW should mask the interrupt if SW wants to process the event in a deferred way.
+
+```c
+void spi_tpm_isr() {
+  uint32_t irq_deferred = 0;
+  uint32_t irq_status = spi_tpm_get_irq_status();
+  if (irq_status & kSpiTpmFifoIrq) {
+    irq_deferred |= kSpiTpmFifoIrq;
+    schedule_deferred_work(spi_tpm_deferred_work);
+  }
+  // ...
+  spi_tpm_mask_irq(irq_deferred);
+}
+
+void spi_tpm_deferred_work() {
+  uint32_t irq_handled = 0;
+  uint32_t irq_status = spi_tpm_get_irq_status();
+  if (irq_status & kSpiTpmFifoIrq) {
+    spi_tpm_handle_fifo_irq();
+    irq_handled |= kSpiTpmFifoIrq;
+  }
+  // ...
+  // Now that we think the FIFO has been emptied, clear the latched status.
+  spi_tpm_clear_irq_status(irq_handled);
+  spi_tpm_unmask_irq(irq_handled);
+  // If the FIFO received more data after handling, the interrupt would assert
+  // again here.
+}
+```
+
+
+### TPM Interrupt
+
+The TPM submodule does not process the TPM over SPI interrupt.
+The SW must check TPM_INT_ENABLE, TPM_INT_STATUS and control the GPIO pin that is designated to the TPM over SPI interrupt.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_spi_device.h)
+
+## Register Table
+
+* [Register Table](../data/spi_device.hjson#registers)
diff --git a/hw/ip/spi_device/doc/theory_of_operation.md b/hw/ip/spi_device/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..cce39ccad1c55
--- /dev/null
+++ b/hw/ip/spi_device/doc/theory_of_operation.md
@@ -0,0 +1,709 @@
+# Theory of Operation
+
+## Block Diagram
+
+![Block Diagram](../doc/block_diagram.svg)
+
+In Generic mode, the incoming data is stored byte-based into an asynchronous FIFO.
+The logic inside the generic mode then updates the DPSRAM RX space.
+The logic also reads data from the DPSRAM then pushes out to the SPI MISO line.
+
+The Generic mode uses the entire DPSRAM space exclusively.
+The TX/RX size in the DPSRAM can be changed by compile-time parameters.
+
+When Flash mode is selected, the command parser accepts the first byte of the SPI MOSI line then activates the flash submodules, such as Status, JEDEC, Read command, and Upload function.
+The Status logic processes the three Read Status commands.
+The SW may configure three bytes of the Flash Status CSR then the Status submodule returns the CSR data into the SPI MISO line.
+The SW may configure the Read Status commands' opcodes.
+
+The JEDEC submodule returns the JEDEC Manufacturer ID followed by the additional information.
+The Manufacturer ID may vary depending on the company.
+For example, lowRISC JEDEC ID `EFh` follows twelve bytes of 7Fh Continuous Codes, requiring a total thirteen bytes for the manufacturer ID.
+The SW may configure how many Continuous Codes is needed and the actual manufacturer ID.
+
+The Read submodule processes the Read SFDP (Serial Flash Discoverable Parameters) command, and up to six different types of the read commands.
+The read submodule receives address information from the SPI transaction, fetches the data from the read buffer in the DPSRAM, and returns the data on SPI lines (single, dual, quad lines).
+If the received address falls into the SW programmable mailbox address space, the logic fetches data not from the read buffer but from the mailbox buffer in the DPSRAM.
+
+SW may configure command information slots to upload the command into the FIFOs and the payload buffer in the DPSRAM.
+SW may additionally let HW to set the BUSY bit in the Status register when the HW uploads the command.
+
+In Passthrough mode, the logic filters the incoming transaction if the transaction is not permitted.
+The SW may configure the logic to change a portion of the address or first 4 bytes of the payload.
+
+## Hardware Interfaces
+
+* [Interface Tables](../data/spi_device.hjson#interfaces)
+
+The TPM submodule requires a separate input port for CS#.
+The TPM submodule and other SPI Device modes are able to be active together.
+The host system distinguishes between the TPM transactions and the other SPI transactions using separate CS# ports.
+Even though both submodules are able to be active, the host system cannot issue a TPM command and a SPI transaction at the same time due to the SPI IO lines being shared.
+
+The TPM has no write FIFO interrupt.
+As TPM transactions are not bigger than 4B in current usage case, the waiting time of the core is not a concern.
+The core takes multiple cycles to pop a byte from the write FIFO due to the slower peripheral clock and multiple CDC paths.
+The gain of having write FIFO interrupt is not great.
+
+## SPI Device Generic mode
+
+![Generic Mode Block Diagram](../doc/generic-blockdiagram.svg)
+
+The block diagram above shows how the SPI Device generic mode converts incoming
+bit-serialized SDI data into a valid byte, where the data bit is valid when the
+chip select signal (CSB) is 0 (active low) and SCK is at positive or negative
+edge (configurable, henceforth called the "active edge"). The bit order within
+the byte is determined by [`CFG.rx_order`](../data/spi_device.hjson#cfg) configuration register field. After a
+byte is gathered, the interface module writes the byte data into a small FIFO
+("RXFIFO") using SCK. It is read out of the FIFO and written into to the
+buffer SRAM ("DP_SRAM") using the system bus clock. If RXFIFO is full, this is
+an error condition and the interface module discards the byte.
+
+The interface module also serializes data from the small transmit FIFO
+("TXFIFO") and shifts it out on the SDO pin when CSB is 0 and SCK is at the
+active edge. The bit order within the byte can be configured with configuration
+register field [`CFG.tx_order`](../data/spi_device.hjson#cfg). It is expected that software has prepared TX data
+based on the description in the "Defining
+Firmware Operation Mode" section below. Since SCK is not under the control of
+software or the device (it is driven by the external SPI host), it is possible
+that there is no data ready in the TXFIFO when chip select becomes active and
+the interface needs to send data on the SDO pin. Either software has not
+prepared TX data or software does not care about the contents of the TX data -
+then the hardware will send whatever lingering data is in the empty TXFIFO. If
+this is a functional issue, then software should at least soft-reset the contents
+of the TXFIFO using the [`CONTROL.rst_txfifo`](../data/spi_device.hjson#control) register. The soft-reset signal
+is not synchronized to the SCK clock, so software should drive the reset
+signal when the SPI interface is idle.
+
+### General Data Transfer on Pins
+
+Data transfers with the SPI device module involve four peripheral SPI pins: SCK,
+CSB, SDI, SDO. SCK is the SPI clock driven by an external SPI host. CSB (chip
+select bar) is an active low enable signal that frames a transfer, driven by the
+external host. Transfers with active SCK edges but inactive (high) CSB are
+ignored. Data is driven into the SPI device on the SDI pin ("Serial Data
+In", though we're otherwise using host/device terminology) and driven out on
+SDO. Any transfer length is legal, though higher level protocols typically
+assume word width boundaries. See details on protocols and transfers that
+follow. The diagram below shows a typical transfer, here for 8 bytes (64 cycles,
+showing the beginning and end of the transfer). Configurability for active
+edges, polarities, and bit orders are described later.
+
+```wavejson
+{ signal: [
+  { name: 'CSB',  wave: '10.........|....1.'},
+  { name: 'SCK',  wave: '0.p........|....l.'},
+  { name: 'SDI',  wave: 'z.=..=.=.=.=.=.=.=.=.=|=.=.=.=.z....',
+    data:['R07','R06','R05','R04','R03','R02','R01','R00','R17',
+          '','R73','R72','R71','R70'], period:0.5, },
+  { name: 'SDO',  wave: 'z.=..=.=.=.=.=.=.=.=.=|=.=.=.=.z....',
+    data:['T07','T06','T05','T04','T03','T02','T01','T00','T17',
+          '','T73','T72','T71','T70'], period:0.5}],
+  head:{
+    text: 'Data Transfer',
+    tick: ['-2 -1 0 1 2 3 4 5 6 7 8 9 60 61 62 63     ']
+  }
+}
+```
+
+
+### Defining "Firmware Operation Mode"
+
+Firmware operation mode, as implemented by this SPI device, is used to bulk copy data in
+and out of the chip using the pins as shown above. In general, it is used to
+load firmware into the chip, but can be used for any data transfer into or out
+of the chip. The transfers are "generic" in the sense that there is no
+addressing or overarching protocol involved. Data transferred into the chip goes into a SPI Device
+circular buffer implemented in an SRAM, and firmware decides what to do with the
+data. Data transferred out of the chip comes out of a circular buffer in an
+SRAM. Software can build any number of higher level protocols on top of this
+basic mechanism. All transfers are by definition full duplex: whenever an active
+SCK edge is received, a bit of RX data is latched into the peripheral, and a bit
+of TX data is sent out of the peripheral. If transfers only require
+unidirectional movement of data, the other direction can be ignored but will
+still be active. For instance, if only receive data is needed in the transfer,
+the device will still be transmitting data out on the TX ("SDO") pin.
+
+### SPI Generic Protocol
+
+The primary protocol considered is one used by an external SPI host to send
+chunks of firmware data into the device in the receive direction, confirming the
+contents with an echo back of a hash of the received data in the transmit
+direction. This is generally termed the 'SPI Generic' protocol, since SPI is used to
+send firmware into device memory, brokered by software confirming integrity
+of the received firmware data. This special case will be described first, and
+then a generic understanding of how firmware mode operates will follow.
+
+The following diagram shows the expected data transfer in SPI Generic mode.
+
+![data transfer in SPI Device](../doc/data_transfer.svg)
+
+In this diagram, bursts of data transfer are shown as "pages" of firmware
+content being driven into the device. The size of the page is not relevant,
+though it must be less than the size of the internal SPI Device SRAM. Typically
+the SRAM is divided in half for RX and TX buffers, but the boundary is
+configurable. The total size of RX and TX buffer must fit in the SPI device
+SRAM. Since the external SPI Host is in charge of the clock (SCK), it controls
+all aspects of the transfer, including the size of the page. But it is done in
+coordination with software running on the device that manages the higher level
+protocol.
+
+The protocol assumes that for each page written into the device, a response will
+be prepared for the next page. But since the SPI Device is always transmitting
+during every received page, the first transmitted page can be ignored. After the
+first page is received, software will get alerted as to its completion (via an
+RX interrupt), and will execute whatever integrity check is required on that
+data. It can then prepare its response to page zero by writing into the SPI
+Device TX buffer. What it writes into the TX buffer the concern of the
+higher level protocol. It could be a "good" indication, a full echo of the RX
+data, or a hash of the received contents. The decision is not in scope for this
+specification.
+
+Clearly there is a potential race condition here as a new page could begin to be
+received before software has prepared the transmit response to page zero
+(including the time to read data out of the SRAM), but that is a condition that
+the higher level protocol must prepare for. That protocol is not in scope for
+this document, but some hints to its implementation are given in the
+programmers guide section below.
+
+The transfer continues until all received data is taken in, and responded back.
+In this protocol the last "received" page of data is a "don't care" as long
+as the response is transmitted successfully.
+
+### Firmware Operation Mode
+
+Taking this example as a guide, we can see the general method of the SPI
+Firmware Operation Mode. On every active SCK clock edge, data is received from the SDI
+pin into the SPI device, and data is transmitted on the SDO pin. Received data
+is gathered into bytes and written into the RX circular buffer in the SPI Device
+SRAM as it is accumulated. Whatever data exists in the TX circular buffer is
+serialized and transmitted. Transfers are framed using the active low chip
+select pin SCB. What happens when data arrives and the RX circular buffer is
+full, or when the transmitter encounters an empty TX circular buffer are
+error conditions discussed in the Design Details section that follows.
+
+### RXFIFO, TXFIFO, and DP_SRAM
+
+The relationship between the Dual Port SRAM (DP_SRAM) and the RX and TXFIFOs
+should be explained. The SRAM is divided into a section for the transmit
+direction, named TXF, and a section for the receive direction, named RXF. Each
+section has its own read and write pointer. The SRAM may be read and written by
+software at any time, but for correct normal operation it will only write the
+empty area of the TXF (between the write pointer and read pointer) and only read
+the full area of the RXF (between the read pointer and write pointer) with the
+other areas used by the hardware. It is first worth noting that the hardware
+implications of the asynchronous nature of SCK and the fact it may not be free
+running, complicate some of the logic. The full feature set of that interface
+logic (clocked by SCK) includes the serial to parallel converter for RX data,
+the parallel-to-serial converter for TX data, and the interfaces to RXFIFO and
+TXFIFO. Before the first bit transfer and after the last SCK is stopped,
+there is no clock for any of this logic.  So for instance there is no guarantee
+of the two-clock-edges normally required for asynchronous handshaking protocols.
+The RXFIFO and TXFIFO exist to facilitate this situation.
+
+In the receive direction, data gathered from the SDI pin is written into the
+RXFIFO (see details below) at appropriate size boundaries. This data is
+handshake-received on the core clock side, gathered into byte or word quantity,
+and written into the RX circular buffer of the dual-port SRAM. On each write,
+the RXF write pointer ([`RXF_PTR.wptr`](../data/spi_device.hjson#rxf_ptr)) is incremented by hardware, wrapping at
+the size of the circular buffer. Software can watch (via polling or interrupts)
+the incrementing of this write pointer to determine how much valid data has been
+received, and determine when and what data to act upon. Once it has acted upon
+data, the software should update the RXF read pointer to indicate that space in
+the SRAM is available for future writes by the hardware. If incrementing the
+write pointer would result in it becoming equal to the read pointer then the RXF
+is full and any subsequently received data will be discarded. Thus in normal
+operation, the RXF write pointer is updated automatically by hardware and the RXF
+read pointer is managed by software. As an optimization the hardware will
+normally only write to the 32-bit wide SRAM when an entire word can be written.
+Since the end of the received data may not be aligned, there is a timer that
+forces sub-word writes if data has been staged for too long. The timer value
+([`CFG.timer_v`](../data/spi_device.hjson#cfg)) represents the number of core clock cycles. For instance, if
+timer value is configured in 0xFF, the RXF control logic will write gathered
+sub-word data in 255 cycles if no further bit stream from SPI is received.
+
+In the transmit direction, things are a little more tricky. Since the pin
+interface logic begins transmitting data on its very first SCK edge, there are
+no previous clock edges in the interface side of the fifo to allow an empty flag
+to be updated. The interface  must *blindly* take whatever data is at the
+read pointer of the TXFIFO (in a typical asynchronous FIFO with free-running
+clocks the pointers can always be sent across the asynchronous boundary to
+determine if the FIFO is truly empty or not). Hence the need to potentially send
+out garbage data if software has not prepared the TXFIFO in time.
+
+The software writes data that it wants to transmit into the TXF circular buffer
+of the DP_SRAM buffer. It then passes the data to the hardware by moving the TXF
+write pointer to point to the next location after the data (this is the location
+it will use to start the data for the next transmission). Hardware that manages
+the TXFIFO detects the change in TXF write pointer and begins reading from the
+SRAM and prefilling the TXFIFO until it is full or until all valid TXF data has
+been read. This prepares the TXFIFO with the desired data for when the next SCK
+data arrives. As the SCK domain logic pulls data out of the TXFIFO to transmit
+on the SDO pin, that TXFIFO read is detected (after synchronization to the core
+clock domain) and potentially another word of data is read from the SRAM and
+written into the TXFIFO. Each time the SRAM is read the hardware increments the
+TXF read pointer making the space available to software. Like above, though
+conversely, in normal operation the TXF write pointer is managed completely by
+software and the TXF read pointer is incremented by hardware.
+
+All reads and writes to/from the SRAM for RXF and TXF activity are managed by
+direct reads and writes through the TLUL bus interface, managed by the
+auto-generated register file control logic.
+
+## SPI Flash and Passthrough Modes
+
+### Command Information List
+
+The SW may configure the map from the received opcode to the command process module by programming *cmd_info* list.
+Current SPI_DEVICE provides 24 command information entries.
+Each entry represents a command.
+Details of the fields are explained in the [`CMD_INFO_0`](../data/spi_device.hjson#cmd_info_0)
+
+First 11 commands are assigned to specific submodules.
+
+Index  | Assigned Submodule
+-------|--------------------
+[2:0]  | Read Status
+[3]    | Read JEDEC ID
+[4]    | Read SFDP
+[10:5] | Read commands
+
+If the IP is in flash mode or in passthrough mode with [`INTERCEPT_EN`](../data/spi_device.hjson#intercept_en) set, other than *opcode* and *valid* fields in the command information entries are ignored for Read Status and Read JEDEC ID commands.
+The submodules directly return data on the MISO line (SD[1]).
+In Passthrough mode, if Read Status and Read JEDEC ID commands are intercepted by the internal HW, the other fields in the command information entries are ignored also.
+
+The main use of the fields other than *opcode* and *valid* is to control the output enable in the passthrough logic.
+See [Output Enable Control](#output-enable-control) section for more.
+
+*upload* and *busy* fields are used in the SPI Flash/ Passthrough modes.
+See [Command Upload](#command-upload) section for details.
+
+### Command Parser
+
+![Command Parser block](../doc/cmdparse.svg)
+
+Command parser (*cmdparse*) processes the first byte of the SPI and activates the processing submodules depending on the received opcode and the *cmd_info* list described in the previous section.
+
+The cmdparse compares the received opcode with the *cmd_info.opcode* data structure.
+If any entry matches to the received opcode, the cmdparse hands over the matched command information entry with the index to the corresponding submodule.
+As explained in the [previous section](#command-information-list), the command parser checks the index to activate Read Status / Read JEDEC ID/ Read Command / Address 4B modules.
+Other than the first 11 slots and last two slots (the last two slots are not visible to SW), the cmdparse checks the *upload* field and activates the upload module if the field is set.
+
+SW can configure whether a submodule should process the command while in the passthrough mode by setting the [`INTERCEPT_EN`](../data/spi_device.hjson#intercept_en) CSR.
+
+### Status Control
+
+If the received command is one of the three read status commands, STATUS control module takes over the SPI interface after the opcode.
+The 3 bytes status register is not reset by CSb.
+Except BUSY bit and WEL bit, other bits are controlled by SW.
+
+BUSY bit is set by HW when it receives any commands that are uploaded to the FIFOs and their `busy` fields are 1 in the command information entry.
+SW may clear BUSY bit when it completes the received commands (e.g Erase/ Program).
+
+If BUSY is set, SPI_DEVICE IP blocks the passthrough interface in Passthrough mode.
+The blocking of the interface occurs in SPI transaction idle state (CSb == 1).
+When SW clears the BUSY bit, it is applied to the STATUS register in the SPI clock domain when SPI clock toggles.
+It means the update happens when the next SPI transaction is received.
+The BUSY bit in the CSR is the synchronized value of the STATUS BUSY bit in the SPI clock domain.
+Due to the CDC latency, SW may see the updated value (BUSY clear) with long delay.
+
+WEL bit can be controlled by SW and also by HW.
+HW updates WEL bit when it receives WREN(06h) or WRDI(04h) commands.
+The opcode can be configured via [`CMD_INFO_WREN`](../data/spi_device.hjson#cmd_info_wren) and [`CMD_INFO_WRDI`](../data/spi_device.hjson#cmd_info_wrdi).
+
+The SW update of the STATUS register via [`FLASH_STATUS`](../data/spi_device.hjson#flash_status) is not instantaneous.
+The IP stores the SW request into the asynchronous FIFO then the request is processed in the SPI clock domain.
+The request updates the temporal status register, which is called as staged registers in the design.
+The staged registers are latched into the committed registers when CSb is released.
+SW sees the committed registers when reading the [`FLASH_STATUS`](../data/spi_device.hjson#flash_status) CSR.
+
+The attached host system also reads back the committed registers via Read Status commands.
+This scheme is to guarantee the atomicity of the STATUS register.
+
+If the host sends the Write Status commands, the commands are not processed in this module.
+SW must configure the remaining command information entries to upload the Write Status commands to the FIFOs.
+
+### JEDEC ID Control
+
+JEDEC module returns JEDEC Device ID and Manufacturer ID following the Continuation Code (CC).
+SW may configure [`JEDEC_CC`](../data/spi_device.hjson#jedec_cc) CSR for HW to return proper CC.
+The *cc* field in [`JEDEC_CC`](../data/spi_device.hjson#jedec_cc) defines the return value, which is `0x7F` by default.
+*num_cc* defines how many times the HW to send CC byte before sending the JEDEC ID.
+
+The actual JEDEC ID consists of one byte manufacturer ID and two bytes device ID.
+The HW sends the manufacturer ID first, then `[7:0]` of the device ID then `[15:8]` byte.
+
+### Serial Flash Discoverable Parameters (SFDP) Control
+
+HW parses SFDP command then fetch the data from SFDP space in the DPSRAM.
+HW provides 256B SFDP space.
+HW uses lower 8bit of the received 24 bit address to access the DPSRAM.
+Upper 16 bits are ignored (aliased).
+SW should prepare proper SFDP contents before the host system issues SFDP commands.
+
+HW fetches from the DPSRAM in 4B and returns the data to the SPI line.
+HW repeats the operation until CSb is de-asserted.
+
+### Read Command Processor
+
+The read command block has multiple sub-blocks to process normal Read, Fast Read, Fast Read Dual/ Quad from the internal DPSRAM.
+The DPSRAM has a 2kB region for the read command access.
+The read command region has two 1kB buffers.
+If HW receives the read access to the other half of the space first time, then the HW reports to the SW to refill the current 1kB region with new content.
+
+The double buffering scheme aids the SW to prepare the next chunk of data.
+SW copies a portion of data (1kB) from the internal flash memory into SPI_DEVICE DPSRAM.
+From the host system, the emulated SPI Device is seen more than 2kB storage device with the double buffering scheme.
+The assumption is that the host system reads mostly sequentially.
+
+#### Address Handling
+
+For read commands such as Normal Read, Fast Read {Single/ Dual/ Quad} Output commands, the address comes through ID0 only.
+The state machine in this block shifts the address one-by-one and decrements the address counter register by 1.
+
+When it reaches the 4B address (`addr[2]`), the module triggers the DPSRAM state machine to fetch data from the DPSRAM.
+When the module receives `addr[0]`, at the positive edge of SCK, the module moves to appropriate command state based on the given CMD_INFO data.
+
+If the received address falls into mailbox address range and mailbox feature is enabled, the module turns on the mailbox selection bit.
+Then all out-going requests to the DPSRAM are forwarded to the mailbox section, not the read buffer section.
+
+#### Dummy Cycle
+
+The SW may configure the dummy cycle field for each individual read commands.
+The default dummy cycle for those commands are 7 (0-based).
+The value is the number of cycles.
+For example, if SW programs the dummy cycle for Fast Read Quad to `3h`, the module waits 4 cycles then returns data.
+
+#### Buffer Management
+
+![Read Buffer Management](../doc/buffer-management.svg)
+
+The SPI Device IP uses the first half of the DPSRAM as a read buffer when the SPI mode is flash or passthrough mode.
+The IP returns data from the read buffer based on the given address in the received read command.
+In the current version, the read buffer size is 2kB.
+The IP only uses lower 11 bits of the received read command address (`addr[10:0]`) to issue the read requests to the DPSRAM.
+
+SW is responsible for updating the read buffer contents.
+The HW notifies the SW to update the buffer contents when needed.
+The HW provides a SW configurable read watermark CSR and read-only [`LAST_READ_ADDR`](../data/spi_device.hjson#last_read_addr) CSR.
+The **LAST_READ_ADDR** shows the last read address of the recent read command.
+For instance, if the host system issues `0xABCD_E000` and reads 128 (or 0x80) bytes, the **LAST_READ_ADDR** after the transaction will show `0xABCD_E07F`.
+It does not show the commands falling into the mailbox region or Read SFDP command's address.
+
+The read watermark address width is 1 bit smaller than the read buffer address.
+In the current version, the register has 10-bit width.
+The HW assumes the SW maintains the read buffer as a double buffer scheme.
+When the host system accesses one buffer (1kB), the SW prepares another 1kB by copying data from the internal non-volatile memory.
+If the received read address crosses the SW configured watermark address, the HW informs the SW.
+SW may configure the watermark CSR low enough so that the SW has enough time to copy over the data.
+
+If a new read command crosses the current buffer boundary, the SW flips the internal buffer index bit and clears the cross event for the HW to detect the address cross event again.
+
+### 4B Address Management (EN4B/ EX4B)
+
+SW may configure the HW to receive EN4B and EX4B commands and change the read command address size between 3 bytes and 4 bytes.
+For the IP to recognize EN4B/ EX4B commands, SW should configure [`CMD_INFO_EN4B`](../data/spi_device.hjson#cmd_info_en4b) and [`CMD_INFO_EX4B`](../data/spi_device.hjson#cmd_info_ex4b).
+
+The two CSRs omit unnecessary fields from the **CMD_INFO** data structure.
+The HW logic creates the default **CMD_INFO** structures for the two commands.
+The command parser module uses the generated structures to process and trigger the 4B management module.
+
+When the HW receives one of the commands, the HW changes the broadcast signal *cfg_addr_4b_en*.
+Also the HW updates [`CFG.addr_4b_en`](../data/spi_device.hjson#cfg) after passing through CDC.
+It takes at most three SYS_CLK cycles to update the value in the *CFG* register after the completion of the SPI transaction (CSb de-assertion).
+
+_Note: The HW changes the broadcasting signal and the CSR even though the SPI host system sends more than 8 beats of the SPI S[0].
+After the logic matches the received command byte with EN4B/ EX4B, the logic ignores the rest of the SPI data._
+
+The broadcasted `cfg_addr_4b_en` signal affects the read commands which `addr_mode` is *AddrCfg* in their command information entries.
+
+### Command Upload
+
+If the received command meets following conditions, the HW stores the command into the command/ address FIFOs and the payload buffer:
+
+- The command does not match to the first 11 command information entries nor EN4B/ EX4B.
+- The command matches to any of the rest command information entries.
+- The matched entry has the `upload` field set.
+
+The upload module checks the command information entry to determine whether the address/ payload fields to be uploaded or not.
+The `addr_mode` is used to determine the address size in the command.
+
+If `busy` field in the command information entry is set, the upload module also sets *BUSY* bit in the *STATUS* register.
+SW may clear the *BUSY* bit after processing the command.
+
+The upload module provides [`UPLOAD_STATUS`](../data/spi_device.hjson#upload_status) and [`UPLOAD_STATUS2`](../data/spi_device.hjson#upload_status2) CSRs for SW to parse the command, address, and payload.
+If a received command has payload, SW may read the payload from the Payload buffer starting from `payload_start_idx` address.
+In normal case, `payload_start_idx` in [`UPLOAD_STATUS2`](../data/spi_device.hjson#upload_status2) shows **0**.
+In error case of the host sending more than the maximum allowed payload size (256B in the current version), the `payload_start_idx` may not be 0.
+It is expected that the `payload_depth` is maximum payload size, 256B if `payload_start_idx` is non-zero.
+In this scenario, SW should read from `payload_start_idx` to the end of the payload buffer then do a second read from the beginning of the buffer to the remained bytes.
+
+If the error case above happens, the IP reports the event through the `payload_overflow` interrupt.
+
+### Passthrough
+
+The passthrough module controls the data between a host system and the attached downstream SPI flash device.
+It snoops the SPI transactions and intervenes if the transaction is not permitted.
+The module also manipulates the data if needed.
+
+#### Command Filtering
+
+Filtering the incoming command is the key role of the Passthrough module.
+
+![Command Filtering logic in Passthrough mode](../doc/passthrough-filter.svg)
+
+```wavejson
+{ signal: [
+  { name: 'CSb_in',  wave: '10.........|....1.'},
+  { name: 'SCK_in',  wave: '0.p........|....l.'},
+  { name: 'IO[0]_i',  wave: 'z.=..=.=.=.=.=.=.=.=|=.=.=.=.z......',
+   data:["C[7]", "C[6]", "C[5]", "C[4]", "C[3]", "C[2]", "C[1]", "C[0]"],
+    period:0.5, },
+  { name: 'filter',  wave: '0................10.................',
+    period:0.5},
+  { name: 'filtered', wave: '0.................1.................',
+    period:0.5},
+  { name: 'SCK_out', wave: '0.p......0........'},
+  { name: 'CSb_out', wave: '10................1.................', period:0.5}
+  ],
+  head:{
+    text: 'Command Filtering',
+    tick: ['-2 -1 0 n-1 n+' ]
+  }
+}
+```
+
+The passthrough logic filters the command based on the 256 bit of [`CMD_FILTER_0`](../data/spi_device.hjson#cmd_filter_0) CSR.
+Each bit corresponds to each opcode.
+For example, if bit 5 of [`CMD_FILTER_0`](../data/spi_device.hjson#cmd_filter_0) is set, the passthrough drops **CSb** when it receives `05h` SPI command.
+
+The SW does not know whether a SPI transaction is filtered or not.
+If the SW wants to check, it needs to set the _upload_ field with the opcode in the command information list.
+Then, the HW uploads the command into the command/ address FIFOs and the payload buffer.
+
+#### Address Manipulation
+
+SW may configure the passthrough logic to swap certain address bits to desired values by configuring [`ADDR_SWAP_MASK`](../data/spi_device.hjson#addr_swap_mask) and [`ADDR_SWAP_DATA`](../data/spi_device.hjson#addr_swap_data) CSRs.
+The address translation takes in effect only when the received command is in the command information list and *addr_swap_en* field in the entry is set.
+
+For instance, the passthrough logic sets bit 20 of the address to 1 if [`ADDR_SWAP_MASK`](../data/spi_device.hjson#addr_swap_mask) is `0x0010_0000` and [`ADDR_SWAP_DATA`](../data/spi_device.hjson#addr_swap_data) is `0x0010_0000`.
+
+#### Write Status Data Manipulation
+
+The passthrough logic also provides a way to change the first 4 bytes of the payload to the downstream SPI flash device on-the-fly as same as the address.
+The main use of this feature is to protect the Status register.
+
+SW may configure the [`PAYLOAD_SWAP_MASK`](../data/spi_device.hjson#payload_swap_mask) and [`PAYLOAD_SWAP_DATA`](../data/spi_device.hjson#payload_swap_data) CSRs to change the specific bit of the first 4 byte of the write payload.
+For example, [`PAYLOAD_SWAP_MASK`](../data/spi_device.hjson#payload_swap_mask) as `32'h 0000_0023` and [`PAYLOAD_SWAP_DATA`](../data/spi_device.hjson#payload_swap_data) as `32'h 0000_0022` change bit 0 to 0, bit 1 to 1, bit 5 to 1 in the first byte payload.
+
+The CSRs are Little Endian (LE)s.
+The passthrough module consumes the lower byte first as SPI flash writes byte 0 first followed by byte 1.
+For example, bit `[7:0]` is processed then `[15:8]`, `[23:16]`, and `[31:24]` at last.
+
+The CSRs affect the commands that have *payload_swap_en* as 1 in their command list entries.
+SW may use additional command information slots for the passthrough (index 11 to 23).
+SW must configure *payload_dir* to **PayloadIn** and *payload_en* to `4'b 0001` in order for the payload translation feature to work correctly.
+
+#### Output Enable Control
+
+Passthrough module controls the output enable signals on both host and downstream sides.
+Controlling the output enable ports is critical to not overdrive the PAD directions.
+The information of the pad enable and direction is given by SW.
+SW configures the address size, payload lanes, dummy size in **CMD_INFO** slots.
+
+If passthrough logic does not find valid command information entry based on the received opcode, it assumes the command is **PayloadIn** Single IO command.
+SW is recommended to set the filter bit for Passthrough to not deliver the unmatched command to the downstream flash device.
+
+#### Internally processed Commands
+
+As described in [SPI Device Modes](#spi-device-modes-and-active-submodules), SPI_DEVICE may return the data from the IP even if the passthrough mode is set.
+The HW can process Read Status, Read JEDEC ID, Read SFDP, Read commands accessing the mailbox region, and EN4B/EX4B.
+
+SW configures [`INTERCEPT_EN`](../data/spi_device.hjson#intercept_en) CSR to enable the feature.
+SW may selectively enable/disable commands.
+For example, HW returns only Read Status data internally if [`INTERCEPT_EN`](../data/spi_device.hjson#intercept_en) is `{status: 1'b 1, default: 1'b 0}`.
+
+Other than Read command accessing mailbox space, it is recommended to filter the intercepted commands.
+
+## TPM over SPI
+
+![TPM over SPI block diagram](../doc/tpm-blockdiagram.svg)
+
+The TPM over SPI submodule processes the low level data only.
+The TPM submodule parses the incoming SPI MOSI line and stacks the stream up to the SW accessible registers, such as TPM_CMD_ADDR, and TPM_WRITE_FIFO.
+The SW must decode the command and the address.
+Then the SW reads the data from the write FIFO or pushes data into the read FIFO depending on the command.
+
+The TPM submodule returns appropriate data for read commands depending on the current read FIFO status, the received address, and the Locality.
+The module sends bytes from the return-by-HW registers to the parallel-to-serial logic right after the address phase when the received address falls into the HW managed registers.
+
+The TPM specification mandates the TPM module to return the data right after the address phase or send the WAIT at the last bit of the address phase.
+The address of the return-by-HW registers has a 4B boundary.
+The TPM submodule has enough time to determine if the incoming address falls into the return-by-HW registers or not.
+As the logic decides if the HW returns data or waits for the SW response at the address[2] bit phase, the logic always sends `WAIT(0x00)` at the last byte of the incoming address phase.
+The module sends `START(0x01)` at the next byte followed by the actual return-by-HW value if the received address falls into the list of the return-by-HW registers.
+
+The module, by default, returns `WAIT` when the address does not fall into the return-by-HW register address.
+In the wait state, the TPM submodule watches the read FIFO status.
+The module stays in the wait state until the read FIFO has the data >= requested transfer size.
+The module sends `START` at the next byte when the read FIFO has enough data.
+Then the module pops data from the read FIFO and sends the data over SPI.
+
+The TPM submodule accepts the payload for the TPM write command without the `WAIT` state if the write FIFO is empty.
+In other case, the TPM submodule sends `WAIT` until the write FIFO becomes available (empty).
+
+### Configuring Return-by-HW registers
+
+The return-by-HW register values come from the SW read-writable CSRs.
+The module latches the CSRs from the SYS_CLK domain into the SPI SCK domain when CSb is asserted.
+The SW is allowed to modify the return-by-HW registers when CSb is not active.
+
+The [TCG PC Client Platform TPM Profile][TPM PCCP] spec describes in the section 6 that the TPM device returns registers values based on the received locality (address[15:12]) and the `TPM_ACCESS_x.activeLocality`.
+The HW uses `TPM_ACCESS_x.activeLocaltiy` and the address bit 15:12 to determine what value the logic should return.
+If `invalid_locality` configuration is set, the logic returns `INVALID` value to the host system, when the host system sends a read request to the Locality greater than 4.
+If the request is in the supported locality (0-4), the logic checks `TPM_ACCESS_x.activeLocality` then returns data based on the table 39 in the spec for Return-by-HW registers.
+Other registers in the table should be processed by SW.
+
+## Detecting Reliability Errors
+
+This version of the SPI_DEVICE IP implements the parity to detect bit flip errors on the internal SRAM.
+The HW checks the parity error when the SW reads data from the SRAM.
+The error is reported to the SW via TL D channel error signal.
+SW is recommended to discard the current context if any transaction is ongoing then to reset the IP.
+
+# Design Details
+
+## Clock and Phase
+
+The SPI device module has two programmable register bits to control the SPI clock, [`CFG.CPOL`](../data/spi_device.hjson#cfg) and [`CFG.CPHA`](../data/spi_device.hjson#cfg).
+CPOL controls clock polarity and CPHA controls the clock phase.
+For further details, please refer to this diagram from Wikipedia:
+[File:SPI_timing_diagram2.svg](https://en.wikipedia.org/wiki/Serial_Peripheral_Interface#/media/File:SPI_timing_diagram2.svg)
+
+This version of SPI_DEVICE HWIP supports mode 0 (CPHA and CPOL as 0) for Generic, Flash, and Passthrough modes. Mode 3 (CPHA and CPOL as 1) is not supported in the current version.
+SW should configure the SPI_DEVICE to mode 0 to enable TPM mode along with other modes.
+
+## SPI Device Firmware Operation Mode
+
+As described in the Theory of Operations above, in this mode, the SPI device
+writes incoming data directly into the SRAM (through RXFIFO) and updates the SPI
+device SRAM write pointer ([`RXF_PTR.wptr`](../data/spi_device.hjson#rxf_ptr)). It does not parse a command byte nor
+address bytes, analyzing incoming data relies on firmware implementation of a
+higher level protocol. Data is sent from the TXF SRAM contents via TXFIFO.
+
+It is important that the data path inside the block should meet the timing that
+is a half cycle of SCK. As SCK clock is shut off right after the last bit of the
+last byte is received, the hardware module cannot register the SDI signal. The
+module registers bits [7:1] and combines them with the SDI signal directly to
+form the input to RXFIFO. This is detailed in the waveform below.
+
+```wavejson
+{ signal: [
+  { name: 'CSB', wave: '10.||...|..1'},
+  { name: 'SCK', wave: '0.p||...|..l', node:'......b' },
+  { name: 'SDI', wave: '0.=..=|=|=.=.=.=|=.=.z..', data:['7','6','5','1','0','7','6','1','0'], period:0.5, },
+  { name: 'BitCount', wave: '=...=.=|=|=.=.=.=|=.=...', data:['7','6','5','1','0','7','6','1','0','7'], period:0.5},
+  { name: 'RX_WEN', wave: '0....|....1.0.|...1.0...' , period:0.5},
+  { name: 'RXFIFO_D', wave:'x.=.=================.x.', node: '...........a',period:0.5},
+  ],
+  head:{
+    text: 'Read Data to FIFO',
+    tick: ['-2 -1 0 1 . 30 31 32 33 n-1 n n+1 n+2 '],
+  },
+}
+```
+
+As shown above, the RXFIFO write request signal (`RX_WEN`) is asserted when
+BitCount reaches 0h. Bitcount is reset by CSB asynchronously, returning to 7h
+for the next round. RXFIFO input data changes on the half clock cycle. RXFIFO
+latches WEN at the positive edge of SCK. When BitCount is 0h, bit 0 of FIFO data
+shows the bit 1 value for the first half clock cycle then shows correct value
+once the incoming SDI value is updated.
+
+TXFIFO is similar. TX_REN is asserted when Tx BitCount reaches 1, and the
+current entry of TXFIFO is popped at the negative edge of SCK. It results in a
+change of SDO value at the negative edge of SCK. SDO_OE is controlled by the
+CSB signal. If CSB goes to high, SDO is returned to High-Z state.
+
+```wavejson
+{ signal: [
+  { name: 'CSB',      wave:'10.||...|..1'},
+  { name: 'SCK',      wave:'0...p.|.|...|l' , node:'.............a', period:0.5},
+  { name: 'SDO',     wave:'x.=..=|=|=.=.=.=|=.=.x..', data:['7','6','5','1','0','7','6','1','0'], period:0.5, },
+  { name: 'SDO_OE',  wave:'0.1...................0.', period:0.5},
+  { name: 'BitCount', wave:'=....=.=|=|=.=.=.=|=.=..', data:['7','6','5','1','0','7','6','1','0','7'], period:0.5},
+  { name: 'TX_REN',   wave:'0.....|..1.0...|.1.0....' , node:'..........c',period:0.5},
+  { name: 'TX_DATA_i',wave:'=.....|....=.......=....',data:['D0','Dn','Dn+1'], node:'...........b', period:0.5},
+  ],
+  edge: ['a~b', 'c~b t1'],
+  head:{
+    text: 'Write Data from FIFO',
+    tick: ['-2 -1 0 1 . 30 31 32 33 n-1 n n+1 n+2 '],
+  },
+}
+```
+
+Note that in the SPI mode 3 configuration ([`CFG.CPOL`](../data/spi_device.hjson#cfg)=1, [`CFG.CPHA`](../data/spi_device.hjson#cfg)=1), the
+logic isn't able to pop the entry from the TX async FIFO after the last bit
+in the last byte of a transaction. In mode 3, no further SCK edge is given
+after sending the last bit before the CSB de-assertion. The design is chosen to
+pop the entry at the 7th bit position. This introduces unavoidable behavior of
+dropping the last byte if CSB is de-asserted before a byte transfer is
+completed. If CSB is de-asserted in bit 1 to 6 position, the FIFO entry isn't
+popped. TX logic will re-send the byte in next transaction. If CSB is
+de-asserted in the 7th or 8th bit position, the data is dropped and will
+re-commence with the next byte in the next transaction.
+
+### RXFIFO control
+
+![RXF CTRL State Machine](../doc/rxf_ctrl_fsm.svg)
+
+The RXFIFO Control module controls data flow from RXFIFO to SRAM. It connects
+two FIFOs having different data widths. RXFIFO is byte width, SRAM storing
+incoming data to serve FW is TL-UL interface width.
+
+To reduce traffic to SRAM, the control logic gathers FIFO entries up to full
+SRAM data width, then does a full-word SRAM write. A programmable timer exists
+in the case when partial bytes are received at the end of a transfer. If the
+timer expires while bytes are still in the RXFIFO, the logic writes partial
+words to SRAM. A read-modify-write operation is triggered to perform the partial
+update.
+
+![State Machine](../doc/rxf_ctrl_fsm_table.png)
+
+### TXFIFO control
+
+The TXFIFO control module reads data from SRAM then pushes to TXFIFO whenever
+there is space in TXFIFO and when the TXF wptr and rptr indicate there is data
+to transmit. Data is written into the TXF SRAM by software which also controls
+the TXF write pointer.
+
+![TXF CTRL Data Path](../doc/txf_ctrl_dp.svg)
+
+The TXFIFO control module latches the write pointer then uses it internally.
+This prevents HW from using incorrect data from SRAM if the write pointer
+and read pointer are pointing at the same location. It is
+recommended for the software to update the write pointer at the SRAM data width
+granularity if it has more than 1 DWord data to send out. If software updates
+write pointer every byte, HW tries to fetch data from SRAM every time it hits
+the write pointer leading to inefficiency of SRAM access.
+
+If TXFIFO is empty, HW module repeatedly sends current entry of TXFIFO output as
+explained in "Theory of Operations" section. It cannot use an empty signal from
+TXFIFO due to asynchronous timing constraints.
+
+So, if software wants to send specific dummy data, it should prepare the amount
+of data with that value. As shown in the Theory Of Operations figure, for
+example, internal software could prepare FFh values for first page.
+
+![State Machine](../doc/txf_ctrl_fsm_table.png)
+
+## Data Storage Sizes
+
+SPI Device IP uses a 2kB internal Dual-Port SRAM. Firmware can resize RX / TX
+circular buffers within the SRAM size. For example, the firmware is able to set
+RX circular buffer to be 1.5kB and 512B for TX circular buffer.
+
+To increase SRAM size, the `SramAw` local parameter in `spi_device.sv`
+should be changed. It cannot exceed 13 (32kB) due to the read and write
+pointers' widths.
diff --git a/hw/ip/spi_host/README.md b/hw/ip/spi_host/README.md
index f258c87bea928..48d92e7b9f554 100644
--- a/hw/ip/spi_host/README.md
+++ b/hw/ip/spi_host/README.md
@@ -149,1375 +149,3 @@ The SPI_HOST command interface allows the user to specify any number of command
 For even faster transfer rates, some flash chips support double transfer rate (DTR) variations to the SPI protocol wherein the device receives and transmits fresh data on *both* the leading and trailing edge.
 This IP only supports single transfer rate (STR), *not* DTR.
 A preliminary investigation of DTR transfer mode suggests that proper support for setup and hold times in this mode may require a level of sub-cycle timing control which is not currently planned for this IP.
-
-# Theory of Operations
-
-## SPI_HOST IP Command Interface
-
-A SPI command consists of at least one segment. Each segment has a different speed (number of active SD lines), direction and length.
-For example a Quad SPI read transaction consists of 4 segments:
-1. A single byte instruction transmitted at *standard* data rate
-2. A three or four byte address transmitted at *Quad* data rate
-3. A number of dummy cycles (no data transmitted or received)
-4. The desired data, received by SPI_HOST at *Quad* data rate
-
-During a transaction, software can issue multiple segment descriptions to the SPI_HOST IP to control for changes in speed or direction.
-
-Issuing a command then consists of the following steps:
-1. Configure the IP to be compatible with each attached peripheral.
-The [`CONFIGOPTS`](data/spi_host.hjson#configopts) multi-register holds separate sets of configuration settings, one for each CSB line.
-In principle, the configuration of these device-specific options only needs to be done/performed once at initialization.
-
-2. Load the TX FIFO with the instructions and data to be transmitted to the remote device by writing to the [`TXDATA`](data/spi_host.hjson#txdata) memory window.
-3. Specify which device should receive the next command using the [`CSID`](data/spi_host.hjson#csid) register.
-4. Wait for [`STATUS.READY`](data/spi_host.hjson#status) before continuing.
-5. Issue speed, direction, and length details for the next command segment using the [`COMMAND`](data/spi_host.hjson#command) register.
-If a command consists of multiple segments, then set [`COMMAND.CSAAT`](data/spi_host.hjson#command) (Chip-select active after transaction) to one for all segments except the last one.
-Setting [`COMMAND.CSAAT`](data/spi_host.hjson#command) to zero indicates the end of a transaction, prompting the IP to raise CSB at the end of the segment.
-
-6. Repeat steps 4 and 5 until all segments have been described.
-7. Read any peripheral response data from the RX FIFO by reading from the [`RXDATA`](data/spi_host.hjson#rxdata) memory window.
-
-### About Command Segments
-
-The structure of a SPI command depends on the device and the command itself.
-
-To support a variety of different I/O sequences the SPI_HOST FSM treats each command as a sequence of segments, each with a defined length, direction and speed.
-
-In case of a standard SPI device the commands are very consistent in structure: the host transmits data on SD[0], and always receives data on SD[1].
-For such devices, all commands can in principle be treated as bidirectional, as both the host and device are always transmitting on their respective lines.
-For bidirectional commands, the SPI_HOST IP will store one byte in the RX FIFO for each byte transmitted from the TX FIFO.
-
-However, even for these standard SPI commands, software may be uninterested in some or all of the device's response data.
-For example, for SPI flash devices, standard-mode write commands contain no useful data in the device response, even though the device may be actively asserting signals to SD[1] throughout the transaction.
-Therefore, for such commands software may choose to specify the entire command as "TX Only", in which case data placed in the TX FIFO will be transmitted throughout the write command, but signals received from the device will be ignored and will not fill the RX FIFO.
-
-Meanwhile for other flash commands, such as standard-mode read, the device only transmits useful information during some portions of the transaction.
-In the case of a basic read (with a 3-byte address), the instruction starts with a 1-byte instruction code (0x3) followed by the three address bytes, during which time the flash device outputs may be high impedance (depending on the device).
-The device then immediately responds with the requested data in the next SCK cycle, and continues to output data bytes until the CSB line is deasserted.
-Though such a command could also be treated as entirely bidirectional, the device response can be safely ignored during the instruction and address phase, especially if the SD[1] line is high impedance during this time.
-Likewise it is not necessary for software to specify any data to transmit while the device is responding.
-Therefore such a command can be thought of as consisting of two separate segments, the first segment being TX Only and the second segment being RX only, as shown in the following figure.
-Breaking the command up this way potentially simplifies the job of writing software for this type of command.
-
-```wavejson
-{signal: [
-  {name: "clk_i",          wave: "p....................|.............."},
-  {name: "SCK (CPOL=0)",   wave: "0.1010101010101010101|01010101010101"},
-  {name: "CSB",            wave: "10...................|.............."},
-  {name: "SD[0]",          wave: "00.0.0.0.0.0.1.1.2.2.|2.2.x.........", data: ["a[23]", "a[22]", "a[1]", "a[0]"]},
-  {name: "SD[1]",          wave: "z....................|....2.2.2.2.2.", data: ["d[7]", "d[6]", "d[5]", "d[4]", "..."]},
-  {name: "Segment number", wave: "x2...................|....2.........", data: ['1', '2', '3','4'] },
-  {name: "Segment speed",  wave: "x2...................|....2.........", data: ['Standard', 'Standard'] },
-  {name: "Segment direction", wave: "x2...................|....2.........", data: ['TX', 'RX', 'None', 'RX'] },
-  ],
- foot: {text: "Standard SPI example: Flash Read command with 24-bit address, consisting of one TX and one RX segment"}
-}
-```
-
-In addition to the TX, RX or Bidirectional modes, many SPI commands require periods where neither the host or device are transmitting data.
-For instance, many flash devices define a Fast Read command in which the host must insert a number of "dummy clocks" between the last address byte and the first data byte from the device.
-These extra cycles are required for operation at higher clock frequencies, to give the address time to propagate through the flash core.
-A standard-mode Fast Read (with 3 byte addressing) command then requires *three* SPI_HOST command segments:
-- 4 bytes TX Only: one for the instruction code (i.e., 0xb for Fast Read), and three for the address.
-- 8 dummy clocks
-- N bytes RX Only for read data response
-
-```wavejson
-{signal: [
-  {name: "clk_i",          wave: "p....................|.............................."},
-  {name: "SCK (CPOL=0)",   wave: "0.1010101010101010101|010101010101010101010101010101"},
-  {name: "CSB",            wave: "10...................|.............................."},
-  {name: "SD[0]",          wave: "00.0.0.0.1.0.1.1.2.2.|2.2.x.........................", data: ["a[23]", "a[22]", "a[1]", "a[0]"]},
-  {name: "SD[1]",          wave: "z....................|....z.z.z.z.z.z.z.z.2.2.2.2.2.", data: ["d[7]", "d[6]", "d[5]", "d[4]", "..."]},
-  {name: "Segment number", wave: "x3...................|....4...............5.........", data: ['1', '2', '3'] },
-  {name: "Segment speed",  wave: "x3...................|....4...............5.........", data: ['Standard', 'X', 'Standard'] },
-  {name: "Segment direction", wave: "x3...................|....4...............5.........", data: ['TX', 'Dummy', 'RX'] },
-  ],
- foot: {text: "Standard SPI example: Fast read command (instruction code 0xb) with 24-bit address, consisting of three segments, one TX, 8 dummy clocks and one RX segment"}
-}
-```
-
-For standard mode-commands, segments simplify the IO process by identifying which bus cycles have useful RX or TX data.
-In such cases it is not strictly necessary to the manage the impedance of the SD[0] and SD[1] lines.
-For Dual- and Quad-mode commands, however, impedance control necessary.
-The impedance of all data lines (SD[3:0]) must switch between TX and RX segments.
-
-Bidirectional data transfers are not applicable for Dual- or Quad-mode segments.
-
-In addition, the speed-mode changes how data is distributed across the four data lines, and many commands require that some segments are transmitted in standard mode (only on SD[0]), while the bulk of the data is transmitted in Dual- or Quad-mode.
-For this reason the speed-mode is also adjustable on a segment-by-segment basis.
-
-#### Specifying Command Segments
-
-The SPI host supports all four possible modes for command segments, and they are controlled writing one of the following values to the 2-bit [`COMMAND.DIRECTION`](data/spi_host.hjson#command) register:
-- 2'b00: Dummy cycles only (neither side transmits)
-- 2'b01: RX Only
-- 2'b10: TX Only
-- 2'b11: Bidirectional
-
-### CSID Register
-
-The [`CSID`](data/spi_host.hjson#csid) register is used to identify the target device for the next command segment.
-Whenever a command segment descriptor is written to [`COMMAND`](data/spi_host.hjson#command), [`CSID`](data/spi_host.hjson#csid) is passed into the FSM along with the command segment descriptor and the corresponding configurations options (taken from the CSID'th element of the `CONFIGOPTS` multi-register).
-
-This register still exists when instantiated with only one CSB line (i.e. when NumCS=1).
-However in this case the [`CSID`](data/spi_host.hjson#csid) value is ignored.
-
-Changes in [`CSID`](data/spi_host.hjson#csid) also affect the CSB lines, because a change in CSID can also implicitly end a command, overriding [`COMMAND.CSAAT`](data/spi_host.hjson#command).
-If a change is detected in [`CSID`](data/spi_host.hjson#csid), but the previous segment was submitted with the `CSAAT` bit asserted, the FSM terminates the previous command before moving on to the next segment.
-The previous CSB line is held low for *at least* `CSNTRAIL` cycles (as defined by the previous value of [`CONFIGOPTS.CSNTRAIL`](data/spi_host.hjson#configopts)) and then brought high.
-All CSB lines are held high for `CSNIDLE` cycles (using the new value of [`CONFIGOPTS.CSNIDLE`](data/spi_host.hjson#configopts)).
-The new CSB line is asserted low, and SCK begins toggling after the usual `CSNLEAD` cycle delay.
-
-### Configuration Options
-
-The [`CONFIGOPTS`](data/spi_host.hjson#configopts) multi-register has one entry per CSB line and holds clock configuration and timing settings which are specific to each peripheral.
-Once the [`CONFIGOPTS`](data/spi_host.hjson#configopts) multi-register has been programmed for each SPI peripheral device, the values can be left unchanged.
-
-The following sections give details on how the SPI_HOST can be used to control a specific peripheral.
-For simplicity, this section describes how to interact one device, attached to CSB[0], and as such references are made to the multi-registers [`CONFIGOPTS`](data/spi_host.hjson#configopts) and [`COMMAND`](data/spi_host.hjson#command).
-To configure timing and send commands to devices on other CSB lines, instead use the `CONFIGOPTS` multi-register corresponding to desired CSB line.
-
-The most common differences between target devices are the requirements for a specific SPI clock phase or polarity, CPOL and CPHA, which were described in the previous section [SPI Protocol Basics](#spi-protocol-basics).
-These clock parameters can be set via the [`CONFIGOPTS.CPOL`](data/spi_host.hjson#configopts) or [`CONFIGOPTS.CPHA`](data/spi_host.hjson#configopts) register fields.
-Likewise, as also described in the previous section, if device setup times require a full clock cycle before sampling the output, Full-Cycle Mode can be enabled by asserting the [`CONFIGOPTS.FULLCYC`](data/spi_host.hjson#configopts) bit.
-
-#### Clock rate selection
-
-The SPI clock rate for each peripheral is set by two factors:
-- The SPI_HOST input clock
-- A 16-bit clock divider
-
-The SPI protocol usually requires activity (either sampling or asserting data) on either edge of the SCK clock.
-For this reason the maximum SCK frequency is at most one half the SPI_HOST core frequency.
-
-Since some peripheral devices attached to the same SPI_HOST may require different clock frequencies, there is also the option to divide the core clock by an additional factor when dealing with slower peripherals.
-
-$$T_{\textrm{SCK},0}=\frac{1}{2}\frac{T_\textrm{clk}}{\textrm{CONFIGOPTS.CLKDIV}+1}$$
-
-#### Chip-select Timing Control
-
-Typically the CSB line is automatically deasserted after the last edge of SCK.
-However, by asserting [`COMMAND.CSAAT`](data/spi_host.hjson#command) when issuing a particular command, one can instruct the core to hold CSB low indefinitely after the last clock edge.
-This is useful for merging two adjacent command segments together, to create more complex commands, such as flash Quad read commands which require a mix of segments with different speeds and directions.
-The CSB line can then be deasserted by either issuing another command without the [`COMMAND.CSAAT`](data/spi_host.hjson#command) field, issuing a command to a different device (after changing the [`CSID`](data/spi_host.hjson#csid) register), or simply resetting the core FSM via the [`CONTROL.RST`](data/spi_host.hjson#control) register.
-
-To avoid spurious clock signals, changes to the [`CONFIGOPTS`](data/spi_host.hjson#configopts) parameters take effect only at the end of a command segment and only when all `csb` lines are deasserted.
-There are two cases to consider:
-1. Configuration changes detected and CSAAT=0 for the previous segment:
-This is when configuration changes are typically expected, and in this case, the SPI_HOST waits for the previous segment to complete before moving changing the configuration.
-The SPI_HOST ensures that all `csb` lines are held idle long enough to satisfy the configuration requirements both *before* and *after* the change.
-2. CSAAT = 1 for the previous segment:
-Configuration changes are not typically expected after CSAAT segments, and require special treatment as the IP does not usually return the `csb` lines to the idle/inactive state at this time.
-In such cases, the SPI_HOST IP closes out the ongoing transaction, ignoring CSAAT, and the configuration is then applied once the SPI_HOST has returned to the idle state.
-The next segment can then proceed, even though the remote device will likely see the next segment as the start of a new transaction (as opposed to a continuation of the previous transaction), because of the brief intervening idle pulse.
-
-Most devices require at least one-half SCK clock-cycle between either edge of CSB and the nearest SCK edge.
-However, some devices may require more timing margin and so the SPI_HOST core offers some configuration registers for controlling the timing of the CSB edges when operating under automatic control.
-The relevant parameters are as follows:
-- T<sub>IDLE</sub>: The minimum time between each rising edge of CSB and the following falling edge.
-This time delay is a half SCK cycle by default but can be extended to as long as eight SCK cycles by setting the [`CONFIGOPTS.CSNIDLE`](data/spi_host.hjson#configopts) register.
-- T<sub>LEAD</sub>: The minimum time between each falling edge of CSB and the first leading edge of SCK.
-This time delay is a half SCK cycle by default but can be extended to as long as eight SCK cycles by setting the [`CONFIGOPTS.CSNLEAD`](data/spi_host.hjson#configopts) register.
-- T<sub>TRAIL</sub>: The minimum time between the last trailing edge of SCK and the following rising edge of CSB.
-This time delay is a half SCK cycle by default but can be extended to as long as eight SCK cycles by setting the [`CONFIGOPTS.CSNTRAIL`](data/spi_host.hjson#configopts) register.
-
-```wavejson
-{signal: [
-  {name: "SCK",  wave: "l....1010|10........"},
-  {name: "CSB", wave: "10.......|.....1...0", node: ".A...B.....C...D...E"}
-],
- edge: ["A<->B minimum (CSNLEAD+1)", "C<->D minimum (CSNTRAIL+1)", "D<->E minimum (CSNIDLE+1)"],
-  head: {
-    text: "Impact of CSNLEAD, CSNTRAIL and CSNIDLE CONFIGOPTS register settings",
-    tick: 1
-  },
-  foot: {
-    text: ["tspan", "All ticks are in units of &#xbd;T",
-           ["tspan", {'baseline-shift':'sub'}, "SCK"],
-          "=&#xbd;T",
-           ["tspan", {'baseline-shift':'sub'}, "clk"],
-          "&#xd7;(CLKDIV+1)"]
-  }
-}
-```
-
-These settings are all minimum bounds, and delays in the FSM implementation may create more margin in each of these timing constraints.
-
-### Idle Time Delays When Changing Configurations
-
-It is important that the configuration changes are applied while `csb` is high to avoid sending spurious `sck` events to any devices.
-For example, if two devices have different requirements for `CPOL`, the clock polarity should not toggle except when `csb` is high (inactive) for all devices.
-
-Furthermore, `csb` should be remain high for the minimum idle time both before and after the configuration update.
-For example, consider a SPI_HOST attached to two devices each with different requirements for the clock divider, clock polarity, and idle time.
-Consider a configuration where total idle time (as determined by the [`CONFIGOPTS.CLKDIV`](data/spi_host.hjson#configopts) and [`CONFIGOPTS.CSNIDLE`](data/spi_host.hjson#configopts) multi-registers) works out to 9 idle clocks for the first device, and 4 clocks for the second device.
-In this scenario then, when swapping from the first device to the second, the SPI_HOST IP will only swap the clock polarity once the first `csb` line, `csb[0]`, has been high for at least 9 clocks, and will continue to hold the second `csb` line, `csb[1]`, high for 4 additional clocks before starting the next transaction.
-
-```wavejson
-{signal: [
-  {name: 'clk', wave: 'p..............'},
-  ["Requested Config",
-   {name: 'Configuration ID',  wave: '3.4............', data: ["CSID=0", "CSID=1"]},
-   {name: 'CPOL',              wave: '2.2............', data: ["0", "1"]},
-   {name: 'CLKDIV',            wave: '2.2............', data: ["2", "1"]},
-   {name: 'CSNIDLE',           wave: '2.2............', data: ["2", "1"]},
-   {name: 'Min. Idle cycles', wave: '2.2............', data: ["9", "4"]},
-  ],
-  ["Active Config",
-   {name: 'Configuration ID',  wave: '3.........4....', data: ["CSID=0", "CSID=1"]},
-   {name: 'CPOL',              wave: '2.........2....', data: ["0", "1"]},
-   {name: 'CLKDIV',            wave: '2.........2....', data: ["2", "1"]},
-   {name: 'CSNIDLE',           wave: '2.........2....', data: ["2", "1"]},
-   {name: 'Min. Idle cycles', wave: '2.........2....', data: ["9", "4"]},
-  ],
-   {name: 'csb[0]',                     wave: '01.............',
-                                        node: '.A........B....'},
-   {name: 'csb[1]',                     wave: '1.............0',
-                                        node: '..........C...D'},
-   {name: 'configuration update event', wave: '1.........H....'}
-],
-  edge: ["A<->B min. 9 cycles", "C<->D min. 4 cycles"],
-  head: {text: "Extended Idle Time During Configuration Changes", tock: 1}
-}
-```
-
-This additional idle time applies not only when switching between devices but when making any changes to the configuration for most recently used device.
-For instance, even in a SPI_HOST configured for one device, changes to [`CONFIGOPTS`](data/spi_host.hjson#configopts), will trigger this extended idle time behavior to ensure that the change in configuration only occurs in the middle of a long idle period.
-
-
-### Special Command Fields
-
-The [`COMMAND`](data/spi_host.hjson#command) register must be written once for each command segment.
-Whenever a command segment is written to [`COMMAND`](data/spi_host.hjson#command), the contents of the [`CONFIGOPTS`](data/spi_host.hjson#configopts), [`CSID`](data/spi_host.hjson#csid), and [`COMMAND`](data/spi_host.hjson#command) registers are passed through the Config/Command FIFO to the SPI_HOST core FSM.
-Once the command is issued, the core will immediately deassert [`STATUS.READY`](data/spi_host.hjson#status), and once the command has started [`STATUS.ACTIVE`](data/spi_host.hjson#status) will go high.
-The command is complete when [`STATUS.ACTIVE`](data/spi_host.hjson#status) goes low.
-A `spi_event` interrupt can also be triggered to go off on completion by setting [`EVENT_ENABLE.IDLE`](data/spi_host.hjson#event_enable).
-
-### Chip Select Masks
-
-Each instance of the SPI_HOST IP supports a parametrizable number of chip select lines (CSB[NumCS-1:0]).
-Each CSB line can be routed either to a single peripheral or to a daisy-chain of peripherals.
-Whenever a segment description is written to the [`COMMAND`](data/spi_host.hjson#command) register, the  [`CSID`](data/spi_host.hjson#csid) is sent along with [`COMMAND`](data/spi_host.hjson#command) and the `CONFIGOPTS` multi-register corresponding to [`CSID`](data/spi_host.hjson#csid)  to indicate which device is meant to receive the command.
-The SPI_HOST core typically then manages the details of asserting and deasserting the proper CSB line, subject to the timing parameters expressed in [`CONFIGOPTS.CSNLEAD`](data/spi_host.hjson#configopts), [`CONFIGOPTS.CSNTRAIL`](data/spi_host.hjson#configopts), and [`CONFIGOPTS.CSNIDLE`](data/spi_host.hjson#configopts).
-
-If [Pass-through mode](#pass-through-mode) is enabled then the CSB lines are controlled by *neither* the SPI_HOST hardware nor the firmware register.
-In Pass-though mode, control of the CSB lines passes directly to the inter-module port, `passthrough_i.csb`.
-
-### Back-to-back Segments
-
-The command interface can allows for any number of segments in a given command.
-
-Since most SPI Flash transactions typically consist of 3 or 4 segments, there is a small command FIFO for submitting segments to the SPI_HOST IP, so that firmware can issue the entire transaction at one time.
-
-Writing a segment description to [`COMMAND`](data/spi_host.hjson#command) when [`STATUS.READY`](data/spi_host.hjson#status) is low will trigger an error condition, which must be acknowledged by software.
-When submitting multiple segments to the the command queue, firmware can also check the [`STATUS.CMDQD`](data/spi_host.hjson#status) register to determine how many unprocessed segments are in the FIFO.
-
-## Data Formatting
-
-### Input and Output Byte Ordering
-
-The SPI transactions must be issued with correct bit ordering to properly communicate with a remote device.
-Based on the requirements for our chosen flash devices, this IP follows these conventions:
-- The relative significance of lines on the SD bus: SD[0] is always the least significant, followed by SD[1] though SD[3] with increasing significance.
-- The relative significance of a sequence of bits on the same SD bus: more significant bits are always transmitted before (or at the same time as) less significant bits.
-    - For instance, when transferring a single byte in Quad mode, all four bits of the upper nibble (bits 7 through 3) are transferred in the first clock cycle and the entire lower nibble (bits 3 through 0) is transferred in the second cycle.
-
-The programming model for the IP should meanwhile make it easy to quickly program the peripheral device, with a minimum amount of byte shuffling.
-It should be intuitive to program the specific flash devices we are targeting, while following the conventions above:
-- When transferring data in from the [`RXDATA`](data/spi_host.hjson#rxdata) memory window or out to the [`TXDATA`](data/spi_host.hjson#txdata) window, the IP should fully utilize the TL-UL bus, using 32-bit I/O instructions.
-- The SPI_HOST should make it easy to arrange transaction data in processor memory, meaning that bytes should be sequentially transmitted in order of ascending memory address.
-  - When using 32-bit I/O instructions, this requires some knowledge of the processor byte-order.
-
-Based on these requirements, data read from [`RXDATA`](data/spi_host.hjson#rxdata) or placed in [`TXDATA`](data/spi_host.hjson#txdata) are handled as follows:
-- 32-bit words placed in [`TXDATA`](data/spi_host.hjson#txdata) are transmitted in first-in-first-out order.
-Likewise, words received from the SPI data lines are made available for reading from [`RXDATA`](data/spi_host.hjson#rxdata) in first-in-first-out order.
-- Within a 32-bit word, the `ByteOrder` parameter controls the order in which bytes are transmitted, and also the manner in which received bytes are eventually arranged in the 32-bit [`RXDATA`](data/spi_host.hjson#rxdata) register.
-By default (`ByteOrder` = 1, for Little-Endian processors), the LSB of [`TXDATA`](data/spi_host.hjson#txdata) (i.e bits 7 though 0) is transmitted first, and the other bytes follow in order of increasing significance.
-Similarly, the first byte received is packed into the LSB of [`RXDATA`](data/spi_host.hjson#rxdata), and the subsequent bytes of each [`RXDATA`](data/spi_host.hjson#rxdata) word are packed in order of increasing significance.
-
-On the other hand, if `ByteOrder` is set to 0 (for Big-Endian processors), the MSB is transmitted first from [`TXDATA`](data/spi_host.hjson#txdata), and received data is loaded first into the MSB of [`RXDATA`](data/spi_host.hjson#rxdata).
-   - The default choice of Little-Endian reflects native byte-order of the Ibex processor.
-- Finally *within a given byte*, the most significant bits are transmitted and received first.
-For Dual and Quad transactions the least significant bit in any instantaneous pair or nibble is transmitted or received on SD[0], and the remaining SD bits (1 though 3) are populated in order of increasing significance.
-
-The following figure shows how data appears on the serial data bus when the hardware reads it from [`TXDATA`](data/spi_host.hjson#txdata) or writes it to [`RXDATA`](data/spi_host.hjson#rxdata).
-
-```wavejson
- {signal: [
-  ["ByteOrder=0",
-  {name: "SD[0] (host output)", wave: "x22222222222|2222|222|22x", data: ["t[31]", "t[30]", "t[29]", "t[28]", "t[27]", "t[26]", "t[25]", "t[24]", "t[23]","t[22]",
-                                                                          "t[21]","t[17]","t[16]","t[15]","t[14]","t[8]", "t[7]", "t[6]", "t[1]", "t[0]"]},
-  {name: "SD[1] (host input)", wave: "x22222222222|2222|222|22x", data: ["r[31]", "r[30]", "r[29]", "r[28]", "r[27]", "r[26]", "r[25]", "r[24]", "r[23]","r[22]",
-                                                                         "r[21]","r[17]","r[16]","r[15]","r[14]","r[8]", "r[7]", "r[6]", "r[1]", "r[0]"]},
-  {name: "Which byte?", wave: "x4.......4..|..4.|.4.|..x", data: ["DATA MSB", "","", "          LSB"]}
-],
-  ["ByteOrder=1",
-  {name: "SD[0] (host output)", wave: "x22222222222|2222|222|22x", data: ["t[7]", "t[6]", "t[5]", "t[4]", "t[3]", "t[2]", "t[1]", "t[0]", "t[15]","t[14]",
-                                                                          "t[13]","t[9]","t[8]","t[23]","t[22]","t[16]", "t[31]", "t[30]", "t[25]", "t[24]"]},
-  {name: "SD[1] (host input)", wave: "x22222222222|2222|222|22x", data: ["r[7]", "r[6]", "r[5]", "r[4]", "r[3]", "r[2]", "r[1]", "r[0]", "r[15]","r[14]",
-                                                                         "r[13]","r[9]","r[8]","r[23]","r[22]","r[16]", "r[31]", "r[30]", "r[25]", "r[24]"]},
-  {name: "Which byte?", wave: "x5.......5..|..5.|.5.|..x", data: ["DATA LSB", "","", "          MSB"]}
-],
-  ],
-  head: {
-   text: "Serial bit ordering for 32-bit data words written to DATA (t[31:0]) or read from DATA (r[31:0]) as a Function of the Parameter 'ByteOrder'",
-  },
-  foot: {
-  text: "Standard SPI, bidirectional segment.  Bits are numbered as they appear in the DATA memory window"
-  }
-}
-```
-
-
-As shown in the following figure, a similar time-ordering scheme applies for Dual- and Quad-mode transfers.
-However many bits of similar significance are packed into multiple parallel SD data lines, with the least significant going to SD[0].
-
-```wavejson
-{signal: [
-  ["ByteOrder=0",
-  {name: "SD[0]", wave: "x...22334455x...", data: ["d[28]", "d[24]", "d[20]", "d[16]", "d[12]", "d[8]", "d[4]", "d[0]"]},
-  {name: "SD[1]", wave: "x...22334455x...", data: ["d[29]", "d[25]", "d[21]", "d[17]", "d[13]", "d[9]", "d[5]", "d[1]"]},
-  {name: "SD[2]", wave: "x...22334455x...", data: ["d[30]", "d[26]", "d[22]", "d[18]", "d[14]", "d[10]", "d[6]", "d[2]"]},
-  {name: "SD[3]", wave: "x...22334455x...", data: ["d[31]", "d[27]", "d[23]", "d[19]", "d[15]", "d[11]", "d[7]", "d[3]"]},
-],
-   ["ByteOrder=1",
-  {name: "SD[0]", wave: "x...55443322x...", data: ["d[4]", "d[0]", "d[12]", "d[8]", "d[20]", "d[16]", "d[28]", "d[24]"]},
-  {name: "SD[1]", wave: "x...55443322x...", data: ["d[5]", "d[1]", "d[13]", "d[9]", "d[21]", "d[17]", "d[29]", "d[25]"]},
-  {name: "SD[2]", wave: "x...55443322x...", data: ["d[6]", "d[2]", "d[14]", "d[10]", "d[22]", "d[18]", "d[30]", "d[26]"]},
-  {name: "SD[3]", wave: "x...55443322x...", data: ["d[7]", "d[3]", "d[15]", "d[11]", "d[23]", "d[19]", "d[31]", "d[27]"]},
-  ],
-  ],
-  head: {
-   text: "Serial bit ordering for 32-bit data word (d[31:0]), Quad SPI as a Function of the Parameter 'ByteOrder'",
-  },
-  foot: {
-  text: "(Bits are numbered as they appear when loaded into DATA memory window)"
-  }
-}
-```
-
-### Command Length and Alignment in DATA
-
-Even though the [`TXDATA`](data/spi_host.hjson#txdata) memory window typically accepts 32-bit words, command segments do not need to use all the bytes from every word.
-
-For TX (or Bidirectional) segments, unused bytes from the latest TX FIFO word are simply ignored at the end of a segment.
-For RX (or Bidirectional) segments, if the last few bytes received do not fill an entire DATA word, the partial word will be zero-padded and inserted into the RX FIFO once the segment is completed.
-If ByteOrder=1 (the default, Little-Endian case), this padding will fill the unused most-significant bytes of the final RX DATA word, otherwise the padding will fill the unused least-significant bytes.
-
-The following waveform illustrates an example SPI transaction, where neither the data transmitted nor the data received in each segment fit into an even number of 32-bit words.
-In this example, the values `I[31:0]`, `A[31:0]` and `B[31:0]`, have been previously written into [`TXDATA`](data/spi_host.hjson#txdata) via firmware, and afterwards one word, `X[31:0]`, is available for reading from [`RXDATA`](data/spi_host.hjson#rxdata).
-All data in the waveform is transferred using 32-bit instructions.
-
-```wavejson
-{signal: [
-  {name: "Segment number", wave: "x2.......2.........2.2.x", data: "1 2 3 4"},
-  {name: "Speed", wave: "x2.......2.........2.2.x", data: "Standard Quad X Quad"},
-  {name: "Direction", wave: "x2.......2.........2.2.x", data: "TX TX Dummy RX"},
-  {name: "Length", wave: "x2.......2.........2.2.x", data: "1 5 2 1"},
-  ["ByteOrder=0",
-  {name: "SD[0]", wave: "x222222222233445522z.22x", data: ["I[31]", "I[30]", "I[29]", "I[28]", "I[27]", "I[26]", "I[25]", "I[24]",
-                                                           "A[28]", "A[24]", "A[20]", "A[16]", "A[12]", "A[8]",  "A[4]", "A[0]", "B[28]", "B[24]", "X[28]", "X[24]"]},
-  {name: "SD[1]", wave: "xz.......2233445522z.22x", data: ["A[29]", "A[25]", "A[21]", "A[17]", "A[13]", "A[9]",  "A[5]", "B[1]", "B[29]", "B[25]", "X[29]", "X[25]"]},
-  {name: "SD[2]", wave: "xz.......2233445522z.22x", data: ["A[30]", "A[26]", "A[22]", "A[18]", "A[14]", "A[10]", "A[6]", "B[2]", "B[30]", "B[26]", "X[30]", "X[26]"]},
-  {name: "SD[3]", wave: "xz.......2233445522z.22x", data: ["A[31]", "A[27]", "A[23]", "A[19]", "A[15]", "A[11]", "A[7]", "B[3]", "B[31]", "B[27]", "X[31]", "X[27]"]},
-],
-   {name:""},
-   ["ByteOrder=1",
-  {name: "SD[0]", wave: "x555555555544332255z.55x", data: ["I[7]", "I[6]", "I[5]", "I[4]", "I[3]", "I[2]", "I[1]", "I[0]",
-                                                           "A[4]", "A[0]", "A[8]",  "A[12]", "A[20]", "A[16]", "A[24]", "A[28]", "B[4]", "B[0]", "X[4]", "X[0]"]},
-  {name: "SD[1]", wave: "xz.......5544332255z.55x", data: ["A[5]", "A[1]", "A[9]",  "A[13]", "A[21]", "A[17]", "A[25]", "A[29]", "B[5]", "B[1]", "X[5]", "X[1]"]},
-  {name: "SD[2]", wave: "xz.......5544332255z.55x", data: ["A[6]", "A[2]", "A[10]", "A[14]", "A[22]", "A[18]", "A[26]", "A[30]", "B[6]", "B[2]", "X[6]", "X[2]"]},
-  {name: "SD[3]", wave: "xz.......5544332255z.55x", data: ["A[7]", "A[3]", "A[11]", "A[15]", "A[23]", "A[19]", "A[27]", "A[31]", "B[7]", "B[3]", "X[7]", "X[3]"]},
-  ],
-  ],
-  head: {
-    text: "Serial bit ordering for 6 bytes transmitted from FIFO words 'I[31:0], A[31:0]' and 'B[31:0]', and 1 byte received into word 'X[31:0]'",
-  },
-  foot: {
-    text: "Command consists of 4 segments, all TX data is written to DATA using 32-bit memory instructions (all bytes enabled)"
-  }
-}
-```
-
-When packing data into the TX FIFO, there are also no restrictions on the alignment of the data written to the [`TXDATA`](data/spi_host.hjson#txdata) memory window, as it supports byte-enable signals.
-This means that when copying bytes into [`TXDATA`](data/spi_host.hjson#txdata) from unaligned firmware memory addresses, it is possible to use byte or half-word instructions.
-Full-word instructions should however be used whenever possible, because each write consumes a full word of data in the TX FIFO regardless of the instruction size.
-Smaller writes will thus make inefficient use of the TX FIFO.
-
-Filtering out disabled bytes consumes clock cycles in the data pipeline, and can create bubbles in the transmission of SPI_DATA.
-In the worst case, such bubbles can also be interpreted as transient underflow conditions in the TX FIFO, and could trigger spurious interrupts.
-The longest delays occur whenever a word is loaded into the TX FIFO with only one byte enabled.
-
-When writing to the [`TXDATA`](data/spi_host.hjson#txdata) window, only three types of data are expected: individual bytes, half-words, and full-words.
-Other types of write transactions (i.e., non-contiguous, zero-byte and three-byte writes) are not supported by most processors.
-Therefore it is assumed that if such transactions do appear, it is likely a sign of a system integrity error, and so these other classes of writes are not supported.
-
-If such transactions ever occur, they trigger an "Invalid Access" error event, which suspends the processing of future commands until the error has been cleared by setting the [`ERROR_STATUS.ACCESSINVAL`](data/spi_host.hjson#error_status) bit.
-
-The RX FIFO has no special provisions for packing received data in any unaligned fashion.
-Depending on the `ByteOrder` parameter, the first byte received is always packed into either the most- or least-significant byte read from the [`RXDATA`](data/spi_host.hjson#rxdata) memory window.
-
-
-## Pass-through Mode
-
-The SPI_HOST also supports a special "Pass-through" mode, which allows for the direct control of the serial interface by another block (namely SPI_DEVICE).
-This feature is entirely controlled by intermodule signals `passthrough_i` and `passthrough_o`, which control a set of multiplexers.
-If `passthrough_i.passthrough_en` is asserted the SPI_HOST peripheral bus signals reflect the corresponding signals in the `passthrough_i` structure.
-Otherwise, the peripheral signals are controlled by the SPI_HOST FSM and the internal shift register.
-
-## Interrupt Aggregation
-
-In order to reduce the total number of interrupts in the system, the SPI_HOST has only two interrupt lines: `error` and `spi_event`.
-Within these two interrupt classes, there are a number of conditions which can trigger them.
-
-Each interrupt class has a secondary status and mask register, to control which sub-classes of SPI events will cause an interrupt.
-
-### SPI Events and Event Interrupts
-
-The SPI_HOST supports interrupts for the following SPI events:
-
-- `IDLE`: The SPI_HOST is idle.
-- `READY`: The SPI_HOST is ready to accept a new command.
-- `RXFULL`: The SPI_HOST has run out of room in the RXFIFO.
-- `RXWM`: The number of 32-bit words in the RXFIFO currently exceeds the value set in [`CONTROL.RX_WATERMARK`](data/spi_host.hjson#control).
-- `TXEMPTY`: The SPI_HOST has transmitted all the data in the TX FIFO.
-- `TXWM`: The number of 32-bit words in the TX FIFO currently is currently less than the value set in [`CONTROL.TX_WATERMARK`](data/spi_host.hjson#control)
-
-Most SPI events signal a particular condition that persists until it is fixed, and these conditions can be detected by polling the corresponding field in the [`STATUS`](data/spi_host.hjson#status) register.
-
-In addition to these events, there are also two additional diagnostic fields in the [`STATUS`](data/spi_host.hjson#status) register:
-- `RXSTALL`: The RX FIFO is full, and the SPI_HOST is stalled and waiting for firmware to remove some data.
-- `TXSTALL`: The TX FIFO is not only empty, but the SPI_HOST is stalled and waiting for firmware to add more data.
-
-These bits can provide diagnostic data for tuning the throughput of the device, but do not themselves generate event interrupts.
-
-By default none of these SPI events trigger an interrupt.
-They need to be enabled by writing to the corresponding field in [`EVENT_ENABLE`](data/spi_host.hjson#event_enable).
-
-The SPI event interrupt is signaled only when the IP enters the corresponding state.
-For example if an interrupt is requested when the TX FIFO is empty, the IP will only generate one interrupt when the last data word is transmitted from the TX FIFO.
-In this case, no new interrupts will be created until more data has been added to the FIFO, and all of it has been transmitted.
-
-#### Stall Conditions
-
-The SPI_HOST IP will temporarily suspend operations if it detects a potential overflow of the RX FIFO or an attempted underflow of the TX FIFO.
-During a stall event, `csb` remains active, and there are no `sck` clock ticks until there is more data to transmit or there is some space to receive more data.
-The `RXSTALL` and `TXSTALL` status bits are meant to inform firmware of such halts.
-Due to implementation details the SPI_HOST IP will also pause, and signal a stall condition, if there are delays related to packing or unpacking the SPI_DATA into 32-bit words.
-The exact conditions for these *transient* stall conditions are implementation dependent, and described in detail in [the Design Details section](#bubbles-in-the-data-pipeline).
-
-### Error Interrupt Conditions
-
-There are six types of error events which each represent a violation of the SPI_HOST programming model:
-- If [`COMMAND`](data/spi_host.hjson#command) is written when [`STATUS.READY`](data/spi_host.hjson#status) is zero, the IP will assert [`ERROR_STATUS.CMDERR`](data/spi_host.hjson#error_status).
-- The IP asserts [`ERROR_STATUS.OVERFLOW`](data/spi_host.hjson#error_status) if it receives a write to [`TXDATA`](data/spi_host.hjson#txdata) when the TX FIFO is full.
-- The IP asserts [`ERROR_STATUS.UNDERFLOW`](data/spi_host.hjson#error_status) if it software attempts to read [`RXDATA`](data/spi_host.hjson#rxdata) when the RX FIFO is empty.
-- Specifying a command segment with an invalid width (speed), or making a request for a Bidirectional Dual- or Quad-width segment will trigger a [`ERROR_STATUS.CMDINVAL`](data/spi_host.hjson#error_status) error event.
-- Submitting a command segment to an invalid CSID (one larger or equal to `NumCS`) will trigger a [`ERROR_STATUS.CSIDINVAL`](data/spi_host.hjson#error_status) event.
-- [`ERROR_STATUS.ACCESSINVAL`](data/spi_host.hjson#error_status) is asserted if the IP receives a write event to the [`TXDATA`](data/spi_host.hjson#txdata) window that does not correspond to any known processor data type (byte, half- or full-word).
-
-All of these programming violations will create an error event when they occur.
-They will also halt the IP until the corresponding bit is cleared in the [`ERROR_STATUS`](data/spi_host.hjson#error_status) register.
-Whenever an error event occurs, the error must be acknowledged by clearing (write 1 to clear) the corresponding bit in [`ERROR_STATUS`](data/spi_host.hjson#error_status).
-
-By default all error events will trigger an `error` interrupt.
-Clearing the bit corresponding bit in the [`ERROR_ENABLE`](data/spi_host.hjson#error_enable) register in the suppresses interrupts for that class of error event and allows the IP to proceed even if one of these errors has occurred.
-The [`ERROR_STATUS`](data/spi_host.hjson#error_status) register will continue to report all violations even if a particular class of error event has been disabled.
-
-Of the six error event classes, `ACCESSINVAL` error events are the only ones which cannot be disabled.
-This is because `ACCESSINVAL` events are caused by anomalous TLUL byte-enable masks that do not correspond to any known software instructions, and can only occur through a fault in the hardware integration.
-
-When handling SPI_HOST `error` interrupts, the [`ERROR_STATUS`](data/spi_host.hjson#error_status) bit should be cleared *before* clearing the error interrupt in the [`INTR_STATE`](data/spi_host.hjson#intr_state) register.
-Failure do to so may result in a repeated interrupt.
-
-## Status Indicators
-
-The [`STATUS`](data/spi_host.hjson#status) register contains a number of fields that should be queried for successful operation or troubleshooting.
-
-The register [`STATUS.ACTIVE`](data/spi_host.hjson#status) indicates whether a command segment is currently being processed by the FSM.
-Even if [`STATUS.ACTIVE`](data/spi_host.hjson#status) is high it is often still possible to insert another command segment into the command FIFO.
-The register [`STATUS.READY`](data/spi_host.hjson#status) indicates that there is room in the command FIFO.
-
-The [`STATUS.BYTEORDER`](data/spi_host.hjson#status) field indicates the fixed value of the `ByteOrder` parameter, which is presented to software to confirm the byte ordering used in the [`RXDATA`](data/spi_host.hjson#rxdata) and [`TXDATA`](data/spi_host.hjson#txdata) windows.
-
-The 8-bit fields [`STATUS.RXQD`](data/spi_host.hjson#status) and [`STATUS.TXQD`](data/spi_host.hjson#status) respectively indicate the number of words currently stored in the RX and TX FIFOs.
-
-The remaining fields in the [`STATUS`](data/spi_host.hjson#status) register are all flags related to the management of the TX and RX FIFOs, which are described in the [section on SPI Events](#spi-events-and-event-interrupts).
-
-## Other Registers
-
-### SPI_HOST Enable
-
-The SPI_HOST state machine is disabled on reset.
-Before any commands are processed, the block must be enabled by writing one to the [`CONTROL.SPIEN`](data/spi_host.hjson#control) register.
-Writing a zero to this register temporarily suspends any previously submitted transactions.
-If the block is re-enabled by writing a one to [`CONTROL.SPIEN`](data/spi_host.hjson#control), any previously executing commands will continue from wherever they left off.
-
-An unacknowledged error event suspends the core state machine.
-
-### SPI_HOST Output Enable
-
-In addition to enabling the SPI_HOST FSM, the SPI_HOST outputs must also be enabled for successful operation.
-This can be achieved by also setting the [`CONTROL.OUTPUT_EN`](data/spi_host.hjson#control) field when enabling the SPI_HOST FSM.
-
-### Component reset
-
-In addition to the global hardware reset, there is a software reset option which completely resets the SPI host.
-To use this reset, assert [`CONTROL.SW_RST`](data/spi_host.hjson#control), and then wait for the device to reset ([`STATUS.ACTIVE`](data/spi_host.hjson#status), [`STATUS.TXQD`](data/spi_host.hjson#status) and [`STATUS.RXQD`](data/spi_host.hjson#status) to all go to zero), before releasing [`CONTROL.SW_RST`](data/spi_host.hjson#control).
-
-## Block Diagram
-
-![](./doc/spi_host_block_diagram.svg)
-
-## Hardware Interfaces
-
-* [Interface Tables](data/spi_host.hjson#interfaces)
-
-# Design Details
-
-## Component Overview
-
-Transaction data words flow through the SPI_HOST IP in a path which starts with the TX FIFOs, shown in the block diagram above.
-At the output of the TX FIFOs each data word is separated into individual bytes by the Byte Select block, which is also responsible for parsing the byte-enable mask and discarding unwanted bytes.
-Selected bytes are then passed into the shift register, where they are played out at Standard, Dual, or Quad speed.
-For receive segments, outputs from the shift register are passed into the Byte Merge block to be packed into 32-bit words.
-Finally the repacked words are inserted into the RX FIFO to be read by firmware.
-
-All of the blocks in the data path use ready-valid handshakes for flow control.
-In addition, the Byte Select block expects a `flush` pulse from the shift register to signify when no further data is needed for the current segment, and so any remaining data in the current word can be discarded.
-Likewise, the Byte Merge block receives a `last` signal from the shift register to identify the end of a command segment so that any partial words can be passed into the RX FIFO (regardless of whether the last byte forms a complete 32-bit word).
-The shift register is then responsible for driving and receiving data on the `cio_sd` lines.
-It coordinates all of the data flow to and from the Byte Select and Byte Merge blocks.
-
-The SPI_HOST FSM parses the software command segments and orchestrates the proper transmission of data through its control of the shift register.
-The FSM directly drives the `cio_sck` and `cio_csb` signals at the commanded speed.
-It also controls the shift register: dictating the correct timing for sending out each beat of data, loading bytes from the Byte Select, and sending bytes on to the Byte Merge block.
-
-## RX and TX FIFOs
-
-The RX and TX FIFOs store the transmitted and received data, which are stored in synchronous FIFOs.
-The RX FIFO is 32 bits wide, matching the width of the TLUL register bus.
-The TX FIFO on the other hand is 36 bits wide, with 32 bits of SPI data (again to match the TLUL bus width) plus 4 byte enable-bits, which are passed into the core to allow the processing of unaligned writes.
-
-The depth of these FIFOs is controlled by two independent parameters for the RX and TX queues.
-
-## Byte Select
-
-The Byte Select unit is responsible for loading words from the FIFO and feeding individual bytes into the shift register.
-This unit takes two data inputs: a data word, `word_i[31:0]`, and a byte enable signal, `word_be_i[3:0]`.
-There is a single output, `byte_o[7:0]`, which feeds the following shift register.
-There are ready/valid signals for managing flow control on all inputs and outputs.
-The shift register asserts ready to request new bytes, based on control inputs from the SPI_HOST FSM.
-
-When the SPI_HOST FSM indicates the final byte for a segment, the shift register asserts the `flush_i` signal with `byte_ready_i` as it requests the last byte from the Byte Select.
-This instructs the Byte Select block to send one more byte from current word, and then discard any remaining unused bytes, before immediately loading the next available word from the TX FIFO.
-
-It is assumed that the input data-words and byte enables have already been byte-swapped at the IP top level, as needed.
-The bytes are transmitted to the shift register in decreasing significance, starting with `word_i[31:24]`, followed by `word_i[23:16]`, `word_i[15:8]`, and finally `word_i[7:0]`.
-
-Some bytes may be skipped however if the corresponding value of `word_be_i[3:0]` is zero.
-For example if `word_be_i[3:0]` equals `4'b0011`, then the first two input bytes will be skipped, and only `word_i[15:8]` and `word_i[7:0]` will be forwarded, in that order.
-
-The following waveform illustrates the operation of the Byte Select module, highlighting the effect of the `flush_i` signal (in the first input word), as well as the effect of the byte enable signal (shown in the second word).
-
-```wavejson
-{signal: [
-  {name: "clk_i", wave:           "p............."},
-  {name: "word_i[31:0]", wave:    "x2..x2...x....", data: ["32'hBEADCAFE", "32'hDAD5F00D"]},
-  {name: "word_be_i[31:0]", wave: "x2..x2...x....", data: ["4'b1111", "4'b0011"]},
-  {name: "word_valid_i", wave:    "0..101...0...."},
-  {name: "word_ready_o",wave:     "1...0...10...."},
-  {name: "byte_o[7:0]", wave:     "x...2222.2222x", data: ["BE", "AD", "CA", "0", "DA", "D5", "F0", "0D"]},
-  {name: "byte_valid_o", wave:    "0...1..0...1.0"},
-  {name: "byte_ready_i", wave:    "1............."},
-  {name: "byte_flush_i", wave:    "0.....10......"},
-  ],
-  head: {
-  text: "Byte Select Operation"
-  }
-}
-```
-
-## Byte Merge
-
-The Byte Merge block is responsible for accumulating bytes from the shift register and packing them into words.
-Like the Byte Select block, it is based on the `prim_packer_fifo` primitive.
-
-The Byte Merge block has a data byte input, and a data word output, which are both controlled by their corresponding ready/valid signals.
-There are no byte-enable outputs for the byte merge, as it is assumed that software can infer the relevant bytes based on the length of the relevant read command segment.
-
-There is `byte_last_i` signal, to indicate the final byte in a word.
-If `byte_last_i` is asserted whenever a byte is loaded, the new byte will be added to the output word, and any remaining bytes will be set to zero, before the word is be loaded into the RX FIFO.
-
-Input bytes are packed into the output word in decreasing significance.
-The first byte in each segment is loaded into `word_o[31:24]`.
-The following bytes are packed into `word_o[23:16]`, `word_o[15:8]`, and then `word_o[7:0]`.
-For partially filled words, the zero padding goes into the least significant byte positions.
-
-Any ByteOrder swapping is performed at the other end of the RX FIFO.
-
-```wavejson
-{signal: [
-  {name: "clk_i",        wave: "p.............."},
-  {name: "byte_i[7:0]",  wave: "x22222.2....22x", data: ["01", "02", "03", "04", "05", "06", "07", "08"]},
-  {name: "byte_valid_i", wave: "01............."},
-  {name: "byte_last_i",  wave: "0....1.0......."},
-  {name: "byte_ready_o", wave: "1....010...1..."},
-  {name: "word_o[31:0]", wave: "2.2222222222222", data: ["0", "01","0102","010203", "01020304", "0", "05", "0500", "05000", "050000", "0", "06", "0607", "060708"]},
-  {name: "word_valid_o", wave: "0....10...10..."},
-  {name: "word_ready_i", wave: "1.............."}
-  ],
- config: {hscale:2},
-  head: {
-  text: "Byte Merge Operation"
-  }
-}
-```
-
-## Shift Register
-
-The SPI_HOST shift register serially transmits and receives all bytes to the `sd_o[3:0]` and `sd_i[3:0]` signals, based on the following timing-control signals from the FSM:
-- `speed_i`: Controls the speed of the current data segment, ranging from `Standard` or `Dual` to `Quad`
-- `wr_en_i`: Writes a new byte from the Byte Select into the 8-bit shift register
-This is usually the first signal issued to the shift register in command segments with data to transmit (i.e., TX only, or bidirectional segments)
-   - There is also a `wr_ready_o` output to tell the FSM that there is no data currently available.
-     If `wr_ready_o` is deasserted when the FSM asserts `wr_en_i`, the FSM will stall.
-- `last_write_i`: When asserted at the same time as `wr_en_i`, this indicates that the current byte is the last of its command segment, and thus the `tx_flush_o` signal should be asserted when requesting this byte from the Byte Select block.
-- `shift_en_i`: Advances the shift register by 1, 2, or 4 bits, depending on the value of `speed_i`
-- `full_cyc_i`: Indicates full-cycle operation (i.e., input data are sampled from `sd_i` whenever new data is shifted out to `sd_o`)
-- `sample_en_i`: Samples `sd_i[3:0]` into a temporary register, `sd_i_q[3:0]` so it can be loaded into the shift register with the next assertion of `shift_en_i`
-Explicit sampling is particularly required for Standard SPI bidirectional segments, where new input data arrives before the first output shift operation.
-For consistency in timing, the `sd_i_q` buffer is used in all other modes as well, unless `full_cyc_i` is asserted.
-The `sample_en_i` signal is ignored during full-cycle operation, in which case data is copied directly into the shift register during shift operations.
-- `rd_en_i`: Indicates that the current byte from the shift register should be transferred on to the Byte Merge block
-   - The `rd_ready_o` output informs the FSM whenever all data storage (the RX FIFO plus any intervening buffers) is full and no further data can be acquired.
-- `last_read_i`: When asserted at the same time as `rd_en_i`, this indicates that the current byte is the last of its command segment, and thus the `rx_last_o` signal should be asserted when passing this byte to the Byte Merge block.
-
-```wavejson
-{signal: [
-  {name: "clk_i",                   wave: "p.........................."},
- [ "External signals",
-  {name: "TX DATA[31:0] (TX FIFO)", wave: "2..........................", data:"0x123456XX"},
-  {name: "cio_sck_o (FSM)",         wave: "0...1010101010101010101010."},
- ],
-  {name: "cio_csb_o[0] (FSM)",      wave: "1..0......................."},
-  {name: "tx_data_i[7:0]",          wave: "2..2...............2.......", data:["0x12", "0x34", "0x56"]},
-  {name: "tx_valid_i",              wave: "1.........................."},
-  {name: "tx_ready_o/wr_en_i",      wave: "0.10..............10......."},
-  {name: "sample_en_i",             wave: "0..101010101010101010101010"},
-  {name: "shift_en_i",              wave: "0...10101010101010..1010101"},
-  {name: "speed_i[1:0]",            wave: "2..........................", data: ["0 (Standard SPI)"]},
-  {name: "sd_i[1]",                 wave: "x..1.1.0.0.1.1.1.1.0.1.0.1."},
-  {name: "sd_i_q[1]",               wave: "x...1.1.0.0.1.1.1.1.0.1.0.1"},
-  {name: "sr_q[0]",                 wave: "x..0.1.1.0.0.1.1.1.0.1.0.1."},
-  {name: "sr_q[1]",                 wave: "x..1.0.1.1.0.0.1.1.0.0.1.0."},
-  {name: "sr_q[2]",                 wave: "x..0.1.0.1.1.0.0.1.1.0.0.1."},
-  {name: "sr_q[3]",                 wave: "x..0.0.1.0.1.1.0.0.0.1.0.0."},
-  {name: "sr_q[4]",                 wave: "x..1.0.0.1.0.1.1.0.1.0.1.0."},
-  {name: "sr_q[5]",                 wave: "x..0.1.0.0.1.0.1.1.1.1.0.1."},
-  {name: "sr_q[6]",                 wave: "x..0.0.1.0.0.1.0.1.0.1.1.0."},
-  {name: "sr_q[7]",                 wave: "x..0.0.0.1.0.0.1.0.0.0.1.1."},
-  {name: "sr_q[7:0] (hex)",         wave: "x..4.2.2.2.2.2.2.2.4.2.2.2.",
-   data: ["0x12", "0x25", "0x4B", "0x96", "0x2c", "0x59", "0xB3", "0x67", "0x34", "0x69", "0xD2", "0xA5"]},
-  {name: "Load Input Data Event",   wave: "1..H...............H......."},
-  {name: "rx_data_o[7:0]", wave: "x..................2.......", data: ["0xcf"]},
-  {name: "rx_valid_o[7:0]/rd_en_i", wave: "0.................10......."},
-  {name: "sd_o[0] (sr_q[7])", wave: "x..0.0.0.1.0.0.1.0.0.0.1.1."},
-],
-head: {
-  text: "Shift Register During Standard SPI Transaction: Simultaneous Receipt and Transmission of Data."
-},
-}
-```
-
-The connection from the shift register to the `sd` bus depends on the speed of the current segment.
-- In Standard-mode, only the most significant shift register bit, `sr_q[7]` is connected to the outputs using `sd_o[0]`.
-In this mode, each `shift_en_i` pulse is induces a shift of only one bit.
-- In Dual-mode, the two most significant bits, `sr_q[7:6]`, are connected to `sd_o[1:0]` and the shift register shifts by two bits with every `shift_en_i` pulse.
-- In Quad-mode, the four most significant bits, `sr_q[7:4]`, are connected to `sd_o[3:0]` and the shift register shifts four bits with every pulse.
-
-The connections to the shift register inputs are similar.
-Depending on the speed, the `sd_i` inputs are routed to the the 1, 2, or 4 least significant inputs of the shift register.
-In full-cycle mode, the shift register LSB's are updated directly from the `sd_i` inputs.
-Otherwise the data first passes through an input sampling register, `sd_i_q[3:0]`, which allows the input sampling events to be staggered from the output shift events.
-
-### Bubbles in the Data Pipeline
-
-Temporary delays in the transmission or receipt data are a performance issue.
-Stall events, which temporarily halt operation of the SPI_HOST IP, often indicate that software is not keeping up with data in the TX and RX FIFOs.
-For this reason the SPI_HOST IP can create interrupts to help monitor the frequency of these stall events, in order to identify correctable performance delays.
-
-There is also the possibility of encountering bubble events, which cause transient stalls in the data pipeline.
-Transient stalls only occur for Quad-mode segments, and only when transmitting or receiving words with only one valid byte.
-
-When transmitting at full clock speed, Quad-mode segments need to process one byte every four clock cycles.
-If a particular Quad TX segment pulls only one byte from a particular data word (for reasons related either to the segment length or odd data alignment), the `prim_packer_fifo` used in the Byte Select block can generate delays of up to four clocks before releasing the next byte.
-This can cause temporary stall conditions either during the Quad segment, or--if there is another TX segment immediately following--just before the following segment.
-
-Similar delays exist when receiving Quad-mode data, which are similarly worst when packing words with just one byte (i.e., when receiving segments of length 4n+1).
-The RX pipeline is however much more robust to such delays, thanks to buffering in the shift register outputs.
-There is some sensitivity to *repeated* 4 clock delays, but it takes at least six of them to cause a temporary stall.
-So transient RX stalls only occur when receiving more than six consecutive one-byte segments.
-As this is an unlikely use case, transient stalls are considered an unlikely occurrence in the RX path.
-
-Dual- and Standard-mode segments can tolerate byte-to-byte delays of 7 or 15 clocks, so there are no known mechanism for transient stalls at these speeds.
-
-Please refer to the [the Appendix](#analysis-of-transient-datapath-stalls) for a detailed analysis of transient stall events.
-
-## SPI_HOST Finite State Machine (FSM)
-
-The SPI_HOST FSM is responsible for parsing the input command segments and configuration settings, which it uses to control the timing of the `sck` and `csb` signals.
-It also controls the timing of shift register operations, coordinating I/O on the `sd` bus with the other SPI signals.
-
-This section describes the SPI_HOST FSM and its control of the `sck` and `csb` lines as well as its interactions with the Shift Register and the Command FIFO.
-
-### Clock Divider
-
-The SPI_HOST FSM is driven by the rising edge of the input clock, however the FSM state registers are not *enabled* during every cycle.
-There is an internal clock counter `clk_cntr_q` which repeatedly counts down from [`CONFIGOPTS.CLKDIV`](data/spi_host.hjson#configopts) to 0, and the FSM is only enabled when `clk_cntr_q == 0`.
-
-The exception is when the FSM is one of the two possible Idle states (`Idle` or `IdleCSBActive`), in which case `clk_cntr_q` is constantly held at zero, making it possible to immediately transition out of the idle state as soon as a new command appears.
-Once the FSM transitions out of the idle state, `clk_cntr_q` resets to [`CONFIGOPTS.CLKDIV`](data/spi_host.hjson#configopts), and FSM transitions are only enabled at the divided clock rate.
-
-As shown in the waveform below, this has the effect of limiting the FSM transitions to only occur at discrete *timeslices* of duration:
-
-$$T_\textrm{timeslice} = \frac{T_{\textrm{clk},\textrm{clk}}}{\texttt{clkdiv}+1}.$$
-
-```wavejson
-{signal: [
-  {name: 'clk',        wave: 'p......................'},
-  {name: 'clkdiv',     wave: '2......................', data: "3"},
-  {name: 'clk_cntr_q', wave: '222222222222......22222', data: "3 2 1 0 3 2 1 0 3 2 1 0 3 2 1 0 3"},
-  {name: 'FSM state',  wave: '2...2.......2.....2...2', data: "WaitTrail WaitIdle Idle WaitLead Hi"              },
-  {name: 'fsm_en',     wave: '0..10......1......0..10'              },
-  {name: 'Timeslice Boundary', wave: "1...H...H...H.....H...H"}
-],
-  edge: ["A<->B min. 9 cycles", "C<->D min. 4 cycles"],
- head: {text: "Use of FSM Enable Pulses to Realize Multi-Clock Timeslices", tock: 1},
- foot: { text: "The fsm_en signal is always high in idle states, to allow exit transitions at any time"}
-}
-```
-
-#### Other Internal Counters
-
-In addition to the FSM state register, the SPI_HOST FSM block also has a number of internal registers to track the progress of a given command segment.
-
-- `wait_cntr_q`: This counter is used the hold the FSM in a particular state for several timeslices, in order to implement the `CSNIDLE`, `CSNLEAD` or `CSNTRAIL` delays required for a particular device.
-
-- `byte_cntr_q`, `bit_cntr_q`: These counters respectively track the number of bytes left to transmit for the current segment and the number of bits left to transmit in the current byte.
-
-- Finally, there are registers corresponding to each configuration field (`csid_q`, `cpol_q`, `cpha_`, `csnidle_q`, `csnlead_q`, `csntrail_q`, and `full_cyc_q`) and to each command segment field (`csaat`, `cmd_rd_en`, `cmd_wr_en`, and `cmd_speed`).
-This registers are sampled whenever a new command comes in, allowing the command inputs to change.
-
-### Basic Operation
-
-The state machine itself is easiest understood by first considering a simple case, with CSAAT set to zero.
-For this initial discussion it is assumed that every command consists of one single segment.
-Multi-segment commands are considered in the following sections.
-In this case the state machine can be simplified to the following figure.
-
-![](./doc/spi_host_fsm_simplified.svg)
-
-The operation of the state machine is the same regardless of the polarity (CPOL) or phase (CPHA) of the current command.
-Commands with `CPOL==0` or `CPOL==1` are processed nearly identically, since the only difference is in the polarity of the `sck` output.
-The state machine drives an internal `sck` clock signal, which is low except when the FSM is in the `InternalClockHigh` state.
-If `CPOL==0` this clock is registered as is to the external `sck` ports.
-If `CPOL==1` the internal clock is *inverted* before the final `sck` output register.
-
-In the following description of the individual states, it is assumed that `CPOL==0`.
-To understand the IP's behavior for transactions with `CPOL==1`, simply invert the value of `sck`.
-
-1. Idle state: In this initial reset state, The `sck` signal is low, and all `csb` lines are high (i.e., inactive).
-An input command is registered whenever `command_valid_i` and `command_ready_o` are both high (i.e., when the signal `new_command = command_valid_i & command_ready_o` is high), in which case the state machine transitions to the `WaitLead` state.
-
-2. WaitLead state: In this state, `sck` remains low, and the `csb` line corresponding to `csid` is asserted-low.
-The purpose of this state is to hold `sck` low for at least `csnlead` + 1 timeslices, before the first rising edge of `sck`.
-For his reason, the FSM uses the `wait_cntr` to track the number of timeslices spent in this state, and only exits when `wait_cntr` counts down to zero, at which point the FSM transitions to the `InternalClkHigh` state.
-
-3. InternalClkHigh state: Entering this state drives `sck` high.
-This state repeats many times per segment, and usually transitions to the `InternalClkLow` state.
-The FSM transitions to the `WaitTrail` state only when the entire segment has been transmitted/received (as indicated by the signals last_bit and last_byte).
-This state machine usually only lasts stays in this state for one timeslice, except when the FSM is disabled or stalled.
-
-4. InternalClkLow state: This state serves to drive `sck` low between visits to the `InternalClkHigh` state.
-This state always returns back to the `InternalClkHigh` state in the next timeslice.
-
-5. WaitTrail state: Similar to the WaitLead, this state serves to control the timing of the `csb` line.
-The FSM uses the `wait_cntr` register to ensure that it remains in this state for `csntrail+1` timeslices, during which time the active `csb` is still held low.
-The `wait_cntr` register resets to [`CONFIGOPTS.CSNTRAIL`](data/spi_host.hjson#configopts) upon entering this state, and is decremented once per timeslice.
-This state transitions to `WaitIdle` when `wait_cntr` is zero.
-
-6. WaitIdle state: In this timing control state, the FSM uses the `wait_cntr` register to ensure that all `csb` lines are held high for at least `csnidle+1` timeslices.
-The `wait_cntr` register resets to [`CONFIGOPTS.CSNIDLE`](data/spi_host.hjson#configopts) upon entering this state, and is decremented once per timeslice.
-This state transitions to `Idle` when `wait_cntr` reaches zero.
-
-```wavejson
-{signal: [
-  {name: 'clk', wave: 'p...............'},
-  {name: 'rst_n', wave: '01..............'},
-  {name: 'state', wave: 'x22.42424242.2.2', data: ['Idle', 'WaitLead', 'IntClkHigh', 'IntClkLow', 'IntClkHigh', 'IntClkLow', 'IntClkHigh', 'IntClkLow','IntClkHigh', 'WaitTrail', 'WaitIdle', 'Idle']},
-  {name: 'csb (active device)', wave: 'x10..........1..'},
-  {name: 'csb (all others)', wave: '1...............'},
-  {name: 'sck', wave: '0...10101010....'}
-],
- config: {hscale: 2}
-}
-```
-
-### Milestone Signals, Serial Data Lines & Shift Register Control
-
-The FSM manages I/O on the `sd` bus by controlling the timing of the shift register control signals: `shift_en_o`, `sample_en_o`, `rd_en_o`, `last_read_o`, `wr_en_o`, and `last_write_o`.
-
-The shift register control signals are managed through the use of three intermediate signals:
-- `byte_starting`: This signal indicates the start of a new byte on the `sd` bus in the *following* clock cycle.
-For Bidirectional or TX segments this signal would indicate that it is time to load a new byte into the shift register.
-This signal corresponds to the FSM's `wr_en_o` port, though that output is suppressed during RX or dummy segments.
-- `byte_ending`: This signal indicates the end of the current `sd` byte in the *current* clock cycle (i.e., the next clock cycle either marks the beginning new byte or the end of the current segment).
-As illustrated in the following waveform, the `byte_starting` and `byte_ending` signals are often asserted at the same time, though there is an extra `byte_starting` pulse at the beginning of each command and an extra `byte_ending` pulse at the end.
-For RX and bidirectional command segments, a `byte_ending` pulse generates a `rd_en_o` pulse to the shift register, which transfers the 8-bit contents of the shift register into the RX FIFO via the Byte Merge block.
-- `bit_shifting`: This signal drives the `shift_en_o` control line to the shift register, ejecting the most-significant bits, and updating the `sd` outputs.
-
-These *milestone signals* mark the progress of each command segment.
-
-The coordination of the milestone signals and the shift register controls are shown in the following waveform.
-Since the milestone signal pulses coincide with *entering* particular FSM states, they are derived from the state register *inputs* (i.e., `state_d`), as opposed to the state register outputs (`state_q`).
-
-```wavejson
-{signal: [
-  {name: 'clk', wave: 'p........................'},
-  {name: 'rst_n', wave: '01.......................'},
-  {name: 'state_q',
-   wave: 'x2.2.42424242424242424242', data: "Idle WL Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo",
-   node: '...W..V.............U'},
-  {name: 'csb', wave: 'x1.0.....................'},
-  {name: 'sck', wave: '0....10101010101010101010'},
-  {name: 'state_d',
-   wave: 'x22.42424242424242424242', data: "Idle WL Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo",
-   node: '..Z..Y.............X'},
-  {name: 'byte_starting / wr_en_o',
-   wave: 'x010...............10....',
-   node: '..A................E'},
-  {name: 'byte_ending / rd_en_o',
-   wave: 'x0.................10....',
-   node: '...................F'},
-  {name: 'bit_shifting / shift_en_o',
-   wave: 'x0...10101010101010..1010',
-   node: '.....C'},
-  {name: 'sample_en_o',
-   wave: 'x0.10.1010101010101010101',
-   node: '...B..D'},
-  {name: 'sample_event',
-   wave: '1...H..H.H.H.H.H.H.H.H.H.'},
-  {name:'sd_o',
-   wave:'x..2..2.2.2.2.2.2.2.2.2.2',
-   node:'',
-   data: "A[7] A[6] A[5] A[4] A[3] A[2] A[1] A[0] B[7] B[6]"},
-  {name: 'bit_cntr_q', wave: 'x2.2..2.2.2.2.2.2.2.2.2.2', data: "0 7 6 5 4 3 2 1 0 7 6 5"},
-  {name: 'byte_cntr_q', wave: 'x2.2................2....', data: "0 N N-1"},
-
-],
-edge: ['A-~>B', 'C-~>D', 'Z-~>A', 'Y-~>C', 'X-~>E', 'X-~>F', 'Z-~>W', 'Y-~>V', 'X-~>U'],
-config: {hscale: 1},
-head: {text: "Timing Relationship between FSM states, Milestone Signals, and Shift Register controls (with CPHA=0)"},
-foot: {text: "Key: WL=\"WaitLead\", Hi=\"InternalClkHigh\", Lo=\"InternalClkLow\" "}
-}
-```
-
-When working from a CPHA=0 configuration, the milestone signals are directly controlled by transitions in the FSM state register, as described in the following table.
-
-<table>
-<thead><tr>
-<th>Milestone Signal</th><th>FSM Triggers</th>
-</tr></thead>
-<tbody>
-<tr><td rowspan=2><tt>byte_starting</tt></td><td>Entering <tt>WaitLead</tt></td></tr>
-<tr><td>Entering <tt>InternalClkLow</tt> and <tt>bit_cntr == 0 </tt> </td></tr>
-<tr><td><tt>bit_shifting</tt></td><td>Entering <tt>InternalClkLow</tt> and <tt>bit_cntr != 0</tt></td></tr>
-<tr><td><tt>byte_ending</tt></td><td>Exiting <tt>InternalClkHigh</tt> and <tt>bit_cntr == 0</tt></td></tr>
-</tbody>
-</table>
-
-When working from a CPHA=1 configuration, the milestone signals exploit the fact that there is usually a unique correspondence between `csb`/`sck` events and FSM transitions.
-There are some exceptions to this pattern since, as discussed below, CSAAT- and multi-csb-support requires the creation of multiple flavors of idle states.
-However, there are no milestone signal pulses in any of the transitions between these various idle states.
-Thus in CPHA=1 mode, the milestone signals are delayed by one-state transition.
-For example, in a CPHA=0 configuration the first data burst should be transmitted as the `csb` line is asserted low, that is, when the FSM enters the WaitLead state.
-Thus a `byte_starting` pulse is generated at this transition.
-On the other hand, in CPHA=1 configuration the first data burst should be transmitted after the first edge of `sck`, which happens on the next state transition as illustrated in the following waveform.
-
-That said, there are two copies of each milestone signal:
-- the original FSM-driven copy, for use when operating with CPHA=0, and
-- a delayed copy, for use in CPHA=1 operation.
-
-```wavejson
-{signal: [
-  {name: 'clk', wave: 'p......................'},
-  {name: 'rst_n', wave: '01.....................'},
-  {name: 'state_q',
-   wave: 'x2.2.4242424242424242.2', data: "Idle WL Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi WT WI",
-   node: '...W..V.....U..........'},
-  {name: 'state_d',
-   wave: 'x22.4242424242424242.2', data: "Idle WL Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi WT WI",
-   node: '..Z..Y.....X..........'},
-  {name: 'byte_starting_cpha0',
-   wave: 'x010.......10..........',
-   node: '..A........C...........'},
-  {name: 'byte_starting_cpha1',
-   wave: 'x0..10......10.........',
-   node: '....B.......D..........'},
-  {name: 'byte_ending_cpha0',
-   wave: 'x0.........10......10..',
-   node: '...........E...........'},
-  {name: 'byte_ending_cpha1',
-   wave: 'x0..........10......10.',
-   node: '............F..........'},
-  {name: 'bit_shifting_cpha0',
-   wave: 'x0...101010..101010....',
-   node: '.....G...I...K.........'},
-  {name: 'bit_shifting_cpha1',
-   wave: 'x0....101010..101010...',
-   node: '......H...J...L'},
-  {name: 'csb', wave: 'x1.0..................1'},
-  {name: 'sck', wave: '0....1010101010101010..'},
-  ["CPHA=0",
-   {name: 'byte_starting',
-    wave: 'x010.......10..........'},
-   {name: 'bit_shifting',
-    wave: 'x0...101010..101010....'},
-  {name: 'bit_cntr_q', wave: 'x2.2..2.2.2.2.2.2.2....',
-   data: "0 6 4 2 0 6 4 2 0"},
-  {name: 'byte_cntr_q', wave: 'x2.2........2..........',
-   data: "0 1 0"},
-  {name:'sd_o',
-   wave:'x0.2..2.2.2.2.2.2.2...0',
-   node:'',
-   data: "A[7:6] A[6:5] A[4:3] A[1:0] B[7:6] B[6:5] B[4:3] B[1:0]"}
-  ],
-  ["CPHA=1",
-   {name: 'byte_starting',
-    wave: 'x0..10......10.........'},
-   {name: 'bit_shifting',
-    wave: 'x0....101010..101010...'},
-   {name: 'byte_ending',
-    wave: 'x0..........10......10.'},
-  {name: 'bit_cntr_q', wave: 'x2...2.2.2.2.2.2.2.2...',
-   data: "0 6 4 2 0 6 4 2 0"},
-  {name: 'byte_cntr_q',
-   wave: 'x2.2.........2.........',
-   data: "0 1 0"},
-  {name:'sd_o',
-   wave:'x0...2.2.2.2.2.2.2.2..0',
-   node:'',
-   data: "A[7:6] A[6:5] A[4:3] A[1:0] B[7:6] B[6:5] B[4:3] B[1:0]"}
-  ],
-],
-edge: ['Z-~>A','Y-~>G', 'X-~>C', 'X-~>E','A->B', 'C->D', 'E->F', 'G->H', 'I->J', 'K->L', 'Z->W', 'Y->V', 'X->U'],
-config: {hscale: 1},
-head: {text: "Comparison of Milestone Signals in CPHA=0 vs. CPHA=1 configuration (for a dual speed segment)"},
-foot: {text: "Key: WL=\"WaitLead\", Hi=\"InternalClkHigh\", Lo=\"InternalClkLow\", WT=\"WaitTrail\""}
-}
-```
-
-### Milestone Signals and Control of the the Bit and Byte Counters
-
-The previous waveform also highlights the relationship between the milestone signals and the bit and byte counters.
-At the beginning of each byte `bit_cntr_q` is reset to a speed-specific value, to trigger the correct number of shift operations required for each byte:
-- 7 for Standard-mode
-- 6 for Dual-mode
-- 4 for Quad-mode
-
-The reset of the `bit_cntr_q` counter is triggered by the `byte_starting` register.
-Meanwhile the `bit_shifting` signal triggers a decrement of the bit-shifting register.
-The size of the decrement also depends on the speed of the current segment:
-- 1 for Standard-mode
-- 2 for Dual-mode
-- 4 for Quad-mode
-
-The `byte_cntr_q` register is updated from the [`COMMAND.LEN`](data/spi_host.hjson#command) register value, at the beginning of each segment, and decremented after each `byte_ending` pulse until the counter reaches zero.
-
-This relationship between the milestone signals and the bit and byte counters is also illustrated in the previous waveform.
-
-### Implementation of Configuration Change Delays
-
-As described in the [Theory of Operation](#idle-time-delays-when-changing-configurations), changes in configuration only occur when the SPI_HOST is idle.
-The configuration change must be preceded by enough idle time to satisfy the previous configuration, and followed by enough idle time to satisfy the new configuration.
-
-In order to support these idle time requirements, the SPI_HOST FSM has two idle waiting states.
-- The `WaitIdle` state manages the idle time requirements of the *preceding* command segment, and usually transitions to the `Idle` state afterwards.
-- From the `Idle` state the FSM monitors for changes in configuration, and transitions to the `ConfigSwitch` state if any changes are detected in the next incoming command segment.
-This state introduces delays long enough the satisfy the idle time requirements of *following* command segment.
-From the `ConfigSwitch` state, the state machine directly enters the `WaitLead` state to start the next command segment.
-
-A complete state diagram, including the `ConfigSwitch` state, is shown in the following section.
-
-The following waveform illustrates how a change in a single [`CONFIGOPTS`](data/spi_host.hjson#configopts), here [`CONFIGOPTS.CPOL`](data/spi_host.hjson#configopts), triggers an entry into the `ConfigSwitch` Idle state, and how the new configuration is applied at the transition from `WaitIdle` to `ConfigSwitch` thereby ensuring ample idle time both before and after the configuration update.
-
-```wavejson
-{signal: [
-  {name: 'clk',                       wave: 'p.................'},
-  {name: 'command_i.csid',            wave: '2.................', data: ["0"]},
-  {name: 'command_i.configopts.cpol', wave: '1........x........'},
-  {name: 'cpol_q',                    wave: '0........1........'},
-  {name: 'switch_required',           wave: '1........x........'},
-  {name: 'command_valid_i',           wave: '1........0........'},
-  {name: 'command_ready_i',           wave: '0.......10........'},
-  {name: 'FSM state',                 wave: '2222..2..2..2..222', data: ["Hi", "Lo", "Hi", "WaitTrail", "WaitIdle", "ConfigSwitch", "WaitLead", "Hi", "Lo", "Hi"]},
-  {name: 'csb[0]',                    wave: '0.....1.....0.....'},
-  {name: 'sck',                       wave: '1010.....1.....010'},
-  {name: 'configuration update event', wave: '1........H........'}
-],
-  edge: ["A<->B min. 9 cycles", "C<->D min. 4 cycles"],
-  head: {text: "Extension of CSB Idle Pulse Due to CPOL Configuration Switch", tock: 1},
-  foot: { text: "(Note: Due to the presence of a valid command, the FSM transitions directly from WaitIdle to ConfigSwitch)"}
-}
-```
-
-### CSAAT Support
-
-In addition to omitting the `ConfigSwitch` state, the simplified state machine illustrated above does not take into account commands with multiple segments, where the CSAAT bit is enabled for all but the last segment.
-
-When the CSAAT bit in enabled there is no idle period between the current segment and the next, nor are the two adjoining segments separated by a Trail or Lead period.
-Usually the end of each segment is detected in the `InternalClkHigh` state, at which point, if CSAAT is disabled, the FSM transitions to the `WaitTrail` state to close out the transaction.
-However, if CSAAT is enabled the `WaitTrail` state is skipped, and the next state depends on whether there is another command segment available for processing (i.e., both `command_ready_o` and `command_valid_i` are both asserted).
-
-In order to support seamless, back-to-back segments the `ConfigSwitch` state can be skipped if a new segment is already available when the previous ends, in which case the FSM transitions directly to the `InternalClkLow` at the end of the previous segment.
-
-If there is no segment available yet, the FSM must pause and idly wait for the next command in the special `IdleCSBActive` state.
-This state serves a similar purpose to the `Idle` state since in this state the IP is doing nothing but waiting for new commands.
-It is different from the `Idle` state though in that during this state the active `csb` is held low.
-When a command segment is received in the `IdleCSBActive` state, it transitions immediately to the `InternalClkLow` state to generate the next `sck` pulse and process the next segment.
-
-```wavejson
-{signal: [
-  {name: 'clk', wave: 'p...........'},
-  {name: 'command_ready_o', wave: '0.1....0....'},
-  {name: 'command_valid_i', wave: '0.....10....'},
-  {name: 'new_command',     wave: '0.....10....'},
-  {name: 'state',           wave: '2222...22222', data: ["Hi", "Lo", "Hi", "IdleCSBActive", "Lo", "Hi", "Lo", "Hi", "Lo"]},
-  {name: 'sck (CPOL=0)',    wave: '1010....1010'},
-  {name: 'sd (CPHA=0)',     wave: '35.....3.4.5'}
- ],
-  edge: ["A<->B min. 9 cycles", "C<->D min. 4 cycles"],
-  head: {text: "Idling While CS Active", tock: 1}
-}
-```
-
-The following figure shows the complete state transition diagram of for the SPI_HOST FSM.
-
-![](./doc/spi_host_fsm_complete.svg)
-
-### Skipped idle states
-
-The `Idle` and `IdleCSBActive` states are unique from the others in that:
-1. In order to respond to an incoming command the FSM can exit these idle states at any time, regardless of the current timeslice definition.
-(In fact, since different commands may use different values for the `CLKDIV` configuration parameter, the concept of a timeslice is poorly defined when idle).
-2. These idle states may be *bypassed* in order to support more efficient transitions from one command segment to the next.
-If an incoming command is detected as the FSM is about to enter an idle state, that idle state is skipped, and the FSM immediately transitions to the next logical state, based on the contents of the new incoming command.
-
-These bypassable states, which are highlighted in the previous diagram, represent a number of possible transitions from one *pre-idle* state to a following *post-idle* state.
-For clarity such transitions are left implicit in the diagram above.
-However they could also be explicitly added to the state diagram.
-For example, the implicit transitions around the `Idle` are shown in the following figure.
-
-![](./doc/spi_host_bypassable_state.svg)
-
-### Stall
-
-Whenever the shift register needs to transfer data in (or out) of the RX (TX) FIFOs, but they are full (or empty), the FSM immediately stalls to wait for new data.
-
-During this stall period none of the FSM internal registers are updated.
-Normal operation proceeds only when the stall condition has been resolved or the SPI_HOST has been reset.
-
-In the SPI_HOST FSM this is realized by disabling all flop updates whenever a stall is detected.
-
-Furthermore, all control signals out of the FSM are suppressed during a stall condition.
-
-From an implementation standpoint, the presence of a stall condition has two effects on the SPI_HOST FSM:
-1. No flops or registers may be updated during a stall condition.
-Thus the FSM may not progress while stalled.
-
-2. All handshaking or control signals to other blocks must be suppressed during a stall condition, placing backpressure on the rest the blocks within the IP to also stop operations until the stall is resolved.
-
-# Programmer's Guide
-
-The operation of the SPI_HOST IP proceeds in seven general steps.
-
-To initialize the IP:
-1. Program the [`CONFIGOPTS`](data/spi_host.hjson#configopts) multi-register with the appropriate timing and polarity settings for each `csb` line.
-2. Set the desired interrupt parameters
-3. Enable the IP
-
-Then for each command:
-
-4. Load the data to be transmitted into the FIFO using the [`TXDATA`](data/spi_host.hjson#txdata) memory window.
-5. Specify the target device by programming the [`CSID`](data/spi_host.hjson#csid)
-6. Specify the structure of the command by writing each segment into the [`COMMAND`](data/spi_host.hjson#command) register
-   - For multi-segment transactions, be sure to assert [`COMMAND.CSAAT`](data/spi_host.hjson#command) for all but the last command segment
-7. For transactions which expect to receive a reply, the data can then be read back from the [`RXDATA`](data/spi_host.hjson#rxdata) window.
-
-These latter four steps are then repeated for each command.
-Each step is covered in detail in the following sections.
-
-For concreteness, this Programmer's Guide uses examples from one of our primary target devices, the [W25Q01JV flash from Winbond](https://www.winbond.com/resource-files/W25Q01JV%20SPI%20RevB%2011132019.pdf).
-The SPI_HOST IP is however suitable for interacting with any number of SPI devices, and the same mode of operation can be used for any SPI device.
-
-## Initializing the IP
-
-### Per-target Configuration
-
-The [`CONFIGOPTS`](data/spi_host.hjson#configopts) multi-register must be programmed to reflect the requirements of the attached target devices.
-As such these registers can be programmed once at initialization, or whenever a new device is connected (e.g., via changes in the external pin connections, or changes in the pinmux configuration).
-The proper settings for the [`CONFIGOPTS`](data/spi_host.hjson#configopts) fields (e.g., CPOL and CPHA, clock divider, ratios, and other timing or sampling requirements) will all depend on the specific device attached as well as the board level delays.
-
-### Interrupt configuration
-
-The next step is to configuration the interrupts for the SPI_HOST.
-This should also be done at initialization using the following register fields:
-- The [`ERROR_ENABLE`](data/spi_host.hjson#error_enable) register should be configured to indicate what types of error conditions (if any) should be ignored to not trigger an interrupt.
-At reset, these fields are all set indicating that all error classes trigger an interrupt.
-
-- For interrupt driven I/O the [`EVENT_ENABLE`](data/spi_host.hjson#event_enable) register must be configured to select the desired event interrupts to signal the desired conditions (e.g. "FIFO empty", "FIFO at the watermark level", or "ready for next command segment").
-By default, this register is all zeros, meaning all event interrupts are disabled, and thus all transactions must be managed by polling the status register.
-   - When using the FIFO watermarks to send interrupts, the watermark levels must be set via the [`CONTROL.RX_WATERMARK`](data/spi_host.hjson#control) and [`CONTROL.TX_WATERMARK`](data/spi_host.hjson#control) fields.
-
-- The event and error interrupts must finally be enabled using the [`INTR_ENABLE`](data/spi_host.hjson#intr_enable) register.
-
-### Enabling the SPI_HOST
-
-The IP must be enabled before sending the first command by asserting the [`CONTROL.SPIEN`](data/spi_host.hjson#control) bit.
-
-## Issuing Transactions
-
-As mentioned above, each command is typically specified in three phases: loading the TX data, specifying the command segments/format, and reading the RX data.
-In principle, the first two steps can be performed in either order.
-If the SPI_HOST does not find any data to transmit it will simply stall until data is inserted.
-Meanwhile, the RX data is only available after the command format has been specified and processed by the state machine.
-
-For longer transactions, with data larger than the capacity of the FIFOs, the command sequence may become more complex.
-For instance, to send 1024 bytes of data in a single transaction, the TX data may need to be loaded several times if using a 256-byte FIFO.
-In this instance, the programming sequence will consist of at least four iterations of entering TX data and waiting for the TX FIFO to drain.
-
-### Loading TX data
-
-SPI transactions expect each command to start with some command sequence from the host, and so usually data will be transmitted at least in the first command segment.
-The [`TXDATA`](data/spi_host.hjson#txdata) window provides a simple interface to the TX FIFO.
-Data can be written to the window using 8-, 16- or 32-bit instructions.
-
-Some attention, however, should be paid to byte-ordering and segmenting conventions.
-
-#### Byte-ordering Conventions
-
-For SPI flash applications, it is generally assumed that most of the *payload* data will be directly copied from embedded SRAM to the flash device.
-
-If this data is to copied to the [`TXDATA`](data/spi_host.hjson#txdata) window using 32-bit instructions, the SPI_HOST should be parameterized such that the `ByteOrder` parameter matches the byte order of the embedded CPU (i.e., for Ibex, `ByteOrder` should be left set to `1` to indicate a Little-Endian CPU).
-This will ensure that data is transmitted to the flash (and thus also stored in flash) in address-ascending order.
-For example, consider the transfer of four bytes, `D[3:0][7:0]`, to SPI via the [`TXDATA`](data/spi_host.hjson#txdata) window.
-- It is assumed for this example that all four bytes are contiguously stored in SRAM at a word-aligned address, with `D[0]` at the lowest byte-address.
-- When these bytes are loaded into the Ibex CPU they are arranged as the 32-bit word: `W[31:0] = {D[3][7:0], D[2][7:0], D[1][7:0], D[0][7:0]}`.
-- After this word are loaded into the [`TXDATA`](data/spi_host.hjson#txdata) window, the LSB (i.e., `W[7:0] = D[0][7:0]`) is transmitted first, by virtue of the `ByteOrder == 1` configuration.
-
-In this way, configuring `ByteOrder` to match the CPU ensures that data is transmitted in memory-address order.
-
-The value of the `ByteOrder` parameter can be confirmed by firmware by reading the [`STATUS.BYTEORDER`](data/spi_host.hjson#status) register field.
-
-Not all data to the SPI device will come from memory however.
-In many cases the transaction command codes or headers will be constructed or packed on the fly in CPU registers.
-The order these register bytes are transmitted on the bus will depend on the value of the `ByteOrder` parameter, as discussed in the Theory of Operation section, and for multi-bit values, such as flash addresses), some byte-swapping may be required to ensure that data is transmitted in the proper order expected by the target device.
-
-For example, SPI flash devices generally expect flash addresses (or any other multi-byte values) to be transmitted MSB-first.
-This is illustrated in the following figure, which depicts a Fast Quad Read I/O command.
-Assuming that `ByteOrder` is set to `1` for Little-Endian devices such as Ibex, byte-swapping will be required for these addresses, otherwise the device will receive the addresses LSB first.
-
-```wavejson
-{ signal: [
-  {name:"csb", wave:"10........................."},
-  {name:"sck", wave:"lnn........................"},
-  {name:"sd[0]", wave:"x1..0101.22222222z.22334455",
-   data:["a[23]", "a[19]", "a[15]", "a[11]", "a[7]", "a[3]", "1", "1"]},
-  {name:"sd[1]", wave:"xz.......22222222z.22334455",
-   data:["a[22]", "a[18]", "a[14]", "a[10]", "a[6]", "a[2]", "1", "1"]},
-  {name:"sd[2]", wave:"xz.......22222222zz22334455",
-   data:["a[21]", "a[17]", "a[15]", "a[11]", "a[7]", "a[3]", "1", "1"]},
-  {name:"sd[3]", wave:"xz.......22222222zz22334455",
-   data:["a[20]", "a[16]", "a[12]", "a[8]", "a[4]", "a[0]", "1", "1"]},
-  {node: ".A.......B.C.D.E.F.G.H.I.J.K"},
-  {node: ".........L.....M...N........O"}
-],
-  edge: ['A<->B Command 0xEB ("Fast Read Quad I/O")',  'B<->C MSB(addr)', 'D<->E LSB(addr)',
-         'G<->H addr[0]', 'H<->I addr[1]', 'I<->J addr[2]', 'J<->K addr[3]',
-         'L<->M Address', 'N<->O Data'],
-
- foot: {text: "Addresses are transmitted MSB first, and data is returned in order of increasing peripheral byte address."}}
-```
-
-Byte ordering on the bus can also be managed by writing [`TXDATA`](data/spi_host.hjson#txdata) as a sequence of discrete bytes using 8-bit transactions, since partially-filled data-words are always sent in the order they are received.
-
-A few examples related to using SPI flash devices on a Little-Endian platform:
-- A 4-byte address can be loaded into the TX FIFO as four individual bytes using 8-bit I/O instructions.
-- The above read command (with 4-byte address) can be loaded into the FIFO by first loading the command code into [`TXDATA`](data/spi_host.hjson#txdata) as a single byte, and the address can be loaded into [`TXDATA`](data/spi_host.hjson#txdata) using 32-bit instructions, provided the byte order is swapped before loading.
-- Flash transactions with 3-byte addressing require some care, as there are no 24-bit I/O instructions, though there are a several options:
-    - After the 8-bit command code is sent, the address can either be sent in several I/O operations (e.g., the MSB is sent as an 8-bit command, and the remaining 16-bits can be sent after swapping)
-    - If bandwidth efficiency is a priority, the address, `A[23:0]`, and command code, `C[7:0]`, can all be packed together into a single 4-byte quantity `W[31:0] = {A[7:0], A[15:8], A[23:16], C[7:0]}`, which when loaded into [`TXDATA`](data/spi_host.hjson#txdata) will ensure that the command code is sent first, followed by the address in MSB-first order.
-
-#### Segmenting Considerations
-
-Data words are *not* shared across segments.
-If at the end of each TX (or bidirectional) segment there is a partially transmitted data word then any unsent bytes will be discarded as the SPI_HOST IP closes the segment.
-For the next TX segment, the transmitted data will start with the following *word* from the TX FIFO.
-
-#### Refilling the TX FIFO
-
-For extremely long transactions, the TX FIFO may not have enough capacity to hold all the data being transmitted.
-In this case software can either poll the [`STATUS.TXQD`](data/spi_host.hjson#status) register to determine the number of elements in the TX FIFO, or enable the SPI_HOST IP to send an interrupt when the FIFO drains to a certain level.
-If [`INTR_ENABLE.spi_event`](data/spi_host.hjson#intr_enable) and [`EVENT_ENABLE.TXWM`](data/spi_host.hjson#event_enable) are both asserted, the IP will send an interrupt whenever the number of elements in the TX FIFO falls below [`CONTROL.TX_WATERMARK`](data/spi_host.hjson#control).
-
-### Specifying the Segments
-
-Each write to the [`COMMAND`](data/spi_host.hjson#command) register corresponds to a single command segment.
-The length, CSAAT flag, direction and speed settings for that segment should all be packed into a single 32-bit register and written simultaneously to [`COMMAND`](data/spi_host.hjson#command).
-
-The [`COMMAND`](data/spi_host.hjson#command) should only be written when [`STATUS.READY`](data/spi_host.hjson#status) is asserted.
-
-While each command segment is being processed, the SPI_HOST has room to queue up exactly one additional segment descriptor in the Command Clock Domain Crossing.
-Once a second command segment descriptor has been submitted, software must wait for the state machine to finish processing the current segment before submitting more.
-Software can poll the [`STATUS.READY`](data/spi_host.hjson#status) field to determine when it is safe to insert another segment descriptor.
-Otherwise the [`EVENT_ENABLE.IDLE`](data/spi_host.hjson#event_enable) bit can be enabled (along with [`INTR_ENABLE.spi_event`](data/spi_host.hjson#intr_enable)) to trigger an event interrupt whenever [`STATUS.READY`](data/spi_host.hjson#status) is asserted.
-
-### Reading Back the Device Response
-
-Once an RX segment descriptor has been submitted to the SPI_HOST, the received data will be available in the RX FIFO after the first word has been received.
-
-The number of words in the FIFO can be polled by reading the [`STATUS.RXQD`](data/spi_host.hjson#status) field.
-The SPI_HOST IP can also configured to generate watermark event interrupts whenever the number of words received reaches (or exceeds) [`CONTROL.RX_WATERMARK`](data/spi_host.hjson#control).
-To enable interrupts when ever the RX FIFO reaches the watermark, assert [`EVENT_ENABLE.RXWM`](data/spi_host.hjson#event_enable) along with [`INTR_ENABLE.spi_event`](data/spi_host.hjson#intr_enable).
-
-## Exception Handling
-
-The SPI_HOST will assert one of the [`ERROR_STATUS`](data/spi_host.hjson#error_status) bits in the event of a firmware programming error, and will become unresponsive until firmware acknowledges the error by clearing the corresponding error bit.
-
-The SPI_HOST interrupt handler should clear any bits in [`ERROR_STATUS`](data/spi_host.hjson#error_status) bit before clearing [`INTR_STATE.error`](data/spi_host.hjson#intr_state).
-
-In addition to clearing the [`ERROR_STATUS`](data/spi_host.hjson#error_status) register, firmware can also trigger a complete software reset via the [`CONTROL.SW_RST`](data/spi_host.hjson#control) bit, as described in the next section.
-
-Other system-level errors may arise due to improper programming of the target device (e.g., due to violations in the device programming model, or improper configuration of the SPI_HOST timing registers).
-Given that the SPI protocol provides no mechanism for the target device to stall the bus, the SPI_HOST will continue to function even if the remote device becomes unresponsive.
-In case of an unresponsive device, the RX FIFO will still accumulate data from the bus during RX segments, though the data values will be undefined.
-
-## Software Reset Procedure
-
-In the event of an error the SPI_HOST IP can be reset under software control using the following procedure:
-
-1. Set [`CONTROL.SW_RST`](data/spi_host.hjson#control).
-2. Poll IP status registers for confirmation of successful state machine reset:
-   - Wait for [`STATUS.ACTIVE`](data/spi_host.hjson#status) to clear.
-   - Wait for both FIFOs to completely drain by polling [`STATUS.TXQD`](data/spi_host.hjson#status) and [`STATUS.RXQD`](data/spi_host.hjson#status) until they reach zero.
-3. Clear [`CONTROL.SW_RST`](data/spi_host.hjson#control).
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_spi_host.h)
-
-## Register Table
-
-* [Register Table](data/spi_host.hjson#registers)
-
-# Appendices
-
-## Analysis of Transient Datapath Stalls
-
-Even if the RX (or TX) FIFOs have free-space (or data) available, stall events can still happen due to momentary backlogs or bubbles in the data pipeline.
-For instance, the Byte Merge and Byte Select blocks occasionally need some extra cycles to clean out the internal `prim_packer_fifo`.
-These delays are likely to cause transient stalls particularly, when constructing transactions with many short (less than 4-byte) segments.
-Transient stalls could lead to false diagnostics when trying to optimize SPI_HOST throughput.
-Thus it is useful to analyze the shift register's tolerance to bubble events, particularly for the highest bandwidth Quad SPI mode.
-
-### Transient Stalls in TX direction.
-
-The transient analysis stall analysis is simpler for the TX direction.
-There is no buffering on the shift register TX data inputs because it would complicate the `byte_flush` operation on the Byte Select block.
-
-In Quad mode,the shift register will demand one new byte as often as once every four clock cycles.
-(This rate is slowed down if for a non-trivial clock-divide ratio).
-Meanwhile, the Byte Select Block pauses once for every disabled byte, and once more at the end of each word.
-Thus if the Byte Select block is loaded with three-consecutive bytes-disables (either in the same word or across two separate words), this will create a pause of 4-clock cycles between two bytes causing a transient stall event.
-
-Assuming that each TX Word has at least one byte enabled, the longest possible transient delay between two Byte Select outputs is 7 clock cycles (with three byte-disables in two adjacent words, respectively aligned for maximal delay and assuming no delays in the TX FIFOs themselves).
-Dual- and Standard-mode segments can tolerate inter-byte delays of 7 or 15 clocks respectively, and thus transient stalls should not be a problem after Dual- or Standard-mode segments.
-
-### Transient Stalls in the RX direction
-
-Similar to the Byte Select, the Byte Merge block must pause for at least one cycle between each word.
-Also when at the end of a segment the Byte Merge packs less than four bytes into the last word, there is also an additional cycle of delay for each unused byte.
-Thus if the last word in a given segment has only one valid byte, the total delay will be four clock cycles.
-
-Such stalls however are a much smaller concern in the RX direction due to the buffering of the Shift Register outputs.
-As shown in the following waveform, even in Quad-mode, this buffer means the shift register can tolerate as many as six clock cycles of temporary back-pressure before creating a stall.
-
-```wavejson
-{signal: [
-  [ "Shift Register Ports",
-  {name: "clk_core_i",                  wave: "p..........................."},
-  {name: "wr_en_i",                     wave: "010..10..10..10..10..10..1.0"},
-  {name: "shift_en_i",                  wave: "0..10..10..10..10..10..10..."},
-  {name: "rd_en_i",                     wave: "0....10..10..10..10..10..1.0"},
-  {name: "rx_valid_o (to Byte Merge)",   wave: "0.....10..1....0..10..1....."},
-  {name: "rx_ready_i (from Byte Merge)", wave: "1......0.....1.....0......1.",
-                                        node: ".......A.....B.....C......D"},
-  {name: "rd_ready_o (to FSM)",         wave: "1.........0..1........0...1."}],
-  ["FSM Port", {name: "rx_stall_o",     wave: "0........................10."}],
-  {name: ""}
-],
-  edge: ["A<->B 6 clocks: No Stall", "C<->D 7 clocks will stall FSM"],
-  head: {text: "SPI_HOST Shift Register: Tolerance to Gaps in rx_ready_i", tick:1}
-}
-```
-
-Even though such long delays are tolerable, it takes some time for shift register to catch up completely and clear the backlog.
-For example, if after a 6-clock delay the shift-register encounters another 4-clock backlog this can also introduce a stall condition, as shown in the waveform below.
-
-```wavejson
-{signal: [
-  ["Shift Register Ports",
-  {name: "clk_core_i", wave: "p........................"},
-  {name: "wr_en_i",    wave: "010..10..10..10..1.0..10."},
-  {name: "shift_en_i", wave: "0..10..10..10..10...10..1"},
-  {name: "rd_en_i",    wave: "0....10..10..10..1.0..10."},
-  {name: "rx_valid_o", wave: "0.....10..1...........010"},
-  {name: "rx_ready_i (from Byte Merge)", wave: "1......0.....10...10.1...",
-                      node: ".......A.....BC...D"},
-  {name: "rd_ready_o (to FSM)", wave: "1.........0..10...10.1..."}],
-  ["FSM Port", {name: "rx_stall_o", wave: "0................10......"}],
-  {name: "", wave: ""},
-],
-  edge: ["A<->B 1st Gap: 6 clocks", "C<->D 2nd Gap: 4 clocks"],
-  head: {text: "SPI_HOST Shift Register: Back-to-back gaps in rx_ready_i", tick:1}
-}
-```
-
-Delays of 3-clocks or less do not create any internal backlog in the system.
-However, the Byte Merge block can create a 4-clock delay each time it processes a single-byte segment.
-In practice, this is unlikely to cause a problem, as no Quad-SPI Flash transactions require even two back-to-back RX segments.
-However with enough (at least six) consecutive one-byte segments, the accumulated delay can eventually create a stall event on the RX path as well, as seen below.
-
-```wavejson
-{signal: [
- [ "Shift Register Ports",
-  {name: "clk_core_i", wave: "p..........................."},
-  {name: "wr_en_i", wave: "010..10..10..10..10..10..1.0"},
-  {name: "shift_en_i", wave: "0..10..10..10..10..10..10..."},
-  {name: "rd_en_i", wave: "0....10..10..10..10..10..1.0"},
-  {name: "rx_valid_o", wave: "0.....10..1.0.1..01........."},
-  {name: "rx_ready_i (from Byte Merge)", wave: "1......0...10...10...10...10",
-                      node: ".......A...BC...D"},
-  {name: "rd_ready_o (to FSM)", wave: "1.........01..0.1.0..10...10"}],
-  [ "FSM Port",
-  {name: "rx_stall_o", wave: "0........................10."}],
-  {name: ""}
-],
-  edge: ["A<->B 4 clocks", "C<->D 4 clocks"],
-  head: {text: "SPI_HOST Shift Register: Hypothetical RX Congestion Scenario", tick:1},
- foot: {text: "Six back-to-back quad reads 1-byte each, same CSID, CSAAT enabled"}
-}
-```
diff --git a/hw/ip/spi_host/doc/programmers_guide.md b/hw/ip/spi_host/doc/programmers_guide.md
new file mode 100644
index 0000000000000..fbf613d45be17
--- /dev/null
+++ b/hw/ip/spi_host/doc/programmers_guide.md
@@ -0,0 +1,281 @@
+# Programmer's Guide
+
+The operation of the SPI_HOST IP proceeds in seven general steps.
+
+To initialize the IP:
+1. Program the [`CONFIGOPTS`](../data/spi_host.hjson#configopts) multi-register with the appropriate timing and polarity settings for each `csb` line.
+2. Set the desired interrupt parameters
+3. Enable the IP
+
+Then for each command:
+
+4. Load the data to be transmitted into the FIFO using the [`TXDATA`](../data/spi_host.hjson#txdata) memory window.
+5. Specify the target device by programming the [`CSID`](../data/spi_host.hjson#csid)
+6. Specify the structure of the command by writing each segment into the [`COMMAND`](../data/spi_host.hjson#command) register
+   - For multi-segment transactions, be sure to assert [`COMMAND.CSAAT`](../data/spi_host.hjson#command) for all but the last command segment
+7. For transactions which expect to receive a reply, the data can then be read back from the [`RXDATA`](../data/spi_host.hjson#rxdata) window.
+
+These latter four steps are then repeated for each command.
+Each step is covered in detail in the following sections.
+
+For concreteness, this Programmer's Guide uses examples from one of our primary target devices, the [W25Q01JV flash from Winbond](https://www.winbond.com/resource-files/W25Q01JV%20SPI%20RevB%2011132019.pdf).
+The SPI_HOST IP is however suitable for interacting with any number of SPI devices, and the same mode of operation can be used for any SPI device.
+
+## Initializing the IP
+
+### Per-target Configuration
+
+The [`CONFIGOPTS`](../data/spi_host.hjson#configopts) multi-register must be programmed to reflect the requirements of the attached target devices.
+As such these registers can be programmed once at initialization, or whenever a new device is connected (e.g., via changes in the external pin connections, or changes in the pinmux configuration).
+The proper settings for the [`CONFIGOPTS`](../data/spi_host.hjson#configopts) fields (e.g., CPOL and CPHA, clock divider, ratios, and other timing or sampling requirements) will all depend on the specific device attached as well as the board level delays.
+
+### Interrupt configuration
+
+The next step is to configuration the interrupts for the SPI_HOST.
+This should also be done at initialization using the following register fields:
+- The [`ERROR_ENABLE`](../data/spi_host.hjson#error_enable) register should be configured to indicate what types of error conditions (if any) should be ignored to not trigger an interrupt.
+At reset, these fields are all set indicating that all error classes trigger an interrupt.
+
+- For interrupt driven I/O the [`EVENT_ENABLE`](../data/spi_host.hjson#event_enable) register must be configured to select the desired event interrupts to signal the desired conditions (e.g. "FIFO empty", "FIFO at the watermark level", or "ready for next command segment").
+By default, this register is all zeros, meaning all event interrupts are disabled, and thus all transactions must be managed by polling the status register.
+   - When using the FIFO watermarks to send interrupts, the watermark levels must be set via the [`CONTROL.RX_WATERMARK`](../data/spi_host.hjson#control) and [`CONTROL.TX_WATERMARK`](../data/spi_host.hjson#control) fields.
+
+- The event and error interrupts must finally be enabled using the [`INTR_ENABLE`](../data/spi_host.hjson#intr_enable) register.
+
+### Enabling the SPI_HOST
+
+The IP must be enabled before sending the first command by asserting the [`CONTROL.SPIEN`](../data/spi_host.hjson#control) bit.
+
+## Issuing Transactions
+
+As mentioned above, each command is typically specified in three phases: loading the TX data, specifying the command segments/format, and reading the RX data.
+In principle, the first two steps can be performed in either order.
+If the SPI_HOST does not find any data to transmit it will simply stall until data is inserted.
+Meanwhile, the RX data is only available after the command format has been specified and processed by the state machine.
+
+For longer transactions, with data larger than the capacity of the FIFOs, the command sequence may become more complex.
+For instance, to send 1024 bytes of data in a single transaction, the TX data may need to be loaded several times if using a 256-byte FIFO.
+In this instance, the programming sequence will consist of at least four iterations of entering TX data and waiting for the TX FIFO to drain.
+
+### Loading TX data
+
+SPI transactions expect each command to start with some command sequence from the host, and so usually data will be transmitted at least in the first command segment.
+The [`TXDATA`](../data/spi_host.hjson#txdata) window provides a simple interface to the TX FIFO.
+Data can be written to the window using 8-, 16- or 32-bit instructions.
+
+Some attention, however, should be paid to byte-ordering and segmenting conventions.
+
+#### Byte-ordering Conventions
+
+For SPI flash applications, it is generally assumed that most of the *payload* data will be directly copied from embedded SRAM to the flash device.
+
+If this data is to copied to the [`TXDATA`](../data/spi_host.hjson#txdata) window using 32-bit instructions, the SPI_HOST should be parameterized such that the `ByteOrder` parameter matches the byte order of the embedded CPU (i.e., for Ibex, `ByteOrder` should be left set to `1` to indicate a Little-Endian CPU).
+This will ensure that data is transmitted to the flash (and thus also stored in flash) in address-ascending order.
+For example, consider the transfer of four bytes, `D[3:0][7:0]`, to SPI via the [`TXDATA`](../data/spi_host.hjson#txdata) window.
+- It is assumed for this example that all four bytes are contiguously stored in SRAM at a word-aligned address, with `D[0]` at the lowest byte-address.
+- When these bytes are loaded into the Ibex CPU they are arranged as the 32-bit word: `W[31:0] = {D[3][7:0], D[2][7:0], D[1][7:0], D[0][7:0]}`.
+- After this word are loaded into the [`TXDATA`](../data/spi_host.hjson#txdata) window, the LSB (i.e., `W[7:0] = D[0][7:0]`) is transmitted first, by virtue of the `ByteOrder == 1` configuration.
+
+In this way, configuring `ByteOrder` to match the CPU ensures that data is transmitted in memory-address order.
+
+The value of the `ByteOrder` parameter can be confirmed by firmware by reading the [`STATUS.BYTEORDER`](../data/spi_host.hjson#status) register field.
+
+Not all data to the SPI device will come from memory however.
+In many cases the transaction command codes or headers will be constructed or packed on the fly in CPU registers.
+The order these register bytes are transmitted on the bus will depend on the value of the `ByteOrder` parameter, as discussed in the Theory of Operation section, and for multi-bit values, such as flash addresses), some byte-swapping may be required to ensure that data is transmitted in the proper order expected by the target device.
+
+For example, SPI flash devices generally expect flash addresses (or any other multi-byte values) to be transmitted MSB-first.
+This is illustrated in the following figure, which depicts a Fast Quad Read I/O command.
+Assuming that `ByteOrder` is set to `1` for Little-Endian devices such as Ibex, byte-swapping will be required for these addresses, otherwise the device will receive the addresses LSB first.
+
+```wavejson
+{ signal: [
+  {name:"csb", wave:"10........................."},
+  {name:"sck", wave:"lnn........................"},
+  {name:"sd[0]", wave:"x1..0101.22222222z.22334455",
+   data:["a[23]", "a[19]", "a[15]", "a[11]", "a[7]", "a[3]", "1", "1"]},
+  {name:"sd[1]", wave:"xz.......22222222z.22334455",
+   data:["a[22]", "a[18]", "a[14]", "a[10]", "a[6]", "a[2]", "1", "1"]},
+  {name:"sd[2]", wave:"xz.......22222222zz22334455",
+   data:["a[21]", "a[17]", "a[15]", "a[11]", "a[7]", "a[3]", "1", "1"]},
+  {name:"sd[3]", wave:"xz.......22222222zz22334455",
+   data:["a[20]", "a[16]", "a[12]", "a[8]", "a[4]", "a[0]", "1", "1"]},
+  {node: ".A.......B.C.D.E.F.G.H.I.J.K"},
+  {node: ".........L.....M...N........O"}
+],
+  edge: ['A<->B Command 0xEB ("Fast Read Quad I/O")',  'B<->C MSB(addr)', 'D<->E LSB(addr)',
+         'G<->H addr[0]', 'H<->I addr[1]', 'I<->J addr[2]', 'J<->K addr[3]',
+         'L<->M Address', 'N<->O Data'],
+
+ foot: {text: "Addresses are transmitted MSB first, and data is returned in order of increasing peripheral byte address."}}
+```
+
+Byte ordering on the bus can also be managed by writing [`TXDATA`](../data/spi_host.hjson#txdata) as a sequence of discrete bytes using 8-bit transactions, since partially-filled data-words are always sent in the order they are received.
+
+A few examples related to using SPI flash devices on a Little-Endian platform:
+- A 4-byte address can be loaded into the TX FIFO as four individual bytes using 8-bit I/O instructions.
+- The above read command (with 4-byte address) can be loaded into the FIFO by first loading the command code into [`TXDATA`](../data/spi_host.hjson#txdata) as a single byte, and the address can be loaded into [`TXDATA`](../data/spi_host.hjson#txdata) using 32-bit instructions, provided the byte order is swapped before loading.
+- Flash transactions with 3-byte addressing require some care, as there are no 24-bit I/O instructions, though there are a several options:
+    - After the 8-bit command code is sent, the address can either be sent in several I/O operations (e.g., the MSB is sent as an 8-bit command, and the remaining 16-bits can be sent after swapping)
+    - If bandwidth efficiency is a priority, the address, `A[23:0]`, and command code, `C[7:0]`, can all be packed together into a single 4-byte quantity `W[31:0] = {A[7:0], A[15:8], A[23:16], C[7:0]}`, which when loaded into [`TXDATA`](../data/spi_host.hjson#txdata) will ensure that the command code is sent first, followed by the address in MSB-first order.
+
+#### Segmenting Considerations
+
+Data words are *not* shared across segments.
+If at the end of each TX (or bidirectional) segment there is a partially transmitted data word then any unsent bytes will be discarded as the SPI_HOST IP closes the segment.
+For the next TX segment, the transmitted data will start with the following *word* from the TX FIFO.
+
+#### Refilling the TX FIFO
+
+For extremely long transactions, the TX FIFO may not have enough capacity to hold all the data being transmitted.
+In this case software can either poll the [`STATUS.TXQD`](../data/spi_host.hjson#status) register to determine the number of elements in the TX FIFO, or enable the SPI_HOST IP to send an interrupt when the FIFO drains to a certain level.
+If [`INTR_ENABLE.spi_event`](../data/spi_host.hjson#intr_enable) and [`EVENT_ENABLE.TXWM`](../data/spi_host.hjson#event_enable) are both asserted, the IP will send an interrupt whenever the number of elements in the TX FIFO falls below [`CONTROL.TX_WATERMARK`](../data/spi_host.hjson#control).
+
+### Specifying the Segments
+
+Each write to the [`COMMAND`](../data/spi_host.hjson#command) register corresponds to a single command segment.
+The length, CSAAT flag, direction and speed settings for that segment should all be packed into a single 32-bit register and written simultaneously to [`COMMAND`](../data/spi_host.hjson#command).
+
+The [`COMMAND`](../data/spi_host.hjson#command) should only be written when [`STATUS.READY`](../data/spi_host.hjson#status) is asserted.
+
+While each command segment is being processed, the SPI_HOST has room to queue up exactly one additional segment descriptor in the Command Clock Domain Crossing.
+Once a second command segment descriptor has been submitted, software must wait for the state machine to finish processing the current segment before submitting more.
+Software can poll the [`STATUS.READY`](../data/spi_host.hjson#status) field to determine when it is safe to insert another segment descriptor.
+Otherwise the [`EVENT_ENABLE.IDLE`](../data/spi_host.hjson#event_enable) bit can be enabled (along with [`INTR_ENABLE.spi_event`](../data/spi_host.hjson#intr_enable)) to trigger an event interrupt whenever [`STATUS.READY`](../data/spi_host.hjson#status) is asserted.
+
+### Reading Back the Device Response
+
+Once an RX segment descriptor has been submitted to the SPI_HOST, the received data will be available in the RX FIFO after the first word has been received.
+
+The number of words in the FIFO can be polled by reading the [`STATUS.RXQD`](../data/spi_host.hjson#status) field.
+The SPI_HOST IP can also configured to generate watermark event interrupts whenever the number of words received reaches (or exceeds) [`CONTROL.RX_WATERMARK`](../data/spi_host.hjson#control).
+To enable interrupts when ever the RX FIFO reaches the watermark, assert [`EVENT_ENABLE.RXWM`](../data/spi_host.hjson#event_enable) along with [`INTR_ENABLE.spi_event`](../data/spi_host.hjson#intr_enable).
+
+## Exception Handling
+
+The SPI_HOST will assert one of the [`ERROR_STATUS`](../data/spi_host.hjson#error_status) bits in the event of a firmware programming error, and will become unresponsive until firmware acknowledges the error by clearing the corresponding error bit.
+
+The SPI_HOST interrupt handler should clear any bits in [`ERROR_STATUS`](../data/spi_host.hjson#error_status) bit before clearing [`INTR_STATE.error`](../data/spi_host.hjson#intr_state).
+
+In addition to clearing the [`ERROR_STATUS`](../data/spi_host.hjson#error_status) register, firmware can also trigger a complete software reset via the [`CONTROL.SW_RST`](../data/spi_host.hjson#control) bit, as described in the next section.
+
+Other system-level errors may arise due to improper programming of the target device (e.g., due to violations in the device programming model, or improper configuration of the SPI_HOST timing registers).
+Given that the SPI protocol provides no mechanism for the target device to stall the bus, the SPI_HOST will continue to function even if the remote device becomes unresponsive.
+In case of an unresponsive device, the RX FIFO will still accumulate data from the bus during RX segments, though the data values will be undefined.
+
+## Software Reset Procedure
+
+In the event of an error the SPI_HOST IP can be reset under software control using the following procedure:
+
+1. Set [`CONTROL.SW_RST`](../data/spi_host.hjson#control).
+2. Poll IP status registers for confirmation of successful state machine reset:
+   - Wait for [`STATUS.ACTIVE`](../data/spi_host.hjson#status) to clear.
+   - Wait for both FIFOs to completely drain by polling [`STATUS.TXQD`](../data/spi_host.hjson#status) and [`STATUS.RXQD`](../data/spi_host.hjson#status) until they reach zero.
+3. Clear [`CONTROL.SW_RST`](../data/spi_host.hjson#control).
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_spi_host.h)
+
+## Register Table
+
+* [Register Table](../data/spi_host.hjson#registers)
+
+# Appendices
+
+## Analysis of Transient Datapath Stalls
+
+Even if the RX (or TX) FIFOs have free-space (or data) available, stall events can still happen due to momentary backlogs or bubbles in the data pipeline.
+For instance, the Byte Merge and Byte Select blocks occasionally need some extra cycles to clean out the internal `prim_packer_fifo`.
+These delays are likely to cause transient stalls particularly, when constructing transactions with many short (less than 4-byte) segments.
+Transient stalls could lead to false diagnostics when trying to optimize SPI_HOST throughput.
+Thus it is useful to analyze the shift register's tolerance to bubble events, particularly for the highest bandwidth Quad SPI mode.
+
+### Transient Stalls in TX direction.
+
+The transient analysis stall analysis is simpler for the TX direction.
+There is no buffering on the shift register TX data inputs because it would complicate the `byte_flush` operation on the Byte Select block.
+
+In Quad mode,the shift register will demand one new byte as often as once every four clock cycles.
+(This rate is slowed down if for a non-trivial clock-divide ratio).
+Meanwhile, the Byte Select Block pauses once for every disabled byte, and once more at the end of each word.
+Thus if the Byte Select block is loaded with three-consecutive bytes-disables (either in the same word or across two separate words), this will create a pause of 4-clock cycles between two bytes causing a transient stall event.
+
+Assuming that each TX Word has at least one byte enabled, the longest possible transient delay between two Byte Select outputs is 7 clock cycles (with three byte-disables in two adjacent words, respectively aligned for maximal delay and assuming no delays in the TX FIFOs themselves).
+Dual- and Standard-mode segments can tolerate inter-byte delays of 7 or 15 clocks respectively, and thus transient stalls should not be a problem after Dual- or Standard-mode segments.
+
+### Transient Stalls in the RX direction
+
+Similar to the Byte Select, the Byte Merge block must pause for at least one cycle between each word.
+Also when at the end of a segment the Byte Merge packs less than four bytes into the last word, there is also an additional cycle of delay for each unused byte.
+Thus if the last word in a given segment has only one valid byte, the total delay will be four clock cycles.
+
+Such stalls however are a much smaller concern in the RX direction due to the buffering of the Shift Register outputs.
+As shown in the following waveform, even in Quad-mode, this buffer means the shift register can tolerate as many as six clock cycles of temporary back-pressure before creating a stall.
+
+```wavejson
+{signal: [
+  [ "Shift Register Ports",
+  {name: "clk_core_i",                  wave: "p..........................."},
+  {name: "wr_en_i",                     wave: "010..10..10..10..10..10..1.0"},
+  {name: "shift_en_i",                  wave: "0..10..10..10..10..10..10..."},
+  {name: "rd_en_i",                     wave: "0....10..10..10..10..10..1.0"},
+  {name: "rx_valid_o (to Byte Merge)",   wave: "0.....10..1....0..10..1....."},
+  {name: "rx_ready_i (from Byte Merge)", wave: "1......0.....1.....0......1.",
+                                        node: ".......A.....B.....C......D"},
+  {name: "rd_ready_o (to FSM)",         wave: "1.........0..1........0...1."}],
+  ["FSM Port", {name: "rx_stall_o",     wave: "0........................10."}],
+  {name: ""}
+],
+  edge: ["A<->B 6 clocks: No Stall", "C<->D 7 clocks will stall FSM"],
+  head: {text: "SPI_HOST Shift Register: Tolerance to Gaps in rx_ready_i", tick:1}
+}
+```
+
+Even though such long delays are tolerable, it takes some time for shift register to catch up completely and clear the backlog.
+For example, if after a 6-clock delay the shift-register encounters another 4-clock backlog this can also introduce a stall condition, as shown in the waveform below.
+
+```wavejson
+{signal: [
+  ["Shift Register Ports",
+  {name: "clk_core_i", wave: "p........................"},
+  {name: "wr_en_i",    wave: "010..10..10..10..1.0..10."},
+  {name: "shift_en_i", wave: "0..10..10..10..10...10..1"},
+  {name: "rd_en_i",    wave: "0....10..10..10..1.0..10."},
+  {name: "rx_valid_o", wave: "0.....10..1...........010"},
+  {name: "rx_ready_i (from Byte Merge)", wave: "1......0.....10...10.1...",
+                      node: ".......A.....BC...D"},
+  {name: "rd_ready_o (to FSM)", wave: "1.........0..10...10.1..."}],
+  ["FSM Port", {name: "rx_stall_o", wave: "0................10......"}],
+  {name: "", wave: ""},
+],
+  edge: ["A<->B 1st Gap: 6 clocks", "C<->D 2nd Gap: 4 clocks"],
+  head: {text: "SPI_HOST Shift Register: Back-to-back gaps in rx_ready_i", tick:1}
+}
+```
+
+Delays of 3-clocks or less do not create any internal backlog in the system.
+However, the Byte Merge block can create a 4-clock delay each time it processes a single-byte segment.
+In practice, this is unlikely to cause a problem, as no Quad-SPI Flash transactions require even two back-to-back RX segments.
+However with enough (at least six) consecutive one-byte segments, the accumulated delay can eventually create a stall event on the RX path as well, as seen below.
+
+```wavejson
+{signal: [
+ [ "Shift Register Ports",
+  {name: "clk_core_i", wave: "p..........................."},
+  {name: "wr_en_i", wave: "010..10..10..10..10..10..1.0"},
+  {name: "shift_en_i", wave: "0..10..10..10..10..10..10..."},
+  {name: "rd_en_i", wave: "0....10..10..10..10..10..1.0"},
+  {name: "rx_valid_o", wave: "0.....10..1.0.1..01........."},
+  {name: "rx_ready_i (from Byte Merge)", wave: "1......0...10...10...10...10",
+                      node: ".......A...BC...D"},
+  {name: "rd_ready_o (to FSM)", wave: "1.........01..0.1.0..10...10"}],
+  [ "FSM Port",
+  {name: "rx_stall_o", wave: "0........................10."}],
+  {name: ""}
+],
+  edge: ["A<->B 4 clocks", "C<->D 4 clocks"],
+  head: {text: "SPI_HOST Shift Register: Hypothetical RX Congestion Scenario", tick:1},
+ foot: {text: "Six back-to-back quad reads 1-byte each, same CSID, CSAAT enabled"}
+}
+```
diff --git a/hw/ip/spi_host/doc/theory_of_operation.md b/hw/ip/spi_host/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..bc2e855b35796
--- /dev/null
+++ b/hw/ip/spi_host/doc/theory_of_operation.md
@@ -0,0 +1,1089 @@
+# Theory of Operation
+
+## SPI_HOST IP Command Interface
+
+A SPI command consists of at least one segment. Each segment has a different speed (number of active SD lines), direction and length.
+For example a Quad SPI read transaction consists of 4 segments:
+1. A single byte instruction transmitted at *standard* data rate
+2. A three or four byte address transmitted at *Quad* data rate
+3. A number of dummy cycles (no data transmitted or received)
+4. The desired data, received by SPI_HOST at *Quad* data rate
+
+During a transaction, software can issue multiple segment descriptions to the SPI_HOST IP to control for changes in speed or direction.
+
+Issuing a command then consists of the following steps:
+1. Configure the IP to be compatible with each attached peripheral.
+The [`CONFIGOPTS`](../data/spi_host.hjson#configopts) multi-register holds separate sets of configuration settings, one for each CSB line.
+In principle, the configuration of these device-specific options only needs to be done/performed once at initialization.
+
+2. Load the TX FIFO with the instructions and data to be transmitted to the remote device by writing to the [`TXDATA`](../data/spi_host.hjson#txdata) memory window.
+3. Specify which device should receive the next command using the [`CSID`](../data/spi_host.hjson#csid) register.
+4. Wait for [`STATUS.READY`](../data/spi_host.hjson#status) before continuing.
+5. Issue speed, direction, and length details for the next command segment using the [`COMMAND`](../data/spi_host.hjson#command) register.
+If a command consists of multiple segments, then set [`COMMAND.CSAAT`](../data/spi_host.hjson#command) (Chip-select active after transaction) to one for all segments except the last one.
+Setting [`COMMAND.CSAAT`](../data/spi_host.hjson#command) to zero indicates the end of a transaction, prompting the IP to raise CSB at the end of the segment.
+
+6. Repeat steps 4 and 5 until all segments have been described.
+7. Read any peripheral response data from the RX FIFO by reading from the [`RXDATA`](../data/spi_host.hjson#rxdata) memory window.
+
+### About Command Segments
+
+The structure of a SPI command depends on the device and the command itself.
+
+To support a variety of different I/O sequences the SPI_HOST FSM treats each command as a sequence of segments, each with a defined length, direction and speed.
+
+In case of a standard SPI device the commands are very consistent in structure: the host transmits data on SD[0], and always receives data on SD[1].
+For such devices, all commands can in principle be treated as bidirectional, as both the host and device are always transmitting on their respective lines.
+For bidirectional commands, the SPI_HOST IP will store one byte in the RX FIFO for each byte transmitted from the TX FIFO.
+
+However, even for these standard SPI commands, software may be uninterested in some or all of the device's response data.
+For example, for SPI flash devices, standard-mode write commands contain no useful data in the device response, even though the device may be actively asserting signals to SD[1] throughout the transaction.
+Therefore, for such commands software may choose to specify the entire command as "TX Only", in which case data placed in the TX FIFO will be transmitted throughout the write command, but signals received from the device will be ignored and will not fill the RX FIFO.
+
+Meanwhile for other flash commands, such as standard-mode read, the device only transmits useful information during some portions of the transaction.
+In the case of a basic read (with a 3-byte address), the instruction starts with a 1-byte instruction code (0x3) followed by the three address bytes, during which time the flash device outputs may be high impedance (depending on the device).
+The device then immediately responds with the requested data in the next SCK cycle, and continues to output data bytes until the CSB line is deasserted.
+Though such a command could also be treated as entirely bidirectional, the device response can be safely ignored during the instruction and address phase, especially if the SD[1] line is high impedance during this time.
+Likewise it is not necessary for software to specify any data to transmit while the device is responding.
+Therefore such a command can be thought of as consisting of two separate segments, the first segment being TX Only and the second segment being RX only, as shown in the following figure.
+Breaking the command up this way potentially simplifies the job of writing software for this type of command.
+
+```wavejson
+{signal: [
+  {name: "clk_i",          wave: "p....................|.............."},
+  {name: "SCK (CPOL=0)",   wave: "0.1010101010101010101|01010101010101"},
+  {name: "CSB",            wave: "10...................|.............."},
+  {name: "SD[0]",          wave: "00.0.0.0.0.0.1.1.2.2.|2.2.x.........", data: ["a[23]", "a[22]", "a[1]", "a[0]"]},
+  {name: "SD[1]",          wave: "z....................|....2.2.2.2.2.", data: ["d[7]", "d[6]", "d[5]", "d[4]", "..."]},
+  {name: "Segment number", wave: "x2...................|....2.........", data: ['1', '2', '3','4'] },
+  {name: "Segment speed",  wave: "x2...................|....2.........", data: ['Standard', 'Standard'] },
+  {name: "Segment direction", wave: "x2...................|....2.........", data: ['TX', 'RX', 'None', 'RX'] },
+  ],
+ foot: {text: "Standard SPI example: Flash Read command with 24-bit address, consisting of one TX and one RX segment"}
+}
+```
+
+In addition to the TX, RX or Bidirectional modes, many SPI commands require periods where neither the host or device are transmitting data.
+For instance, many flash devices define a Fast Read command in which the host must insert a number of "dummy clocks" between the last address byte and the first data byte from the device.
+These extra cycles are required for operation at higher clock frequencies, to give the address time to propagate through the flash core.
+A standard-mode Fast Read (with 3 byte addressing) command then requires *three* SPI_HOST command segments:
+- 4 bytes TX Only: one for the instruction code (i.e., 0xb for Fast Read), and three for the address.
+- 8 dummy clocks
+- N bytes RX Only for read data response
+
+```wavejson
+{signal: [
+  {name: "clk_i",          wave: "p....................|.............................."},
+  {name: "SCK (CPOL=0)",   wave: "0.1010101010101010101|010101010101010101010101010101"},
+  {name: "CSB",            wave: "10...................|.............................."},
+  {name: "SD[0]",          wave: "00.0.0.0.1.0.1.1.2.2.|2.2.x.........................", data: ["a[23]", "a[22]", "a[1]", "a[0]"]},
+  {name: "SD[1]",          wave: "z....................|....z.z.z.z.z.z.z.z.2.2.2.2.2.", data: ["d[7]", "d[6]", "d[5]", "d[4]", "..."]},
+  {name: "Segment number", wave: "x3...................|....4...............5.........", data: ['1', '2', '3'] },
+  {name: "Segment speed",  wave: "x3...................|....4...............5.........", data: ['Standard', 'X', 'Standard'] },
+  {name: "Segment direction", wave: "x3...................|....4...............5.........", data: ['TX', 'Dummy', 'RX'] },
+  ],
+ foot: {text: "Standard SPI example: Fast read command (instruction code 0xb) with 24-bit address, consisting of three segments, one TX, 8 dummy clocks and one RX segment"}
+}
+```
+
+For standard mode-commands, segments simplify the IO process by identifying which bus cycles have useful RX or TX data.
+In such cases it is not strictly necessary to the manage the impedance of the SD[0] and SD[1] lines.
+For Dual- and Quad-mode commands, however, impedance control necessary.
+The impedance of all data lines (SD[3:0]) must switch between TX and RX segments.
+
+Bidirectional data transfers are not applicable for Dual- or Quad-mode segments.
+
+In addition, the speed-mode changes how data is distributed across the four data lines, and many commands require that some segments are transmitted in standard mode (only on SD[0]), while the bulk of the data is transmitted in Dual- or Quad-mode.
+For this reason the speed-mode is also adjustable on a segment-by-segment basis.
+
+#### Specifying Command Segments
+
+The SPI host supports all four possible modes for command segments, and they are controlled writing one of the following values to the 2-bit [`COMMAND.DIRECTION`](../data/spi_host.hjson#command) register:
+- 2'b00: Dummy cycles only (neither side transmits)
+- 2'b01: RX Only
+- 2'b10: TX Only
+- 2'b11: Bidirectional
+
+### CSID Register
+
+The [`CSID`](../data/spi_host.hjson#csid) register is used to identify the target device for the next command segment.
+Whenever a command segment descriptor is written to [`COMMAND`](../data/spi_host.hjson#command), [`CSID`](../data/spi_host.hjson#csid) is passed into the FSM along with the command segment descriptor and the corresponding configurations options (taken from the CSID'th element of the `CONFIGOPTS` multi-register).
+
+This register still exists when instantiated with only one CSB line (i.e. when NumCS=1).
+However in this case the [`CSID`](../data/spi_host.hjson#csid) value is ignored.
+
+Changes in [`CSID`](../data/spi_host.hjson#csid) also affect the CSB lines, because a change in CSID can also implicitly end a command, overriding [`COMMAND.CSAAT`](../data/spi_host.hjson#command).
+If a change is detected in [`CSID`](../data/spi_host.hjson#csid), but the previous segment was submitted with the `CSAAT` bit asserted, the FSM terminates the previous command before moving on to the next segment.
+The previous CSB line is held low for *at least* `CSNTRAIL` cycles (as defined by the previous value of [`CONFIGOPTS.CSNTRAIL`](../data/spi_host.hjson#configopts)) and then brought high.
+All CSB lines are held high for `CSNIDLE` cycles (using the new value of [`CONFIGOPTS.CSNIDLE`](../data/spi_host.hjson#configopts)).
+The new CSB line is asserted low, and SCK begins toggling after the usual `CSNLEAD` cycle delay.
+
+### Configuration Options
+
+The [`CONFIGOPTS`](../data/spi_host.hjson#configopts) multi-register has one entry per CSB line and holds clock configuration and timing settings which are specific to each peripheral.
+Once the [`CONFIGOPTS`](../data/spi_host.hjson#configopts) multi-register has been programmed for each SPI peripheral device, the values can be left unchanged.
+
+The following sections give details on how the SPI_HOST can be used to control a specific peripheral.
+For simplicity, this section describes how to interact one device, attached to CSB[0], and as such references are made to the multi-registers [`CONFIGOPTS`](../data/spi_host.hjson#configopts) and [`COMMAND`](../data/spi_host.hjson#command).
+To configure timing and send commands to devices on other CSB lines, instead use the `CONFIGOPTS` multi-register corresponding to desired CSB line.
+
+The most common differences between target devices are the requirements for a specific SPI clock phase or polarity, CPOL and CPHA, which were described in the previous section [SPI Protocol Basics](#spi-protocol-basics).
+These clock parameters can be set via the [`CONFIGOPTS.CPOL`](../data/spi_host.hjson#configopts) or [`CONFIGOPTS.CPHA`](../data/spi_host.hjson#configopts) register fields.
+Likewise, as also described in the previous section, if device setup times require a full clock cycle before sampling the output, Full-Cycle Mode can be enabled by asserting the [`CONFIGOPTS.FULLCYC`](../data/spi_host.hjson#configopts) bit.
+
+#### Clock rate selection
+
+The SPI clock rate for each peripheral is set by two factors:
+- The SPI_HOST input clock
+- A 16-bit clock divider
+
+The SPI protocol usually requires activity (either sampling or asserting data) on either edge of the SCK clock.
+For this reason the maximum SCK frequency is at most one half the SPI_HOST core frequency.
+
+Since some peripheral devices attached to the same SPI_HOST may require different clock frequencies, there is also the option to divide the core clock by an additional factor when dealing with slower peripherals.
+
+$$T_{\textrm{SCK},0}=\frac{1}{2}\frac{T_\textrm{clk}}{\textrm{CONFIGOPTS.CLKDIV}+1}$$
+
+#### Chip-select Timing Control
+
+Typically the CSB line is automatically deasserted after the last edge of SCK.
+However, by asserting [`COMMAND.CSAAT`](../data/spi_host.hjson#command) when issuing a particular command, one can instruct the core to hold CSB low indefinitely after the last clock edge.
+This is useful for merging two adjacent command segments together, to create more complex commands, such as flash Quad read commands which require a mix of segments with different speeds and directions.
+The CSB line can then be deasserted by either issuing another command without the [`COMMAND.CSAAT`](../data/spi_host.hjson#command) field, issuing a command to a different device (after changing the [`CSID`](../data/spi_host.hjson#csid) register), or simply resetting the core FSM via the [`CONTROL.RST`](../data/spi_host.hjson#control) register.
+
+To avoid spurious clock signals, changes to the [`CONFIGOPTS`](../data/spi_host.hjson#configopts) parameters take effect only at the end of a command segment and only when all `csb` lines are deasserted.
+There are two cases to consider:
+1. Configuration changes detected and CSAAT=0 for the previous segment:
+This is when configuration changes are typically expected, and in this case, the SPI_HOST waits for the previous segment to complete before moving changing the configuration.
+The SPI_HOST ensures that all `csb` lines are held idle long enough to satisfy the configuration requirements both *before* and *after* the change.
+2. CSAAT = 1 for the previous segment:
+Configuration changes are not typically expected after CSAAT segments, and require special treatment as the IP does not usually return the `csb` lines to the idle/inactive state at this time.
+In such cases, the SPI_HOST IP closes out the ongoing transaction, ignoring CSAAT, and the configuration is then applied once the SPI_HOST has returned to the idle state.
+The next segment can then proceed, even though the remote device will likely see the next segment as the start of a new transaction (as opposed to a continuation of the previous transaction), because of the brief intervening idle pulse.
+
+Most devices require at least one-half SCK clock-cycle between either edge of CSB and the nearest SCK edge.
+However, some devices may require more timing margin and so the SPI_HOST core offers some configuration registers for controlling the timing of the CSB edges when operating under automatic control.
+The relevant parameters are as follows:
+- T<sub>IDLE</sub>: The minimum time between each rising edge of CSB and the following falling edge.
+This time delay is a half SCK cycle by default but can be extended to as long as eight SCK cycles by setting the [`CONFIGOPTS.CSNIDLE`](../data/spi_host.hjson#configopts) register.
+- T<sub>LEAD</sub>: The minimum time between each falling edge of CSB and the first leading edge of SCK.
+This time delay is a half SCK cycle by default but can be extended to as long as eight SCK cycles by setting the [`CONFIGOPTS.CSNLEAD`](../data/spi_host.hjson#configopts) register.
+- T<sub>TRAIL</sub>: The minimum time between the last trailing edge of SCK and the following rising edge of CSB.
+This time delay is a half SCK cycle by default but can be extended to as long as eight SCK cycles by setting the [`CONFIGOPTS.CSNTRAIL`](../data/spi_host.hjson#configopts) register.
+
+```wavejson
+{signal: [
+  {name: "SCK",  wave: "l....1010|10........"},
+  {name: "CSB", wave: "10.......|.....1...0", node: ".A...B.....C...D...E"}
+],
+ edge: ["A<->B minimum (CSNLEAD+1)", "C<->D minimum (CSNTRAIL+1)", "D<->E minimum (CSNIDLE+1)"],
+  head: {
+    text: "Impact of CSNLEAD, CSNTRAIL and CSNIDLE CONFIGOPTS register settings",
+    tick: 1
+  },
+  foot: {
+    text: ["tspan", "All ticks are in units of &#xbd;T",
+           ["tspan", {'baseline-shift':'sub'}, "SCK"],
+          "=&#xbd;T",
+           ["tspan", {'baseline-shift':'sub'}, "clk"],
+          "&#xd7;(CLKDIV+1)"]
+  }
+}
+```
+
+These settings are all minimum bounds, and delays in the FSM implementation may create more margin in each of these timing constraints.
+
+### Idle Time Delays When Changing Configurations
+
+It is important that the configuration changes are applied while `csb` is high to avoid sending spurious `sck` events to any devices.
+For example, if two devices have different requirements for `CPOL`, the clock polarity should not toggle except when `csb` is high (inactive) for all devices.
+
+Furthermore, `csb` should be remain high for the minimum idle time both before and after the configuration update.
+For example, consider a SPI_HOST attached to two devices each with different requirements for the clock divider, clock polarity, and idle time.
+Consider a configuration where total idle time (as determined by the [`CONFIGOPTS.CLKDIV`](../data/spi_host.hjson#configopts) and [`CONFIGOPTS.CSNIDLE`](../data/spi_host.hjson#configopts) multi-registers) works out to 9 idle clocks for the first device, and 4 clocks for the second device.
+In this scenario then, when swapping from the first device to the second, the SPI_HOST IP will only swap the clock polarity once the first `csb` line, `csb[0]`, has been high for at least 9 clocks, and will continue to hold the second `csb` line, `csb[1]`, high for 4 additional clocks before starting the next transaction.
+
+```wavejson
+{signal: [
+  {name: 'clk', wave: 'p..............'},
+  ["Requested Config",
+   {name: 'Configuration ID',  wave: '3.4............', data: ["CSID=0", "CSID=1"]},
+   {name: 'CPOL',              wave: '2.2............', data: ["0", "1"]},
+   {name: 'CLKDIV',            wave: '2.2............', data: ["2", "1"]},
+   {name: 'CSNIDLE',           wave: '2.2............', data: ["2", "1"]},
+   {name: 'Min. Idle cycles', wave: '2.2............', data: ["9", "4"]},
+  ],
+  ["Active Config",
+   {name: 'Configuration ID',  wave: '3.........4....', data: ["CSID=0", "CSID=1"]},
+   {name: 'CPOL',              wave: '2.........2....', data: ["0", "1"]},
+   {name: 'CLKDIV',            wave: '2.........2....', data: ["2", "1"]},
+   {name: 'CSNIDLE',           wave: '2.........2....', data: ["2", "1"]},
+   {name: 'Min. Idle cycles', wave: '2.........2....', data: ["9", "4"]},
+  ],
+   {name: 'csb[0]',                     wave: '01.............',
+                                        node: '.A........B....'},
+   {name: 'csb[1]',                     wave: '1.............0',
+                                        node: '..........C...D'},
+   {name: 'configuration update event', wave: '1.........H....'}
+],
+  edge: ["A<->B min. 9 cycles", "C<->D min. 4 cycles"],
+  head: {text: "Extended Idle Time During Configuration Changes", tock: 1}
+}
+```
+
+This additional idle time applies not only when switching between devices but when making any changes to the configuration for most recently used device.
+For instance, even in a SPI_HOST configured for one device, changes to [`CONFIGOPTS`](../data/spi_host.hjson#configopts), will trigger this extended idle time behavior to ensure that the change in configuration only occurs in the middle of a long idle period.
+
+
+### Special Command Fields
+
+The [`COMMAND`](../data/spi_host.hjson#command) register must be written once for each command segment.
+Whenever a command segment is written to [`COMMAND`](../data/spi_host.hjson#command), the contents of the [`CONFIGOPTS`](../data/spi_host.hjson#configopts), [`CSID`](../data/spi_host.hjson#csid), and [`COMMAND`](../data/spi_host.hjson#command) registers are passed through the Config/Command FIFO to the SPI_HOST core FSM.
+Once the command is issued, the core will immediately deassert [`STATUS.READY`](../data/spi_host.hjson#status), and once the command has started [`STATUS.ACTIVE`](../data/spi_host.hjson#status) will go high.
+The command is complete when [`STATUS.ACTIVE`](../data/spi_host.hjson#status) goes low.
+A `spi_event` interrupt can also be triggered to go off on completion by setting [`EVENT_ENABLE.IDLE`](../data/spi_host.hjson#event_enable).
+
+### Chip Select Masks
+
+Each instance of the SPI_HOST IP supports a parametrizable number of chip select lines (CSB[NumCS-1:0]).
+Each CSB line can be routed either to a single peripheral or to a daisy-chain of peripherals.
+Whenever a segment description is written to the [`COMMAND`](../data/spi_host.hjson#command) register, the  [`CSID`](../data/spi_host.hjson#csid) is sent along with [`COMMAND`](../data/spi_host.hjson#command) and the `CONFIGOPTS` multi-register corresponding to [`CSID`](../data/spi_host.hjson#csid)  to indicate which device is meant to receive the command.
+The SPI_HOST core typically then manages the details of asserting and deasserting the proper CSB line, subject to the timing parameters expressed in [`CONFIGOPTS.CSNLEAD`](../data/spi_host.hjson#configopts), [`CONFIGOPTS.CSNTRAIL`](../data/spi_host.hjson#configopts), and [`CONFIGOPTS.CSNIDLE`](../data/spi_host.hjson#configopts).
+
+If [Pass-through mode](#pass-through-mode) is enabled then the CSB lines are controlled by *neither* the SPI_HOST hardware nor the firmware register.
+In Pass-though mode, control of the CSB lines passes directly to the inter-module port, `passthrough_i.csb`.
+
+### Back-to-back Segments
+
+The command interface can allows for any number of segments in a given command.
+
+Since most SPI Flash transactions typically consist of 3 or 4 segments, there is a small command FIFO for submitting segments to the SPI_HOST IP, so that firmware can issue the entire transaction at one time.
+
+Writing a segment description to [`COMMAND`](../data/spi_host.hjson#command) when [`STATUS.READY`](../data/spi_host.hjson#status) is low will trigger an error condition, which must be acknowledged by software.
+When submitting multiple segments to the the command queue, firmware can also check the [`STATUS.CMDQD`](../data/spi_host.hjson#status) register to determine how many unprocessed segments are in the FIFO.
+
+## Data Formatting
+
+### Input and Output Byte Ordering
+
+The SPI transactions must be issued with correct bit ordering to properly communicate with a remote device.
+Based on the requirements for our chosen flash devices, this IP follows these conventions:
+- The relative significance of lines on the SD bus: SD[0] is always the least significant, followed by SD[1] though SD[3] with increasing significance.
+- The relative significance of a sequence of bits on the same SD bus: more significant bits are always transmitted before (or at the same time as) less significant bits.
+    - For instance, when transferring a single byte in Quad mode, all four bits of the upper nibble (bits 7 through 3) are transferred in the first clock cycle and the entire lower nibble (bits 3 through 0) is transferred in the second cycle.
+
+The programming model for the IP should meanwhile make it easy to quickly program the peripheral device, with a minimum amount of byte shuffling.
+It should be intuitive to program the specific flash devices we are targeting, while following the conventions above:
+- When transferring data in from the [`RXDATA`](../data/spi_host.hjson#rxdata) memory window or out to the [`TXDATA`](../data/spi_host.hjson#txdata) window, the IP should fully utilize the TL-UL bus, using 32-bit I/O instructions.
+- The SPI_HOST should make it easy to arrange transaction data in processor memory, meaning that bytes should be sequentially transmitted in order of ascending memory address.
+  - When using 32-bit I/O instructions, this requires some knowledge of the processor byte-order.
+
+Based on these requirements, data read from [`RXDATA`](../data/spi_host.hjson#rxdata) or placed in [`TXDATA`](../data/spi_host.hjson#txdata) are handled as follows:
+- 32-bit words placed in [`TXDATA`](../data/spi_host.hjson#txdata) are transmitted in first-in-first-out order.
+Likewise, words received from the SPI data lines are made available for reading from [`RXDATA`](../data/spi_host.hjson#rxdata) in first-in-first-out order.
+- Within a 32-bit word, the `ByteOrder` parameter controls the order in which bytes are transmitted, and also the manner in which received bytes are eventually arranged in the 32-bit [`RXDATA`](../data/spi_host.hjson#rxdata) register.
+By default (`ByteOrder` = 1, for Little-Endian processors), the LSB of [`TXDATA`](../data/spi_host.hjson#txdata) (i.e bits 7 though 0) is transmitted first, and the other bytes follow in order of increasing significance.
+Similarly, the first byte received is packed into the LSB of [`RXDATA`](../data/spi_host.hjson#rxdata), and the subsequent bytes of each [`RXDATA`](../data/spi_host.hjson#rxdata) word are packed in order of increasing significance.
+
+On the other hand, if `ByteOrder` is set to 0 (for Big-Endian processors), the MSB is transmitted first from [`TXDATA`](../data/spi_host.hjson#txdata), and received data is loaded first into the MSB of [`RXDATA`](../data/spi_host.hjson#rxdata).
+   - The default choice of Little-Endian reflects native byte-order of the Ibex processor.
+- Finally *within a given byte*, the most significant bits are transmitted and received first.
+For Dual and Quad transactions the least significant bit in any instantaneous pair or nibble is transmitted or received on SD[0], and the remaining SD bits (1 though 3) are populated in order of increasing significance.
+
+The following figure shows how data appears on the serial data bus when the hardware reads it from [`TXDATA`](../data/spi_host.hjson#txdata) or writes it to [`RXDATA`](../data/spi_host.hjson#rxdata).
+
+```wavejson
+ {signal: [
+  ["ByteOrder=0",
+  {name: "SD[0] (host output)", wave: "x22222222222|2222|222|22x", data: ["t[31]", "t[30]", "t[29]", "t[28]", "t[27]", "t[26]", "t[25]", "t[24]", "t[23]","t[22]",
+                                                                          "t[21]","t[17]","t[16]","t[15]","t[14]","t[8]", "t[7]", "t[6]", "t[1]", "t[0]"]},
+  {name: "SD[1] (host input)", wave: "x22222222222|2222|222|22x", data: ["r[31]", "r[30]", "r[29]", "r[28]", "r[27]", "r[26]", "r[25]", "r[24]", "r[23]","r[22]",
+                                                                         "r[21]","r[17]","r[16]","r[15]","r[14]","r[8]", "r[7]", "r[6]", "r[1]", "r[0]"]},
+  {name: "Which byte?", wave: "x4.......4..|..4.|.4.|..x", data: ["DATA MSB", "","", "          LSB"]}
+],
+  ["ByteOrder=1",
+  {name: "SD[0] (host output)", wave: "x22222222222|2222|222|22x", data: ["t[7]", "t[6]", "t[5]", "t[4]", "t[3]", "t[2]", "t[1]", "t[0]", "t[15]","t[14]",
+                                                                          "t[13]","t[9]","t[8]","t[23]","t[22]","t[16]", "t[31]", "t[30]", "t[25]", "t[24]"]},
+  {name: "SD[1] (host input)", wave: "x22222222222|2222|222|22x", data: ["r[7]", "r[6]", "r[5]", "r[4]", "r[3]", "r[2]", "r[1]", "r[0]", "r[15]","r[14]",
+                                                                         "r[13]","r[9]","r[8]","r[23]","r[22]","r[16]", "r[31]", "r[30]", "r[25]", "r[24]"]},
+  {name: "Which byte?", wave: "x5.......5..|..5.|.5.|..x", data: ["DATA LSB", "","", "          MSB"]}
+],
+  ],
+  head: {
+   text: "Serial bit ordering for 32-bit data words written to DATA (t[31:0]) or read from DATA (r[31:0]) as a Function of the Parameter 'ByteOrder'",
+  },
+  foot: {
+  text: "Standard SPI, bidirectional segment.  Bits are numbered as they appear in the DATA memory window"
+  }
+}
+```
+
+
+As shown in the following figure, a similar time-ordering scheme applies for Dual- and Quad-mode transfers.
+However many bits of similar significance are packed into multiple parallel SD data lines, with the least significant going to SD[0].
+
+```wavejson
+{signal: [
+  ["ByteOrder=0",
+  {name: "SD[0]", wave: "x...22334455x...", data: ["d[28]", "d[24]", "d[20]", "d[16]", "d[12]", "d[8]", "d[4]", "d[0]"]},
+  {name: "SD[1]", wave: "x...22334455x...", data: ["d[29]", "d[25]", "d[21]", "d[17]", "d[13]", "d[9]", "d[5]", "d[1]"]},
+  {name: "SD[2]", wave: "x...22334455x...", data: ["d[30]", "d[26]", "d[22]", "d[18]", "d[14]", "d[10]", "d[6]", "d[2]"]},
+  {name: "SD[3]", wave: "x...22334455x...", data: ["d[31]", "d[27]", "d[23]", "d[19]", "d[15]", "d[11]", "d[7]", "d[3]"]},
+],
+   ["ByteOrder=1",
+  {name: "SD[0]", wave: "x...55443322x...", data: ["d[4]", "d[0]", "d[12]", "d[8]", "d[20]", "d[16]", "d[28]", "d[24]"]},
+  {name: "SD[1]", wave: "x...55443322x...", data: ["d[5]", "d[1]", "d[13]", "d[9]", "d[21]", "d[17]", "d[29]", "d[25]"]},
+  {name: "SD[2]", wave: "x...55443322x...", data: ["d[6]", "d[2]", "d[14]", "d[10]", "d[22]", "d[18]", "d[30]", "d[26]"]},
+  {name: "SD[3]", wave: "x...55443322x...", data: ["d[7]", "d[3]", "d[15]", "d[11]", "d[23]", "d[19]", "d[31]", "d[27]"]},
+  ],
+  ],
+  head: {
+   text: "Serial bit ordering for 32-bit data word (d[31:0]), Quad SPI as a Function of the Parameter 'ByteOrder'",
+  },
+  foot: {
+  text: "(Bits are numbered as they appear when loaded into DATA memory window)"
+  }
+}
+```
+
+### Command Length and Alignment in DATA
+
+Even though the [`TXDATA`](../data/spi_host.hjson#txdata) memory window typically accepts 32-bit words, command segments do not need to use all the bytes from every word.
+
+For TX (or Bidirectional) segments, unused bytes from the latest TX FIFO word are simply ignored at the end of a segment.
+For RX (or Bidirectional) segments, if the last few bytes received do not fill an entire DATA word, the partial word will be zero-padded and inserted into the RX FIFO once the segment is completed.
+If ByteOrder=1 (the default, Little-Endian case), this padding will fill the unused most-significant bytes of the final RX DATA word, otherwise the padding will fill the unused least-significant bytes.
+
+The following waveform illustrates an example SPI transaction, where neither the data transmitted nor the data received in each segment fit into an even number of 32-bit words.
+In this example, the values `I[31:0]`, `A[31:0]` and `B[31:0]`, have been previously written into [`TXDATA`](../data/spi_host.hjson#txdata) via firmware, and afterwards one word, `X[31:0]`, is available for reading from [`RXDATA`](../data/spi_host.hjson#rxdata).
+All data in the waveform is transferred using 32-bit instructions.
+
+```wavejson
+{signal: [
+  {name: "Segment number", wave: "x2.......2.........2.2.x", data: "1 2 3 4"},
+  {name: "Speed", wave: "x2.......2.........2.2.x", data: "Standard Quad X Quad"},
+  {name: "Direction", wave: "x2.......2.........2.2.x", data: "TX TX Dummy RX"},
+  {name: "Length", wave: "x2.......2.........2.2.x", data: "1 5 2 1"},
+  ["ByteOrder=0",
+  {name: "SD[0]", wave: "x222222222233445522z.22x", data: ["I[31]", "I[30]", "I[29]", "I[28]", "I[27]", "I[26]", "I[25]", "I[24]",
+                                                           "A[28]", "A[24]", "A[20]", "A[16]", "A[12]", "A[8]",  "A[4]", "A[0]", "B[28]", "B[24]", "X[28]", "X[24]"]},
+  {name: "SD[1]", wave: "xz.......2233445522z.22x", data: ["A[29]", "A[25]", "A[21]", "A[17]", "A[13]", "A[9]",  "A[5]", "B[1]", "B[29]", "B[25]", "X[29]", "X[25]"]},
+  {name: "SD[2]", wave: "xz.......2233445522z.22x", data: ["A[30]", "A[26]", "A[22]", "A[18]", "A[14]", "A[10]", "A[6]", "B[2]", "B[30]", "B[26]", "X[30]", "X[26]"]},
+  {name: "SD[3]", wave: "xz.......2233445522z.22x", data: ["A[31]", "A[27]", "A[23]", "A[19]", "A[15]", "A[11]", "A[7]", "B[3]", "B[31]", "B[27]", "X[31]", "X[27]"]},
+],
+   {name:""},
+   ["ByteOrder=1",
+  {name: "SD[0]", wave: "x555555555544332255z.55x", data: ["I[7]", "I[6]", "I[5]", "I[4]", "I[3]", "I[2]", "I[1]", "I[0]",
+                                                           "A[4]", "A[0]", "A[8]",  "A[12]", "A[20]", "A[16]", "A[24]", "A[28]", "B[4]", "B[0]", "X[4]", "X[0]"]},
+  {name: "SD[1]", wave: "xz.......5544332255z.55x", data: ["A[5]", "A[1]", "A[9]",  "A[13]", "A[21]", "A[17]", "A[25]", "A[29]", "B[5]", "B[1]", "X[5]", "X[1]"]},
+  {name: "SD[2]", wave: "xz.......5544332255z.55x", data: ["A[6]", "A[2]", "A[10]", "A[14]", "A[22]", "A[18]", "A[26]", "A[30]", "B[6]", "B[2]", "X[6]", "X[2]"]},
+  {name: "SD[3]", wave: "xz.......5544332255z.55x", data: ["A[7]", "A[3]", "A[11]", "A[15]", "A[23]", "A[19]", "A[27]", "A[31]", "B[7]", "B[3]", "X[7]", "X[3]"]},
+  ],
+  ],
+  head: {
+    text: "Serial bit ordering for 6 bytes transmitted from FIFO words 'I[31:0], A[31:0]' and 'B[31:0]', and 1 byte received into word 'X[31:0]'",
+  },
+  foot: {
+    text: "Command consists of 4 segments, all TX data is written to DATA using 32-bit memory instructions (all bytes enabled)"
+  }
+}
+```
+
+When packing data into the TX FIFO, there are also no restrictions on the alignment of the data written to the [`TXDATA`](../data/spi_host.hjson#txdata) memory window, as it supports byte-enable signals.
+This means that when copying bytes into [`TXDATA`](../data/spi_host.hjson#txdata) from unaligned firmware memory addresses, it is possible to use byte or half-word instructions.
+Full-word instructions should however be used whenever possible, because each write consumes a full word of data in the TX FIFO regardless of the instruction size.
+Smaller writes will thus make inefficient use of the TX FIFO.
+
+Filtering out disabled bytes consumes clock cycles in the data pipeline, and can create bubbles in the transmission of SPI_DATA.
+In the worst case, such bubbles can also be interpreted as transient underflow conditions in the TX FIFO, and could trigger spurious interrupts.
+The longest delays occur whenever a word is loaded into the TX FIFO with only one byte enabled.
+
+When writing to the [`TXDATA`](../data/spi_host.hjson#txdata) window, only three types of data are expected: individual bytes, half-words, and full-words.
+Other types of write transactions (i.e., non-contiguous, zero-byte and three-byte writes) are not supported by most processors.
+Therefore it is assumed that if such transactions do appear, it is likely a sign of a system integrity error, and so these other classes of writes are not supported.
+
+If such transactions ever occur, they trigger an "Invalid Access" error event, which suspends the processing of future commands until the error has been cleared by setting the [`ERROR_STATUS.ACCESSINVAL`](../data/spi_host.hjson#error_status) bit.
+
+The RX FIFO has no special provisions for packing received data in any unaligned fashion.
+Depending on the `ByteOrder` parameter, the first byte received is always packed into either the most- or least-significant byte read from the [`RXDATA`](../data/spi_host.hjson#rxdata) memory window.
+
+
+## Pass-through Mode
+
+The SPI_HOST also supports a special "Pass-through" mode, which allows for the direct control of the serial interface by another block (namely SPI_DEVICE).
+This feature is entirely controlled by intermodule signals `passthrough_i` and `passthrough_o`, which control a set of multiplexers.
+If `passthrough_i.passthrough_en` is asserted the SPI_HOST peripheral bus signals reflect the corresponding signals in the `passthrough_i` structure.
+Otherwise, the peripheral signals are controlled by the SPI_HOST FSM and the internal shift register.
+
+## Interrupt Aggregation
+
+In order to reduce the total number of interrupts in the system, the SPI_HOST has only two interrupt lines: `error` and `spi_event`.
+Within these two interrupt classes, there are a number of conditions which can trigger them.
+
+Each interrupt class has a secondary status and mask register, to control which sub-classes of SPI events will cause an interrupt.
+
+### SPI Events and Event Interrupts
+
+The SPI_HOST supports interrupts for the following SPI events:
+
+- `IDLE`: The SPI_HOST is idle.
+- `READY`: The SPI_HOST is ready to accept a new command.
+- `RXFULL`: The SPI_HOST has run out of room in the RXFIFO.
+- `RXWM`: The number of 32-bit words in the RXFIFO currently exceeds the value set in [`CONTROL.RX_WATERMARK`](../data/spi_host.hjson#control).
+- `TXEMPTY`: The SPI_HOST has transmitted all the data in the TX FIFO.
+- `TXWM`: The number of 32-bit words in the TX FIFO currently is currently less than the value set in [`CONTROL.TX_WATERMARK`](../data/spi_host.hjson#control)
+
+Most SPI events signal a particular condition that persists until it is fixed, and these conditions can be detected by polling the corresponding field in the [`STATUS`](../data/spi_host.hjson#status) register.
+
+In addition to these events, there are also two additional diagnostic fields in the [`STATUS`](../data/spi_host.hjson#status) register:
+- `RXSTALL`: The RX FIFO is full, and the SPI_HOST is stalled and waiting for firmware to remove some data.
+- `TXSTALL`: The TX FIFO is not only empty, but the SPI_HOST is stalled and waiting for firmware to add more data.
+
+These bits can provide diagnostic data for tuning the throughput of the device, but do not themselves generate event interrupts.
+
+By default none of these SPI events trigger an interrupt.
+They need to be enabled by writing to the corresponding field in [`EVENT_ENABLE`](../data/spi_host.hjson#event_enable).
+
+The SPI event interrupt is signaled only when the IP enters the corresponding state.
+For example if an interrupt is requested when the TX FIFO is empty, the IP will only generate one interrupt when the last data word is transmitted from the TX FIFO.
+In this case, no new interrupts will be created until more data has been added to the FIFO, and all of it has been transmitted.
+
+#### Stall Conditions
+
+The SPI_HOST IP will temporarily suspend operations if it detects a potential overflow of the RX FIFO or an attempted underflow of the TX FIFO.
+During a stall event, `csb` remains active, and there are no `sck` clock ticks until there is more data to transmit or there is some space to receive more data.
+The `RXSTALL` and `TXSTALL` status bits are meant to inform firmware of such halts.
+Due to implementation details the SPI_HOST IP will also pause, and signal a stall condition, if there are delays related to packing or unpacking the SPI_DATA into 32-bit words.
+The exact conditions for these *transient* stall conditions are implementation dependent, and described in detail in [the Design Details section](#bubbles-in-the-data-pipeline).
+
+### Error Interrupt Conditions
+
+There are six types of error events which each represent a violation of the SPI_HOST programming model:
+- If [`COMMAND`](../data/spi_host.hjson#command) is written when [`STATUS.READY`](../data/spi_host.hjson#status) is zero, the IP will assert [`ERROR_STATUS.CMDERR`](../data/spi_host.hjson#error_status).
+- The IP asserts [`ERROR_STATUS.OVERFLOW`](../data/spi_host.hjson#error_status) if it receives a write to [`TXDATA`](../data/spi_host.hjson#txdata) when the TX FIFO is full.
+- The IP asserts [`ERROR_STATUS.UNDERFLOW`](../data/spi_host.hjson#error_status) if it software attempts to read [`RXDATA`](../data/spi_host.hjson#rxdata) when the RX FIFO is empty.
+- Specifying a command segment with an invalid width (speed), or making a request for a Bidirectional Dual- or Quad-width segment will trigger a [`ERROR_STATUS.CMDINVAL`](../data/spi_host.hjson#error_status) error event.
+- Submitting a command segment to an invalid CSID (one larger or equal to `NumCS`) will trigger a [`ERROR_STATUS.CSIDINVAL`](../data/spi_host.hjson#error_status) event.
+- [`ERROR_STATUS.ACCESSINVAL`](../data/spi_host.hjson#error_status) is asserted if the IP receives a write event to the [`TXDATA`](../data/spi_host.hjson#txdata) window that does not correspond to any known processor data type (byte, half- or full-word).
+
+All of these programming violations will create an error event when they occur.
+They will also halt the IP until the corresponding bit is cleared in the [`ERROR_STATUS`](../data/spi_host.hjson#error_status) register.
+Whenever an error event occurs, the error must be acknowledged by clearing (write 1 to clear) the corresponding bit in [`ERROR_STATUS`](../data/spi_host.hjson#error_status).
+
+By default all error events will trigger an `error` interrupt.
+Clearing the bit corresponding bit in the [`ERROR_ENABLE`](../data/spi_host.hjson#error_enable) register in the suppresses interrupts for that class of error event and allows the IP to proceed even if one of these errors has occurred.
+The [`ERROR_STATUS`](../data/spi_host.hjson#error_status) register will continue to report all violations even if a particular class of error event has been disabled.
+
+Of the six error event classes, `ACCESSINVAL` error events are the only ones which cannot be disabled.
+This is because `ACCESSINVAL` events are caused by anomalous TLUL byte-enable masks that do not correspond to any known software instructions, and can only occur through a fault in the hardware integration.
+
+When handling SPI_HOST `error` interrupts, the [`ERROR_STATUS`](../data/spi_host.hjson#error_status) bit should be cleared *before* clearing the error interrupt in the [`INTR_STATE`](../data/spi_host.hjson#intr_state) register.
+Failure do to so may result in a repeated interrupt.
+
+## Status Indicators
+
+The [`STATUS`](../data/spi_host.hjson#status) register contains a number of fields that should be queried for successful operation or troubleshooting.
+
+The register [`STATUS.ACTIVE`](../data/spi_host.hjson#status) indicates whether a command segment is currently being processed by the FSM.
+Even if [`STATUS.ACTIVE`](../data/spi_host.hjson#status) is high it is often still possible to insert another command segment into the command FIFO.
+The register [`STATUS.READY`](../data/spi_host.hjson#status) indicates that there is room in the command FIFO.
+
+The [`STATUS.BYTEORDER`](../data/spi_host.hjson#status) field indicates the fixed value of the `ByteOrder` parameter, which is presented to software to confirm the byte ordering used in the [`RXDATA`](../data/spi_host.hjson#rxdata) and [`TXDATA`](../data/spi_host.hjson#txdata) windows.
+
+The 8-bit fields [`STATUS.RXQD`](../data/spi_host.hjson#status) and [`STATUS.TXQD`](../data/spi_host.hjson#status) respectively indicate the number of words currently stored in the RX and TX FIFOs.
+
+The remaining fields in the [`STATUS`](../data/spi_host.hjson#status) register are all flags related to the management of the TX and RX FIFOs, which are described in the [section on SPI Events](#spi-events-and-event-interrupts).
+
+## Other Registers
+
+### SPI_HOST Enable
+
+The SPI_HOST state machine is disabled on reset.
+Before any commands are processed, the block must be enabled by writing one to the [`CONTROL.SPIEN`](../data/spi_host.hjson#control) register.
+Writing a zero to this register temporarily suspends any previously submitted transactions.
+If the block is re-enabled by writing a one to [`CONTROL.SPIEN`](../data/spi_host.hjson#control), any previously executing commands will continue from wherever they left off.
+
+An unacknowledged error event suspends the core state machine.
+
+### SPI_HOST Output Enable
+
+In addition to enabling the SPI_HOST FSM, the SPI_HOST outputs must also be enabled for successful operation.
+This can be achieved by also setting the [`CONTROL.OUTPUT_EN`](../data/spi_host.hjson#control) field when enabling the SPI_HOST FSM.
+
+### Component reset
+
+In addition to the global hardware reset, there is a software reset option which completely resets the SPI host.
+To use this reset, assert [`CONTROL.SW_RST`](../data/spi_host.hjson#control), and then wait for the device to reset ([`STATUS.ACTIVE`](../data/spi_host.hjson#status), [`STATUS.TXQD`](../data/spi_host.hjson#status) and [`STATUS.RXQD`](../data/spi_host.hjson#status) to all go to zero), before releasing [`CONTROL.SW_RST`](../data/spi_host.hjson#control).
+
+## Block Diagram
+
+![](../doc/spi_host_block_diagram.svg)
+
+## Hardware Interfaces
+
+* [Interface Tables](../data/spi_host.hjson#interfaces)
+
+# Design Details
+
+## Component Overview
+
+Transaction data words flow through the SPI_HOST IP in a path which starts with the TX FIFOs, shown in the block diagram above.
+At the output of the TX FIFOs each data word is separated into individual bytes by the Byte Select block, which is also responsible for parsing the byte-enable mask and discarding unwanted bytes.
+Selected bytes are then passed into the shift register, where they are played out at Standard, Dual, or Quad speed.
+For receive segments, outputs from the shift register are passed into the Byte Merge block to be packed into 32-bit words.
+Finally the repacked words are inserted into the RX FIFO to be read by firmware.
+
+All of the blocks in the data path use ready-valid handshakes for flow control.
+In addition, the Byte Select block expects a `flush` pulse from the shift register to signify when no further data is needed for the current segment, and so any remaining data in the current word can be discarded.
+Likewise, the Byte Merge block receives a `last` signal from the shift register to identify the end of a command segment so that any partial words can be passed into the RX FIFO (regardless of whether the last byte forms a complete 32-bit word).
+The shift register is then responsible for driving and receiving data on the `cio_sd` lines.
+It coordinates all of the data flow to and from the Byte Select and Byte Merge blocks.
+
+The SPI_HOST FSM parses the software command segments and orchestrates the proper transmission of data through its control of the shift register.
+The FSM directly drives the `cio_sck` and `cio_csb` signals at the commanded speed.
+It also controls the shift register: dictating the correct timing for sending out each beat of data, loading bytes from the Byte Select, and sending bytes on to the Byte Merge block.
+
+## RX and TX FIFOs
+
+The RX and TX FIFOs store the transmitted and received data, which are stored in synchronous FIFOs.
+The RX FIFO is 32 bits wide, matching the width of the TLUL register bus.
+The TX FIFO on the other hand is 36 bits wide, with 32 bits of SPI data (again to match the TLUL bus width) plus 4 byte enable-bits, which are passed into the core to allow the processing of unaligned writes.
+
+The depth of these FIFOs is controlled by two independent parameters for the RX and TX queues.
+
+## Byte Select
+
+The Byte Select unit is responsible for loading words from the FIFO and feeding individual bytes into the shift register.
+This unit takes two data inputs: a data word, `word_i[31:0]`, and a byte enable signal, `word_be_i[3:0]`.
+There is a single output, `byte_o[7:0]`, which feeds the following shift register.
+There are ready/valid signals for managing flow control on all inputs and outputs.
+The shift register asserts ready to request new bytes, based on control inputs from the SPI_HOST FSM.
+
+When the SPI_HOST FSM indicates the final byte for a segment, the shift register asserts the `flush_i` signal with `byte_ready_i` as it requests the last byte from the Byte Select.
+This instructs the Byte Select block to send one more byte from current word, and then discard any remaining unused bytes, before immediately loading the next available word from the TX FIFO.
+
+It is assumed that the input data-words and byte enables have already been byte-swapped at the IP top level, as needed.
+The bytes are transmitted to the shift register in decreasing significance, starting with `word_i[31:24]`, followed by `word_i[23:16]`, `word_i[15:8]`, and finally `word_i[7:0]`.
+
+Some bytes may be skipped however if the corresponding value of `word_be_i[3:0]` is zero.
+For example if `word_be_i[3:0]` equals `4'b0011`, then the first two input bytes will be skipped, and only `word_i[15:8]` and `word_i[7:0]` will be forwarded, in that order.
+
+The following waveform illustrates the operation of the Byte Select module, highlighting the effect of the `flush_i` signal (in the first input word), as well as the effect of the byte enable signal (shown in the second word).
+
+```wavejson
+{signal: [
+  {name: "clk_i", wave:           "p............."},
+  {name: "word_i[31:0]", wave:    "x2..x2...x....", data: ["32'hBEADCAFE", "32'hDAD5F00D"]},
+  {name: "word_be_i[31:0]", wave: "x2..x2...x....", data: ["4'b1111", "4'b0011"]},
+  {name: "word_valid_i", wave:    "0..101...0...."},
+  {name: "word_ready_o",wave:     "1...0...10...."},
+  {name: "byte_o[7:0]", wave:     "x...2222.2222x", data: ["BE", "AD", "CA", "0", "DA", "D5", "F0", "0D"]},
+  {name: "byte_valid_o", wave:    "0...1..0...1.0"},
+  {name: "byte_ready_i", wave:    "1............."},
+  {name: "byte_flush_i", wave:    "0.....10......"},
+  ],
+  head: {
+  text: "Byte Select Operation"
+  }
+}
+```
+
+## Byte Merge
+
+The Byte Merge block is responsible for accumulating bytes from the shift register and packing them into words.
+Like the Byte Select block, it is based on the `prim_packer_fifo` primitive.
+
+The Byte Merge block has a data byte input, and a data word output, which are both controlled by their corresponding ready/valid signals.
+There are no byte-enable outputs for the byte merge, as it is assumed that software can infer the relevant bytes based on the length of the relevant read command segment.
+
+There is `byte_last_i` signal, to indicate the final byte in a word.
+If `byte_last_i` is asserted whenever a byte is loaded, the new byte will be added to the output word, and any remaining bytes will be set to zero, before the word is be loaded into the RX FIFO.
+
+Input bytes are packed into the output word in decreasing significance.
+The first byte in each segment is loaded into `word_o[31:24]`.
+The following bytes are packed into `word_o[23:16]`, `word_o[15:8]`, and then `word_o[7:0]`.
+For partially filled words, the zero padding goes into the least significant byte positions.
+
+Any ByteOrder swapping is performed at the other end of the RX FIFO.
+
+```wavejson
+{signal: [
+  {name: "clk_i",        wave: "p.............."},
+  {name: "byte_i[7:0]",  wave: "x22222.2....22x", data: ["01", "02", "03", "04", "05", "06", "07", "08"]},
+  {name: "byte_valid_i", wave: "01............."},
+  {name: "byte_last_i",  wave: "0....1.0......."},
+  {name: "byte_ready_o", wave: "1....010...1..."},
+  {name: "word_o[31:0]", wave: "2.2222222222222", data: ["0", "01","0102","010203", "01020304", "0", "05", "0500", "05000", "050000", "0", "06", "0607", "060708"]},
+  {name: "word_valid_o", wave: "0....10...10..."},
+  {name: "word_ready_i", wave: "1.............."}
+  ],
+ config: {hscale:2},
+  head: {
+  text: "Byte Merge Operation"
+  }
+}
+```
+
+## Shift Register
+
+The SPI_HOST shift register serially transmits and receives all bytes to the `sd_o[3:0]` and `sd_i[3:0]` signals, based on the following timing-control signals from the FSM:
+- `speed_i`: Controls the speed of the current data segment, ranging from `Standard` or `Dual` to `Quad`
+- `wr_en_i`: Writes a new byte from the Byte Select into the 8-bit shift register
+This is usually the first signal issued to the shift register in command segments with data to transmit (i.e., TX only, or bidirectional segments)
+   - There is also a `wr_ready_o` output to tell the FSM that there is no data currently available.
+     If `wr_ready_o` is deasserted when the FSM asserts `wr_en_i`, the FSM will stall.
+- `last_write_i`: When asserted at the same time as `wr_en_i`, this indicates that the current byte is the last of its command segment, and thus the `tx_flush_o` signal should be asserted when requesting this byte from the Byte Select block.
+- `shift_en_i`: Advances the shift register by 1, 2, or 4 bits, depending on the value of `speed_i`
+- `full_cyc_i`: Indicates full-cycle operation (i.e., input data are sampled from `sd_i` whenever new data is shifted out to `sd_o`)
+- `sample_en_i`: Samples `sd_i[3:0]` into a temporary register, `sd_i_q[3:0]` so it can be loaded into the shift register with the next assertion of `shift_en_i`
+Explicit sampling is particularly required for Standard SPI bidirectional segments, where new input data arrives before the first output shift operation.
+For consistency in timing, the `sd_i_q` buffer is used in all other modes as well, unless `full_cyc_i` is asserted.
+The `sample_en_i` signal is ignored during full-cycle operation, in which case data is copied directly into the shift register during shift operations.
+- `rd_en_i`: Indicates that the current byte from the shift register should be transferred on to the Byte Merge block
+   - The `rd_ready_o` output informs the FSM whenever all data storage (the RX FIFO plus any intervening buffers) is full and no further data can be acquired.
+- `last_read_i`: When asserted at the same time as `rd_en_i`, this indicates that the current byte is the last of its command segment, and thus the `rx_last_o` signal should be asserted when passing this byte to the Byte Merge block.
+
+```wavejson
+{signal: [
+  {name: "clk_i",                   wave: "p.........................."},
+ [ "External signals",
+  {name: "TX DATA[31:0] (TX FIFO)", wave: "2..........................", data:"0x123456XX"},
+  {name: "cio_sck_o (FSM)",         wave: "0...1010101010101010101010."},
+ ],
+  {name: "cio_csb_o[0] (FSM)",      wave: "1..0......................."},
+  {name: "tx_data_i[7:0]",          wave: "2..2...............2.......", data:["0x12", "0x34", "0x56"]},
+  {name: "tx_valid_i",              wave: "1.........................."},
+  {name: "tx_ready_o/wr_en_i",      wave: "0.10..............10......."},
+  {name: "sample_en_i",             wave: "0..101010101010101010101010"},
+  {name: "shift_en_i",              wave: "0...10101010101010..1010101"},
+  {name: "speed_i[1:0]",            wave: "2..........................", data: ["0 (Standard SPI)"]},
+  {name: "sd_i[1]",                 wave: "x..1.1.0.0.1.1.1.1.0.1.0.1."},
+  {name: "sd_i_q[1]",               wave: "x...1.1.0.0.1.1.1.1.0.1.0.1"},
+  {name: "sr_q[0]",                 wave: "x..0.1.1.0.0.1.1.1.0.1.0.1."},
+  {name: "sr_q[1]",                 wave: "x..1.0.1.1.0.0.1.1.0.0.1.0."},
+  {name: "sr_q[2]",                 wave: "x..0.1.0.1.1.0.0.1.1.0.0.1."},
+  {name: "sr_q[3]",                 wave: "x..0.0.1.0.1.1.0.0.0.1.0.0."},
+  {name: "sr_q[4]",                 wave: "x..1.0.0.1.0.1.1.0.1.0.1.0."},
+  {name: "sr_q[5]",                 wave: "x..0.1.0.0.1.0.1.1.1.1.0.1."},
+  {name: "sr_q[6]",                 wave: "x..0.0.1.0.0.1.0.1.0.1.1.0."},
+  {name: "sr_q[7]",                 wave: "x..0.0.0.1.0.0.1.0.0.0.1.1."},
+  {name: "sr_q[7:0] (hex)",         wave: "x..4.2.2.2.2.2.2.2.4.2.2.2.",
+   data: ["0x12", "0x25", "0x4B", "0x96", "0x2c", "0x59", "0xB3", "0x67", "0x34", "0x69", "0xD2", "0xA5"]},
+  {name: "Load Input Data Event",   wave: "1..H...............H......."},
+  {name: "rx_data_o[7:0]", wave: "x..................2.......", data: ["0xcf"]},
+  {name: "rx_valid_o[7:0]/rd_en_i", wave: "0.................10......."},
+  {name: "sd_o[0] (sr_q[7])", wave: "x..0.0.0.1.0.0.1.0.0.0.1.1."},
+],
+head: {
+  text: "Shift Register During Standard SPI Transaction: Simultaneous Receipt and Transmission of Data."
+},
+}
+```
+
+The connection from the shift register to the `sd` bus depends on the speed of the current segment.
+- In Standard-mode, only the most significant shift register bit, `sr_q[7]` is connected to the outputs using `sd_o[0]`.
+In this mode, each `shift_en_i` pulse is induces a shift of only one bit.
+- In Dual-mode, the two most significant bits, `sr_q[7:6]`, are connected to `sd_o[1:0]` and the shift register shifts by two bits with every `shift_en_i` pulse.
+- In Quad-mode, the four most significant bits, `sr_q[7:4]`, are connected to `sd_o[3:0]` and the shift register shifts four bits with every pulse.
+
+The connections to the shift register inputs are similar.
+Depending on the speed, the `sd_i` inputs are routed to the the 1, 2, or 4 least significant inputs of the shift register.
+In full-cycle mode, the shift register LSB's are updated directly from the `sd_i` inputs.
+Otherwise the data first passes through an input sampling register, `sd_i_q[3:0]`, which allows the input sampling events to be staggered from the output shift events.
+
+### Bubbles in the Data Pipeline
+
+Temporary delays in the transmission or receipt data are a performance issue.
+Stall events, which temporarily halt operation of the SPI_HOST IP, often indicate that software is not keeping up with data in the TX and RX FIFOs.
+For this reason the SPI_HOST IP can create interrupts to help monitor the frequency of these stall events, in order to identify correctable performance delays.
+
+There is also the possibility of encountering bubble events, which cause transient stalls in the data pipeline.
+Transient stalls only occur for Quad-mode segments, and only when transmitting or receiving words with only one valid byte.
+
+When transmitting at full clock speed, Quad-mode segments need to process one byte every four clock cycles.
+If a particular Quad TX segment pulls only one byte from a particular data word (for reasons related either to the segment length or odd data alignment), the `prim_packer_fifo` used in the Byte Select block can generate delays of up to four clocks before releasing the next byte.
+This can cause temporary stall conditions either during the Quad segment, or--if there is another TX segment immediately following--just before the following segment.
+
+Similar delays exist when receiving Quad-mode data, which are similarly worst when packing words with just one byte (i.e., when receiving segments of length 4n+1).
+The RX pipeline is however much more robust to such delays, thanks to buffering in the shift register outputs.
+There is some sensitivity to *repeated* 4 clock delays, but it takes at least six of them to cause a temporary stall.
+So transient RX stalls only occur when receiving more than six consecutive one-byte segments.
+As this is an unlikely use case, transient stalls are considered an unlikely occurrence in the RX path.
+
+Dual- and Standard-mode segments can tolerate byte-to-byte delays of 7 or 15 clocks, so there are no known mechanism for transient stalls at these speeds.
+
+Please refer to the [the Appendix](#analysis-of-transient-datapath-stalls) for a detailed analysis of transient stall events.
+
+## SPI_HOST Finite State Machine (FSM)
+
+The SPI_HOST FSM is responsible for parsing the input command segments and configuration settings, which it uses to control the timing of the `sck` and `csb` signals.
+It also controls the timing of shift register operations, coordinating I/O on the `sd` bus with the other SPI signals.
+
+This section describes the SPI_HOST FSM and its control of the `sck` and `csb` lines as well as its interactions with the Shift Register and the Command FIFO.
+
+### Clock Divider
+
+The SPI_HOST FSM is driven by the rising edge of the input clock, however the FSM state registers are not *enabled* during every cycle.
+There is an internal clock counter `clk_cntr_q` which repeatedly counts down from [`CONFIGOPTS.CLKDIV`](../data/spi_host.hjson#configopts) to 0, and the FSM is only enabled when `clk_cntr_q == 0`.
+
+The exception is when the FSM is one of the two possible Idle states (`Idle` or `IdleCSBActive`), in which case `clk_cntr_q` is constantly held at zero, making it possible to immediately transition out of the idle state as soon as a new command appears.
+Once the FSM transitions out of the idle state, `clk_cntr_q` resets to [`CONFIGOPTS.CLKDIV`](../data/spi_host.hjson#configopts), and FSM transitions are only enabled at the divided clock rate.
+
+As shown in the waveform below, this has the effect of limiting the FSM transitions to only occur at discrete *timeslices* of duration:
+
+$$T_\textrm{timeslice} = \frac{T_{\textrm{clk},\textrm{clk}}}{\texttt{clkdiv}+1}.$$
+
+```wavejson
+{signal: [
+  {name: 'clk',        wave: 'p......................'},
+  {name: 'clkdiv',     wave: '2......................', data: "3"},
+  {name: 'clk_cntr_q', wave: '222222222222......22222', data: "3 2 1 0 3 2 1 0 3 2 1 0 3 2 1 0 3"},
+  {name: 'FSM state',  wave: '2...2.......2.....2...2', data: "WaitTrail WaitIdle Idle WaitLead Hi"              },
+  {name: 'fsm_en',     wave: '0..10......1......0..10'              },
+  {name: 'Timeslice Boundary', wave: "1...H...H...H.....H...H"}
+],
+  edge: ["A<->B min. 9 cycles", "C<->D min. 4 cycles"],
+ head: {text: "Use of FSM Enable Pulses to Realize Multi-Clock Timeslices", tock: 1},
+ foot: { text: "The fsm_en signal is always high in idle states, to allow exit transitions at any time"}
+}
+```
+
+#### Other Internal Counters
+
+In addition to the FSM state register, the SPI_HOST FSM block also has a number of internal registers to track the progress of a given command segment.
+
+- `wait_cntr_q`: This counter is used the hold the FSM in a particular state for several timeslices, in order to implement the `CSNIDLE`, `CSNLEAD` or `CSNTRAIL` delays required for a particular device.
+
+- `byte_cntr_q`, `bit_cntr_q`: These counters respectively track the number of bytes left to transmit for the current segment and the number of bits left to transmit in the current byte.
+
+- Finally, there are registers corresponding to each configuration field (`csid_q`, `cpol_q`, `cpha_`, `csnidle_q`, `csnlead_q`, `csntrail_q`, and `full_cyc_q`) and to each command segment field (`csaat`, `cmd_rd_en`, `cmd_wr_en`, and `cmd_speed`).
+This registers are sampled whenever a new command comes in, allowing the command inputs to change.
+
+### Basic Operation
+
+The state machine itself is easiest understood by first considering a simple case, with CSAAT set to zero.
+For this initial discussion it is assumed that every command consists of one single segment.
+Multi-segment commands are considered in the following sections.
+In this case the state machine can be simplified to the following figure.
+
+![](../doc/spi_host_fsm_simplified.svg)
+
+The operation of the state machine is the same regardless of the polarity (CPOL) or phase (CPHA) of the current command.
+Commands with `CPOL==0` or `CPOL==1` are processed nearly identically, since the only difference is in the polarity of the `sck` output.
+The state machine drives an internal `sck` clock signal, which is low except when the FSM is in the `InternalClockHigh` state.
+If `CPOL==0` this clock is registered as is to the external `sck` ports.
+If `CPOL==1` the internal clock is *inverted* before the final `sck` output register.
+
+In the following description of the individual states, it is assumed that `CPOL==0`.
+To understand the IP's behavior for transactions with `CPOL==1`, simply invert the value of `sck`.
+
+1. Idle state: In this initial reset state, The `sck` signal is low, and all `csb` lines are high (i.e., inactive).
+An input command is registered whenever `command_valid_i` and `command_ready_o` are both high (i.e., when the signal `new_command = command_valid_i & command_ready_o` is high), in which case the state machine transitions to the `WaitLead` state.
+
+2. WaitLead state: In this state, `sck` remains low, and the `csb` line corresponding to `csid` is asserted-low.
+The purpose of this state is to hold `sck` low for at least `csnlead` + 1 timeslices, before the first rising edge of `sck`.
+For his reason, the FSM uses the `wait_cntr` to track the number of timeslices spent in this state, and only exits when `wait_cntr` counts down to zero, at which point the FSM transitions to the `InternalClkHigh` state.
+
+3. InternalClkHigh state: Entering this state drives `sck` high.
+This state repeats many times per segment, and usually transitions to the `InternalClkLow` state.
+The FSM transitions to the `WaitTrail` state only when the entire segment has been transmitted/received (as indicated by the signals last_bit and last_byte).
+This state machine usually only lasts stays in this state for one timeslice, except when the FSM is disabled or stalled.
+
+4. InternalClkLow state: This state serves to drive `sck` low between visits to the `InternalClkHigh` state.
+This state always returns back to the `InternalClkHigh` state in the next timeslice.
+
+5. WaitTrail state: Similar to the WaitLead, this state serves to control the timing of the `csb` line.
+The FSM uses the `wait_cntr` register to ensure that it remains in this state for `csntrail+1` timeslices, during which time the active `csb` is still held low.
+The `wait_cntr` register resets to [`CONFIGOPTS.CSNTRAIL`](../data/spi_host.hjson#configopts) upon entering this state, and is decremented once per timeslice.
+This state transitions to `WaitIdle` when `wait_cntr` is zero.
+
+6. WaitIdle state: In this timing control state, the FSM uses the `wait_cntr` register to ensure that all `csb` lines are held high for at least `csnidle+1` timeslices.
+The `wait_cntr` register resets to [`CONFIGOPTS.CSNIDLE`](../data/spi_host.hjson#configopts) upon entering this state, and is decremented once per timeslice.
+This state transitions to `Idle` when `wait_cntr` reaches zero.
+
+```wavejson
+{signal: [
+  {name: 'clk', wave: 'p...............'},
+  {name: 'rst_n', wave: '01..............'},
+  {name: 'state', wave: 'x22.42424242.2.2', data: ['Idle', 'WaitLead', 'IntClkHigh', 'IntClkLow', 'IntClkHigh', 'IntClkLow', 'IntClkHigh', 'IntClkLow','IntClkHigh', 'WaitTrail', 'WaitIdle', 'Idle']},
+  {name: 'csb (active device)', wave: 'x10..........1..'},
+  {name: 'csb (all others)', wave: '1...............'},
+  {name: 'sck', wave: '0...10101010....'}
+],
+ config: {hscale: 2}
+}
+```
+
+### Milestone Signals, Serial Data Lines & Shift Register Control
+
+The FSM manages I/O on the `sd` bus by controlling the timing of the shift register control signals: `shift_en_o`, `sample_en_o`, `rd_en_o`, `last_read_o`, `wr_en_o`, and `last_write_o`.
+
+The shift register control signals are managed through the use of three intermediate signals:
+- `byte_starting`: This signal indicates the start of a new byte on the `sd` bus in the *following* clock cycle.
+For Bidirectional or TX segments this signal would indicate that it is time to load a new byte into the shift register.
+This signal corresponds to the FSM's `wr_en_o` port, though that output is suppressed during RX or dummy segments.
+- `byte_ending`: This signal indicates the end of the current `sd` byte in the *current* clock cycle (i.e., the next clock cycle either marks the beginning new byte or the end of the current segment).
+As illustrated in the following waveform, the `byte_starting` and `byte_ending` signals are often asserted at the same time, though there is an extra `byte_starting` pulse at the beginning of each command and an extra `byte_ending` pulse at the end.
+For RX and bidirectional command segments, a `byte_ending` pulse generates a `rd_en_o` pulse to the shift register, which transfers the 8-bit contents of the shift register into the RX FIFO via the Byte Merge block.
+- `bit_shifting`: This signal drives the `shift_en_o` control line to the shift register, ejecting the most-significant bits, and updating the `sd` outputs.
+
+These *milestone signals* mark the progress of each command segment.
+
+The coordination of the milestone signals and the shift register controls are shown in the following waveform.
+Since the milestone signal pulses coincide with *entering* particular FSM states, they are derived from the state register *inputs* (i.e., `state_d`), as opposed to the state register outputs (`state_q`).
+
+```wavejson
+{signal: [
+  {name: 'clk', wave: 'p........................'},
+  {name: 'rst_n', wave: '01.......................'},
+  {name: 'state_q',
+   wave: 'x2.2.42424242424242424242', data: "Idle WL Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo",
+   node: '...W..V.............U'},
+  {name: 'csb', wave: 'x1.0.....................'},
+  {name: 'sck', wave: '0....10101010101010101010'},
+  {name: 'state_d',
+   wave: 'x22.42424242424242424242', data: "Idle WL Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo",
+   node: '..Z..Y.............X'},
+  {name: 'byte_starting / wr_en_o',
+   wave: 'x010...............10....',
+   node: '..A................E'},
+  {name: 'byte_ending / rd_en_o',
+   wave: 'x0.................10....',
+   node: '...................F'},
+  {name: 'bit_shifting / shift_en_o',
+   wave: 'x0...10101010101010..1010',
+   node: '.....C'},
+  {name: 'sample_en_o',
+   wave: 'x0.10.1010101010101010101',
+   node: '...B..D'},
+  {name: 'sample_event',
+   wave: '1...H..H.H.H.H.H.H.H.H.H.'},
+  {name:'sd_o',
+   wave:'x..2..2.2.2.2.2.2.2.2.2.2',
+   node:'',
+   data: "A[7] A[6] A[5] A[4] A[3] A[2] A[1] A[0] B[7] B[6]"},
+  {name: 'bit_cntr_q', wave: 'x2.2..2.2.2.2.2.2.2.2.2.2', data: "0 7 6 5 4 3 2 1 0 7 6 5"},
+  {name: 'byte_cntr_q', wave: 'x2.2................2....', data: "0 N N-1"},
+
+],
+edge: ['A-~>B', 'C-~>D', 'Z-~>A', 'Y-~>C', 'X-~>E', 'X-~>F', 'Z-~>W', 'Y-~>V', 'X-~>U'],
+config: {hscale: 1},
+head: {text: "Timing Relationship between FSM states, Milestone Signals, and Shift Register controls (with CPHA=0)"},
+foot: {text: "Key: WL=\"WaitLead\", Hi=\"InternalClkHigh\", Lo=\"InternalClkLow\" "}
+}
+```
+
+When working from a CPHA=0 configuration, the milestone signals are directly controlled by transitions in the FSM state register, as described in the following table.
+
+<table>
+<thead><tr>
+<th>Milestone Signal</th><th>FSM Triggers</th>
+</tr></thead>
+<tbody>
+<tr><td rowspan=2><tt>byte_starting</tt></td><td>Entering <tt>WaitLead</tt></td></tr>
+<tr><td>Entering <tt>InternalClkLow</tt> and <tt>bit_cntr == 0 </tt> </td></tr>
+<tr><td><tt>bit_shifting</tt></td><td>Entering <tt>InternalClkLow</tt> and <tt>bit_cntr != 0</tt></td></tr>
+<tr><td><tt>byte_ending</tt></td><td>Exiting <tt>InternalClkHigh</tt> and <tt>bit_cntr == 0</tt></td></tr>
+</tbody>
+</table>
+
+When working from a CPHA=1 configuration, the milestone signals exploit the fact that there is usually a unique correspondence between `csb`/`sck` events and FSM transitions.
+There are some exceptions to this pattern since, as discussed below, CSAAT- and multi-csb-support requires the creation of multiple flavors of idle states.
+However, there are no milestone signal pulses in any of the transitions between these various idle states.
+Thus in CPHA=1 mode, the milestone signals are delayed by one-state transition.
+For example, in a CPHA=0 configuration the first data burst should be transmitted as the `csb` line is asserted low, that is, when the FSM enters the WaitLead state.
+Thus a `byte_starting` pulse is generated at this transition.
+On the other hand, in CPHA=1 configuration the first data burst should be transmitted after the first edge of `sck`, which happens on the next state transition as illustrated in the following waveform.
+
+That said, there are two copies of each milestone signal:
+- the original FSM-driven copy, for use when operating with CPHA=0, and
+- a delayed copy, for use in CPHA=1 operation.
+
+```wavejson
+{signal: [
+  {name: 'clk', wave: 'p......................'},
+  {name: 'rst_n', wave: '01.....................'},
+  {name: 'state_q',
+   wave: 'x2.2.4242424242424242.2', data: "Idle WL Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi WT WI",
+   node: '...W..V.....U..........'},
+  {name: 'state_d',
+   wave: 'x22.4242424242424242.2', data: "Idle WL Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi Lo Hi WT WI",
+   node: '..Z..Y.....X..........'},
+  {name: 'byte_starting_cpha0',
+   wave: 'x010.......10..........',
+   node: '..A........C...........'},
+  {name: 'byte_starting_cpha1',
+   wave: 'x0..10......10.........',
+   node: '....B.......D..........'},
+  {name: 'byte_ending_cpha0',
+   wave: 'x0.........10......10..',
+   node: '...........E...........'},
+  {name: 'byte_ending_cpha1',
+   wave: 'x0..........10......10.',
+   node: '............F..........'},
+  {name: 'bit_shifting_cpha0',
+   wave: 'x0...101010..101010....',
+   node: '.....G...I...K.........'},
+  {name: 'bit_shifting_cpha1',
+   wave: 'x0....101010..101010...',
+   node: '......H...J...L'},
+  {name: 'csb', wave: 'x1.0..................1'},
+  {name: 'sck', wave: '0....1010101010101010..'},
+  ["CPHA=0",
+   {name: 'byte_starting',
+    wave: 'x010.......10..........'},
+   {name: 'bit_shifting',
+    wave: 'x0...101010..101010....'},
+  {name: 'bit_cntr_q', wave: 'x2.2..2.2.2.2.2.2.2....',
+   data: "0 6 4 2 0 6 4 2 0"},
+  {name: 'byte_cntr_q', wave: 'x2.2........2..........',
+   data: "0 1 0"},
+  {name:'sd_o',
+   wave:'x0.2..2.2.2.2.2.2.2...0',
+   node:'',
+   data: "A[7:6] A[6:5] A[4:3] A[1:0] B[7:6] B[6:5] B[4:3] B[1:0]"}
+  ],
+  ["CPHA=1",
+   {name: 'byte_starting',
+    wave: 'x0..10......10.........'},
+   {name: 'bit_shifting',
+    wave: 'x0....101010..101010...'},
+   {name: 'byte_ending',
+    wave: 'x0..........10......10.'},
+  {name: 'bit_cntr_q', wave: 'x2...2.2.2.2.2.2.2.2...',
+   data: "0 6 4 2 0 6 4 2 0"},
+  {name: 'byte_cntr_q',
+   wave: 'x2.2.........2.........',
+   data: "0 1 0"},
+  {name:'sd_o',
+   wave:'x0...2.2.2.2.2.2.2.2..0',
+   node:'',
+   data: "A[7:6] A[6:5] A[4:3] A[1:0] B[7:6] B[6:5] B[4:3] B[1:0]"}
+  ],
+],
+edge: ['Z-~>A','Y-~>G', 'X-~>C', 'X-~>E','A->B', 'C->D', 'E->F', 'G->H', 'I->J', 'K->L', 'Z->W', 'Y->V', 'X->U'],
+config: {hscale: 1},
+head: {text: "Comparison of Milestone Signals in CPHA=0 vs. CPHA=1 configuration (for a dual speed segment)"},
+foot: {text: "Key: WL=\"WaitLead\", Hi=\"InternalClkHigh\", Lo=\"InternalClkLow\", WT=\"WaitTrail\""}
+}
+```
+
+### Milestone Signals and Control of the the Bit and Byte Counters
+
+The previous waveform also highlights the relationship between the milestone signals and the bit and byte counters.
+At the beginning of each byte `bit_cntr_q` is reset to a speed-specific value, to trigger the correct number of shift operations required for each byte:
+- 7 for Standard-mode
+- 6 for Dual-mode
+- 4 for Quad-mode
+
+The reset of the `bit_cntr_q` counter is triggered by the `byte_starting` register.
+Meanwhile the `bit_shifting` signal triggers a decrement of the bit-shifting register.
+The size of the decrement also depends on the speed of the current segment:
+- 1 for Standard-mode
+- 2 for Dual-mode
+- 4 for Quad-mode
+
+The `byte_cntr_q` register is updated from the [`COMMAND.LEN`](../data/spi_host.hjson#command) register value, at the beginning of each segment, and decremented after each `byte_ending` pulse until the counter reaches zero.
+
+This relationship between the milestone signals and the bit and byte counters is also illustrated in the previous waveform.
+
+### Implementation of Configuration Change Delays
+
+As described in the [Theory of Operation](#idle-time-delays-when-changing-configurations), changes in configuration only occur when the SPI_HOST is idle.
+The configuration change must be preceded by enough idle time to satisfy the previous configuration, and followed by enough idle time to satisfy the new configuration.
+
+In order to support these idle time requirements, the SPI_HOST FSM has two idle waiting states.
+- The `WaitIdle` state manages the idle time requirements of the *preceding* command segment, and usually transitions to the `Idle` state afterwards.
+- From the `Idle` state the FSM monitors for changes in configuration, and transitions to the `ConfigSwitch` state if any changes are detected in the next incoming command segment.
+This state introduces delays long enough the satisfy the idle time requirements of *following* command segment.
+From the `ConfigSwitch` state, the state machine directly enters the `WaitLead` state to start the next command segment.
+
+A complete state diagram, including the `ConfigSwitch` state, is shown in the following section.
+
+The following waveform illustrates how a change in a single [`CONFIGOPTS`](../data/spi_host.hjson#configopts), here [`CONFIGOPTS.CPOL`](../data/spi_host.hjson#configopts), triggers an entry into the `ConfigSwitch` Idle state, and how the new configuration is applied at the transition from `WaitIdle` to `ConfigSwitch` thereby ensuring ample idle time both before and after the configuration update.
+
+```wavejson
+{signal: [
+  {name: 'clk',                       wave: 'p.................'},
+  {name: 'command_i.csid',            wave: '2.................', data: ["0"]},
+  {name: 'command_i.configopts.cpol', wave: '1........x........'},
+  {name: 'cpol_q',                    wave: '0........1........'},
+  {name: 'switch_required',           wave: '1........x........'},
+  {name: 'command_valid_i',           wave: '1........0........'},
+  {name: 'command_ready_i',           wave: '0.......10........'},
+  {name: 'FSM state',                 wave: '2222..2..2..2..222', data: ["Hi", "Lo", "Hi", "WaitTrail", "WaitIdle", "ConfigSwitch", "WaitLead", "Hi", "Lo", "Hi"]},
+  {name: 'csb[0]',                    wave: '0.....1.....0.....'},
+  {name: 'sck',                       wave: '1010.....1.....010'},
+  {name: 'configuration update event', wave: '1........H........'}
+],
+  edge: ["A<->B min. 9 cycles", "C<->D min. 4 cycles"],
+  head: {text: "Extension of CSB Idle Pulse Due to CPOL Configuration Switch", tock: 1},
+  foot: { text: "(Note: Due to the presence of a valid command, the FSM transitions directly from WaitIdle to ConfigSwitch)"}
+}
+```
+
+### CSAAT Support
+
+In addition to omitting the `ConfigSwitch` state, the simplified state machine illustrated above does not take into account commands with multiple segments, where the CSAAT bit is enabled for all but the last segment.
+
+When the CSAAT bit in enabled there is no idle period between the current segment and the next, nor are the two adjoining segments separated by a Trail or Lead period.
+Usually the end of each segment is detected in the `InternalClkHigh` state, at which point, if CSAAT is disabled, the FSM transitions to the `WaitTrail` state to close out the transaction.
+However, if CSAAT is enabled the `WaitTrail` state is skipped, and the next state depends on whether there is another command segment available for processing (i.e., both `command_ready_o` and `command_valid_i` are both asserted).
+
+In order to support seamless, back-to-back segments the `ConfigSwitch` state can be skipped if a new segment is already available when the previous ends, in which case the FSM transitions directly to the `InternalClkLow` at the end of the previous segment.
+
+If there is no segment available yet, the FSM must pause and idly wait for the next command in the special `IdleCSBActive` state.
+This state serves a similar purpose to the `Idle` state since in this state the IP is doing nothing but waiting for new commands.
+It is different from the `Idle` state though in that during this state the active `csb` is held low.
+When a command segment is received in the `IdleCSBActive` state, it transitions immediately to the `InternalClkLow` state to generate the next `sck` pulse and process the next segment.
+
+```wavejson
+{signal: [
+  {name: 'clk', wave: 'p...........'},
+  {name: 'command_ready_o', wave: '0.1....0....'},
+  {name: 'command_valid_i', wave: '0.....10....'},
+  {name: 'new_command',     wave: '0.....10....'},
+  {name: 'state',           wave: '2222...22222', data: ["Hi", "Lo", "Hi", "IdleCSBActive", "Lo", "Hi", "Lo", "Hi", "Lo"]},
+  {name: 'sck (CPOL=0)',    wave: '1010....1010'},
+  {name: 'sd (CPHA=0)',     wave: '35.....3.4.5'}
+ ],
+  edge: ["A<->B min. 9 cycles", "C<->D min. 4 cycles"],
+  head: {text: "Idling While CS Active", tock: 1}
+}
+```
+
+The following figure shows the complete state transition diagram of for the SPI_HOST FSM.
+
+![](../doc/spi_host_fsm_complete.svg)
+
+### Skipped idle states
+
+The `Idle` and `IdleCSBActive` states are unique from the others in that:
+1. In order to respond to an incoming command the FSM can exit these idle states at any time, regardless of the current timeslice definition.
+(In fact, since different commands may use different values for the `CLKDIV` configuration parameter, the concept of a timeslice is poorly defined when idle).
+2. These idle states may be *bypassed* in order to support more efficient transitions from one command segment to the next.
+If an incoming command is detected as the FSM is about to enter an idle state, that idle state is skipped, and the FSM immediately transitions to the next logical state, based on the contents of the new incoming command.
+
+These bypassable states, which are highlighted in the previous diagram, represent a number of possible transitions from one *pre-idle* state to a following *post-idle* state.
+For clarity such transitions are left implicit in the diagram above.
+However they could also be explicitly added to the state diagram.
+For example, the implicit transitions around the `Idle` are shown in the following figure.
+
+![](../doc/spi_host_bypassable_state.svg)
+
+### Stall
+
+Whenever the shift register needs to transfer data in (or out) of the RX (TX) FIFOs, but they are full (or empty), the FSM immediately stalls to wait for new data.
+
+During this stall period none of the FSM internal registers are updated.
+Normal operation proceeds only when the stall condition has been resolved or the SPI_HOST has been reset.
+
+In the SPI_HOST FSM this is realized by disabling all flop updates whenever a stall is detected.
+
+Furthermore, all control signals out of the FSM are suppressed during a stall condition.
+
+From an implementation standpoint, the presence of a stall condition has two effects on the SPI_HOST FSM:
+1. No flops or registers may be updated during a stall condition.
+Thus the FSM may not progress while stalled.
+
+2. All handshaking or control signals to other blocks must be suppressed during a stall condition, placing backpressure on the rest the blocks within the IP to also stop operations until the stall is resolved.
diff --git a/hw/ip/sram_ctrl/README.md b/hw/ip/sram_ctrl/README.md
index 087917bf998c2..a63789a86137d 100644
--- a/hw/ip/sram_ctrl/README.md
+++ b/hw/ip/sram_ctrl/README.md
@@ -18,218 +18,3 @@ The SRAM controller contains the SRAM data and address scrambling device and pro
 - LFSR-based memory initialization feature.
 - Access controls to allow / disallow code execution from SRAM.
 - Security hardening when integrity error has been detected.
-
-# Theory of Operations
-
-## Block Diagram
-
-![SRAM Controller Block Diagram](./doc/sram_ctrl_blockdiag.svg)
-
-As shown in the block diagram above, the SRAM controller contains a TL-UL adapter, an initialization LFSR, the CSR node, key request logic and an instance of `prim_ram_1p_scr` that implements the actual scrambling mechanism.
-
-The SRAM controller supports the system-wide end-to-end bus integrity scheme and thus stores the data integrity bits alongside each data word in the memory.
-I.e., this means that both the 32 data bits and 7 integrity bits are passed through the scrambling device.
-
-Sub-word write operations therefore perform a read-modify-write operation in order to ensure consistency of the integrity bits.
-Hence, the throughput of sub-word write operations is three times lower than for full-word write operations.
-Note however that the throughput of read operations is the same for full- and sub-word read operations.
-
-The scrambling mechanism is always enabled and the `sram_ctrl` provides the scrambling device with a predefined scrambling key and nonce when it comes out of reset.
-It is the task of SW to request an updated scrambling key and nonce via the CSRs as described in the [Programmer's Guide](#programmers-guide) below.
-
-For SW convenience, the SRAM controller also provides an LFSR-based memory initialization feature that can overwrite the entire memory with pseudorandom data.
-Similarly to the scrambling key, it is the task of of SW to request memory initialization via the CSRs as described in the [Programmer's Guide](#programmers-guide) below.
-
-Note that TL-UL accesses to the memory that occur while a key request or hardware initialization is pending will be blocked until the request has completed.
-
-The individual mechanisms are explained in more detail in the subsections below.
-
-## Hardware Interfaces
-
-### Parameters
-
-The following table lists the instantiation parameters of the SRAM controller.
-
-Parameter                   | Default               | Top Earlgrey      | Description
-----------------------------|-----------------------|-------------------|---------------
-`AlertAsyncOn`              | 1'b1                  | 1'b1              |
-`InstrExec`                 | 1                     | 1                 | Enables the execute from SRAM feature.
-`MemSizeRam`                | 4096                  | (multiple values) | Number of 32bit words in the SRAM (can be overridden by `topgen`).
-`RndCnstSramKey`            | (see RTL)             | (see RTL)         | Compile-time random default constant for scrambling key.
-`RndCnstSramNonce`          | (see RTL)             | (see RTL)         | Compile-time random default constant for scrambling nonce.
-`RndCnstLfsrSeed`           | (see RTL)             | (see RTL)         | Compile-time random default constant for LFSR seed.
-`RndCnstLfsrPerm`           | (see RTL)             | (see RTL)         | Compile-time random default constant for LFSR permutation.
-
-### Signals
-
-* [Interface Tables](data/sram_ctrl.hjson#interfaces)
-
-The table below lists other SRAM controller signals.
-
-Signal                     | Direction        | Type                               | Description
----------------------------|------------------|------------------------------------|---------------
-`lc_hw_debug_en_i`         | `input`          | `lc_ctrl_pkg::lc_tx_t`             | Multibit life cycle hardware debug enable signal coming from life cycle controller, asserted when the hardware debug mechanisms are enabled in the system.
-`lc_escalate_en_i`         | `input`          | `lc_ctrl_pkg::lc_tx_t`             | Multibit life cycle escalation enable signal coming from life cycle controller, asserted if an escalation has occurred.
-`sram_otp_key_o`           | `output`         | `otp_ctrl_pkg::sram_otp_key_req_t` | Key derivation request going to the key derivation interface of the OTP controller.
-`sram_otp_key_i`           | `input`          | `otp_ctrl_pkg::sram_otp_key_rsp_t` | Ephemeral scrambling key coming back from the key derivation interface of the OTP controller.
-`otp_en_sram_ifetch_i`     | `input`          | `otp_ctrl_pkg::mubi8_t`            | Multibit value coming from the OTP HW_CFG partition ([EN_SRAM_IFETCH](../otp_ctrl/README.md#direct-access-memory-map)), set to kMuBi8True in order to enable the [`EXEC`](data/sram_ctrl.hjson#exec) CSR.
-`cfg_i`                    | `input`          | `logic [CfgWidth-1:0]`             | Attributes for physical memory macro.
-
-#### Interfaces to OTP and the SRAM Scrambling Primitive
-
-The interface to the key derivation interface inside the OTP controller follows a simple req / ack protocol, where the SRAM controller first requests an updated ephemeral key by asserting the `sram_otp_key_i.req`.
-The OTP controller then fetches entropy from CSRNG and derives an ephemeral key using the SRAM_DATA_KEY_SEED and the PRESENT scrambling data path as described in the [OTP controller spec](../otp_ctrl/README.md#scrambling-datapath).
-Finally, the OTP controller returns a fresh ephemeral key via the response channels (`sram_otp_key_o[*]`, `otbn_otp_key_o`), which complete the req / ack handshake.
-The key and nonce are made available to the scrambling primitive in the subsequent cycle.
-The wave diagram below illustrates this process.
-
-```wavejson
-{signal: [
-  {name: 'clk_otp_i',                 wave: 'p...........'},
-  {name: 'sram_otp_key_o.req',        wave: '0.|1.|..0|..'},
-  {name: 'sram_otp_key_i.ack',        wave: '0.|..|.10|..'},
-  {name: 'sram_otp_key_i.nonce',      wave: '0.|..|.30|..'},
-  {name: 'sram_otp_key_i.key',        wave: '0.|..|.30|..'},
-  {name: 'sram_otp_key_i.seed_valid', wave: '0.|..|.10|..'},
-  {},
-  {name: 'clk_i',                     wave: 'p...........'},
-  {name: 'key_valid_q',               wave: '10|..|...|1.'},
-  {name: 'key_q',                     wave: '4.|..|...|3.'},
-  {name: 'nonce_q',                   wave: '4.|..|...|3.'},
-  {name: 'key_seed_valid_q',          wave: '4.|..|...|3.'},
-]}
-```
-
-If the key seeds have not yet been provisioned in OTP, the keys are derived from all-zero constants, and the `*.seed_valid` signal will be set to 0 in the response.
-It should be noted that this mechanism requires the CSRNG and entropy distribution network to be operational, and a key derivation request will block if they are not.
-
-Note that the req/ack protocol runs on `clk_otp_i`.
-The SRAM controller synchronizes the data over via a req/ack handshake primitive `prim_sync_reqack.sv` primitive as shown below.
-
-![OTP Key Req Ack](../otp_ctrl/doc/otp_ctrl_key_req_ack.svg)
-
-Note that the key and nonce output signals on the OTP controller side are guaranteed to remain stable for at least 62 OTP clock cycles after the `ack` signal is pulsed high, because the derivation of a 64bit half-key takes at least two passes through the 31-cycle PRESENT primitive.
-Hence, if the SRAM controller clock `clk_i` is faster or in the same order of magnitude as `clk_otp_i`, the data can be directly sampled upon assertion of `src_ack_o`.
-If the SRAM controller runs on a significantly slower clock than OTP, an additional register (as indicated with dashed grey lines in the figure) has to be added.
-
-#### Global and Local Escalation
-
-If `lc_escalate_en_i` is set to any different value than `lc_ctrl_pkg::Off`, the current scrambling keys are discarded and reset to `RndCnstSramKey` and `RndCnstSramNonce` in the subsequent cycle.
-Any subsequent memory request to `prim_ram_1p_scr` will then be blocked as well.
-This mechanism is part of the [life cycle](../lc_ctrl/README.md) state scrapping and secret wiping countermeasure triggered by the alert handler (global escalation).
-
-Note that if any local bus integrity or counter errors are detected, the SRAM controller will locally escalate without assertion of `lc_escalate_en_i`.
-The behavior of local escalation is identical to global escalation via `lc_escalate_en_i`.
-
-## Scrambling Primitive
-
-As explained in [`prim_ram_1p_scr`](../prim/doc/prim_ram_1p_scr.md) the scrambling mechanism employs a reduced-round PRINCE block cipher in CTR mode to scramble the data.
-Since plain CTR mode does not diffuse the data bits due to the bitwise XOR, the scheme is augmented by passing each word through a shallow substitution-permutation (S&P) network implemented with the `prim_subst_perm` primitive.
-The S&P network employed is similar to the one employed in PRESENT and is explained in more detail [here](../prim/doc/prim_ram_1p_scr.md#custom-substitution-permutation-network).
-
-Another CTR mode augmentation that is aimed at breaking the linear address space is SRAM address scrambling.
-The same S&P network construction that is used for intra-word diffusion is leveraged to non-linearly remap the SRAM address as shown in the block diagram above.
-
-### Integrity Error Handling
-
-When an integrity error is encountered, the `sram_ctrl` will latch the integrity error send out a `fatal_bus_integ_error` until the next reset (the generation of the integrity error is determined by system integration).
-In addition, the latched error condition is fed into the `prim_ram_1p_scr` primitive via a dedicated input, causing the scrambling primitive to do the following:
-*  Reverse the nonce used during the address and CTR scrambling.
-*  Disallow any transaction (read or write) on the actual memory macro.
-
-This behavior, combined with other top level defenses, form a multi-layered defense when integrity errors are seen in the system.
-
-### LFSR Initialization Feature
-
-Since the scrambling device uses a block cipher in CTR mode, it is undesirable to initialize the memory with all-zeros from a security perspective, as that would reveal the XOR keystream.
-To this end, the `sram_ctrl` contains an LFSR-based initialization mechanism that overwrites the the entire memory with pseudorandom data.
-
-Initialization can be triggered via the [`CTRL.INIT`](data/sram_ctrl.hjson#ctrl) CSR, and once triggered, the LFSR is first re-seeded with the nonce that has been fetched together with the scrambling key.
-Then, the memory is initialized with pseudorandom data pulled from the LFSR.
-For each pseudorandom 32bit word, the initialization mechanism computes the corresponding integrity bits and writes both the data and integrity bits (39bit total) through the scrambling device using the most recently obtained scrambling key.
-
-If SW triggers the scrambling key update and LFSR initialization at the same time (i.e., with the same CSR write operation), the LFSR initialization will be stalled until an updated scrambling key has been obtained.
-
-There is no limit on how often the initialization feature can be called, and hence it can also be used as a cheap SRAM wiping mechanism at runtime.
-Note however that the PRNG sequence does not have strong security guarantees, since it is produced using an LFSR.
-
-### Code Execution from SRAM
-
-The SRAM controller contains an access control mechanism for filtering instruction fetches from the processor.
-As illustrated below, an OTP switch EN_SRAM_IFETCH (see [OTP memory map](../otp_ctrl/README.md#direct-access-memory-map)) allows to either tie code execution from SRAM to the life cycle state via the HW_DEBUG_EN function (see [life cycle docs](../lc_ctrl/README.md#hw_debug_en)), or it can be enabled / disabled via the [`EXEC`](data/sram_ctrl.hjson#exec) CSR.
-
-![SRAM Code Execution](./doc/sram_ctrl_sram_execution.svg)
-
-The different configuration options are listed in the table below:
-
-
- EN_SRAM_IFETCH (OTP)   | HW_DEBUG_EN (Life Cycle) | EXEC CSR               | Execution Enabled
-------------------------|--------------------------|------------------------|--------------------
- == kMultiBitBool8True  | -                        | == kMultiBitBool4True  | Yes
- == kMultiBitBool8True  | -                        | != kMultiBitBool4True  | No
- != kMultiBitBool8True  | ON                       | -                      | Yes
- != kMultiBitBool8True  | OFF                      | -                      | No
-
-Note that the execute from SRAM feature may only be enabled on certain SRAM controller instances in the top-level design.
-If the feature is turned off via the `InstrExec` parameter, the execute from SRAM feature is permanently disabled, and the status of the OTP switch, the life cycle state and the value of the EXEC register are irrelevant.
-
-As an example, the `top_earlgrey` design only enables this feature on the main SRAM, and permanently disables it on the retention SRAM.
-
-### Read and Write Sequencing
-
-For timing reasons, the scrambling primitive instantiates a register halfway in the PRINCE block cipher.
-This means that the keystream block becomes available in the second request cycle, which naturally aligns with read operations since the SRAM memory latency is 1 clock cycle.
-
-However, write operations have to be deferred by 1 cycle in order to be able to reuse the same PRINCE primitive.
-This can lead to read/write conflicts when a write operation is immediately followed by a read operation, and we solve that issue by introducing two write data holding registers (highlighted with green and orange in the block diagram above).
-The register highlighted with green is the unscrambled data holding register, which is used for forwarding unwritten write data in case the conflicting read operation goes to the same address as the pending write operation.
-The register highlighted with orange is the scrambled data holding register, which holds the scrambled data until the conflicting read operation(s) have completed.
-
-Note that this arrangement still allows full read/write throughput as illustrated in the alternating R/W sequence below.
-
-![SRAM Controller Sequencing](./doc/sram_ctrl_sequencing.svg)
-
-However, due to the end-to-end bus integrity scheme, sub-word write accesses currently require a read-modify-write operation in order to recompute the integrity bits for the entire word, as illustrated in the diagram below.
-
-![SRAM Controller Sub-word Write](./doc/sram_ctrl_sub_word_write.svg)
-
-Sub-word write accesses are therefore 3x slower than full-word write accesses.
-Read accesses however always take 1 cycle, no matter whether the access is a full-word or sub-word read operation.
-
-Note that this has been implemented in this way to not overly complicate the design, and since it is assumed that sub-word write operations happen relatively infrequently.
-For full write throughput, a more elaborate write buffering scheme would be required.
-
-# Programmer's Guide
-
-## Initialization
-
-The memory inside the SRAM controller can be used right away after a system reset.
-However, since the scrambling key defaults to a predefined value, it is recommended that SW performs the following initialization steps as early in the boot process as possible.
-
-1. Request an updated ephemeral scrambling key from OTP by writing 0x1 to [`CTRL.RENEW_SCR_KEY`](data/sram_ctrl.hjson#ctrl).
-   SW should spin on [`STATUS.SCR_KEY_VALID`](data/sram_ctrl.hjson#status) to wait until the new key has been obtained.
-   While this is not strictly needed since memory accesses to the SRAM will be stalled until the updated key has been obtained, the PC value upon a watchdog crash will be more informative when using a spin wait.
-
-2. (optional) Initialize the memory with pseudo random data by writing 0x1 to [`CTRL.INIT`](data/sram_ctrl.hjson#ctrl)
-   SW should spin on [`STATUS.INIT_DONE`](data/sram_ctrl.hjson#status) to wait until the memory has been initialized.
-   While this is not strictly needed since memory accesses to the SRAM will be stalled until the initialization is done, the PC value upon a watchdog crash will be more informative when using a spin wait.
-
-3. (optional) Check the [`STATUS.SCR_KEY_SEED_VALID`](data/sram_ctrl.hjson#status) bit:
-    - In case the scrambling key seeds have been fully provisioned to OTP, this bit should be set to 0x1. A value of 0x0 indicates that the OTP could be malfunctioning or has been tampered with.
-    - If the scrambling seeds have not yet been provisioned to OTP, this bit is set to 0x0. The scrambling key will in that case still be ephemeral, but the key seed mixed in as part of the key derivation process will be set to a predefined netlist constant.
-
-4. (optional) Lock down write access to [`CTRL`](data/sram_ctrl.hjson#ctrl) by writing to [`CTRL_REGWEN`](data/sram_ctrl.hjson#ctrl_regwen) if future key renewals and initializations should be disallowed until the next system reset.
-
-Note that before (re-)requesting an updated SRAM key it is imperative to make sure that:
-- The memory contents are not needed anymore. Requesting a key implicitly wipes all data in the SRAM.
-- The CSRNG and the entropy distribution network have been initialized. The key derivation mechanism in OTP needs to request a chunk of fresh entropy, and that request will block until the entropy distribution network responds.
-
-It should also be noted that data and address scrambling is never entirely disabled - even when the default scrambling key is used.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_sram_ctrl.h)
-
-## Register Table
-
-* [Register Table](data/sram_ctrl.hjson#registers)
diff --git a/hw/ip/sram_ctrl/doc/programmers_guide.md b/hw/ip/sram_ctrl/doc/programmers_guide.md
new file mode 100644
index 0000000000000..ddf20b45feff6
--- /dev/null
+++ b/hw/ip/sram_ctrl/doc/programmers_guide.md
@@ -0,0 +1,34 @@
+# Programmer's Guide
+
+## Initialization
+
+The memory inside the SRAM controller can be used right away after a system reset.
+However, since the scrambling key defaults to a predefined value, it is recommended that SW performs the following initialization steps as early in the boot process as possible.
+
+1. Request an updated ephemeral scrambling key from OTP by writing 0x1 to [`CTRL.RENEW_SCR_KEY`](../data/sram_ctrl.hjson#ctrl).
+   SW should spin on [`STATUS.SCR_KEY_VALID`](../data/sram_ctrl.hjson#status) to wait until the new key has been obtained.
+   While this is not strictly needed since memory accesses to the SRAM will be stalled until the updated key has been obtained, the PC value upon a watchdog crash will be more informative when using a spin wait.
+
+2. (optional) Initialize the memory with pseudo random data by writing 0x1 to [`CTRL.INIT`](../data/sram_ctrl.hjson#ctrl)
+   SW should spin on [`STATUS.INIT_DONE`](../data/sram_ctrl.hjson#status) to wait until the memory has been initialized.
+   While this is not strictly needed since memory accesses to the SRAM will be stalled until the initialization is done, the PC value upon a watchdog crash will be more informative when using a spin wait.
+
+3. (optional) Check the [`STATUS.SCR_KEY_SEED_VALID`](../data/sram_ctrl.hjson#status) bit:
+    - In case the scrambling key seeds have been fully provisioned to OTP, this bit should be set to 0x1. A value of 0x0 indicates that the OTP could be malfunctioning or has been tampered with.
+    - If the scrambling seeds have not yet been provisioned to OTP, this bit is set to 0x0. The scrambling key will in that case still be ephemeral, but the key seed mixed in as part of the key derivation process will be set to a predefined netlist constant.
+
+4. (optional) Lock down write access to [`CTRL`](../data/sram_ctrl.hjson#ctrl) by writing to [`CTRL_REGWEN`](../data/sram_ctrl.hjson#ctrl_regwen) if future key renewals and initializations should be disallowed until the next system reset.
+
+Note that before (re-)requesting an updated SRAM key it is imperative to make sure that:
+- The memory contents are not needed anymore. Requesting a key implicitly wipes all data in the SRAM.
+- The CSRNG and the entropy distribution network have been initialized. The key derivation mechanism in OTP needs to request a chunk of fresh entropy, and that request will block until the entropy distribution network responds.
+
+It should also be noted that data and address scrambling is never entirely disabled - even when the default scrambling key is used.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_sram_ctrl.h)
+
+## Register Table
+
+* [Register Table](../data/sram_ctrl.hjson#registers)
diff --git a/hw/ip/sram_ctrl/doc/theory_of_operation.md b/hw/ip/sram_ctrl/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..86bc4b4f8f4e5
--- /dev/null
+++ b/hw/ip/sram_ctrl/doc/theory_of_operation.md
@@ -0,0 +1,179 @@
+# Theory of Operation
+
+## Block Diagram
+
+![SRAM Controller Block Diagram](../doc/sram_ctrl_blockdiag.svg)
+
+As shown in the block diagram above, the SRAM controller contains a TL-UL adapter, an initialization LFSR, the CSR node, key request logic and an instance of `prim_ram_1p_scr` that implements the actual scrambling mechanism.
+
+The SRAM controller supports the system-wide end-to-end bus integrity scheme and thus stores the data integrity bits alongside each data word in the memory.
+I.e., this means that both the 32 data bits and 7 integrity bits are passed through the scrambling device.
+
+Sub-word write operations therefore perform a read-modify-write operation in order to ensure consistency of the integrity bits.
+Hence, the throughput of sub-word write operations is three times lower than for full-word write operations.
+Note however that the throughput of read operations is the same for full- and sub-word read operations.
+
+The scrambling mechanism is always enabled and the `sram_ctrl` provides the scrambling device with a predefined scrambling key and nonce when it comes out of reset.
+It is the task of SW to request an updated scrambling key and nonce via the CSRs as described in the [Programmer's Guide](#programmers-guide) below.
+
+For SW convenience, the SRAM controller also provides an LFSR-based memory initialization feature that can overwrite the entire memory with pseudorandom data.
+Similarly to the scrambling key, it is the task of of SW to request memory initialization via the CSRs as described in the [Programmer's Guide](#programmers-guide) below.
+
+Note that TL-UL accesses to the memory that occur while a key request or hardware initialization is pending will be blocked until the request has completed.
+
+The individual mechanisms are explained in more detail in the subsections below.
+
+## Hardware Interfaces
+
+### Parameters
+
+The following table lists the instantiation parameters of the SRAM controller.
+
+Parameter                   | Default               | Top Earlgrey      | Description
+----------------------------|-----------------------|-------------------|---------------
+`AlertAsyncOn`              | 1'b1                  | 1'b1              |
+`InstrExec`                 | 1                     | 1                 | Enables the execute from SRAM feature.
+`MemSizeRam`                | 4096                  | (multiple values) | Number of 32bit words in the SRAM (can be overridden by `topgen`).
+`RndCnstSramKey`            | (see RTL)             | (see RTL)         | Compile-time random default constant for scrambling key.
+`RndCnstSramNonce`          | (see RTL)             | (see RTL)         | Compile-time random default constant for scrambling nonce.
+`RndCnstLfsrSeed`           | (see RTL)             | (see RTL)         | Compile-time random default constant for LFSR seed.
+`RndCnstLfsrPerm`           | (see RTL)             | (see RTL)         | Compile-time random default constant for LFSR permutation.
+
+### Signals
+
+* [Interface Tables](../data/sram_ctrl.hjson#interfaces)
+
+The table below lists other SRAM controller signals.
+
+Signal                     | Direction        | Type                               | Description
+---------------------------|------------------|------------------------------------|---------------
+`lc_hw_debug_en_i`         | `input`          | `lc_ctrl_pkg::lc_tx_t`             | Multibit life cycle hardware debug enable signal coming from life cycle controller, asserted when the hardware debug mechanisms are enabled in the system.
+`lc_escalate_en_i`         | `input`          | `lc_ctrl_pkg::lc_tx_t`             | Multibit life cycle escalation enable signal coming from life cycle controller, asserted if an escalation has occurred.
+`sram_otp_key_o`           | `output`         | `otp_ctrl_pkg::sram_otp_key_req_t` | Key derivation request going to the key derivation interface of the OTP controller.
+`sram_otp_key_i`           | `input`          | `otp_ctrl_pkg::sram_otp_key_rsp_t` | Ephemeral scrambling key coming back from the key derivation interface of the OTP controller.
+`otp_en_sram_ifetch_i`     | `input`          | `otp_ctrl_pkg::mubi8_t`            | Multibit value coming from the OTP HW_CFG partition ([EN_SRAM_IFETCH](../../otp_ctrl/README.md#direct-access-memory-map)), set to kMuBi8True in order to enable the [`EXEC`](../data/sram_ctrl.hjson#exec) CSR.
+`cfg_i`                    | `input`          | `logic [CfgWidth-1:0]`             | Attributes for physical memory macro.
+
+#### Interfaces to OTP and the SRAM Scrambling Primitive
+
+The interface to the key derivation interface inside the OTP controller follows a simple req / ack protocol, where the SRAM controller first requests an updated ephemeral key by asserting the `sram_otp_key_i.req`.
+The OTP controller then fetches entropy from CSRNG and derives an ephemeral key using the SRAM_DATA_KEY_SEED and the PRESENT scrambling data path as described in the [OTP controller spec](../../otp_ctrl/README.md#scrambling-datapath).
+Finally, the OTP controller returns a fresh ephemeral key via the response channels (`sram_otp_key_o[*]`, `otbn_otp_key_o`), which complete the req / ack handshake.
+The key and nonce are made available to the scrambling primitive in the subsequent cycle.
+The wave diagram below illustrates this process.
+
+```wavejson
+{signal: [
+  {name: 'clk_otp_i',                 wave: 'p...........'},
+  {name: 'sram_otp_key_o.req',        wave: '0.|1.|..0|..'},
+  {name: 'sram_otp_key_i.ack',        wave: '0.|..|.10|..'},
+  {name: 'sram_otp_key_i.nonce',      wave: '0.|..|.30|..'},
+  {name: 'sram_otp_key_i.key',        wave: '0.|..|.30|..'},
+  {name: 'sram_otp_key_i.seed_valid', wave: '0.|..|.10|..'},
+  {},
+  {name: 'clk_i',                     wave: 'p...........'},
+  {name: 'key_valid_q',               wave: '10|..|...|1.'},
+  {name: 'key_q',                     wave: '4.|..|...|3.'},
+  {name: 'nonce_q',                   wave: '4.|..|...|3.'},
+  {name: 'key_seed_valid_q',          wave: '4.|..|...|3.'},
+]}
+```
+
+If the key seeds have not yet been provisioned in OTP, the keys are derived from all-zero constants, and the `*.seed_valid` signal will be set to 0 in the response.
+It should be noted that this mechanism requires the CSRNG and entropy distribution network to be operational, and a key derivation request will block if they are not.
+
+Note that the req/ack protocol runs on `clk_otp_i`.
+The SRAM controller synchronizes the data over via a req/ack handshake primitive `prim_sync_reqack.sv` primitive as shown below.
+
+![OTP Key Req Ack](../../otp_ctrl/doc/otp_ctrl_key_req_ack.svg)
+
+Note that the key and nonce output signals on the OTP controller side are guaranteed to remain stable for at least 62 OTP clock cycles after the `ack` signal is pulsed high, because the derivation of a 64bit half-key takes at least two passes through the 31-cycle PRESENT primitive.
+Hence, if the SRAM controller clock `clk_i` is faster or in the same order of magnitude as `clk_otp_i`, the data can be directly sampled upon assertion of `src_ack_o`.
+If the SRAM controller runs on a significantly slower clock than OTP, an additional register (as indicated with dashed grey lines in the figure) has to be added.
+
+#### Global and Local Escalation
+
+If `lc_escalate_en_i` is set to any different value than `lc_ctrl_pkg::Off`, the current scrambling keys are discarded and reset to `RndCnstSramKey` and `RndCnstSramNonce` in the subsequent cycle.
+Any subsequent memory request to `prim_ram_1p_scr` will then be blocked as well.
+This mechanism is part of the [life cycle](../../lc_ctrl/README.md) state scrapping and secret wiping countermeasure triggered by the alert handler (global escalation).
+
+Note that if any local bus integrity or counter errors are detected, the SRAM controller will locally escalate without assertion of `lc_escalate_en_i`.
+The behavior of local escalation is identical to global escalation via `lc_escalate_en_i`.
+
+## Scrambling Primitive
+
+As explained in [`prim_ram_1p_scr`](../../prim/doc/prim_ram_1p_scr.md) the scrambling mechanism employs a reduced-round PRINCE block cipher in CTR mode to scramble the data.
+Since plain CTR mode does not diffuse the data bits due to the bitwise XOR, the scheme is augmented by passing each word through a shallow substitution-permutation (S&P) network implemented with the `prim_subst_perm` primitive.
+The S&P network employed is similar to the one employed in PRESENT and is explained in more detail [here](../../prim/doc/prim_ram_1p_scr.md#custom-substitution-permutation-network).
+
+Another CTR mode augmentation that is aimed at breaking the linear address space is SRAM address scrambling.
+The same S&P network construction that is used for intra-word diffusion is leveraged to non-linearly remap the SRAM address as shown in the block diagram above.
+
+### Integrity Error Handling
+
+When an integrity error is encountered, the `sram_ctrl` will latch the integrity error send out a `fatal_bus_integ_error` until the next reset (the generation of the integrity error is determined by system integration).
+In addition, the latched error condition is fed into the `prim_ram_1p_scr` primitive via a dedicated input, causing the scrambling primitive to do the following:
+*  Reverse the nonce used during the address and CTR scrambling.
+*  Disallow any transaction (read or write) on the actual memory macro.
+
+This behavior, combined with other top level defenses, form a multi-layered defense when integrity errors are seen in the system.
+
+### LFSR Initialization Feature
+
+Since the scrambling device uses a block cipher in CTR mode, it is undesirable to initialize the memory with all-zeros from a security perspective, as that would reveal the XOR keystream.
+To this end, the `sram_ctrl` contains an LFSR-based initialization mechanism that overwrites the the entire memory with pseudorandom data.
+
+Initialization can be triggered via the [`CTRL.INIT`](../data/sram_ctrl.hjson#ctrl) CSR, and once triggered, the LFSR is first re-seeded with the nonce that has been fetched together with the scrambling key.
+Then, the memory is initialized with pseudorandom data pulled from the LFSR.
+For each pseudorandom 32bit word, the initialization mechanism computes the corresponding integrity bits and writes both the data and integrity bits (39bit total) through the scrambling device using the most recently obtained scrambling key.
+
+If SW triggers the scrambling key update and LFSR initialization at the same time (i.e., with the same CSR write operation), the LFSR initialization will be stalled until an updated scrambling key has been obtained.
+
+There is no limit on how often the initialization feature can be called, and hence it can also be used as a cheap SRAM wiping mechanism at runtime.
+Note however that the PRNG sequence does not have strong security guarantees, since it is produced using an LFSR.
+
+### Code Execution from SRAM
+
+The SRAM controller contains an access control mechanism for filtering instruction fetches from the processor.
+As illustrated below, an OTP switch EN_SRAM_IFETCH (see [OTP memory map](../../otp_ctrl/README.md#direct-access-memory-map)) allows to either tie code execution from SRAM to the life cycle state via the HW_DEBUG_EN function (see [life cycle docs](../../lc_ctrl/README.md#hw_debug_en)), or it can be enabled / disabled via the [`EXEC`](../data/sram_ctrl.hjson#exec) CSR.
+
+![SRAM Code Execution](../doc/sram_ctrl_sram_execution.svg)
+
+The different configuration options are listed in the table below:
+
+
+ EN_SRAM_IFETCH (OTP)   | HW_DEBUG_EN (Life Cycle) | EXEC CSR               | Execution Enabled
+------------------------|--------------------------|------------------------|--------------------
+ == kMultiBitBool8True  | -                        | == kMultiBitBool4True  | Yes
+ == kMultiBitBool8True  | -                        | != kMultiBitBool4True  | No
+ != kMultiBitBool8True  | ON                       | -                      | Yes
+ != kMultiBitBool8True  | OFF                      | -                      | No
+
+Note that the execute from SRAM feature may only be enabled on certain SRAM controller instances in the top-level design.
+If the feature is turned off via the `InstrExec` parameter, the execute from SRAM feature is permanently disabled, and the status of the OTP switch, the life cycle state and the value of the EXEC register are irrelevant.
+
+As an example, the `top_earlgrey` design only enables this feature on the main SRAM, and permanently disables it on the retention SRAM.
+
+### Read and Write Sequencing
+
+For timing reasons, the scrambling primitive instantiates a register halfway in the PRINCE block cipher.
+This means that the keystream block becomes available in the second request cycle, which naturally aligns with read operations since the SRAM memory latency is 1 clock cycle.
+
+However, write operations have to be deferred by 1 cycle in order to be able to reuse the same PRINCE primitive.
+This can lead to read/write conflicts when a write operation is immediately followed by a read operation, and we solve that issue by introducing two write data holding registers (highlighted with green and orange in the block diagram above).
+The register highlighted with green is the unscrambled data holding register, which is used for forwarding unwritten write data in case the conflicting read operation goes to the same address as the pending write operation.
+The register highlighted with orange is the scrambled data holding register, which holds the scrambled data until the conflicting read operation(s) have completed.
+
+Note that this arrangement still allows full read/write throughput as illustrated in the alternating R/W sequence below.
+
+![SRAM Controller Sequencing](../doc/sram_ctrl_sequencing.svg)
+
+However, due to the end-to-end bus integrity scheme, sub-word write accesses currently require a read-modify-write operation in order to recompute the integrity bits for the entire word, as illustrated in the diagram below.
+
+![SRAM Controller Sub-word Write](../doc/sram_ctrl_sub_word_write.svg)
+
+Sub-word write accesses are therefore 3x slower than full-word write accesses.
+Read accesses however always take 1 cycle, no matter whether the access is a full-word or sub-word read operation.
+
+Note that this has been implemented in this way to not overly complicate the design, and since it is assumed that sub-word write operations happen relatively infrequently.
+For full write throughput, a more elaborate write buffering scheme would be required.
diff --git a/hw/ip/uart/README.md b/hw/ip/uart/README.md
index 288bc11fde346..960fe363bfa62 100644
--- a/hw/ip/uart/README.md
+++ b/hw/ip/uart/README.md
@@ -35,454 +35,3 @@ to 1Mbps.
 
 The OpenTitan UART is feature compatible to a specific implementation in [Chromium EC](https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/master/chip/g/uart.c).
 Additional features such as parity have been added.
-
-# Theory of Operations
-
-## Block Diagram
-
-![UART Block Diagram](./doc/block_diagram.svg)
-
-## Hardware Interfaces
-
-* [Interface Tables](data/uart.hjson#interfaces)
-
-## Design Details
-
-### Serial interface (both directions)
-
-The TX/RX serial lines are high when idle. Data starts with a START bit (high
-idle state deasserts, **1**-->**0**) followed by 8 data bits. The least
-significant bit is sent first. If the parity feature is turned on then an odd or
-even parity bit follows after the data bits. Finally a STOP (**1**) bit
-completes one byte of data transfer.
-
-```wavejson
-{
-  signal: [
-    { name: 'Baud Clock',     wave: 'p............'                                                        },
-    { name: 'tx',             wave: '10333333331..', data: [ "lsb", "", "", "", "", "", "", "msb" ]        },
-    { name: 'Baud Clock',     wave: 'p............'                                                        },
-    { name: 'tx (w/ parity)', wave: '103333333341.', data: [ "lsb", "", "", "", "", "", "", "msb", "par" ] },
-  ],
-  head: {
-    text: 'Serial Transmission Frame',
-  },
-  foot: {
-    text: 'start bit ("0") at cycle -1, stop bit ("1") at cycle 8, or after parity bit',
-    tock: -2
-  },
-  foot: {
-    text: [
-      'tspan',
-        ['tspan', 'start bit '],
-        ['tspan', {class:'info h4'}, '0'],
-        ['tspan', ' at cycle -1, stop bit '],
-        ['tspan', {class:'info h4'}, '1'],
-        ['tspan', ' at cycle 8, or at cycle 9 after parity bit'],
-      ],
-    tock: -2,
-  }
-}
-```
-
-### Transmission
-
-A write to [`WDATA`](data/uart.hjson#wdata) enqueues a data byte into the 32 byte deep write FIFO, which
-triggers the transmit module to start UART TX serial data transfer. The TX
-module dequeues the byte from the FIFO and shifts it bit by bit out to the UART
-TX pin on positive edges of the baud clock.
-
-If TX is not enabled, written DATA into FIFO will be stacked up and sent out
-when TX is enabled.
-
-When the FIFO becomes empty as part of transmission, a TX FIFO empty interrupt will be raised.
-This is separate from the TX FIFO water mark interrupt.
-
-
-### Reception
-
-The RX module oversamples the RX input pin at 16x the requested
-baud clock. When the input is detected low the receiver will check
-half a bit-time later (i.e. 8 cycles of the oversample clock) that the
-line is still low before detecting the START bit. If the line has
-returned high the glitch is ignored. After it detects the START bit,
-the RX module samples at the center of each bit-time and gathers
-incoming serial bits into a character buffer. If the STOP bit is
-detected as high and the optional parity bit is correct the data byte
-is pushed into a 32 byte deep RX FIFO. The data can be read out by
-reading [`RDATA`](data/uart.hjson#rdata) register.
-
-This behaviour of the receiver can be used to compute the approximate
-baud clock frequency error that can be tolerated between the
-transmitter at the other end of the cable and the receiver. The
-initial sample point is aligned with the center of the START bit. The
-receiver will then sample every 16 cycles of the 16 x baud clock, the
-diagram below shows the number of ticks after the centering that each
-bit is captured. Because of the frequency difference between the
-transmitter and receiver the actual sample point will drift compared to
-the ideal center of the bit. In order to correctly receive the STOP
-bit it must be sampled between the "early" and "late" points shown
-on the diagram, which are half a bit-time or 8 ticks of the 16x baud
-clock before or after the center. If the transmitter is considered
-"ideal" then the local clock must thus differ by no more than plus or
-minus 8 ticks in 144 or approximately +/- 5.5%. If parity is enabled
-the stop bit will be a bit time later, so this becomes 8/160 or about
-+/- 5%.
-
-```wavejson
-{
-  signal: [
-    { name: 'Sample', wave: '', node: '..P............', period: "2" },
-    {},
-    { name: 'rx',
-      wave: '1.0.3.3.3.3.3.3.3.3.1.0.3',
-      node: '...A................C.D..',
-      cdata: [ "idle", "start", "+16", "+32", "+48", "+64", "+80",
-                "+96", "+112", "+128", "+144", "next start" ] },
-  ],
-    "edge"   : ["P-|>A center", "P-|>C early", "P-|>D late"],
-  head: {
-    text: 'Receiver sampling window',
-  },
-}
-```
-
-In practice, the transmitter and receiver will both differ from the
-ideal baud rate. Since the worst case difference for reception is 5%,
-the uart can be expected to work if both sides are within +/- 2.5% of
-the ideal baud rate.
-
-### Setting the baud rate
-
-The baud rate is set by writing to the [`CTRL.NCO`](data/uart.hjson#ctrl) register field. This should be
-set using the equation below, where `f_pclk` is the system clock frequency
-provided to the UART, and `f_baud` is the desired baud rate (in bits per second).
-
-$$ NCO = 16 \times {{2^{$bits(NCO)} \times f\_{baud}} \over {f\_{pclk}}} $$
-
-The formula above depends on the NCO CSR width.
-The logic creates a x16 tick when the NCO counter overflows.
-So, the computed baud rate from NCO value is below.
-
-$$ f\_{baud} = {{1 \over 16} \times {NCO \over {2^{$bits(NCO)}}} \times {f\_{pclk}}} $$
-
-Note that the NCO result from the above formula can be a fraction but
-the NCO register only accepts an integer value. This will create an
-error if the baud rate is not divisible by the fixed clock frequency. As
-discussed in the previous section the error rate between the receiver
-and remote transmitter should be lower than `8 / 144` to latch a
-correct character value when parity is not used and lower than `8 /
-160` when parity is used. In the expectation that the device the other
-side of the line behaves similarly, this requires each side have a
-baud rate that is matched to within +/- 2.5% of the ideal baud
-rate. The contribution to this error if NCO is rounded down to an
-integer (which will make the actual baud rate always lower or equal to
-the requested rate) can be computed from:
-
-$$ Error = {{(NCO - INT(NCO))} \over {NCO}} percent $$
-
-In this case if the resulting value of NCO is greater than $$ {1 \over
-0.025} = 40 $$ then this will always be less than the 2.5% error
-target.
-
-For NCO less than 40 the error in baud rate may or may not be
-acceptable and should be carefully checked and rounding to the nearest
-integer may achieve better results. If the computed value is close to
-an integer so that the error in the target range then the baud rate
-can be supported, however if it is too far off an integer then the
-baud rate cannot be supported. This check is needed when
-
-$$ {{baud} < {{40 * f\_{pclk}} \over {2^{$bits(NCO)+4}}}} \qquad OR \qquad
-{{f\_{pclk}} > {{{2^{$bits(NCO)+4}} * {baud}} \over {40}}} $$
-
-Using rounded frequencies and common baud rates, this implies that
-care is needed for 9600 baud and below if the system clock is under
-250MHz, with 4800 baud and below if the system clock is under 125MHz,
-2400 baud and below if the system clock us under 63MHz, and 1200 baud
-and below if the system clock is under 32MHz.
-
-
-### Interrupts
-
-UART module has a few interrupts including general data flow interrupts
-and unexpected event interrupts.
-
-#### tx_watermark / rx_watermark
-If the TX FIFO level becomes smaller than the TX water mark level (configurable via [`FIFO_CTRL.RXILVL`](data/uart.hjson#fifo_ctrl) and [`FIFO_CTRL.TXILVL`](data/uart.hjson#fifo_ctrl)), the `tx_watermark` interrupt is raised to inform SW.
-If the RX FIFO level becomes greater than or equal to RX water mark level (configurable via [`FIFO_CTRL.RXILVL`](data/uart.hjson#fifo_ctrl) and [`FIFO_CTRL.TXILVL`](data/uart.hjson#fifo_ctrl)), the `rx_watermark` interrupt is raised to inform SW.
-
-Note that the watermark interrupts are edge triggered events.
-This means the interrupt only triggers when the condition transitions from untrue->true.
-This is especially important in the tx_watermark case.
-When the TX FIFO is empty, it by default satisfies all the watermark conditions.
-In order for the interrupt to trigger then, it is required that software initiates a write burst that is greater than the programmed watermark value.
-
-For example, assume TX watermark is programmed to be less than 4 bytes, and software programs one byte at a time, waits for it to finish transmitting, before supplying the next byte.
-Under these conditions, the TX watermark interrupt will never trigger because the size of the FIFO never exceeds the watermark level.
-
-
-#### tx_empty
-If TX FIFO becomes empty as part of transmit, the interrupt `tx_empty` is asserted.
-The transmitted contents may be garbage at this point as old FIFO contents will likely be transmitted.
-
-#### rx_overflow
-If RX FIFO receives an additional write request when its FIFO is full,
-the interrupt `rx_overflow` is asserted and the character is dropped.
-
-#### rx_break_err
-The `rx_break_err` interrupt is triggered if a break condition has
-been detected. A break condition is defined as the RX pin being
-continuously low for more than a programmable number of
-character-times (via [`CTRL.RXBLVL`](data/uart.hjson#ctrl), either 2, 4, 8, or 16). A
-character time is 10 bit-times if parity is disabled (START + 8 data +
-STOP) or 11 bit-times if parity is enabled (START + 8 data + parity +
-STOP). If the UART is connected to an external connector this would
-typically indicate the cable has been disconnected (or there is a
-break in the wire). If the UART is connected to another part on the
-same board it would typically indicate the other part has reset or
-rebooted. (If the open connector or resetting peer part causes the RX
-input to not be actively driven, then a pulldown resistor is needed to
-ensure a break and a pullup resistor will ensure the line looks idle
-and no break is generated.)  Note that only one interrupt is generated
-per break -- the line must return high for at least half a bit-time
-before an additional break interrupt is generated. The current break
-status can be read from the [`STATUS.BREAK`](data/uart.hjson#status) bit. If STATUS.BREAK is set
-but [`INTR_STATE.BREAK`](data/uart.hjson#intr_state) is clear then the line break has already caused
-an interrupt that has been cleared but the line break is still going
-on. If [`STATUS.BREAK`](data/uart.hjson#status) is clear but [`INTR_STATE.BREAK`](data/uart.hjson#intr_state) is set then
-there has been a line break for which software has not cleared the
-interrupt but the line is now back to normal.
-
-#### rx_frame_err
-The `rx_frame_err` interrupt is triggered if the RX module receives the `START`
-bit (**0**) and a series of data bits but did not detect the `STOP` bit
-(**1**). This can happen because of noise affecting the line or if the
-transmitter clock is fast or slow compared to the receiver. In a real frame
-error the stop bit will be present just at an incorrect time so the line will
-continue to signal both high and low. The start of a line break (described
-above) matches a frame error with all data bits zero and one frame error
-interrupt will be raised. If the line stays zero until the break error occurs,
-the frame error will be set at every char-time. Frame errors will continue to
-be reported after a break error.
-
-```wavejson
-{
-  signal: [
-    { name: 'Baud Clock',        wave: 'p............'                                                 },
-    { name: 'rx',                wave: '10333333330..', data: [ "lsb", "", "", "", "", "", "", "msb" ] },
-    {},
-    { name: 'intr_rx_frame_err', wave: '0..........1.'},
-  ],
-  head: {
-    text: 'Serial Receive with Framing Error',
-  },
-  foot: {
-    text: [
-      'tspan',
-        ['tspan', 'start bit '],
-        ['tspan', {class:'info h4'}, '0'],
-        ['tspan', ' at cycle -1, stop bit '],
-        ['tspan', {class:'error h4'}, '1'],
-        ['tspan', ' missing at cycle 8'],
-      ],
-    tock: -2,
-  }
-}
-```
-
-The effects of the line being low for certain periods are summarized
-in the table:
-
-|Line low (bit-times) | Frame Err? | Break? | Comment |
-|---------------------|------------|--------|---------|
-|<10                  | If STOP=0  | No     | Normal operation |
-|10 (with parity)     | No         | No     | Normal zero data with STOP=1 |
-|10 (no parity)       | Yes        | No     | Frame error since STOP=0 |
-|11 - RXBLVL*char     | Yes        | No     | Break less than detect level |
-|\>RXBLVL*char        | Yes        | Once   | Frame error signalled at every char-time, break at RXBLVL char-times|
-
-#### rx_timeout
-The `rx_timeout` interrupt is triggered when the RX FIFO has data sitting in it
-without software reading it for a programmable number of bit times (using the
-baud rate clock as reference, programmable via [`TIMEOUT_CTRL`](data/uart.hjson#timeout_ctrl)). This is used to
-alert software that it has data still waiting in the FIFO that has not been
-handled yet. The timeout counter is reset whenever the FIFO depth is changed or
-an `rx_timeout` event occurs. If the RX FIFO is full and new character is
-received, it won't reset the timeout value. The software is responsible for
-keeping the RX FIFO in the level below the watermark. The actual timeout time
-can vary based on the reset of the timeout timer and the start of the
-transaction. For instance, if the software resets the timeout timer by reading a
-character from the RX FIFO and right after it there is a baud clock tick and the
-start of a new RX transaction from the host, the timeout time is reduced by 1
-and half baud clock periods.
-
-#### rx_parity_err
-The `rx_parity_err` interrupt is triggered if parity is enabled and
-the RX parity bit does not match the expected polarity as programmed
-in [`CTRL.PARITY_ODD`](data/uart.hjson#ctrl).
-
-# Programmers Guide
-
-## Initialization
-
-The following code snippet demonstrates initializing the UART to a programmable
-baud rate, clearing the RX and TX FIFO, setting up the FIFOs for interrupt
-levels, and enabling some interrupts. The NCO register controls the baud rate,
-and should be set using the equation below, where `f_pclk` is the fixed clock
-frequency and `f_baud` is the baud rate in bits per second. The UART uses the
-primary clock `clk_i` as a clock source.
-
-$$ NCO = {{2^{20} * f\_{baud}} \over {f\_{pclk}}} $$
-
-Note that the NCO result from the above formula can be a fraction but
-the NCO register only accepts an integer value. See the the
-[Reception](#reception) and [Setting the baud
-rate](#setting-the-baud-rate) sections for more discussion of the
-baud rate error target and when care is needed.
-
-Also note that because the baud rate is multiplied by 2^20 care is
-needed not to overflow 32-bit registers. Baud rates can easily be more
-than 12 bits. The code below is careful to force 64-bit
-arithmetic. (Even if the compiler is pre-computing constants there can
-be unexpected overflow).
-
-```cpp
-#define CLK_FIXED_FREQ_HZ (50ULL * 1000 * 1000)
-
-void uart_init(unsigned int baud) {
-  // nco = 2^20 * baud / fclk. Assume NCO width is 16bit.
-  uint64_t uart_ctrl_nco = ((uint64_t)baud << 20) / CLK_FIXED_FREQ_HZ;
-  REG32(UART_CTRL(0)) =
-      ((uart_ctrl_nco & UART_CTRL_NCO_MASK) << UART_CTRL_NCO_OFFSET) |
-      (1 << UART_CTRL_TX) |
-      (1 << UART_CTRL_RX);
-
-  // clear FIFOs and set up to interrupt on any RX, half-full TX
-  *UART_FIFO_CTRL_REG =
-      UART_FIFO_CTRL_RXRST                 | // clear both FIFOs
-      UART_FIFO_CTRL_TXRST                 |
-      (UART_FIFO_CTRL_RXILVL_RXFULL_1 <<3) | // intr on RX 1 character
-      (UART_FIFO_CTRL_TXILVL_TXFULL_16<<5) ; // intr on TX 16 character
-
-  // enable only RX, overflow, and error interrupts
-  *UART_INTR_ENABLE_REG =
-      UART_INTR_ENABLE_RX_WATERMARK_MASK  |
-      UART_INTR_ENABLE_TX_OVERFLOW_MASK   |
-      UART_INTR_ENABLE_RX_OVERFLOW_MASK   |
-      UART_INTR_ENABLE_RX_FRAME_ERR_MASK  |
-      UART_INTR_ENABLE_RX_PARITY_ERR_MASK;
-
-  // at the processor level, the UART interrupts should also be enabled
-}
-```
-
-## Common Examples
-
-The following code shows the steps to transmit a string of characters.
-
-```cpp
-int uart_tx_rdy() {
-  return ((*UART_FIFO_STATUS_REG & UART_FIFO_STATUS_TXLVL_MASK) == 32) ? 0 : 1;
-}
-
-void uart_send_char(char val) {
-  while(!uart_tx_rdy()) {}
-  *UART_WDATA_REG = val;
-}
-
-void uart_send_str(char *str) {
-  while(*str != '\0') {
-    uart_send_char(*str++);
-}
-```
-
-Do the following to receive a character, with -1 returned if RX is empty.
-
-```cpp
-int uart_rx_empty() {
-  return ((*UART_FIFO_STATUS_REG & UART_FIFO_STATUS_RXLVL_MASK) ==
-          (0 << UART_FIFO_STATUS_RXLVL_LSB)) ? 1 : 0;
-}
-
-int uart_rcv_char() {
-  if(uart_rx_empty())
-    return -1;
-  return *UART_RDATA_REG;
-}
-```
-
-## Interrupt Handling
-
-The code below shows one example of how to handle all UART interrupts
-in one service routine.
-
-```cpp
-void uart_interrupt_routine() {
-  volatile uint32 intr_state = *UART_INTR_STATE_REG;
-  uint32 intr_state_mask = 0;
-  char uart_ch;
-  uint32 intr_enable_reg;
-
-  // Turn off Interrupt Enable
-  intr_enable_reg = *UART_INTR_ENABLE_REG;
-  *UART_INTR_ENABLE_REG = intr_enable_reg & 0xFFFFFF00; // Clr bits 7:0
-
-  if (intr_state & UART_INTR_STATE_RX_PARITY_ERR_MASK) {
-    // Do something ...
-
-    // Store Int mask
-    intr_state_mask |= UART_INTR_STATE_RX_PARITY_ERR_MASK;
-  }
-
-  if (intr_state & UART_INTR_STATE_RX_BREAK_ERR_MASK) {
-    // Do something ...
-
-    // Store Int mask
-    intr_state_mask |= UART_INTR_STATE_RX_BREAK_ERR_MASK;
-  }
-
-  // .. Frame Error
-
-  // TX/RX Overflow Error
-
-  // RX Int
-  if (intr_state & UART_INTR_STATE_RX_WATERMARK_MASK) {
-    while(1) {
-      uart_ch = uart_rcv_char();
-      if (uart_ch == 0xff) break;
-      uart_buf.append(uart_ch);
-    }
-    // Store Int mask
-    intr_state_mask |= UART_INTR_STATE_RX_WATERMARK_MASK;
-  }
-
-  // Clear Interrupt State
-  *UART_INTR_STATE_REG = intr_state_mask;
-
-  // Restore Interrupt Enable
-  *UART_INTR_ENABLE_REG = intr_enable_reg;
-}
-```
-
-One use of the `rx_timeout` interrupt is when the [`FIFO_CTRL.RXILVL`](data/uart.hjson#fifo_ctrl)
-is set greater than one, so an interrupt is only fired when the fifo
-is full to a certain level. If the remote device sends fewer than the
-watermark number of characters before stopping sending (for example it
-is waiting an acknowledgement) then the usual `rx_watermark` interrupt
-would not be raised. In this case an `rx_timeout` would generate an
-interrupt that allows the host to read these additional characters. The
-`rx_timeout` can be selected based on the worst latency experienced by a
-character. The worst case latency experienced by a character will happen
-if characters happen to arrive just slower than the timeout: the second
-character arrives just before the timeout for the first (resetting the
-timer), the third just before the timeout from the second etc. In this
-case the host will eventually get a watermark interrupt, this will happen
-`((RXILVL - 1)*timeout)` after the first character was received.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_uart.h)
-
-## Register Table
-
-* [Register Table](data/uart.hjson#registers)
diff --git a/hw/ip/uart/doc/programmers_guide.md b/hw/ip/uart/doc/programmers_guide.md
new file mode 100644
index 0000000000000..fe3596c2a919d
--- /dev/null
+++ b/hw/ip/uart/doc/programmers_guide.md
@@ -0,0 +1,165 @@
+# Programmer's Guide
+
+## Initialization
+
+The following code snippet demonstrates initializing the UART to a programmable
+baud rate, clearing the RX and TX FIFO, setting up the FIFOs for interrupt
+levels, and enabling some interrupts. The NCO register controls the baud rate,
+and should be set using the equation below, where `f_pclk` is the fixed clock
+frequency and `f_baud` is the baud rate in bits per second. The UART uses the
+primary clock `clk_i` as a clock source.
+
+$$ NCO = {{2^{20} * f\_{baud}} \over {f\_{pclk}}} $$
+
+Note that the NCO result from the above formula can be a fraction but
+the NCO register only accepts an integer value. See the the
+[Reception](#reception) and [Setting the baud
+rate](#setting-the-baud-rate) sections for more discussion of the
+baud rate error target and when care is needed.
+
+Also note that because the baud rate is multiplied by 2^20 care is
+needed not to overflow 32-bit registers. Baud rates can easily be more
+than 12 bits. The code below is careful to force 64-bit
+arithmetic. (Even if the compiler is pre-computing constants there can
+be unexpected overflow).
+
+```cpp
+#define CLK_FIXED_FREQ_HZ (50ULL * 1000 * 1000)
+
+void uart_init(unsigned int baud) {
+  // nco = 2^20 * baud / fclk. Assume NCO width is 16bit.
+  uint64_t uart_ctrl_nco = ((uint64_t)baud << 20) / CLK_FIXED_FREQ_HZ;
+  REG32(UART_CTRL(0)) =
+      ((uart_ctrl_nco & UART_CTRL_NCO_MASK) << UART_CTRL_NCO_OFFSET) |
+      (1 << UART_CTRL_TX) |
+      (1 << UART_CTRL_RX);
+
+  // clear FIFOs and set up to interrupt on any RX, half-full TX
+  *UART_FIFO_CTRL_REG =
+      UART_FIFO_CTRL_RXRST                 | // clear both FIFOs
+      UART_FIFO_CTRL_TXRST                 |
+      (UART_FIFO_CTRL_RXILVL_RXFULL_1 <<3) | // intr on RX 1 character
+      (UART_FIFO_CTRL_TXILVL_TXFULL_16<<5) ; // intr on TX 16 character
+
+  // enable only RX, overflow, and error interrupts
+  *UART_INTR_ENABLE_REG =
+      UART_INTR_ENABLE_RX_WATERMARK_MASK  |
+      UART_INTR_ENABLE_TX_OVERFLOW_MASK   |
+      UART_INTR_ENABLE_RX_OVERFLOW_MASK   |
+      UART_INTR_ENABLE_RX_FRAME_ERR_MASK  |
+      UART_INTR_ENABLE_RX_PARITY_ERR_MASK;
+
+  // at the processor level, the UART interrupts should also be enabled
+}
+```
+
+## Common Examples
+
+The following code shows the steps to transmit a string of characters.
+
+```cpp
+int uart_tx_rdy() {
+  return ((*UART_FIFO_STATUS_REG & UART_FIFO_STATUS_TXLVL_MASK) == 32) ? 0 : 1;
+}
+
+void uart_send_char(char val) {
+  while(!uart_tx_rdy()) {}
+  *UART_WDATA_REG = val;
+}
+
+void uart_send_str(char *str) {
+  while(*str != '\0') {
+    uart_send_char(*str++);
+}
+```
+
+Do the following to receive a character, with -1 returned if RX is empty.
+
+```cpp
+int uart_rx_empty() {
+  return ((*UART_FIFO_STATUS_REG & UART_FIFO_STATUS_RXLVL_MASK) ==
+          (0 << UART_FIFO_STATUS_RXLVL_LSB)) ? 1 : 0;
+}
+
+int uart_rcv_char() {
+  if(uart_rx_empty())
+    return -1;
+  return *UART_RDATA_REG;
+}
+```
+
+## Interrupt Handling
+
+The code below shows one example of how to handle all UART interrupts
+in one service routine.
+
+```cpp
+void uart_interrupt_routine() {
+  volatile uint32 intr_state = *UART_INTR_STATE_REG;
+  uint32 intr_state_mask = 0;
+  char uart_ch;
+  uint32 intr_enable_reg;
+
+  // Turn off Interrupt Enable
+  intr_enable_reg = *UART_INTR_ENABLE_REG;
+  *UART_INTR_ENABLE_REG = intr_enable_reg & 0xFFFFFF00; // Clr bits 7:0
+
+  if (intr_state & UART_INTR_STATE_RX_PARITY_ERR_MASK) {
+    // Do something ...
+
+    // Store Int mask
+    intr_state_mask |= UART_INTR_STATE_RX_PARITY_ERR_MASK;
+  }
+
+  if (intr_state & UART_INTR_STATE_RX_BREAK_ERR_MASK) {
+    // Do something ...
+
+    // Store Int mask
+    intr_state_mask |= UART_INTR_STATE_RX_BREAK_ERR_MASK;
+  }
+
+  // .. Frame Error
+
+  // TX/RX Overflow Error
+
+  // RX Int
+  if (intr_state & UART_INTR_STATE_RX_WATERMARK_MASK) {
+    while(1) {
+      uart_ch = uart_rcv_char();
+      if (uart_ch == 0xff) break;
+      uart_buf.append(uart_ch);
+    }
+    // Store Int mask
+    intr_state_mask |= UART_INTR_STATE_RX_WATERMARK_MASK;
+  }
+
+  // Clear Interrupt State
+  *UART_INTR_STATE_REG = intr_state_mask;
+
+  // Restore Interrupt Enable
+  *UART_INTR_ENABLE_REG = intr_enable_reg;
+}
+```
+
+One use of the `rx_timeout` interrupt is when the [`FIFO_CTRL.RXILVL`](../data/uart.hjson#fifo_ctrl)
+is set greater than one, so an interrupt is only fired when the fifo
+is full to a certain level. If the remote device sends fewer than the
+watermark number of characters before stopping sending (for example it
+is waiting an acknowledgement) then the usual `rx_watermark` interrupt
+would not be raised. In this case an `rx_timeout` would generate an
+interrupt that allows the host to read these additional characters. The
+`rx_timeout` can be selected based on the worst latency experienced by a
+character. The worst case latency experienced by a character will happen
+if characters happen to arrive just slower than the timeout: the second
+character arrives just before the timeout for the first (resetting the
+timer), the third just before the timeout from the second etc. In this
+case the host will eventually get a watermark interrupt, this will happen
+`((RXILVL - 1)*timeout)` after the first character was received.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_uart.h)
+
+## Register Table
+
+* [Register Table](../data/uart.hjson#registers)
diff --git a/hw/ip/uart/doc/theory_of_operation.md b/hw/ip/uart/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..3bcc85147cb2e
--- /dev/null
+++ b/hw/ip/uart/doc/theory_of_operation.md
@@ -0,0 +1,284 @@
+# Theory of Operation
+
+## Block Diagram
+
+![UART Block Diagram](../doc/block_diagram.svg)
+
+## Hardware Interfaces
+
+* [Interface Tables](../data/uart.hjson#interfaces)
+
+## Design Details
+
+### Serial interface (both directions)
+
+The TX/RX serial lines are high when idle. Data starts with a START bit (high
+idle state deasserts, **1**-->**0**) followed by 8 data bits. The least
+significant bit is sent first. If the parity feature is turned on then an odd or
+even parity bit follows after the data bits. Finally a STOP (**1**) bit
+completes one byte of data transfer.
+
+```wavejson
+{
+  signal: [
+    { name: 'Baud Clock',     wave: 'p............'                                                        },
+    { name: 'tx',             wave: '10333333331..', data: [ "lsb", "", "", "", "", "", "", "msb" ]        },
+    { name: 'Baud Clock',     wave: 'p............'                                                        },
+    { name: 'tx (w/ parity)', wave: '103333333341.', data: [ "lsb", "", "", "", "", "", "", "msb", "par" ] },
+  ],
+  head: {
+    text: 'Serial Transmission Frame',
+  },
+  foot: {
+    text: 'start bit ("0") at cycle -1, stop bit ("1") at cycle 8, or after parity bit',
+    tock: -2
+  },
+  foot: {
+    text: [
+      'tspan',
+        ['tspan', 'start bit '],
+        ['tspan', {class:'info h4'}, '0'],
+        ['tspan', ' at cycle -1, stop bit '],
+        ['tspan', {class:'info h4'}, '1'],
+        ['tspan', ' at cycle 8, or at cycle 9 after parity bit'],
+      ],
+    tock: -2,
+  }
+}
+```
+
+### Transmission
+
+A write to [`WDATA`](../data/uart.hjson#wdata) enqueues a data byte into the 32 byte deep write FIFO, which
+triggers the transmit module to start UART TX serial data transfer. The TX
+module dequeues the byte from the FIFO and shifts it bit by bit out to the UART
+TX pin on positive edges of the baud clock.
+
+If TX is not enabled, written DATA into FIFO will be stacked up and sent out
+when TX is enabled.
+
+When the FIFO becomes empty as part of transmission, a TX FIFO empty interrupt will be raised.
+This is separate from the TX FIFO water mark interrupt.
+
+
+### Reception
+
+The RX module oversamples the RX input pin at 16x the requested
+baud clock. When the input is detected low the receiver will check
+half a bit-time later (i.e. 8 cycles of the oversample clock) that the
+line is still low before detecting the START bit. If the line has
+returned high the glitch is ignored. After it detects the START bit,
+the RX module samples at the center of each bit-time and gathers
+incoming serial bits into a character buffer. If the STOP bit is
+detected as high and the optional parity bit is correct the data byte
+is pushed into a 32 byte deep RX FIFO. The data can be read out by
+reading [`RDATA`](../data/uart.hjson#rdata) register.
+
+This behaviour of the receiver can be used to compute the approximate
+baud clock frequency error that can be tolerated between the
+transmitter at the other end of the cable and the receiver. The
+initial sample point is aligned with the center of the START bit. The
+receiver will then sample every 16 cycles of the 16 x baud clock, the
+diagram below shows the number of ticks after the centering that each
+bit is captured. Because of the frequency difference between the
+transmitter and receiver the actual sample point will drift compared to
+the ideal center of the bit. In order to correctly receive the STOP
+bit it must be sampled between the "early" and "late" points shown
+on the diagram, which are half a bit-time or 8 ticks of the 16x baud
+clock before or after the center. If the transmitter is considered
+"ideal" then the local clock must thus differ by no more than plus or
+minus 8 ticks in 144 or approximately +/- 5.5%. If parity is enabled
+the stop bit will be a bit time later, so this becomes 8/160 or about
++/- 5%.
+
+```wavejson
+{
+  signal: [
+    { name: 'Sample', wave: '', node: '..P............', period: "2" },
+    {},
+    { name: 'rx',
+      wave: '1.0.3.3.3.3.3.3.3.3.1.0.3',
+      node: '...A................C.D..',
+      cdata: [ "idle", "start", "+16", "+32", "+48", "+64", "+80",
+                "+96", "+112", "+128", "+144", "next start" ] },
+  ],
+    "edge"   : ["P-|>A center", "P-|>C early", "P-|>D late"],
+  head: {
+    text: 'Receiver sampling window',
+  },
+}
+```
+
+In practice, the transmitter and receiver will both differ from the
+ideal baud rate. Since the worst case difference for reception is 5%,
+the uart can be expected to work if both sides are within +/- 2.5% of
+the ideal baud rate.
+
+### Setting the baud rate
+
+The baud rate is set by writing to the [`CTRL.NCO`](../data/uart.hjson#ctrl) register field. This should be
+set using the equation below, where `f_pclk` is the system clock frequency
+provided to the UART, and `f_baud` is the desired baud rate (in bits per second).
+
+$$ NCO = 16 \times {{2^{$bits(NCO)} \times f\_{baud}} \over {f\_{pclk}}} $$
+
+The formula above depends on the NCO CSR width.
+The logic creates a x16 tick when the NCO counter overflows.
+So, the computed baud rate from NCO value is below.
+
+$$ f\_{baud} = {{1 \over 16} \times {NCO \over {2^{$bits(NCO)}}} \times {f\_{pclk}}} $$
+
+Note that the NCO result from the above formula can be a fraction but
+the NCO register only accepts an integer value. This will create an
+error if the baud rate is not divisible by the fixed clock frequency. As
+discussed in the previous section the error rate between the receiver
+and remote transmitter should be lower than `8 / 144` to latch a
+correct character value when parity is not used and lower than `8 /
+160` when parity is used. In the expectation that the device the other
+side of the line behaves similarly, this requires each side have a
+baud rate that is matched to within +/- 2.5% of the ideal baud
+rate. The contribution to this error if NCO is rounded down to an
+integer (which will make the actual baud rate always lower or equal to
+the requested rate) can be computed from:
+
+$$ Error = {{(NCO - INT(NCO))} \over {NCO}} percent $$
+
+In this case if the resulting value of NCO is greater than $$ {1 \over
+0.025} = 40 $$ then this will always be less than the 2.5% error
+target.
+
+For NCO less than 40 the error in baud rate may or may not be
+acceptable and should be carefully checked and rounding to the nearest
+integer may achieve better results. If the computed value is close to
+an integer so that the error in the target range then the baud rate
+can be supported, however if it is too far off an integer then the
+baud rate cannot be supported. This check is needed when
+
+$$ {{baud} < {{40 * f\_{pclk}} \over {2^{$bits(NCO)+4}}}} \qquad OR \qquad
+{{f\_{pclk}} > {{{2^{$bits(NCO)+4}} * {baud}} \over {40}}} $$
+
+Using rounded frequencies and common baud rates, this implies that
+care is needed for 9600 baud and below if the system clock is under
+250MHz, with 4800 baud and below if the system clock is under 125MHz,
+2400 baud and below if the system clock us under 63MHz, and 1200 baud
+and below if the system clock is under 32MHz.
+
+
+### Interrupts
+
+UART module has a few interrupts including general data flow interrupts
+and unexpected event interrupts.
+
+#### tx_watermark / rx_watermark
+If the TX FIFO level becomes smaller than the TX water mark level (configurable via [`FIFO_CTRL.RXILVL`](../data/uart.hjson#fifo_ctrl) and [`FIFO_CTRL.TXILVL`](../data/uart.hjson#fifo_ctrl)), the `tx_watermark` interrupt is raised to inform SW.
+If the RX FIFO level becomes greater than or equal to RX water mark level (configurable via [`FIFO_CTRL.RXILVL`](../data/uart.hjson#fifo_ctrl) and [`FIFO_CTRL.TXILVL`](../data/uart.hjson#fifo_ctrl)), the `rx_watermark` interrupt is raised to inform SW.
+
+Note that the watermark interrupts are edge triggered events.
+This means the interrupt only triggers when the condition transitions from untrue->true.
+This is especially important in the tx_watermark case.
+When the TX FIFO is empty, it by default satisfies all the watermark conditions.
+In order for the interrupt to trigger then, it is required that software initiates a write burst that is greater than the programmed watermark value.
+
+For example, assume TX watermark is programmed to be less than 4 bytes, and software programs one byte at a time, waits for it to finish transmitting, before supplying the next byte.
+Under these conditions, the TX watermark interrupt will never trigger because the size of the FIFO never exceeds the watermark level.
+
+
+#### tx_empty
+If TX FIFO becomes empty as part of transmit, the interrupt `tx_empty` is asserted.
+The transmitted contents may be garbage at this point as old FIFO contents will likely be transmitted.
+
+#### rx_overflow
+If RX FIFO receives an additional write request when its FIFO is full,
+the interrupt `rx_overflow` is asserted and the character is dropped.
+
+#### rx_break_err
+The `rx_break_err` interrupt is triggered if a break condition has
+been detected. A break condition is defined as the RX pin being
+continuously low for more than a programmable number of
+character-times (via [`CTRL.RXBLVL`](../data/uart.hjson#ctrl), either 2, 4, 8, or 16). A
+character time is 10 bit-times if parity is disabled (START + 8 data +
+STOP) or 11 bit-times if parity is enabled (START + 8 data + parity +
+STOP). If the UART is connected to an external connector this would
+typically indicate the cable has been disconnected (or there is a
+break in the wire). If the UART is connected to another part on the
+same board it would typically indicate the other part has reset or
+rebooted. (If the open connector or resetting peer part causes the RX
+input to not be actively driven, then a pulldown resistor is needed to
+ensure a break and a pullup resistor will ensure the line looks idle
+and no break is generated.)  Note that only one interrupt is generated
+per break -- the line must return high for at least half a bit-time
+before an additional break interrupt is generated. The current break
+status can be read from the [`STATUS.BREAK`](../data/uart.hjson#status) bit. If STATUS.BREAK is set
+but [`INTR_STATE.BREAK`](../data/uart.hjson#intr_state) is clear then the line break has already caused
+an interrupt that has been cleared but the line break is still going
+on. If [`STATUS.BREAK`](../data/uart.hjson#status) is clear but [`INTR_STATE.BREAK`](../data/uart.hjson#intr_state) is set then
+there has been a line break for which software has not cleared the
+interrupt but the line is now back to normal.
+
+#### rx_frame_err
+The `rx_frame_err` interrupt is triggered if the RX module receives the `START`
+bit (**0**) and a series of data bits but did not detect the `STOP` bit
+(**1**). This can happen because of noise affecting the line or if the
+transmitter clock is fast or slow compared to the receiver. In a real frame
+error the stop bit will be present just at an incorrect time so the line will
+continue to signal both high and low. The start of a line break (described
+above) matches a frame error with all data bits zero and one frame error
+interrupt will be raised. If the line stays zero until the break error occurs,
+the frame error will be set at every char-time. Frame errors will continue to
+be reported after a break error.
+
+```wavejson
+{
+  signal: [
+    { name: 'Baud Clock',        wave: 'p............'                                                 },
+    { name: 'rx',                wave: '10333333330..', data: [ "lsb", "", "", "", "", "", "", "msb" ] },
+    {},
+    { name: 'intr_rx_frame_err', wave: '0..........1.'},
+  ],
+  head: {
+    text: 'Serial Receive with Framing Error',
+  },
+  foot: {
+    text: [
+      'tspan',
+        ['tspan', 'start bit '],
+        ['tspan', {class:'info h4'}, '0'],
+        ['tspan', ' at cycle -1, stop bit '],
+        ['tspan', {class:'error h4'}, '1'],
+        ['tspan', ' missing at cycle 8'],
+      ],
+    tock: -2,
+  }
+}
+```
+
+The effects of the line being low for certain periods are summarized
+in the table:
+
+|Line low (bit-times) | Frame Err? | Break? | Comment |
+|---------------------|------------|--------|---------|
+|<10                  | If STOP=0  | No     | Normal operation |
+|10 (with parity)     | No         | No     | Normal zero data with STOP=1 |
+|10 (no parity)       | Yes        | No     | Frame error since STOP=0 |
+|11 - RXBLVL*char     | Yes        | No     | Break less than detect level |
+|\>RXBLVL*char        | Yes        | Once   | Frame error signalled at every char-time, break at RXBLVL char-times|
+
+#### rx_timeout
+The `rx_timeout` interrupt is triggered when the RX FIFO has data sitting in it
+without software reading it for a programmable number of bit times (using the
+baud rate clock as reference, programmable via [`TIMEOUT_CTRL`](../data/uart.hjson#timeout_ctrl)). This is used to
+alert software that it has data still waiting in the FIFO that has not been
+handled yet. The timeout counter is reset whenever the FIFO depth is changed or
+an `rx_timeout` event occurs. If the RX FIFO is full and new character is
+received, it won't reset the timeout value. The software is responsible for
+keeping the RX FIFO in the level below the watermark. The actual timeout time
+can vary based on the reset of the timeout timer and the start of the
+transaction. For instance, if the software resets the timeout timer by reading a
+character from the RX FIFO and right after it there is a baud clock tick and the
+start of a new RX transaction from the host, the timeout time is reduced by 1
+and half baud clock periods.
+
+#### rx_parity_err
+The `rx_parity_err` interrupt is triggered if parity is enabled and
+the RX parity bit does not match the expected polarity as programmed
+in [`CTRL.PARITY_ODD`](../data/uart.hjson#ctrl).
diff --git a/hw/ip/usbdev/README.md b/hw/ip/usbdev/README.md
index ec0da2c336a79..66a7fd4d41f0b 100644
--- a/hw/ip/usbdev/README.md
+++ b/hw/ip/usbdev/README.md
@@ -37,361 +37,3 @@ The physical layer interface features multiple transmit and receive paths to all
 ## Compatibility
 
 The USB device programming interface is not based on any existing interface.
-
-
-# Theory of Operations
-
-A useful quick reference for USB Full-Speed is [USB Made Simple, Part 3 - Data Flow.](http://www.usbmadesimple.co.uk/ums_3.htm)
-
-The block diagram shows a high level view of the USB device including the main register access paths.
-
-![Block Diagram](doc/usbdev_block.svg)
-
-
-## Clocking
-
-The USB Full-Speed interface runs at a data rate of 12 MHz.
-The interface runs at four times this frequency and must be clocked from an accurate 48 MHz clock source.
-The USB specification for a Full-Speed device requires the average bit rate is 12 Mbps +/- 0.25%, so the clock needs to support maximum error of 2,500 ppm.
-The maximum allowable integrated jitter is +/- 1 ns over 1 to 7 bit periods.
-
-This module features the following output signals to provide a reference for synchronizing the 48 MHz clock source:
-- `usb_ref_pulse_o` indicates the reception of a start of frame (SOF) packet.
-  The host is required to send a SOF packet every 1 ms.
-- `usb_ref_val_o` serves as a valid signal for `usb_ref_pulse_o`.
-  It is set to one after the first SOF packet is received and remains high as long as `usb_ref_pulse_o` continues to behave as expected.
-  As soon as it is detected that SOF will not be received as expected (usually because the link is no longer active), `usb_ref_val_o` deasserts to zero until after the next `usb_ref_pulse_o`.
-
-Both these signals are synchronous to the 48 MHz clock.
-They can be forced to zero by setting [`phy_config.usb_ref_disable`](data/usbdev.hjson#phy_config) to `1`.
-
-To successfully receive SOF packets without errors and thereby enabling clock synchronization, the initial accuracy of the 48 MHz clock source should be within 3.2% or 32,000 ppm.
-This requirement comes from the fact that the SOF packet has a length of 24 bits (plus 8-bit sync field).
-The first 8 bits are used to transfer the SOF packet ID (8'b01011010).
-Internally, the USB device dynamically adjusts the sampling point based on observed line transitions.
-Assuming the last bit of the SOF packet ID is sampled in the middle of the eye, the drift over the remaining 16 bits of the packet must be lower than half a bit (10^6 * (0.5/16) = 32,000 ppm).
-
-To externally monitor the 48 MHz clock, the USB device supports an oscillator test mode which can be enabled by setting [`phy_config.tx_osc_test_mode`](data/usbdev.hjson#phy_config) to `1`.
-In this mode, the device constantly transmits a J/K pattern but no longer receives SOF packets.
-Consequently, it does not generate reference pulses for clock synchronization.
-The clock might drift off.
-
-Control transfers pass through synchronous FIFOs or have a ready bit synchronized across the clock domain boundary.
-A dual-port synchronous buffer SRAM is used for data transfers, and the bus clock and USB clock come from the same 48 MHz input.
-The wake detection module is clocked by a separate clock, and a couple registers are used to interface with it.
-Any bus-related clock domain crossings must happen outside the core, except for the transition between the 48 MHz clock and the wake detection module's clock.
-The 48 MHz clock must be enabled to reach the registers in `usbdev`.
-
-
-## USB Interface Pins
-
-Full-Speed USB uses a bidirectional serial interface as shown in Figure 7-24 of the [USB 2.0 Full-Speed specification](https://www.usb.org/document-library/usb-20-specification).
-For reasons of flexibility, this IP block features multiple transmit and receive paths for interfacing with various transceivers.
-
-The following sections describe how the various input/output signals relate to the USB interface pins for the different receive and transmit configurations.
-
-
-### Data Transmit
-
-The IP block supports two different encodings, driving out on separate TX interfaces.
-The default encoding looks like the USB bus, with D+ and D- values driven on usb_dp_o and usb_dn_o pins.
-The alternate encoding uses usb_se0_o to indicate a single-ended zero (SE0), and usb_d_o encodes K/J (when usb_se0_o is low).
-The TX mode can be selected by setting the `use_tx_d_se0` bit in [`phy_config`](data/usbdev.hjson#phy_config) to either 1 (alternate, using d/se0) or 0 (default, using dp/dn).
-
-The following table summarizes how the different output signals relate to the USB interface pins.
-
-|  External Pins | Internal Signals | Notes |
-|----------------|------------------|-------|
-| D+, D-         | dp_o, dn_o       | Data output with an encoding like the USB bus, intended to go directly to pads for supported targets. On an FPGA, the components should be used with a USB transceiver, as the regular bidirectional I/O cells will likely not be USB compliant. |
-| [Alt TX Data]  | se0_o            | Signal Single-Ended Zero (SE0) link state to a USB transceiver. |
-| [Alt TX Data]  | d_o              | Data output used for encoding K and J, for interfacing with a USB transceiver. |
-|   [TX Mode]    | tx_use_d_se0_o   | Indicates the selected TX interface: use dp_o and dn_o (0) or use d_o and se0_o (1). |
-
-Note that according to the [Comportable guideline for peripheral functionality](../../../doc/contributing/hw/comportability/README.md), every output signal `name_o` has a dedicated output enable `name_en_o`.
-For TX data, these separate signals `dp_en_o` and `dn_en_o` all correspond to the same TX or output enable signal (`OE` in the USB spec).
-The other signals listed are of the "intersignal" variety, and they do not go directly to pads or have dedicated output enable signals.
-
-
-### Data Receive
-
-The IP block supports recovery of the differential K and J symbols from the output of an external differential receiver or directly from the D+/D- pair.
-The RX mode can be selected to use a differential receiver's output by setting the `use_diff_rcvr` bit in [`phy_config`](data/usbdev.hjson#phy_config).
-The D+/D- pair is always used to detect the single-ended zero (SE0) state.
-
-The following table summarizes how the different input signals relate to the USB interface pins.
-
-|  External Pins | Internal Signals | Notes |
-|----------------|------------------|-------|
-| D+, D-         | dp_i, dn_i       | D+ and D- signals passing into the IP single-ended, intended to go directly to pads for supported targets. These signals are used to detect the SE0 link state, and if a differential receiver is not present, they are also used for K and J symbols. On an FPGA, the components should be used with a USB transceiver, as the bidirectional regular IO cells will likely not be USB compliant. |
-| [Diff Rcvr Out]| d_i              | Data input for interfacing with a differential receiver, which is required for this input. |
-
-
-### Non-Data Pins
-
-The USB device features the following non-data pins.
-
-|  External Pins | Internal Signals         | Notes |
-|----------------|--------------------------|-------|
-| sense (VBUS)   | sense_i                  | The sense pin indicates the presence of VBUS from the USB host. |
-| [pullup]       | dp_pullup_o, dn_pullup_o | When dp_pullup_o or dn_pullup_o asserts a 1.5k pullup resistor should be connected to D+ or D-, respectively. This can be done inside the chip or with an external pin. A permanently connected resistor could be used if the pin flip feature is not needed, but this is not recommended because there is then no way to force the device to appear to unplug. Only one of the pullup signals can be asserted at any time. The selection is based on the `pinflip` bit in [`phy_config`](data/usbdev.hjson#phy_config). Because this is a Full-Speed device the resistor must be on the D+ pin, so when `pinflip` is zero, dp_pullup_o is used. |
-| [suspend]      | suspend_o                | The suspend pin indicates to the USB transceiver that a constant idle has been detected on the link and the device is in the Suspend state (see Section 7.1.7.6 of the [USB 2.0 specification](https://www.usb.org/document-library/usb-20-specification)). |
-| [rx_enable]    | rx_enable_o              | The rx_enable pin turns on/off a differential receiver. It is enabled via a CSR and automatically disabled when the device suspends. |
-
-The USB host will identify itself to the device by enabling the 5V VBUS power.
-It may do a hard reset of a port by removing and reasserting VBUS (the Linux driver will do this when it finds a port in an inconsistent state or a port that generates errors during enumeration).
-The IP block detects VBUS through the sense pin.
-This pin is always an input and should be externally connected to detect the state of the VBUS.
-Note that this may require a resistor divider or (for USB-C where VBUS can be up to 20V) active level translation to an acceptable voltage for the input pin.
-
-A Full-Speed device identifies itself by providing a 1.5k pullup resistor (to 3.3V) on the D+ line.
-The IP block produces a signal `dp_pullup_o` that is asserted when this resistor should be presented.
-This signal will be asserted whenever the interface is enabled and VBUS is present.
-In an FPGA implementation, this signal can drive a 3.3V output pin that is driven high when the signal is asserted and set high impedance when the signal is deasserted, and the output pin used to drive a 1.5k resistor connected on the board to the D+ line.
-Alternatively, it can be used to enable an internal 1.5k pullup on the D+ pin.
-
-This USB device supports the flipping of D+/D-.
-If the `pinflip` bit in [`phy_config`](data/usbdev.hjson#phy_config) is set, the data pins are flipped internally, meaning the 1.5k pullup resistor needs to be on the external D- line.
-To control the pullup on the D- line, this USB device features `dn_pullup_o` signal.
-Of the two pullup signals `dp_pullup_o` and `dn_pullup_o`, only one can be enabled at any time.
-As this is a Full-Speed device, `dp_pullup_o`, i.e., the pullup on D+ is used by default (`pinflip` equals zero).
-
-## Hardware Interfaces
-
-* [Interface Tables](data/usbdev.hjson#interfaces)
-
-
-## USB Link State
-
-The USB link has a number of states.
-These are detected and reported in [`usbstat.link_state`](data/usbdev.hjson#usbstat) and state changes are reported using interrupts.
-The FSM implements a subset of the USB device state diagram shown in Figure 9-1 of the [USB 2.0 specification.](https://www.usb.org/document-library/usb-20-specification)
-
-|State| Description |
-|-----|-------------|
-|Disconnected | The link is disconnected. This is signaled when the VBUS is not driven by the host, which results in the sense input pin being low, or when the user has not connected the pull-up by enabling the interface. An interrupt is raised on entering this state.|
-|Powered| The device has been powered as VBUS is being driven by the host and the user has connected the pull-up, but the device has not been reset yet. The link is reset whenever the D+ and D- are both low (an SE0 condition) for an extended period. The host will assert reset for a minimum of 10 ms, but the USB specification allows the device to detect and respond to a reset after 2.5 us. The implementation here will report the reset state and raise an interrupt when the link is in SE0 for 3 us.|
-|Powered Suspended| The link is suspended when at idle (a J condition) for more than 3 ms. An interrupt is generated when the suspend is detected and a resume interrupt is generated when the link exits the suspend state. This state is entered, if the device has not been reset yet.|
-|Active No SOF| The link has been reset and can begin receiving packets, but no Start-of-Frame packets have yet been seen.|
-|Active| The link is active when it is running normally. |
-|Suspended| Similar to 'Powered Suspended', but the device was in the active state before being suspended.|
-|Resuming| The link is awaiting the end of resume signaling before transitioning to the Active No SOF state.|
-
-|Link Events| Description |
-|-----------|-------------|
-|Disconnect| VBUS has been lost. |
-|Link Reset| The link has been in the SE0 state for 3 us.|
-|Link Suspend| The link has been in the J state for more than 3 ms, upon which we have to enter the Suspend state.|
-|Link Resume| The link has been driven to a non-J state after being in Suspend. For the case of resuming to active link states, the end of resume signaling has occurred.|
-|Host Lost| Signaled using an interrupt if the link is active but a start of frame (SOF) packet has not been received from the host in 4 frames. The host is required to send a SOF packet every 1 ms. This is not an expected condition.|
-
-
-## USB Protocol Engine
-
-The USB 2.0 Full-Speed Protocol Engine is provided by the common USB interface code and is, strictly speaking, not part of this USB device module.
-
-At the lowest level of the USB stack the transmit bitstream is serialized, converted to non-return-to-zero inverted (NRZI) encoding with bit-stuffing and sent to the transmitter.
-The received bitstream is recovered, clock aligned and decoded and has bit-stuffing removed.
-The recovered clock alignment is used for transmission.
-
-The higher level protocol engine forms the bitstream into packets, performs CRC checking and recognizes IN, OUT and SETUP transactions.
-These are presented to this module without buffering.
-This means the USB device module must accept or provide data when requested.
-The protocol engine may cancel a transaction because of a bad cyclic redundancy check (CRC) or request a retry if an acknowledgment (ACK) was not received.
-
-
-## Buffer Interface
-
-A 2 kB SRAM is used as a packet buffer to hold data between the system and the USB interface.
-This is divided up into 32 buffers each containing 64 bytes.
-This is an asynchronous dual-port SRAM with software accessing from the bus clock domain and the USB interface accessing from the USB 48 MHz clock domain.
-
-
-### Reception
-
-Software provides buffers for packet reception through a 4-entry Available Buffer FIFO.
-(More study needed but four seems about right: one just returned to software, one being filled, one ready to be filled, and one for luck.)
-The [`rxenable_out`](data/usbdev.hjson#rxenable_out) and [`rxenable_setup`](data/usbdev.hjson#rxenable_setup) registers is used to indicate which endpoints will accept data from the host using OUT or SETUP transactions, respectively.
-When a packet is transferred from the host to the device (using an OUT or SETUP transaction) and reception of that type of transaction is enabled for the requested endpoint, the next buffer ID is pulled from the Available Buffer FIFO.
-The packet data is written to the corresponding buffer in the packet buffer (the 2 kB SRAM).
-If the packet is correctly received, an ACK is returned to the host.
-In addition, the buffer ID, the packet size, an out/setup flag and the endpoint ID are passed back to software using the Received Buffer FIFO and a pkt_received interrupt is raised.
-
-Software should immediately provide a free buffer for future reception by writing the corresponding buffer ID to the Available Buffer FIFO.
-It can then process the packet and eventually return the received buffer to the free pool.
-This allows streaming on a single endpoint or across a number of endpoints.
-If the packets cannot be consumed at the rate they are received, software can implement selective flow control by clearing [`rxenable_out`](data/usbdev.hjson#rxenable_out) for a particular endpoint, which will result in a request to that endpoint being NAKed (negative acknowledgment).
-In the unfortunate event that the Available Buffer FIFO is empty or the Received Buffer FIFO is full, all OUT transactions are NAKed and SETUP transactions are ignored.
-In that event, the host will retry the transaction (up to some maximum attempts or time).
-
-There are two options for a given OUT endpoint's flow control, controlled by the [`set_nak_out`](data/usbdev.hjson#set_nak_out) register.
-If `set_nak_out` is 0 for the endpoint, it will accept packets as long as there are buffers available in the Available Buffer FIFO and space available in the Received Buffer FIFO.
-For timing, this option implies that software may not be able to affect the response to a given transaction, and buffer availability is the only needed factor.
-If `set_nak_out` is 1 for the endpoint, it will clear its corresponding bit in the [`rxenable_out`](data/usbdev.hjson#rxenable_out) register, forcing NAK responses to OUT transactions to that endpoint until software can intervene.
-That option uses NAK to defer the host, and this enables software to implement features that require protocol-level control at transaction boundaries, such as when implementing the functional stall.
-
-
-### Transmission
-
-To send data to the host in response to an IN transaction, software first writes the data into a free buffer.
-Then, it writes the buffer ID, data length and rdy flag to the [`configin`](data/usbdev.hjson#configin) register of the corresponding endpoint.
-When the host next does an IN transaction to that endpoint, the data will be sent from the buffer.
-On receipt of the ACK from the host, the rdy bit in the [`configin`](data/usbdev.hjson#configin) register will be cleared, and the bit corresponding to the endpoint ID will be set in the [`in_sent`](data/usbdev.hjson#in_sent) register causing a pkt_sent interrupt to be raised.
-Software can return the buffer to the free pool and write a 1 to clear the endpoint bit in the [`in_sent`](data/usbdev.hjson#in_sent) register.
-Note that streaming can be achieved if the next buffer has been prepared and is written to the [`configin`](data/usbdev.hjson#configin) register when the interrupt is received.
-
-A Control transfer requires one or more IN transactions, either during the data stage or the status stage.
-Therefore, when a SETUP transaction is received for an endpoint, any buffers that are waiting to be sent out to the host from that endpoint are canceled by clearing the rdy bit in the corresponding [`configin`](data/usbdev.hjson#configin) register.
-To keep track of such canceled buffers, the pend bit in the same register is set.
-The transfer must be queued again after the Control transfer is completed.
-
-Similarly, a Link Reset cancels any waiting IN transactions by clearing the rdy bit in the [`configin`](data/usbdev.hjson#configin) register of all endpoints.
-The pend bit in the [`configin`](data/usbdev.hjson#configin) register is set for all endpoints with a pending IN transaction.
-
-
-### Buffer Count and Size
-
-Under high load, the 32 buffers of the packet buffer (2 kB SRAM) are allocated as follows:
-- 1 is being processed following reception,
-- 4 are in the Available Buffer FIFO, and
-- 12 (worst case) waiting transmissions in the [`configin`](data/usbdev.hjson#configin) registers.
-This leaves 15 buffers for preparation of future transmissions (which would need 12 in the worst case of one per endpoint) and the free pool.
-
-The size of 64 bytes per buffer satisfies the maximum USB packet size for a Full-Speed interface for Control transfers (max may be 8, 16, 32 or 64 bytes), Bulk Transfers (max is 64 bytes) and Interrupt transfers (max is 64 bytes).
-It is small for Isochronous transfers (which have a max size of 1023 bytes).
-The interface will need extending for high rate isochronous use (a possible option would be to allow up to 8 or 16 64-byte buffers to be aggregated as the isochronous buffer).
-
-
-# Design Details
-
-
-# Programmers Guide
-
-
-## Initialization
-
-The basic hardware initialization is to (in any order) configure the physical interface for the implementation via the [`phy_config`](data/usbdev.hjson#phy_config) register, fill the Available Buffer FIFO, enable IN and OUT endpoints with ID 0 (this is the control endpoint that the host will use to configure the interface), enable reception of SETUP and OUT packets on OUT Endpoint 0, and enable any required interrupts.
-Finally, the interface is enabled by setting the enable bit in the [`usbctrl`](data/usbdev.hjson#usbctrl) register.
-Setting this bit causes the USB device to assert the pullup on the D+ line, which is used by the host to detect the device.
-There is no need to configure the device ID in ([`usbctrl.device_address`](data/usbdev.hjson#usbctrl)) at this point -- the line remains in reset and the hardware forces the device ID to zero.
-
-The second stage of initialization is done under control of the host, which will use control transfers (always beginning with SETUP transactions) to Endpoint 0.
-Initially these will be sent to device ID 0.
-When a Set Address request is received, the device ID received must be stored in the [`usbctrl.device_address`](data/usbdev.hjson#usbctrl) register.
-Note that device 0 is used for the entire control transaction setting the new device ID, so writing the new ID to the register should not be done until the ACK for the Status stage has been received (see [USB 2.0 specification](https://www.usb.org/document-library/usb-20-specification)).
-
-The host will then issue additional control transfers to Endpoint 0 to configure the device, now to the device's configured address.
-In response to the Set Configuration request, software should set up the rest of the endpoints for that configuration, including configuring the flow control behavior for OUT endpoints via the [`set_nak_out`](data/usbdev.hjson#set_nak_out) register, configuring the endpoint type via the [`rxenable_setup`](data/usbdev.hjson#rxenable_setup) register (for a control endpoint) and the [`out_iso`](data/usbdev.hjson#out_iso) and [`in_iso`](data/usbdev.hjson#in_iso) registers (for isochronous OUT and IN endpoints, respectively).
-Finally, software should enable the configured endpoints via the [`ep_out_enable`](data/usbdev.hjson#ep_out_enable) and [`ep_in_enable`](data/usbdev.hjson#ep_in_enable) registers.
-The status stage of the Set Configuration request should not be allowed to complete until all endpoints are set up.
-
-
-## Buffers
-
-Software needs to manage the buffers in the packet buffer (2 kB SRAM).
-Each buffer can hold the maximum length packet for a Full-Speed interface (64 bytes).
-Other than for data movement, the management is most likely done based on their buffer ID which is a small integer between zero and (SRAM size in bytes)/(max packet size in bytes).
-
-In order to avoid unintentionally deferring transactions, there must be buffers available when the host sends data to the device (an OUT or SETUP transaction).
-Software needs to ensure (1) there are always buffer IDs in the Available Buffer FIFO, and (2) the Received Buffer FIFO is not full.
-For OUT transactions, if the Available Buffer FIFO is empty or the Received Buffer FIFO is full when data is received, a NAK will be returned to the host, requesting the packet be retried later.
-For SETUP transactions under the same conditions, the request will be dropped and a handshake will not be sent, indicating an error to the host and provoking a retry.
-These conditions cause the bus to be busy and perform no work, lowering performance for this device and potentially others on the same bus.
-Timely management of buffers may have a significant impact on throughput.
-
-Keeping the Available Buffer FIFO full can be done with a simple loop, adding buffer IDs from the software-managed free pool until the FIFO is full.
-A simpler policy of just adding a buffer ID to the Available Buffer FIFO whenever a buffer ID is removed from the Received Buffer FIFO should work on average, but performance will be slightly worse when bursts of packets are received.
-
-Flow control (using NAKs) may be done on a per-endpoint basis using the [`rxenable_out`](data/usbdev.hjson#rxenable_out) register.
-If this does not indicate OUT packet reception is enabled, then any OUT packet will receive a NAK to request a retry later.
-This should only be done for short durations or the host may timeout the transaction.
-
-
-## Reception
-
-The host will send OUT or SETUP transactions when it wants to transfer data to the device.
-The data packets are directed to a particular endpoint, and the maximum packet size is set per-endpoint in its Endpoint Descriptor (this must be the same or smaller than the maximum packet size supported by the device).
-A pkt_received interrupt is raised whenever there are one or more packets in the Received Buffer FIFO.
-Software should pop the information from the Received Buffer FIFO by reading the [`rxfifo`](data/usbdev.hjson#rxfifo) register, which gives (1) the buffer ID that the data was received in, (2) the data length received in bytes, (3) the endpoint to which the packet was sent, and (4) an indication if the packet was sent with an OUT or SETUP transaction.
-Note that the data length could be between zero and the maximum packet size -- in some situations a zero length packet is used as an acknowledgment or end of transfer.
-
-The data length does not include the packet CRC.
-(The CRC bytes are written to the buffer if they fit within the maximum buffer size.)
-Packets with a bad CRC will **not** be transferred to the Received Buffer FIFO; the hardware will drop the transaction without a handshake, indicating an error to the host.
-For non-isochronous endpoints, this typically results in the host retrying the transaction.
-
-
-## Transmission
-
-Data is transferred to the host based on the host requesting a transfer with an IN transaction.
-The host will only generate IN requests if the endpoint is declared as an IN endpoint in its Endpoint Descriptor (note that two descriptors are needed if the same endpoint is used for both IN and OUT transfers).
-The Endpoint Descriptor also includes a description of the frequency the endpoint should be polled (for isochronous and interrupt endpoints).
-
-Data is queued for transmission by writing the corresponding [`configin`](data/usbdev.hjson#configin) register with the buffer ID containing the data, the length in bytes of data (0 to maximum packet length) and setting the rdy bit.
-This data (with the packet CRC) will be sent as a response to the next IN transaction on the corresponding endpoint.
-When the host ACKs the data, the rdy bit is cleared, the corresponding endpoint bit is set in the [`in_sent`](data/usbdev.hjson#in_sent) register, and a pkt_sent interrupt is raised. If the host does not ACK the data, the packet will be retried.
-When the packet transmission has been noted by software, the corresponding endpoint bit should be cleared in the [`in_sent`](data/usbdev.hjson#in_sent) register (by writing a 1 to this very bit).
-
-Note that the [`configin`](data/usbdev.hjson#configin) for an endpoint is a single register, so no new data packet should be queued until the previous packet has been ACKed.
-If a SETUP transaction is received on a control endpoint that has a transmission pending, the hardware will **clear the rdy bit** and **set the pend bit** in the [`configin`](data/usbdev.hjson#configin) register of that endpoint.
-Software must remember the pending transmission and, after the Control transaction is complete, write it back to the [`configin`](data/usbdev.hjson#configin) register with the rdy bit set.
-
-
-## Stalling
-
-The [`out_stall`](data/usbdev.hjson#out_stall) and [`in_stall`](data/usbdev.hjson#in_stall) registers are used for endpoint stalling.
-There is one dedicated register per endpoint.
-Stalling is used to signal that the host should not retry a particular transmission or to signal certain error conditions (functional stall).
-Control endpoints also use a STALL to indicate unsupported requests (protocol stall).
-Unused endpoints can have their [`in_stall`](data/usbdev.hjson#in_stall) or [`out_stall`](data/usbdev.hjson#out_stall) register left clear, so in many cases there is no need to use the register.
-If the stall register is set for an enabled endpoint then the STALL response will be provided to all IN or OUT requests on that endpoint.
-
-In the case of a protocol stall, the device must send a STALL for all IN/OUT requests until the next SETUP token is received.
-To support this, software sets the [`in_stall`](data/usbdev.hjson#in_stall) and [`out_stall`](data/usbdev.hjson#out_stall) register for an endpoint when the host requests an unsupported transfer.
-The hardware will then send a STALL response to all IN/OUT transactions until the next SETUP is received for this endpoint.
-Receiving the **SETUP token clears the [`in_stall`](data/usbdev.hjson#in_stall) and [`out_stall`](data/usbdev.hjson#out_stall) registers** for that endpoint.
-If either a control endpoint's [`set_nak_out`](data/usbdev.hjson#set_nak_out) bit is set or software has cleared the [`rxenable_out`](data/usbdev.hjson#rxenable_out) bit before this transfer began, the hardware will send NAKs to any IN/OUT requests until the software has decided what action to take for the new SETUP request.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_usbdev.h)
-
-## Register Table
-
-* [Register Table](data/usbdev.hjson#registers)
-
-## Application to FPGAs
-
-### Differential Receivers
-
-For better receive sensitivity, lower transmit jitter and to be standard compliant, a dedicated, differential USB transceiver such as the [USB1T11A](https://www.mouser.com/datasheet/2/149/fairchild%20semiconductor_usb1t11a-320893.pdf) or the [USB1T20](https://www.onsemi.com/pub/Collateral/USB1T20-D.pdf) must be used (see Section 7.1.4.1 of the [USB 2.0 specification](https://www.usb.org/document-library/usb-20-specification)).
-Depending on the selected USB transceiver, either the dp/dn or d/se0 transmit paths or can be used to interface the IP block with the transceiver.
-If the selected USB transceiver contains a differential receiver, its output may also be enabled and passed to the D input of the IP block.
-
-When prototyping on FPGAs the interface can be implemented with pseudo-differential 3.3V GPIO pins for D+ and D-. The receiver will oversample to recover the bitstream and clock alignment even if there is considerable timing skew between the signal paths.
-The full speed transmit always uses LVCMOS output drivers (see USB 2.0 spec Figure 7-1 and Figure 7-3) but there are two possible encodings: Either the D+ and D- values are directly driven from tx_dp and tx_dn, or there is a data value from tx_d and an indicator to force SE0 from tx_se0.
-External to the IP, these should be combined to drive the actual pins when transmit is enabled and receive otherwise.
-Using standard 3.3V IO pads allows use on most FPGAs although the drive strength and series termination resistors may need to be adjusted to meet the USB signal eye.
-On a Xilinx Artix-7 (and less well tested Spartan-7) part, setting the driver to the 8mA, FAST setting seems to work well with a 22R series termination (and with a 0R series termination).
-
-### FPGA Board Implementation With PMOD
-
-The interface was developed using the Digilent Nexys Video board with a PMOD card attached.
-A PMOD interface with direct connection to the SoC should be used (some PMOD interfaces include 100R series resistors which break the signal requirements for USB).
-The PMOD card includes two USB micro-B connectors and allows two USB interfaces to be used.
-The D+ and D- signals have 22R series resistors (in line with the USB spec) and there is a 1.5k pullup on D+ to the pullup enable signal.
-There is a resistive divider to set the sense pin at half of the VBUS voltage which enables detection on the FPGA without overvoltage on the pin.
-
-![PMOD Schematic](./doc/dualpmod-sch.svg)
-
-The PMOD PCB is [available from OSH Park](https://oshpark.com/shared_projects/xMKhTIHn).
-
-The PMOD design files for KiCad version 5 are in the [`usbdev/pmod`](https://github.com/lowRISC/opentitan/tree/master/hw/ip/usbdev/pmod) directory.
-The BOM can be filled by parts from Digikey.
-
-| Item | Qty | Reference(s) | Value | LibPart | Footprint | Datasheet | Category | DK_Datasheet_Link | DK_Detail_Page | Description | Digi-Key_PN | Family | MPN | Manufacturer | Status|
-|------|-----|--------------|-------|---------|-----------|-----------|----------|-------------------|----------------|-------------|-------------|--------|-----|--------------|-------|
-| 1 | 2 | J1, J2 | 10118193-0001LF | dualpmod-rescue:10118193-0001LF-dk_USB-DVI-HDMI-Connectors | digikey-footprints:USB_Micro_B_Female_10118193-0001LF | http://www.amphenol-icc.com/media/wysiwyg/files/drawing/10118193.pdf | Connectors, Interconnects | http://www.amphenol-icc.com/media/wysiwyg/files/drawing/10118193.pdf | /product-detail/en/amphenol-icc-fci/10118193-0001LF/609-4616-1-ND/2785380 | CONN RCPT USB2.0 MICRO B SMD R/A | 609-4616-1-ND | USB, DVI, HDMI Connectors | 10118193-0001LF | Amphenol ICC (FCI) | Active|
-| 2 | 1 | J3 | 68021-412HLF | dualpmod-rescue:68021-412HLF-dk_Rectangular-Connectors-Headers-Male-Pins | digikey-footprints:PinHeader_6x2_P2.54mm_Horizontal | https://cdn.amphenol-icc.com/media/wysiwyg/files/drawing/68020.pdf | Connectors, Interconnects | https://cdn.amphenol-icc.com/media/wysiwyg/files/drawing/68020.pdf | /product-detail/en/amphenol-icc-fci/68021-412HLF/609-3355-ND/1878558 | CONN HEADER R/A 12POS 2.54MM | 609-3355-ND | Rectangular Connectors - Headers, Male Pins | 68021-412HLF | Amphenol ICC (FCI) | Active|
-| 3 | 4 | R1, R2, R7, R8 | 5k1 | Device:R_Small_US | Resistor_SMD:R_0805_2012Metric_Pad1.15x1.40mm_HandSolder | ~ |  |  |  |  | A126379CT-ND |  |  |  | |
-| 4 | 4 | R3, R4, R5, R6 | 22R | Device:R_Small_US | Resistor_SMD:R_0805_2012Metric_Pad1.15x1.40mm_HandSolder | ~ |  |  |  |  | A126352CT-ND |  |  |  | |
-| 5 | 2 | R9, R10 | 1k5 | Device:R_Small_US | Resistor_SMD:R_0805_2012Metric_Pad1.15x1.40mm_HandSolder | ~ |  |  |  |  | A106057CT-ND |  |  |  | |
diff --git a/hw/ip/usbdev/doc/programmers_guide.md b/hw/ip/usbdev/doc/programmers_guide.md
new file mode 100644
index 0000000000000..756e91e18c908
--- /dev/null
+++ b/hw/ip/usbdev/doc/programmers_guide.md
@@ -0,0 +1,131 @@
+# Programmer's Guide
+
+
+## Initialization
+
+The basic hardware initialization is to (in any order) configure the physical interface for the implementation via the [`phy_config`](../data/usbdev.hjson#phy_config) register, fill the Available Buffer FIFO, enable IN and OUT endpoints with ID 0 (this is the control endpoint that the host will use to configure the interface), enable reception of SETUP and OUT packets on OUT Endpoint 0, and enable any required interrupts.
+Finally, the interface is enabled by setting the enable bit in the [`usbctrl`](../data/usbdev.hjson#usbctrl) register.
+Setting this bit causes the USB device to assert the pullup on the D+ line, which is used by the host to detect the device.
+There is no need to configure the device ID in ([`usbctrl.device_address`](../data/usbdev.hjson#usbctrl)) at this point -- the line remains in reset and the hardware forces the device ID to zero.
+
+The second stage of initialization is done under control of the host, which will use control transfers (always beginning with SETUP transactions) to Endpoint 0.
+Initially these will be sent to device ID 0.
+When a Set Address request is received, the device ID received must be stored in the [`usbctrl.device_address`](../data/usbdev.hjson#usbctrl) register.
+Note that device 0 is used for the entire control transaction setting the new device ID, so writing the new ID to the register should not be done until the ACK for the Status stage has been received (see [USB 2.0 specification](https://www.usb.org/document-library/usb-20-specification)).
+
+The host will then issue additional control transfers to Endpoint 0 to configure the device, now to the device's configured address.
+In response to the Set Configuration request, software should set up the rest of the endpoints for that configuration, including configuring the flow control behavior for OUT endpoints via the [`set_nak_out`](../data/usbdev.hjson#set_nak_out) register, configuring the endpoint type via the [`rxenable_setup`](../data/usbdev.hjson#rxenable_setup) register (for a control endpoint) and the [`out_iso`](../data/usbdev.hjson#out_iso) and [`in_iso`](../data/usbdev.hjson#in_iso) registers (for isochronous OUT and IN endpoints, respectively).
+Finally, software should enable the configured endpoints via the [`ep_out_enable`](../data/usbdev.hjson#ep_out_enable) and [`ep_in_enable`](../data/usbdev.hjson#ep_in_enable) registers.
+The status stage of the Set Configuration request should not be allowed to complete until all endpoints are set up.
+
+
+## Buffers
+
+Software needs to manage the buffers in the packet buffer (2 kB SRAM).
+Each buffer can hold the maximum length packet for a Full-Speed interface (64 bytes).
+Other than for data movement, the management is most likely done based on their buffer ID which is a small integer between zero and (SRAM size in bytes)/(max packet size in bytes).
+
+In order to avoid unintentionally deferring transactions, there must be buffers available when the host sends data to the device (an OUT or SETUP transaction).
+Software needs to ensure (1) there are always buffer IDs in the Available Buffer FIFO, and (2) the Received Buffer FIFO is not full.
+For OUT transactions, if the Available Buffer FIFO is empty or the Received Buffer FIFO is full when data is received, a NAK will be returned to the host, requesting the packet be retried later.
+For SETUP transactions under the same conditions, the request will be dropped and a handshake will not be sent, indicating an error to the host and provoking a retry.
+These conditions cause the bus to be busy and perform no work, lowering performance for this device and potentially others on the same bus.
+Timely management of buffers may have a significant impact on throughput.
+
+Keeping the Available Buffer FIFO full can be done with a simple loop, adding buffer IDs from the software-managed free pool until the FIFO is full.
+A simpler policy of just adding a buffer ID to the Available Buffer FIFO whenever a buffer ID is removed from the Received Buffer FIFO should work on average, but performance will be slightly worse when bursts of packets are received.
+
+Flow control (using NAKs) may be done on a per-endpoint basis using the [`rxenable_out`](../data/usbdev.hjson#rxenable_out) register.
+If this does not indicate OUT packet reception is enabled, then any OUT packet will receive a NAK to request a retry later.
+This should only be done for short durations or the host may timeout the transaction.
+
+
+## Reception
+
+The host will send OUT or SETUP transactions when it wants to transfer data to the device.
+The data packets are directed to a particular endpoint, and the maximum packet size is set per-endpoint in its Endpoint Descriptor (this must be the same or smaller than the maximum packet size supported by the device).
+A pkt_received interrupt is raised whenever there are one or more packets in the Received Buffer FIFO.
+Software should pop the information from the Received Buffer FIFO by reading the [`rxfifo`](../data/usbdev.hjson#rxfifo) register, which gives (1) the buffer ID that the data was received in, (2) the data length received in bytes, (3) the endpoint to which the packet was sent, and (4) an indication if the packet was sent with an OUT or SETUP transaction.
+Note that the data length could be between zero and the maximum packet size -- in some situations a zero length packet is used as an acknowledgment or end of transfer.
+
+The data length does not include the packet CRC.
+(The CRC bytes are written to the buffer if they fit within the maximum buffer size.)
+Packets with a bad CRC will **not** be transferred to the Received Buffer FIFO; the hardware will drop the transaction without a handshake, indicating an error to the host.
+For non-isochronous endpoints, this typically results in the host retrying the transaction.
+
+
+## Transmission
+
+Data is transferred to the host based on the host requesting a transfer with an IN transaction.
+The host will only generate IN requests if the endpoint is declared as an IN endpoint in its Endpoint Descriptor (note that two descriptors are needed if the same endpoint is used for both IN and OUT transfers).
+The Endpoint Descriptor also includes a description of the frequency the endpoint should be polled (for isochronous and interrupt endpoints).
+
+Data is queued for transmission by writing the corresponding [`configin`](../data/usbdev.hjson#configin) register with the buffer ID containing the data, the length in bytes of data (0 to maximum packet length) and setting the rdy bit.
+This data (with the packet CRC) will be sent as a response to the next IN transaction on the corresponding endpoint.
+When the host ACKs the data, the rdy bit is cleared, the corresponding endpoint bit is set in the [`in_sent`](../data/usbdev.hjson#in_sent) register, and a pkt_sent interrupt is raised. If the host does not ACK the data, the packet will be retried.
+When the packet transmission has been noted by software, the corresponding endpoint bit should be cleared in the [`in_sent`](../data/usbdev.hjson#in_sent) register (by writing a 1 to this very bit).
+
+Note that the [`configin`](../data/usbdev.hjson#configin) for an endpoint is a single register, so no new data packet should be queued until the previous packet has been ACKed.
+If a SETUP transaction is received on a control endpoint that has a transmission pending, the hardware will **clear the rdy bit** and **set the pend bit** in the [`configin`](../data/usbdev.hjson#configin) register of that endpoint.
+Software must remember the pending transmission and, after the Control transaction is complete, write it back to the [`configin`](../data/usbdev.hjson#configin) register with the rdy bit set.
+
+
+## Stalling
+
+The [`out_stall`](../data/usbdev.hjson#out_stall) and [`in_stall`](../data/usbdev.hjson#in_stall) registers are used for endpoint stalling.
+There is one dedicated register per endpoint.
+Stalling is used to signal that the host should not retry a particular transmission or to signal certain error conditions (functional stall).
+Control endpoints also use a STALL to indicate unsupported requests (protocol stall).
+Unused endpoints can have their [`in_stall`](../data/usbdev.hjson#in_stall) or [`out_stall`](../data/usbdev.hjson#out_stall) register left clear, so in many cases there is no need to use the register.
+If the stall register is set for an enabled endpoint then the STALL response will be provided to all IN or OUT requests on that endpoint.
+
+In the case of a protocol stall, the device must send a STALL for all IN/OUT requests until the next SETUP token is received.
+To support this, software sets the [`in_stall`](../data/usbdev.hjson#in_stall) and [`out_stall`](../data/usbdev.hjson#out_stall) register for an endpoint when the host requests an unsupported transfer.
+The hardware will then send a STALL response to all IN/OUT transactions until the next SETUP is received for this endpoint.
+Receiving the **SETUP token clears the [`in_stall`](../data/usbdev.hjson#in_stall) and [`out_stall`](../data/usbdev.hjson#out_stall) registers** for that endpoint.
+If either a control endpoint's [`set_nak_out`](../data/usbdev.hjson#set_nak_out) bit is set or software has cleared the [`rxenable_out`](../data/usbdev.hjson#rxenable_out) bit before this transfer began, the hardware will send NAKs to any IN/OUT requests until the software has decided what action to take for the new SETUP request.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../../sw/device/lib/dif/dif_usbdev.h)
+
+## Register Table
+
+* [Register Table](../data/usbdev.hjson#registers)
+
+## Application to FPGAs
+
+### Differential Receivers
+
+For better receive sensitivity, lower transmit jitter and to be standard compliant, a dedicated, differential USB transceiver such as the [USB1T11A](https://www.mouser.com/datasheet/2/149/fairchild%20semiconductor_usb1t11a-320893.pdf) or the [USB1T20](https://www.onsemi.com/pub/Collateral/USB1T20-D.pdf) must be used (see Section 7.1.4.1 of the [USB 2.0 specification](https://www.usb.org/document-library/usb-20-specification)).
+Depending on the selected USB transceiver, either the dp/dn or d/se0 transmit paths or can be used to interface the IP block with the transceiver.
+If the selected USB transceiver contains a differential receiver, its output may also be enabled and passed to the D input of the IP block.
+
+When prototyping on FPGAs the interface can be implemented with pseudo-differential 3.3V GPIO pins for D+ and D-. The receiver will oversample to recover the bitstream and clock alignment even if there is considerable timing skew between the signal paths.
+The full speed transmit always uses LVCMOS output drivers (see USB 2.0 spec Figure 7-1 and Figure 7-3) but there are two possible encodings: Either the D+ and D- values are directly driven from tx_dp and tx_dn, or there is a data value from tx_d and an indicator to force SE0 from tx_se0.
+External to the IP, these should be combined to drive the actual pins when transmit is enabled and receive otherwise.
+Using standard 3.3V IO pads allows use on most FPGAs although the drive strength and series termination resistors may need to be adjusted to meet the USB signal eye.
+On a Xilinx Artix-7 (and less well tested Spartan-7) part, setting the driver to the 8mA, FAST setting seems to work well with a 22R series termination (and with a 0R series termination).
+
+### FPGA Board Implementation With PMOD
+
+The interface was developed using the Digilent Nexys Video board with a PMOD card attached.
+A PMOD interface with direct connection to the SoC should be used (some PMOD interfaces include 100R series resistors which break the signal requirements for USB).
+The PMOD card includes two USB micro-B connectors and allows two USB interfaces to be used.
+The D+ and D- signals have 22R series resistors (in line with the USB spec) and there is a 1.5k pullup on D+ to the pullup enable signal.
+There is a resistive divider to set the sense pin at half of the VBUS voltage which enables detection on the FPGA without overvoltage on the pin.
+
+![PMOD Schematic](../doc/dualpmod-sch.svg)
+
+The PMOD PCB is [available from OSH Park](https://oshpark.com/shared_projects/xMKhTIHn).
+
+The PMOD design files for KiCad version 5 are in the [`usbdev/pmod`](https://github.com/lowRISC/opentitan/tree/master/hw/ip/usbdev/pmod) directory.
+The BOM can be filled by parts from Digikey.
+
+| Item | Qty | Reference(s) | Value | LibPart | Footprint | Datasheet | Category | DK_Datasheet_Link | DK_Detail_Page | Description | Digi-Key_PN | Family | MPN | Manufacturer | Status|
+|------|-----|--------------|-------|---------|-----------|-----------|----------|-------------------|----------------|-------------|-------------|--------|-----|--------------|-------|
+| 1 | 2 | J1, J2 | 10118193-0001LF | dualpmod-rescue:10118193-0001LF-dk_USB-DVI-HDMI-Connectors | digikey-footprints:USB_Micro_B_Female_10118193-0001LF | http://www.amphenol-icc.com/media/wysiwyg/files/drawing/10118193.pdf | Connectors, Interconnects | http://www.amphenol-icc.com/media/wysiwyg/files/drawing/10118193.pdf | /product-detail/en/amphenol-icc-fci/10118193-0001LF/609-4616-1-ND/2785380 | CONN RCPT USB2.0 MICRO B SMD R/A | 609-4616-1-ND | USB, DVI, HDMI Connectors | 10118193-0001LF | Amphenol ICC (FCI) | Active|
+| 2 | 1 | J3 | 68021-412HLF | dualpmod-rescue:68021-412HLF-dk_Rectangular-Connectors-Headers-Male-Pins | digikey-footprints:PinHeader_6x2_P2.54mm_Horizontal | https://cdn.amphenol-icc.com/media/wysiwyg/files/drawing/68020.pdf | Connectors, Interconnects | https://cdn.amphenol-icc.com/media/wysiwyg/files/drawing/68020.pdf | /product-detail/en/amphenol-icc-fci/68021-412HLF/609-3355-ND/1878558 | CONN HEADER R/A 12POS 2.54MM | 609-3355-ND | Rectangular Connectors - Headers, Male Pins | 68021-412HLF | Amphenol ICC (FCI) | Active|
+| 3 | 4 | R1, R2, R7, R8 | 5k1 | Device:R_Small_US | Resistor_SMD:R_0805_2012Metric_Pad1.15x1.40mm_HandSolder | ~ |  |  |  |  | A126379CT-ND |  |  |  | |
+| 4 | 4 | R3, R4, R5, R6 | 22R | Device:R_Small_US | Resistor_SMD:R_0805_2012Metric_Pad1.15x1.40mm_HandSolder | ~ |  |  |  |  | A126352CT-ND |  |  |  | |
+| 5 | 2 | R9, R10 | 1k5 | Device:R_Small_US | Resistor_SMD:R_0805_2012Metric_Pad1.15x1.40mm_HandSolder | ~ |  |  |  |  | A106057CT-ND |  |  |  | |
diff --git a/hw/ip/usbdev/doc/theory_of_operation.md b/hw/ip/usbdev/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..66ae830b643f2
--- /dev/null
+++ b/hw/ip/usbdev/doc/theory_of_operation.md
@@ -0,0 +1,223 @@
+# Theory of Operation
+
+A useful quick reference for USB Full-Speed is [USB Made Simple, Part 3 - Data Flow.](http://www.usbmadesimple.co.uk/ums_3.htm)
+
+The block diagram shows a high level view of the USB device including the main register access paths.
+
+![Block Diagram](../doc/usbdev_block.svg)
+
+
+## Clocking
+
+The USB Full-Speed interface runs at a data rate of 12 MHz.
+The interface runs at four times this frequency and must be clocked from an accurate 48 MHz clock source.
+The USB specification for a Full-Speed device requires the average bit rate is 12 Mbps +/- 0.25%, so the clock needs to support maximum error of 2,500 ppm.
+The maximum allowable integrated jitter is +/- 1 ns over 1 to 7 bit periods.
+
+This module features the following output signals to provide a reference for synchronizing the 48 MHz clock source:
+- `usb_ref_pulse_o` indicates the reception of a start of frame (SOF) packet.
+  The host is required to send a SOF packet every 1 ms.
+- `usb_ref_val_o` serves as a valid signal for `usb_ref_pulse_o`.
+  It is set to one after the first SOF packet is received and remains high as long as `usb_ref_pulse_o` continues to behave as expected.
+  As soon as it is detected that SOF will not be received as expected (usually because the link is no longer active), `usb_ref_val_o` deasserts to zero until after the next `usb_ref_pulse_o`.
+
+Both these signals are synchronous to the 48 MHz clock.
+They can be forced to zero by setting [`phy_config.usb_ref_disable`](../data/usbdev.hjson#phy_config) to `1`.
+
+To successfully receive SOF packets without errors and thereby enabling clock synchronization, the initial accuracy of the 48 MHz clock source should be within 3.2% or 32,000 ppm.
+This requirement comes from the fact that the SOF packet has a length of 24 bits (plus 8-bit sync field).
+The first 8 bits are used to transfer the SOF packet ID (8'b01011010).
+Internally, the USB device dynamically adjusts the sampling point based on observed line transitions.
+Assuming the last bit of the SOF packet ID is sampled in the middle of the eye, the drift over the remaining 16 bits of the packet must be lower than half a bit (10^6 * (0.5/16) = 32,000 ppm).
+
+To externally monitor the 48 MHz clock, the USB device supports an oscillator test mode which can be enabled by setting [`phy_config.tx_osc_test_mode`](../data/usbdev.hjson#phy_config) to `1`.
+In this mode, the device constantly transmits a J/K pattern but no longer receives SOF packets.
+Consequently, it does not generate reference pulses for clock synchronization.
+The clock might drift off.
+
+Control transfers pass through synchronous FIFOs or have a ready bit synchronized across the clock domain boundary.
+A dual-port synchronous buffer SRAM is used for data transfers, and the bus clock and USB clock come from the same 48 MHz input.
+The wake detection module is clocked by a separate clock, and a couple registers are used to interface with it.
+Any bus-related clock domain crossings must happen outside the core, except for the transition between the 48 MHz clock and the wake detection module's clock.
+The 48 MHz clock must be enabled to reach the registers in `usbdev`.
+
+
+## USB Interface Pins
+
+Full-Speed USB uses a bidirectional serial interface as shown in Figure 7-24 of the [USB 2.0 Full-Speed specification](https://www.usb.org/document-library/usb-20-specification).
+For reasons of flexibility, this IP block features multiple transmit and receive paths for interfacing with various transceivers.
+
+The following sections describe how the various input/output signals relate to the USB interface pins for the different receive and transmit configurations.
+
+
+### Data Transmit
+
+The IP block supports two different encodings, driving out on separate TX interfaces.
+The default encoding looks like the USB bus, with D+ and D- values driven on usb_dp_o and usb_dn_o pins.
+The alternate encoding uses usb_se0_o to indicate a single-ended zero (SE0), and usb_d_o encodes K/J (when usb_se0_o is low).
+The TX mode can be selected by setting the `use_tx_d_se0` bit in [`phy_config`](../data/usbdev.hjson#phy_config) to either 1 (alternate, using d/se0) or 0 (default, using dp/dn).
+
+The following table summarizes how the different output signals relate to the USB interface pins.
+
+|  External Pins | Internal Signals | Notes |
+|----------------|------------------|-------|
+| D+, D-         | dp_o, dn_o       | Data output with an encoding like the USB bus, intended to go directly to pads for supported targets. On an FPGA, the components should be used with a USB transceiver, as the regular bidirectional I/O cells will likely not be USB compliant. |
+| [Alt TX Data]  | se0_o            | Signal Single-Ended Zero (SE0) link state to a USB transceiver. |
+| [Alt TX Data]  | d_o              | Data output used for encoding K and J, for interfacing with a USB transceiver. |
+|   [TX Mode]    | tx_use_d_se0_o   | Indicates the selected TX interface: use dp_o and dn_o (0) or use d_o and se0_o (1). |
+
+Note that according to the [Comportable guideline for peripheral functionality](../../../../doc/contributing/hw/comportability/README.md), every output signal `name_o` has a dedicated output enable `name_en_o`.
+For TX data, these separate signals `dp_en_o` and `dn_en_o` all correspond to the same TX or output enable signal (`OE` in the USB spec).
+The other signals listed are of the "intersignal" variety, and they do not go directly to pads or have dedicated output enable signals.
+
+
+### Data Receive
+
+The IP block supports recovery of the differential K and J symbols from the output of an external differential receiver or directly from the D+/D- pair.
+The RX mode can be selected to use a differential receiver's output by setting the `use_diff_rcvr` bit in [`phy_config`](../data/usbdev.hjson#phy_config).
+The D+/D- pair is always used to detect the single-ended zero (SE0) state.
+
+The following table summarizes how the different input signals relate to the USB interface pins.
+
+|  External Pins | Internal Signals | Notes |
+|----------------|------------------|-------|
+| D+, D-         | dp_i, dn_i       | D+ and D- signals passing into the IP single-ended, intended to go directly to pads for supported targets. These signals are used to detect the SE0 link state, and if a differential receiver is not present, they are also used for K and J symbols. On an FPGA, the components should be used with a USB transceiver, as the bidirectional regular IO cells will likely not be USB compliant. |
+| [Diff Rcvr Out]| d_i              | Data input for interfacing with a differential receiver, which is required for this input. |
+
+
+### Non-Data Pins
+
+The USB device features the following non-data pins.
+
+|  External Pins | Internal Signals         | Notes |
+|----------------|--------------------------|-------|
+| sense (VBUS)   | sense_i                  | The sense pin indicates the presence of VBUS from the USB host. |
+| [pullup]       | dp_pullup_o, dn_pullup_o | When dp_pullup_o or dn_pullup_o asserts a 1.5k pullup resistor should be connected to D+ or D-, respectively. This can be done inside the chip or with an external pin. A permanently connected resistor could be used if the pin flip feature is not needed, but this is not recommended because there is then no way to force the device to appear to unplug. Only one of the pullup signals can be asserted at any time. The selection is based on the `pinflip` bit in [`phy_config`](../data/usbdev.hjson#phy_config). Because this is a Full-Speed device the resistor must be on the D+ pin, so when `pinflip` is zero, dp_pullup_o is used. |
+| [suspend]      | suspend_o                | The suspend pin indicates to the USB transceiver that a constant idle has been detected on the link and the device is in the Suspend state (see Section 7.1.7.6 of the [USB 2.0 specification](https://www.usb.org/document-library/usb-20-specification)). |
+| [rx_enable]    | rx_enable_o              | The rx_enable pin turns on/off a differential receiver. It is enabled via a CSR and automatically disabled when the device suspends. |
+
+The USB host will identify itself to the device by enabling the 5V VBUS power.
+It may do a hard reset of a port by removing and reasserting VBUS (the Linux driver will do this when it finds a port in an inconsistent state or a port that generates errors during enumeration).
+The IP block detects VBUS through the sense pin.
+This pin is always an input and should be externally connected to detect the state of the VBUS.
+Note that this may require a resistor divider or (for USB-C where VBUS can be up to 20V) active level translation to an acceptable voltage for the input pin.
+
+A Full-Speed device identifies itself by providing a 1.5k pullup resistor (to 3.3V) on the D+ line.
+The IP block produces a signal `dp_pullup_o` that is asserted when this resistor should be presented.
+This signal will be asserted whenever the interface is enabled and VBUS is present.
+In an FPGA implementation, this signal can drive a 3.3V output pin that is driven high when the signal is asserted and set high impedance when the signal is deasserted, and the output pin used to drive a 1.5k resistor connected on the board to the D+ line.
+Alternatively, it can be used to enable an internal 1.5k pullup on the D+ pin.
+
+This USB device supports the flipping of D+/D-.
+If the `pinflip` bit in [`phy_config`](../data/usbdev.hjson#phy_config) is set, the data pins are flipped internally, meaning the 1.5k pullup resistor needs to be on the external D- line.
+To control the pullup on the D- line, this USB device features `dn_pullup_o` signal.
+Of the two pullup signals `dp_pullup_o` and `dn_pullup_o`, only one can be enabled at any time.
+As this is a Full-Speed device, `dp_pullup_o`, i.e., the pullup on D+ is used by default (`pinflip` equals zero).
+
+## Hardware Interfaces
+
+* [Interface Tables](../data/usbdev.hjson#interfaces)
+
+
+## USB Link State
+
+The USB link has a number of states.
+These are detected and reported in [`usbstat.link_state`](../data/usbdev.hjson#usbstat) and state changes are reported using interrupts.
+The FSM implements a subset of the USB device state diagram shown in Figure 9-1 of the [USB 2.0 specification.](https://www.usb.org/document-library/usb-20-specification)
+
+|State| Description |
+|-----|-------------|
+|Disconnected | The link is disconnected. This is signaled when the VBUS is not driven by the host, which results in the sense input pin being low, or when the user has not connected the pull-up by enabling the interface. An interrupt is raised on entering this state.|
+|Powered| The device has been powered as VBUS is being driven by the host and the user has connected the pull-up, but the device has not been reset yet. The link is reset whenever the D+ and D- are both low (an SE0 condition) for an extended period. The host will assert reset for a minimum of 10 ms, but the USB specification allows the device to detect and respond to a reset after 2.5 us. The implementation here will report the reset state and raise an interrupt when the link is in SE0 for 3 us.|
+|Powered Suspended| The link is suspended when at idle (a J condition) for more than 3 ms. An interrupt is generated when the suspend is detected and a resume interrupt is generated when the link exits the suspend state. This state is entered, if the device has not been reset yet.|
+|Active No SOF| The link has been reset and can begin receiving packets, but no Start-of-Frame packets have yet been seen.|
+|Active| The link is active when it is running normally. |
+|Suspended| Similar to 'Powered Suspended', but the device was in the active state before being suspended.|
+|Resuming| The link is awaiting the end of resume signaling before transitioning to the Active No SOF state.|
+
+|Link Events| Description |
+|-----------|-------------|
+|Disconnect| VBUS has been lost. |
+|Link Reset| The link has been in the SE0 state for 3 us.|
+|Link Suspend| The link has been in the J state for more than 3 ms, upon which we have to enter the Suspend state.|
+|Link Resume| The link has been driven to a non-J state after being in Suspend. For the case of resuming to active link states, the end of resume signaling has occurred.|
+|Host Lost| Signaled using an interrupt if the link is active but a start of frame (SOF) packet has not been received from the host in 4 frames. The host is required to send a SOF packet every 1 ms. This is not an expected condition.|
+
+
+## USB Protocol Engine
+
+The USB 2.0 Full-Speed Protocol Engine is provided by the common USB interface code and is, strictly speaking, not part of this USB device module.
+
+At the lowest level of the USB stack the transmit bitstream is serialized, converted to non-return-to-zero inverted (NRZI) encoding with bit-stuffing and sent to the transmitter.
+The received bitstream is recovered, clock aligned and decoded and has bit-stuffing removed.
+The recovered clock alignment is used for transmission.
+
+The higher level protocol engine forms the bitstream into packets, performs CRC checking and recognizes IN, OUT and SETUP transactions.
+These are presented to this module without buffering.
+This means the USB device module must accept or provide data when requested.
+The protocol engine may cancel a transaction because of a bad cyclic redundancy check (CRC) or request a retry if an acknowledgment (ACK) was not received.
+
+
+## Buffer Interface
+
+A 2 kB SRAM is used as a packet buffer to hold data between the system and the USB interface.
+This is divided up into 32 buffers each containing 64 bytes.
+This is an asynchronous dual-port SRAM with software accessing from the bus clock domain and the USB interface accessing from the USB 48 MHz clock domain.
+
+
+### Reception
+
+Software provides buffers for packet reception through a 4-entry Available Buffer FIFO.
+(More study needed but four seems about right: one just returned to software, one being filled, one ready to be filled, and one for luck.)
+The [`rxenable_out`](../data/usbdev.hjson#rxenable_out) and [`rxenable_setup`](../data/usbdev.hjson#rxenable_setup) registers is used to indicate which endpoints will accept data from the host using OUT or SETUP transactions, respectively.
+When a packet is transferred from the host to the device (using an OUT or SETUP transaction) and reception of that type of transaction is enabled for the requested endpoint, the next buffer ID is pulled from the Available Buffer FIFO.
+The packet data is written to the corresponding buffer in the packet buffer (the 2 kB SRAM).
+If the packet is correctly received, an ACK is returned to the host.
+In addition, the buffer ID, the packet size, an out/setup flag and the endpoint ID are passed back to software using the Received Buffer FIFO and a pkt_received interrupt is raised.
+
+Software should immediately provide a free buffer for future reception by writing the corresponding buffer ID to the Available Buffer FIFO.
+It can then process the packet and eventually return the received buffer to the free pool.
+This allows streaming on a single endpoint or across a number of endpoints.
+If the packets cannot be consumed at the rate they are received, software can implement selective flow control by clearing [`rxenable_out`](../data/usbdev.hjson#rxenable_out) for a particular endpoint, which will result in a request to that endpoint being NAKed (negative acknowledgment).
+In the unfortunate event that the Available Buffer FIFO is empty or the Received Buffer FIFO is full, all OUT transactions are NAKed and SETUP transactions are ignored.
+In that event, the host will retry the transaction (up to some maximum attempts or time).
+
+There are two options for a given OUT endpoint's flow control, controlled by the [`set_nak_out`](../data/usbdev.hjson#set_nak_out) register.
+If `set_nak_out` is 0 for the endpoint, it will accept packets as long as there are buffers available in the Available Buffer FIFO and space available in the Received Buffer FIFO.
+For timing, this option implies that software may not be able to affect the response to a given transaction, and buffer availability is the only needed factor.
+If `set_nak_out` is 1 for the endpoint, it will clear its corresponding bit in the [`rxenable_out`](../data/usbdev.hjson#rxenable_out) register, forcing NAK responses to OUT transactions to that endpoint until software can intervene.
+That option uses NAK to defer the host, and this enables software to implement features that require protocol-level control at transaction boundaries, such as when implementing the functional stall.
+
+
+### Transmission
+
+To send data to the host in response to an IN transaction, software first writes the data into a free buffer.
+Then, it writes the buffer ID, data length and rdy flag to the [`configin`](../data/usbdev.hjson#configin) register of the corresponding endpoint.
+When the host next does an IN transaction to that endpoint, the data will be sent from the buffer.
+On receipt of the ACK from the host, the rdy bit in the [`configin`](../data/usbdev.hjson#configin) register will be cleared, and the bit corresponding to the endpoint ID will be set in the [`in_sent`](../data/usbdev.hjson#in_sent) register causing a pkt_sent interrupt to be raised.
+Software can return the buffer to the free pool and write a 1 to clear the endpoint bit in the [`in_sent`](../data/usbdev.hjson#in_sent) register.
+Note that streaming can be achieved if the next buffer has been prepared and is written to the [`configin`](../data/usbdev.hjson#configin) register when the interrupt is received.
+
+A Control transfer requires one or more IN transactions, either during the data stage or the status stage.
+Therefore, when a SETUP transaction is received for an endpoint, any buffers that are waiting to be sent out to the host from that endpoint are canceled by clearing the rdy bit in the corresponding [`configin`](../data/usbdev.hjson#configin) register.
+To keep track of such canceled buffers, the pend bit in the same register is set.
+The transfer must be queued again after the Control transfer is completed.
+
+Similarly, a Link Reset cancels any waiting IN transactions by clearing the rdy bit in the [`configin`](../data/usbdev.hjson#configin) register of all endpoints.
+The pend bit in the [`configin`](../data/usbdev.hjson#configin) register is set for all endpoints with a pending IN transaction.
+
+
+### Buffer Count and Size
+
+Under high load, the 32 buffers of the packet buffer (2 kB SRAM) are allocated as follows:
+- 1 is being processed following reception,
+- 4 are in the Available Buffer FIFO, and
+- 12 (worst case) waiting transmissions in the [`configin`](../data/usbdev.hjson#configin) registers.
+This leaves 15 buffers for preparation of future transmissions (which would need 12 in the worst case of one per endpoint) and the free pool.
+
+The size of 64 bytes per buffer satisfies the maximum USB packet size for a Full-Speed interface for Control transfers (max may be 8, 16, 32 or 64 bytes), Bulk Transfers (max is 64 bytes) and Interrupt transfers (max is 64 bytes).
+It is small for Isochronous transfers (which have a max size of 1023 bytes).
+The interface will need extending for high rate isochronous use (a possible option would be to allow up to 8 or 16 64-byte buffers to be aggregated as the isochronous buffer).
+
+
+# Design Details
diff --git a/hw/ip_templates/alert_handler/README.md b/hw/ip_templates/alert_handler/README.md
index 69e388bdda458..808fd478f91b6 100644
--- a/hw/ip_templates/alert_handler/README.md
+++ b/hw/ip_templates/alert_handler/README.md
@@ -67,4 +67,3 @@ For each class a counter of alerts is kept, clearable by software.
 If that counter exceeds a programmable maximum value, then the escalation protocol for that class begins.
 
 The details for alert signaling, classification, and escalation are all given in the Theory of Operations section.
-
diff --git a/hw/ip_templates/rv_plic/doc/theory_of_operation.md b/hw/ip_templates/rv_plic/doc/theory_of_operation.md
index aebdf43bff497..99013956b2f94 100644
--- a/hw/ip_templates/rv_plic/doc/theory_of_operation.md
+++ b/hw/ip_templates/rv_plic/doc/theory_of_operation.md
@@ -116,4 +116,3 @@ raised.  The SW handles the interrupt and it drops at e. However a new interrupt
 quickly occurs at f. As complete hasn't been signaled yet `irq_o` isn't
 asserted. At g the interrupt is completed (by writing `i` to it's
 Claim/Complete register) so at h `irq_o` is asserted due to the new interrupt.
-
diff --git a/hw/top_earlgrey/ip/sensor_ctrl/README.md b/hw/top_earlgrey/ip/sensor_ctrl/README.md
index 9f0743ab07609..25a219dd98545 100644
--- a/hw/top_earlgrey/ip/sensor_ctrl/README.md
+++ b/hw/top_earlgrey/ip/sensor_ctrl/README.md
@@ -15,56 +15,3 @@ Long term, this is a module that can be absorbed directly into the `analog senso
 - Status readback for `analog sensor top`
 - Pad debug hook up for `analog sensor top`
 - Wakeup based on alert events
-
-# Theory of Operations
-
-## Block Diagram
-
-The diagram below shows how `sensor control` helps `analog sensor top` integration into the overall design.
-
-## Recoverable and Fatal Alerts
-
-The `analog sensor top` sends alert requests in independent, differential form to the `sensor control`.
-Each alert request consists of a pair of signals, one active high and one active low.
-The active polarity of each signal is independent, due to the imprecise sensor timing that drives the alert.
-This means that the `sensor control` recognizes an active alert as long as one of the lines is active, and not the pair of signals being in a particular state.
-Each signal in the differential pair is thus a separate dedicated alert indicator.
-
-Once an alert request is detected as active, the `sensor control` formulates a proper alert event through the `prim_alert_sender` and sends a notification to the `alert handler`.
-
-The `sensor control` can optionally generate alert acknowledgements back to the `analog sensor top`.
-
-For each incoming alert, it can be programmed as fatal or recoverable through [`FATAL_ALERT_EN`](data/sensor_ctrl.hjson#fatal_alert_en).
-If set to recoverable, an alert will be registered in [`RECOV_ALERT`](data/sensor_ctrl.hjson#recov_alert) and the original `analog sensor top` event acknowledged.
-The acknowledgement prevents alerts from constantly being sent.
-
-If set to fatal, an alert will be registered in [`FATAL_ALERT`](data/sensor_ctrl.hjson#fatal_alert) but the original `analog sensor top` event will not be acknowledged.
-This causes the alert to constantly send until the system escalates in some form.
-
-## Wakeup Requests
-
-In addition to forwarding events to the `alert handler`, incoming events can also be aggregated into a wakeup request to the system.
-The `sensor control` does not make assumptions about its power domains and thus it is up to the integrating system to decide which power modes allow alert event wakeups.
-
-As an example, if the `sensor control` is not placed in an always on domain, then it cannot send alert based wakeups if the system is in a deep low power state.
-It will only be able to send wakeups when the system is powered and the `clk_aon_i` input is available.
-
-## Hardware Interfaces
-
-### Signals
-
-* [Interface Tables](data/sensor_ctrl.hjson#interfaces)
-
-# Programmer's Guide
-
-Each available alert has a corresponding fatality configuration.
-If an alert event is set to 1 in [`FATAL_ALERT_EN`](data/sensor_ctrl.hjson#fatal_alert_en), `sensor control` treats it as a fatal event instead of a recoverable event.
-Fatal events are not acknowledged, and continuously send alert events in the system until some kind of escalation is seen.
-
-## Device Interface Functions (DIFs)
-
-- [Device Interface Functions](../../../sw/device/lib/dif/dif_sensor_ctrl.h)
-
-## Register Table
-
-* [Register Table](data/sensor_ctrl.hjson#register)
diff --git a/hw/top_earlgrey/ip/sensor_ctrl/doc/programmers_guide.md b/hw/top_earlgrey/ip/sensor_ctrl/doc/programmers_guide.md
new file mode 100644
index 0000000000000..12ed441eeaa3e
--- /dev/null
+++ b/hw/top_earlgrey/ip/sensor_ctrl/doc/programmers_guide.md
@@ -0,0 +1,13 @@
+# Programmer's Guide
+
+Each available alert has a corresponding fatality configuration.
+If an alert event is set to 1 in [`FATAL_ALERT_EN`](../data/sensor_ctrl.hjson#fatal_alert_en), `sensor control` treats it as a fatal event instead of a recoverable event.
+Fatal events are not acknowledged, and continuously send alert events in the system until some kind of escalation is seen.
+
+## Device Interface Functions (DIFs)
+
+- [Device Interface Functions](../../../sw/device/lib/dif/dif_sensor_ctrl.h)
+
+## Register Table
+
+* [Register Table](../data/sensor_ctrl.hjson#register)
diff --git a/hw/top_earlgrey/ip/sensor_ctrl/doc/theory_of_operation.md b/hw/top_earlgrey/ip/sensor_ctrl/doc/theory_of_operation.md
new file mode 100644
index 0000000000000..0fe81865a52c1
--- /dev/null
+++ b/hw/top_earlgrey/ip/sensor_ctrl/doc/theory_of_operation.md
@@ -0,0 +1,38 @@
+# Theory of Operation
+
+## Block Diagram
+
+The diagram below shows how `sensor control` helps `analog sensor top` integration into the overall design.
+
+## Recoverable and Fatal Alerts
+
+The `analog sensor top` sends alert requests in independent, differential form to the `sensor control`.
+Each alert request consists of a pair of signals, one active high and one active low.
+The active polarity of each signal is independent, due to the imprecise sensor timing that drives the alert.
+This means that the `sensor control` recognizes an active alert as long as one of the lines is active, and not the pair of signals being in a particular state.
+Each signal in the differential pair is thus a separate dedicated alert indicator.
+
+Once an alert request is detected as active, the `sensor control` formulates a proper alert event through the `prim_alert_sender` and sends a notification to the `alert handler`.
+
+The `sensor control` can optionally generate alert acknowledgements back to the `analog sensor top`.
+
+For each incoming alert, it can be programmed as fatal or recoverable through [`FATAL_ALERT_EN`](../data/sensor_ctrl.hjson#fatal_alert_en).
+If set to recoverable, an alert will be registered in [`RECOV_ALERT`](../data/sensor_ctrl.hjson#recov_alert) and the original `analog sensor top` event acknowledged.
+The acknowledgement prevents alerts from constantly being sent.
+
+If set to fatal, an alert will be registered in [`FATAL_ALERT`](../data/sensor_ctrl.hjson#fatal_alert) but the original `analog sensor top` event will not be acknowledged.
+This causes the alert to constantly send until the system escalates in some form.
+
+## Wakeup Requests
+
+In addition to forwarding events to the `alert handler`, incoming events can also be aggregated into a wakeup request to the system.
+The `sensor control` does not make assumptions about its power domains and thus it is up to the integrating system to decide which power modes allow alert event wakeups.
+
+As an example, if the `sensor control` is not placed in an always on domain, then it cannot send alert based wakeups if the system is in a deep low power state.
+It will only be able to send wakeups when the system is powered and the `clk_aon_i` input is available.
+
+## Hardware Interfaces
+
+### Signals
+
+* [Interface Tables](../data/sensor_ctrl.hjson#interfaces)
diff --git a/hw/top_earlgrey/ip_autogen/alert_handler/README.md b/hw/top_earlgrey/ip_autogen/alert_handler/README.md
index 69e388bdda458..808fd478f91b6 100644
--- a/hw/top_earlgrey/ip_autogen/alert_handler/README.md
+++ b/hw/top_earlgrey/ip_autogen/alert_handler/README.md
@@ -67,4 +67,3 @@ For each class a counter of alerts is kept, clearable by software.
 If that counter exceeds a programmable maximum value, then the escalation protocol for that class begins.
 
 The details for alert signaling, classification, and escalation are all given in the Theory of Operations section.
-
diff --git a/hw/top_earlgrey/ip_autogen/rv_plic/doc/theory_of_operation.md b/hw/top_earlgrey/ip_autogen/rv_plic/doc/theory_of_operation.md
index aebdf43bff497..99013956b2f94 100644
--- a/hw/top_earlgrey/ip_autogen/rv_plic/doc/theory_of_operation.md
+++ b/hw/top_earlgrey/ip_autogen/rv_plic/doc/theory_of_operation.md
@@ -116,4 +116,3 @@ raised.  The SW handles the interrupt and it drops at e. However a new interrupt
 quickly occurs at f. As complete hasn't been signaled yet `irq_o` isn't
 asserted. At g the interrupt is completed (by writing `i` to it's
 Claim/Complete register) so at h `irq_o` is asserted due to the new interrupt.
-