From 3ec7f427e467d149a5f79500aaecd94b07185e85 Mon Sep 17 00:00:00 2001
From: Christophe Favergeon <Christophe.Favergeon@arm.com>
Date: Mon, 26 Feb 2024 13:50:57 +0100
Subject: [PATCH 1/5] First release of the CMSIS-DSP++ C++ extension

---
 Documentation/Doxygen/dsp.dxy.in              |   19 +-
 Documentation/Doxygen/src/building.md         |   29 +
 Documentation/Doxygen/src/code_size.md        |   14 +
 Documentation/Doxygen/src/dsppp_main.md       |   18 +
 Documentation/Doxygen/src/fusion.md           |   39 +
 Documentation/Doxygen/src/guidelines.md       |    1 +
 Documentation/Doxygen/src/introduction.md     |   64 +
 Documentation/Doxygen/src/mainpage.md         |   17 +-
 Documentation/Doxygen/src/matrix.md           |  168 ++
 Documentation/Doxygen/src/memory_allocator.md |   87 +
 .../Doxygen/src/memory_static_dynamic.md      |   35 +
 Documentation/Doxygen/src/template.md         |   60 +
 Documentation/Doxygen/src/vector.md           |  112 +
 Documentation/Doxygen/src/vectorop.md         |  100 +
 dsppp/.gitignore                              |   13 +
 dsppp/Examples/dot_product.cpp                |   52 +
 dsppp/Examples/matrix_op.cpp                  |  109 +
 dsppp/Examples/vector_op.cpp                  |   83 +
 dsppp/Include/dsppp/DSP/basic.hpp             |  256 +++
 dsppp/Include/dsppp/DSP/matrix_multiply.hpp   |  367 ++++
 dsppp/Include/dsppp/DSP/memory.hpp            |   98 +
 dsppp/Include/dsppp/DSP/num_features.hpp      |   14 +
 dsppp/Include/dsppp/DSP/q15.hpp               |  238 +++
 dsppp/Include/dsppp/DSP/q7.hpp                |  264 +++
 dsppp/Include/dsppp/Helium/basic.hpp          |  223 ++
 dsppp/Include/dsppp/Helium/float.hpp          |  426 ++++
 dsppp/Include/dsppp/Helium/half.hpp           |  520 +++++
 .../Include/dsppp/Helium/matrix_multiply.hpp  |  335 +++
 .../dsppp/Helium/matrix_multiply_f16.hpp      |  404 ++++
 .../dsppp/Helium/matrix_multiply_f32.hpp      |  270 +++
 .../dsppp/Helium/matrix_multiply_fixed.hpp    |  613 ++++++
 dsppp/Include/dsppp/Helium/num_features.hpp   |   17 +
 dsppp/Include/dsppp/Helium/q15.hpp            |  461 ++++
 dsppp/Include/dsppp/Helium/q31.hpp            |  345 +++
 dsppp/Include/dsppp/Helium/q7.hpp             |  463 ++++
 dsppp/Include/dsppp/Neon/basic.hpp            |  133 ++
 dsppp/Include/dsppp/Neon/float.hpp            |  105 +
 dsppp/Include/dsppp/Neon/num_features.hpp     |    5 +
 dsppp/Include/dsppp/Scalar/basic.hpp          |  189 ++
 .../Include/dsppp/Scalar/matrix_multiply.hpp  |  134 ++
 .../dsppp/Scalar/matrix_multiply_fixed.hpp    |  124 ++
 .../dsppp/Scalar/matrix_multiply_float.hpp    |  119 ++
 dsppp/Include/dsppp/algorithms.hpp            |  269 +++
 dsppp/Include/dsppp/arch.hpp                  |   64 +
 dsppp/Include/dsppp/arch_detection.hpp        |  281 +++
 dsppp/Include/dsppp/common.hpp                |   73 +
 dsppp/Include/dsppp/fixed_point.hpp           | 1036 +++++++++
 dsppp/Include/dsppp/forward.hpp               |  149 ++
 dsppp/Include/dsppp/fusion.hpp                |  760 +++++++
 dsppp/Include/dsppp/fusion_ops.hpp            |  358 ++++
 dsppp/Include/dsppp/matrix.hpp                |  647 ++++++
 dsppp/Include/dsppp/matrix_impl.hpp           |  612 ++++++
 dsppp/Include/dsppp/matrix_view.hpp           |  751 +++++++
 dsppp/Include/dsppp/memory_pool.hpp           |  259 +++
 dsppp/Include/dsppp/num_features/double.hpp   |   63 +
 dsppp/Include/dsppp/num_features/float.hpp    |   77 +
 dsppp/Include/dsppp/num_features/group.hpp    |  171 ++
 dsppp/Include/dsppp/num_features/half.hpp     |   76 +
 dsppp/Include/dsppp/num_features/q15.hpp      |   66 +
 dsppp/Include/dsppp/num_features/q31.hpp      |   65 +
 dsppp/Include/dsppp/num_features/q7.hpp       |   57 +
 dsppp/Include/dsppp/number.hpp                |  190 ++
 dsppp/Include/dsppp/unroll.hpp                |  247 +++
 dsppp/Include/dsppp/vec.hpp                   |  442 ++++
 dsppp/Include/dsppp/vector_impl.hpp           |  576 +++++
 dsppp/Include/dsppp/vector_view.hpp           |  449 ++++
 dsppp/RTE/Device/ARMCM0P/ARMCM0plus_ac6.sct   |   80 +
 .../ARMCM0P/ARMCM0plus_ac6.sct.base@1.0.0     |   80 +
 dsppp/RTE/Device/ARMCM0P/ARMCM0plus_gcc.ld    |  263 +++
 .../ARMCM0P/ARMCM0plus_gcc.ld.base@2.2.0      |  263 +++
 .../RTE/Device/ARMCM0P/ac6_linker_script.sct  |  111 +
 .../RTE/Device/ARMCM0P/clang_linker_script.ld |  353 ++++
 dsppp/RTE/Device/ARMCM0P/gcc_linker_script.ld |  294 +++
 dsppp/RTE/Device/ARMCM0P/regions_ARMCM0P.h    |   60 +
 dsppp/RTE/Device/ARMCM0P/startup_ARMCM0plus.c |  146 ++
 .../ARMCM0P/startup_ARMCM0plus.c.base@3.0.0   |  146 ++
 dsppp/RTE/Device/ARMCM0P/system_ARMCM0plus.c  |   69 +
 .../ARMCM0P/system_ARMCM0plus.c.base@2.0.0    |   69 +
 dsppp/RTE/Device/ARMCM4/ARMCM4_ac6.sct        |   80 +
 .../Device/ARMCM4/ARMCM4_ac6.sct.base@1.0.0   |   80 +
 dsppp/RTE/Device/ARMCM4/ARMCM4_gcc.ld         |  263 +++
 .../Device/ARMCM4/ARMCM4_gcc.ld.base@2.2.0    |  263 +++
 .../RTE/Device/ARMCM4/clang_linker_script.ld  |  353 ++++
 dsppp/RTE/Device/ARMCM4/regions_ARMCM4.h      |   60 +
 dsppp/RTE/Device/ARMCM4/startup_ARMCM4.c      |  150 ++
 .../Device/ARMCM4/startup_ARMCM4.c.base@3.0.0 |  150 ++
 dsppp/RTE/Device/ARMCM4/system_ARMCM4.c       |   79 +
 .../Device/ARMCM4/system_ARMCM4.c.base@2.0.0  |   79 +
 dsppp/RTE/Device/SSE-300-MPS3/RTE_Device.h    |   84 +
 .../SSE-300-MPS3/RTE_Device.h.base@1.1.0      |   84 +
 .../Device/SSE-300-MPS3/cmsis_driver_config.h |   25 +
 .../cmsis_driver_config.h.base@1.1.1          |   25 +
 dsppp/RTE/Device/SSE-300-MPS3/device_cfg.h    |  149 ++
 .../SSE-300-MPS3/device_cfg.h.base@1.1.3      |  149 ++
 .../SSE-300-MPS3/linker_SSE300MPS3_secure.ld  |  242 +++
 .../linker_SSE300MPS3_secure.ld.base@1.0.0    |  242 +++
 .../SSE-300-MPS3/linker_SSE300MPS3_secure.sct |   62 +
 .../linker_SSE300MPS3_secure.sct.base@1.1.0   |   62 +
 dsppp/RTE/Device/SSE-300-MPS3/region_defs.h   |   44 +
 .../SSE-300-MPS3/region_defs.h.base@1.0.0     |   44 +
 dsppp/RTE/Device/SSE-300-MPS3/region_limits.h |   45 +
 .../SSE-300-MPS3/region_limits.h.base@1.0.0   |   45 +
 .../Device/SSE-300-MPS3/startup_SSE300MPS3.c  |  375 ++++
 .../startup_SSE300MPS3.c.base@1.1.1           |  375 ++++
 .../Device/SSE-300-MPS3/system_SSE300MPS3.c   |   93 +
 .../system_SSE300MPS3.c.base@1.1.1            |   93 +
 .../Device/SSE_300_MPS3/ac6_linker_script.sct |  111 +
 .../SSE_300_MPS3/clang_linker_script.ld       |  353 ++++
 .../Device/SSE_300_MPS3/gcc_linker_script.ld  |  294 +++
 .../regions_V2M_MPS3_SSE_300_FVP.h            |  400 ++++
 dsppp/RTE/_Release_IPSS_M0P/RTE_Components.h  |   20 +
 dsppp/RTE/_Release_IPSS_M4/RTE_Components.h   |   20 +
 .../RTE_Components.h                          |   23 +
 .../RTE_Components.h                          |   25 +
 .../RTE_Components.h                          |   20 +
 dsppp/RTE/_Release_VHT-M0P/RTE_Components.h   |   20 +
 dsppp/RTE/_Release_VHT-M4/RTE_Components.h    |   20 +
 dsppp/RTE/_Release_VHT_M0P/RTE_Components.h   |   20 +
 dsppp/RTE/_Release_VHT_M4/RTE_Components.h    |   20 +
 dsppp/allocator.cpp                           |   98 +
 dsppp/allocator.h                             |  124 ++
 dsppp/cdefault.yml                            |  142 ++
 dsppp/clang_sse300.c                          |   65 +
 dsppp/example.cproject.yml                    |  120 ++
 dsppp/fvp_configs/VHT-Corstone-300.txt        |    9 +
 dsppp/fvp_configs/VHT-M0P.txt                 |    3 +
 dsppp/fvp_configs/VHT-M4.txt                  |    3 +
 dsppp/getserial.py                            |   28 +
 dsppp/linker_scripts/ARMCM0P/region_defs.h    |   60 +
 dsppp/linker_scripts/ARMCM4/region_defs.h     |   60 +
 .../linker_scripts/SSE-300-MPS3/region_defs.h |   44 +
 .../SSE-300-MPS3/region_limits.h              |   45 +
 dsppp/linker_scripts/ac6_m0p_mps3_s.sct       |  111 +
 dsppp/linker_scripts/ac6_m4_mps3_s.sct        |  111 +
 dsppp/linker_scripts/ac6_sse300_mps3_s.sct    |   79 +
 dsppp/linker_scripts/ac6_sse310_mps3_s.sct    |   60 +
 dsppp/linker_scripts/clang_m0p_mps3.ld        |  353 ++++
 dsppp/linker_scripts/clang_m4_mps3.ld         |  353 ++++
 dsppp/linker_scripts/clang_sse300_mps3.sct    |  364 ++++
 dsppp/linker_scripts/clang_sse310_mps3.sct    |  363 ++++
 dsppp/linker_scripts/gcc_m0p_mps3.ld          |  294 +++
 dsppp/linker_scripts/gcc_m4_mps3.ld           |  294 +++
 dsppp/linker_scripts/gcc_sse300_mps3.ld       |  295 +++
 dsppp/linker_scripts/gcc_sse310_mps3_s.ld     |  295 +++
 dsppp/main.c                                  |   93 +
 dsppp/mps3run.py                              |   78 +
 dsppp/process.py                              |  137 ++
 dsppp/run_all.py                              |  390 ++++
 dsppp/test.cbuild-pack.yml                    |   17 +
 dsppp/test.cproject.yml                       |  146 ++
 dsppp/test.csolution.yml                      |  108 +
 dsppp/test_config.h                           |   13 +
 dsppp/tests/bench.c                           |    3 +
 dsppp/tests/bench.h                           |   60 +
 dsppp/tests/cmsis_tests.h                     |  699 +++++++
 dsppp/tests/cmsisdsp.cpp                      | 1146 ++++++++++
 dsppp/tests/col_test.cpp                      |  112 +
 dsppp/tests/common_tests.cpp                  |   48 +
 dsppp/tests/common_tests.h                    |  282 +++
 dsppp/tests/debug_mat.h                       |  738 +++++++
 dsppp/tests/debug_test.cpp                    |   45 +
 dsppp/tests/debug_test_external.cpp           |   56 +
 dsppp/tests/dot_test.cpp                      |  213 ++
 dsppp/tests/filter_test.cpp                   |  657 ++++++
 dsppp/tests/fusion_test.cpp                   |  247 +++
 dsppp/tests/matrix_test.cpp                   | 1863 +++++++++++++++++
 dsppp/tests/matrix_utils.h                    |  640 ++++++
 dsppp/tests/row_test.cpp                      |  204 ++
 dsppp/tests/test.h                            |   19 +
 dsppp/tests/vector_test.cpp                   |  219 ++
 170 files changed, 33958 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/Doxygen/src/building.md
 create mode 100644 Documentation/Doxygen/src/code_size.md
 create mode 100644 Documentation/Doxygen/src/dsppp_main.md
 create mode 100644 Documentation/Doxygen/src/fusion.md
 create mode 100644 Documentation/Doxygen/src/guidelines.md
 create mode 100644 Documentation/Doxygen/src/introduction.md
 create mode 100644 Documentation/Doxygen/src/matrix.md
 create mode 100644 Documentation/Doxygen/src/memory_allocator.md
 create mode 100644 Documentation/Doxygen/src/memory_static_dynamic.md
 create mode 100644 Documentation/Doxygen/src/template.md
 create mode 100644 Documentation/Doxygen/src/vector.md
 create mode 100644 Documentation/Doxygen/src/vectorop.md
 create mode 100644 dsppp/.gitignore
 create mode 100644 dsppp/Examples/dot_product.cpp
 create mode 100644 dsppp/Examples/matrix_op.cpp
 create mode 100644 dsppp/Examples/vector_op.cpp
 create mode 100644 dsppp/Include/dsppp/DSP/basic.hpp
 create mode 100644 dsppp/Include/dsppp/DSP/matrix_multiply.hpp
 create mode 100644 dsppp/Include/dsppp/DSP/memory.hpp
 create mode 100644 dsppp/Include/dsppp/DSP/num_features.hpp
 create mode 100644 dsppp/Include/dsppp/DSP/q15.hpp
 create mode 100644 dsppp/Include/dsppp/DSP/q7.hpp
 create mode 100644 dsppp/Include/dsppp/Helium/basic.hpp
 create mode 100644 dsppp/Include/dsppp/Helium/float.hpp
 create mode 100644 dsppp/Include/dsppp/Helium/half.hpp
 create mode 100644 dsppp/Include/dsppp/Helium/matrix_multiply.hpp
 create mode 100644 dsppp/Include/dsppp/Helium/matrix_multiply_f16.hpp
 create mode 100644 dsppp/Include/dsppp/Helium/matrix_multiply_f32.hpp
 create mode 100644 dsppp/Include/dsppp/Helium/matrix_multiply_fixed.hpp
 create mode 100644 dsppp/Include/dsppp/Helium/num_features.hpp
 create mode 100644 dsppp/Include/dsppp/Helium/q15.hpp
 create mode 100644 dsppp/Include/dsppp/Helium/q31.hpp
 create mode 100644 dsppp/Include/dsppp/Helium/q7.hpp
 create mode 100644 dsppp/Include/dsppp/Neon/basic.hpp
 create mode 100644 dsppp/Include/dsppp/Neon/float.hpp
 create mode 100644 dsppp/Include/dsppp/Neon/num_features.hpp
 create mode 100644 dsppp/Include/dsppp/Scalar/basic.hpp
 create mode 100644 dsppp/Include/dsppp/Scalar/matrix_multiply.hpp
 create mode 100644 dsppp/Include/dsppp/Scalar/matrix_multiply_fixed.hpp
 create mode 100644 dsppp/Include/dsppp/Scalar/matrix_multiply_float.hpp
 create mode 100644 dsppp/Include/dsppp/algorithms.hpp
 create mode 100644 dsppp/Include/dsppp/arch.hpp
 create mode 100644 dsppp/Include/dsppp/arch_detection.hpp
 create mode 100644 dsppp/Include/dsppp/common.hpp
 create mode 100644 dsppp/Include/dsppp/fixed_point.hpp
 create mode 100644 dsppp/Include/dsppp/forward.hpp
 create mode 100644 dsppp/Include/dsppp/fusion.hpp
 create mode 100644 dsppp/Include/dsppp/fusion_ops.hpp
 create mode 100644 dsppp/Include/dsppp/matrix.hpp
 create mode 100644 dsppp/Include/dsppp/matrix_impl.hpp
 create mode 100644 dsppp/Include/dsppp/matrix_view.hpp
 create mode 100644 dsppp/Include/dsppp/memory_pool.hpp
 create mode 100644 dsppp/Include/dsppp/num_features/double.hpp
 create mode 100644 dsppp/Include/dsppp/num_features/float.hpp
 create mode 100644 dsppp/Include/dsppp/num_features/group.hpp
 create mode 100644 dsppp/Include/dsppp/num_features/half.hpp
 create mode 100644 dsppp/Include/dsppp/num_features/q15.hpp
 create mode 100644 dsppp/Include/dsppp/num_features/q31.hpp
 create mode 100644 dsppp/Include/dsppp/num_features/q7.hpp
 create mode 100644 dsppp/Include/dsppp/number.hpp
 create mode 100644 dsppp/Include/dsppp/unroll.hpp
 create mode 100644 dsppp/Include/dsppp/vec.hpp
 create mode 100644 dsppp/Include/dsppp/vector_impl.hpp
 create mode 100644 dsppp/Include/dsppp/vector_view.hpp
 create mode 100644 dsppp/RTE/Device/ARMCM0P/ARMCM0plus_ac6.sct
 create mode 100644 dsppp/RTE/Device/ARMCM0P/ARMCM0plus_ac6.sct.base@1.0.0
 create mode 100644 dsppp/RTE/Device/ARMCM0P/ARMCM0plus_gcc.ld
 create mode 100644 dsppp/RTE/Device/ARMCM0P/ARMCM0plus_gcc.ld.base@2.2.0
 create mode 100644 dsppp/RTE/Device/ARMCM0P/ac6_linker_script.sct
 create mode 100644 dsppp/RTE/Device/ARMCM0P/clang_linker_script.ld
 create mode 100644 dsppp/RTE/Device/ARMCM0P/gcc_linker_script.ld
 create mode 100644 dsppp/RTE/Device/ARMCM0P/regions_ARMCM0P.h
 create mode 100644 dsppp/RTE/Device/ARMCM0P/startup_ARMCM0plus.c
 create mode 100644 dsppp/RTE/Device/ARMCM0P/startup_ARMCM0plus.c.base@3.0.0
 create mode 100644 dsppp/RTE/Device/ARMCM0P/system_ARMCM0plus.c
 create mode 100644 dsppp/RTE/Device/ARMCM0P/system_ARMCM0plus.c.base@2.0.0
 create mode 100644 dsppp/RTE/Device/ARMCM4/ARMCM4_ac6.sct
 create mode 100644 dsppp/RTE/Device/ARMCM4/ARMCM4_ac6.sct.base@1.0.0
 create mode 100644 dsppp/RTE/Device/ARMCM4/ARMCM4_gcc.ld
 create mode 100644 dsppp/RTE/Device/ARMCM4/ARMCM4_gcc.ld.base@2.2.0
 create mode 100644 dsppp/RTE/Device/ARMCM4/clang_linker_script.ld
 create mode 100644 dsppp/RTE/Device/ARMCM4/regions_ARMCM4.h
 create mode 100644 dsppp/RTE/Device/ARMCM4/startup_ARMCM4.c
 create mode 100644 dsppp/RTE/Device/ARMCM4/startup_ARMCM4.c.base@3.0.0
 create mode 100644 dsppp/RTE/Device/ARMCM4/system_ARMCM4.c
 create mode 100644 dsppp/RTE/Device/ARMCM4/system_ARMCM4.c.base@2.0.0
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/RTE_Device.h
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/RTE_Device.h.base@1.1.0
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/cmsis_driver_config.h
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/cmsis_driver_config.h.base@1.1.1
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/device_cfg.h
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/device_cfg.h.base@1.1.3
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.ld
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.ld.base@1.0.0
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.sct
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.sct.base@1.1.0
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/region_defs.h
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/region_defs.h.base@1.0.0
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/region_limits.h
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/region_limits.h.base@1.0.0
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/startup_SSE300MPS3.c
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/startup_SSE300MPS3.c.base@1.1.1
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/system_SSE300MPS3.c
 create mode 100644 dsppp/RTE/Device/SSE-300-MPS3/system_SSE300MPS3.c.base@1.1.1
 create mode 100644 dsppp/RTE/Device/SSE_300_MPS3/ac6_linker_script.sct
 create mode 100644 dsppp/RTE/Device/SSE_300_MPS3/clang_linker_script.ld
 create mode 100644 dsppp/RTE/Device/SSE_300_MPS3/gcc_linker_script.ld
 create mode 100644 dsppp/RTE/Device/SSE_300_MPS3/regions_V2M_MPS3_SSE_300_FVP.h
 create mode 100644 dsppp/RTE/_Release_IPSS_M0P/RTE_Components.h
 create mode 100644 dsppp/RTE/_Release_IPSS_M4/RTE_Components.h
 create mode 100644 dsppp/RTE/_Release_LLVM-Corstone-300/RTE_Components.h
 create mode 100644 dsppp/RTE/_Release_MPS3-Corstone-300/RTE_Components.h
 create mode 100644 dsppp/RTE/_Release_VHT-Corstone-300/RTE_Components.h
 create mode 100644 dsppp/RTE/_Release_VHT-M0P/RTE_Components.h
 create mode 100644 dsppp/RTE/_Release_VHT-M4/RTE_Components.h
 create mode 100644 dsppp/RTE/_Release_VHT_M0P/RTE_Components.h
 create mode 100644 dsppp/RTE/_Release_VHT_M4/RTE_Components.h
 create mode 100644 dsppp/allocator.cpp
 create mode 100644 dsppp/allocator.h
 create mode 100644 dsppp/cdefault.yml
 create mode 100644 dsppp/clang_sse300.c
 create mode 100644 dsppp/example.cproject.yml
 create mode 100644 dsppp/fvp_configs/VHT-Corstone-300.txt
 create mode 100644 dsppp/fvp_configs/VHT-M0P.txt
 create mode 100644 dsppp/fvp_configs/VHT-M4.txt
 create mode 100644 dsppp/getserial.py
 create mode 100644 dsppp/linker_scripts/ARMCM0P/region_defs.h
 create mode 100644 dsppp/linker_scripts/ARMCM4/region_defs.h
 create mode 100644 dsppp/linker_scripts/SSE-300-MPS3/region_defs.h
 create mode 100644 dsppp/linker_scripts/SSE-300-MPS3/region_limits.h
 create mode 100644 dsppp/linker_scripts/ac6_m0p_mps3_s.sct
 create mode 100644 dsppp/linker_scripts/ac6_m4_mps3_s.sct
 create mode 100644 dsppp/linker_scripts/ac6_sse300_mps3_s.sct
 create mode 100644 dsppp/linker_scripts/ac6_sse310_mps3_s.sct
 create mode 100644 dsppp/linker_scripts/clang_m0p_mps3.ld
 create mode 100644 dsppp/linker_scripts/clang_m4_mps3.ld
 create mode 100644 dsppp/linker_scripts/clang_sse300_mps3.sct
 create mode 100644 dsppp/linker_scripts/clang_sse310_mps3.sct
 create mode 100644 dsppp/linker_scripts/gcc_m0p_mps3.ld
 create mode 100644 dsppp/linker_scripts/gcc_m4_mps3.ld
 create mode 100644 dsppp/linker_scripts/gcc_sse300_mps3.ld
 create mode 100644 dsppp/linker_scripts/gcc_sse310_mps3_s.ld
 create mode 100644 dsppp/main.c
 create mode 100644 dsppp/mps3run.py
 create mode 100644 dsppp/process.py
 create mode 100644 dsppp/run_all.py
 create mode 100644 dsppp/test.cbuild-pack.yml
 create mode 100644 dsppp/test.cproject.yml
 create mode 100644 dsppp/test.csolution.yml
 create mode 100644 dsppp/test_config.h
 create mode 100644 dsppp/tests/bench.c
 create mode 100644 dsppp/tests/bench.h
 create mode 100644 dsppp/tests/cmsis_tests.h
 create mode 100644 dsppp/tests/cmsisdsp.cpp
 create mode 100644 dsppp/tests/col_test.cpp
 create mode 100644 dsppp/tests/common_tests.cpp
 create mode 100644 dsppp/tests/common_tests.h
 create mode 100644 dsppp/tests/debug_mat.h
 create mode 100644 dsppp/tests/debug_test.cpp
 create mode 100644 dsppp/tests/debug_test_external.cpp
 create mode 100644 dsppp/tests/dot_test.cpp
 create mode 100644 dsppp/tests/filter_test.cpp
 create mode 100644 dsppp/tests/fusion_test.cpp
 create mode 100644 dsppp/tests/matrix_test.cpp
 create mode 100644 dsppp/tests/matrix_utils.h
 create mode 100644 dsppp/tests/row_test.cpp
 create mode 100644 dsppp/tests/test.h
 create mode 100644 dsppp/tests/vector_test.cpp

diff --git a/Documentation/Doxygen/dsp.dxy.in b/Documentation/Doxygen/dsp.dxy.in
index 73c175731..f514addf5 100644
--- a/Documentation/Doxygen/dsp.dxy.in
+++ b/Documentation/Doxygen/dsp.dxy.in
@@ -573,14 +573,14 @@ HIDE_UNDOC_MEMBERS     = YES
 # if EXTRACT_ALL is enabled.
 # The default value is: NO.
 
-HIDE_UNDOC_CLASSES     = NO
+HIDE_UNDOC_CLASSES     = YES
 
 # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
 # declarations. If set to NO, these declarations will be included in the
 # documentation.
 # The default value is: NO.
 
-HIDE_FRIEND_COMPOUNDS  = NO
+HIDE_FRIEND_COMPOUNDS  = YES
 
 # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
 # documentation blocks found inside the body of a function. If set to NO, these
@@ -919,11 +919,24 @@ WARN_LOGFILE           =
 # Note: If this tag is empty the current directory is searched.
 
 INPUT                  = ./src/mainpage.md   \
+                         ./src/dsppp_main.md   \
+                         ./src/introduction.md   \
+                         ./src/template.md   \
+                         ./src/guidelines.md \
+                         ./src/vectorop.md   \
+                         ./src/memory_allocator.md   \
+                         ./src/memory_static_dynamic.md   \
+                         ./src/code_size.md   \
+                         ./src/fusion.md   \
+                         ./src/vector.md   \
+                         ./src/matrix.md   \
+                         ./src/building.md   \
                          ./src/history.md    \
                          ./src/history.txt   \
                          ../../Examples/ARM  \
                          ../../Include/      \
-                         ../../Source/       \
+                         ../../Source/ \
+                         ../../dsppp/Include
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/Documentation/Doxygen/src/building.md b/Documentation/Doxygen/src/building.md
new file mode 100644
index 000000000..0cdf6b41b
--- /dev/null
+++ b/Documentation/Doxygen/src/building.md
@@ -0,0 +1,29 @@
+# Building and running examples {#dsppp_building}
+
+## To build
+
+First time:
+
+```shell
+cbuild -O cprj test.csolution.yml  --toolchain AC6 -c example.Release+VHT-Corstone-300 -p -r --update-rte
+
+```
+
+Other times:
+
+```shell
+cbuild -O cprj test.csolution.yml  --toolchain AC6 -c example.Release+VHT-Corstone-300
+```
+
+If you want to select another test, edit the file `example.cproject.yml` and uncomment the test.
+
+## To run
+
+If the tools have been installed with `vcpkg`:
+
+```
+FVP_Corstone_SSE-300_Ethos-U55.exe -f fvp_configs/VHT-Corstone-300.txt -a cpu0=cprj\out\example\VHT-Corstone-300\Release\example.axf
+```
+
+Otherwise, you'll need to use the path to your FVP installation.
+
diff --git a/Documentation/Doxygen/src/code_size.md b/Documentation/Doxygen/src/code_size.md
new file mode 100644
index 000000000..f06cf696b
--- /dev/null
+++ b/Documentation/Doxygen/src/code_size.md
@@ -0,0 +1,14 @@
+# Code size {#dsppp_code_size}
+
+It was explained in previous sections that types `Vector<T,NB1>` and `Vector<T,NB2>` are considered as different types if `NB1` and `NB2` are differents.
+
+A template algorithm is like a code generator that will generate different code for different values of the template arguments : the types.
+
+If you use a template algorithm with different vector datatypes, it will generate different code for those two datatypes. The generated code will be specialized for the specific datatypes used and thus is likely to be more efficient.
+
+But then it means you get different implementations so more code size.
+
+If you have a lot of different sizes in your system, then you're likely to get too much code size and in that case it may be better to use dynamic objects instead of static ones.
+
+dynamic objects are less efficient so it is a trade-off between code size / speed.
+
diff --git a/Documentation/Doxygen/src/dsppp_main.md b/Documentation/Doxygen/src/dsppp_main.md
new file mode 100644
index 000000000..df5325dbc
--- /dev/null
+++ b/Documentation/Doxygen/src/dsppp_main.md
@@ -0,0 +1,18 @@
+# DSP++ extension {#dsppp_main}
+
+C++ extensions to CMSIS-DSP using C++ template meta-programming (headers only).
+
+The headers are not yet part of the CMSIS-DSP pack since they are experimental. You can get them from the [CMSIS-DSP github](https://github.com/ARM-software/CMSIS-DSP/dsppp/Include). There is nothing to build. Just include the headers when you want to use this framework.
+
+* @subpage dsppp_intro "Introduction"
+* @subpage dsppp_template "C++ template for C programmer"
+* @subpage dsppp_vector_example "Vector operation example"
+* @subpage dsppp_memory_allocator "Memory allocation"
+* @subpage dsppp_memory_static_dynamic "Static / Dynamic objects"
+* @subpage dsppp_code_size "Code size"
+* @subpage dsppp_fusion "Fusion mechanism"
+* @subpage dsppp_vector "Vector operators"
+* @subpage dsppp_matrix "Matrix operators"
+* @subpage dsppp_building "Building and running examples"
+* @subpage dsppp_guidelines "Usage guidelines"
+
diff --git a/Documentation/Doxygen/src/fusion.md b/Documentation/Doxygen/src/fusion.md
new file mode 100644
index 000000000..cec2f4f89
--- /dev/null
+++ b/Documentation/Doxygen/src/fusion.md
@@ -0,0 +1,39 @@
+# Fusion {#dsppp_fusion}
+
+```cpp
+Vector<float32_t,NB> d = a + b * c;
+```
+
+With this line of code, there is loop fusion : instead of having one loop per operator there is one loop for the whole computation.
+
+It is important to have some ideas of how it works to avoid some mistake in the use of the library.
+
+In above code, `a + b * c` is not computing anything !
+`a + b * c` is creating a representation of the expression : an abstract syntax tree (AST) at build time.
+
+When this AST is assigned to the variable `d` it is evaluated.
+The evaluation forces the inlining of the expression operators in one loop. The code generated thus contains only one loop with a fusion of all the operators : `+` and `*`.
+
+The library is supporting virtual vectors. They are a view on an existing part of a vector. You can use a virtual vector for instance to read some samples with a stride. Or write some samples with a stride. A virtual vector does not own its memory.
+
+If you write:
+```cpp
+d = a;
+```
+
+and `d` and `a` are virtual vectors then nothing will be written to `d` !
+
+`d` will becomes `a` and `a` will no more be valid.
+
+If you want to copy a virtual vector you need to make an expression and write:
+
+```cpp
+d = copy(a);
+```
+
+Note that this problem occurs only for virtual vectors who do not own their memory.
+
+For real vectors, a copy would occur. But since there is no overhead in adding `copy` it is better to do it to avoid problems.
+
+
+
diff --git a/Documentation/Doxygen/src/guidelines.md b/Documentation/Doxygen/src/guidelines.md
new file mode 100644
index 000000000..bd8163784
--- /dev/null
+++ b/Documentation/Doxygen/src/guidelines.md
@@ -0,0 +1 @@
+# Guidelines {#dsppp_guidelines}
diff --git a/Documentation/Doxygen/src/introduction.md b/Documentation/Doxygen/src/introduction.md
new file mode 100644
index 000000000..4814dab6b
--- /dev/null
+++ b/Documentation/Doxygen/src/introduction.md
@@ -0,0 +1,64 @@
+## Introduction {#dsppp_intro}
+
+### Dot product example
+
+If you want to compute the dot product:
+
+\f[
+
+<scale*(\overrightarrow{a}+\overrightarrow{b}),\overrightarrow{c}*\overrightarrow{d}>
+
+\f]
+
+with CMSIS-DSP, you would write:
+
+```c
+arm_add_f32(a,b,tmp1,NB);
+arm_scale_f32(tmp1,scale,tmp2,NB);
+arm_mult_f32(c,d,tmp3,NB);
+arm_dot_prod_f32(tmp2,tmp3,NB,&r);
+```
+
+There are several limitations with this way of writing the code:
+
+1. The code needs to be rewritten and the `_f32` suffix changed if the developer wants to use another datatype
+
+2. Temporary buffers need to be allocated and managed (`tmp1`,`tmp2`,`tmp3`,`tmp4`)
+
+3. The four function calls are four different loops. It is not good for data locality and caches. The computation is not done in one pass
+
+4. Each loop contains a small number of instructions. For instance, for the `arm_add_f32`, two loads, an add instruction and a store. It is not enough to enable the compiler to reorder the instructions to improve the performance
+
+With this new C++ template library, you can write:
+
+
+```cpp
+r = dot(scale*(a+b),c*d);
+```
+
+The code generated by this line computes the dot product in one pass with all the operators (`+`, `*`) included in the loop.
+There is no more any temporary buffers.
+
+### Vector operations
+
+Let's look at another example:
+
+\f[
+
+\overrightarrow{d} = \overrightarrow{a} + \overrightarrow{b} * \overrightarrow{c}
+
+\f]
+
+With the C++ library, it can be written as:
+
+
+```cpp
+Vector<float32_t,NB> d = a + b * c;
+```
+
+Here again : all the vector operations (`+`,`*`) are done in one pass with one loop. There is no more any temporary buffer.
+
+If you're coming from C and does not know anything about C++ templates, we have a very quick introduction : @ref dsppp_template "The minimum you need to know about C++ template to use this library".
+
+You can also jump directly to an @ref dsppp_vector_example "example with vector operations".
+
diff --git a/Documentation/Doxygen/src/mainpage.md b/Documentation/Doxygen/src/mainpage.md
index f33a65b9e..d081f0a3b 100644
--- a/Documentation/Doxygen/src/mainpage.md
+++ b/Documentation/Doxygen/src/mainpage.md
@@ -1,5 +1,7 @@
 # Overview {#mainpage}
 
+## Introduction
+
 This user manual describes the CMSIS DSP software library, a suite of common compute processing functions for use on Cortex-M and Cortex-A processor based devices.
 
 The library is divided into a number of functions each covering a specific category:
@@ -26,9 +28,21 @@ The library is providing vectorized versions of most algorithms for Helium and o
 
 When using a vectorized version, provide a little bit of padding after the end of a buffer (3 words) because the vectorized code may read a little bit after the end of a buffer. You don't have to modify your buffers but just ensure that the end of buffer + padding is not outside of a memory region.
 
+## Related projects
+
+### Python wrapper
+
 A Python wrapper is also available with a Python API as close as possible to the C one. It can be used to start developing and testing an algorithm with NumPy and SciPy before writing the C version. Is is available on [PyPI.org](https://pypi.org/project/cmsisdsp/). It can be installed with: `pip install cmsisdsp`.
 
-## Using the Library {#using}
+### Experimental C++ template extension
+
+This extension is a set of C++ headers. They just need to included to start using the features.
+
+Those headers are not yet part of the pack and you need to get them from the [github repository](https://github.com/ARM-software/CMSIS-DSP/tree/main/Include)
+
+More documentation about the @ref dsppp_main "DSP++" extension.
+
+## Using the CMSIS-DSP Library {#using}
 
 The library is released in source form. It is strongly advised to compile the library using `-Ofast` optimization to have the best performances.
 
@@ -56,6 +70,7 @@ The table below explains the content of **ARM::CMSIS-DSP** pack.
  📂 Include                            | Include files for using and building the lib
  📂 PrivateInclude                     | Private include files for building the lib
  📂 Source                             | Source files
+ 📂 dsppp                              | Experimental C++ teamplate extension
  📄 ARM.CMSIS-DSP.pdsc                 | CMSIS-Pack description file
  📄 LICENSE                            | License Agreement (Apache 2.0)
 
diff --git a/Documentation/Doxygen/src/matrix.md b/Documentation/Doxygen/src/matrix.md
new file mode 100644
index 000000000..c3c983b0d
--- /dev/null
+++ b/Documentation/Doxygen/src/matrix.md
@@ -0,0 +1,168 @@
+# Matrix {#dsppp_matrix}
+
+Matrixes can be used similarly to vectors:
+
+```cpp
+Matrix<float32_t,ROWS,COLS> a;
+Matrix<float32_t,ROWS,COLS> b;
+```
+
+If the dimensions of the matrixes are not known at build time, you would instead write:
+
+```
+Matrix<float32_t> a(rows,cols);
+Matrix<float32_t> b(rows,cols);
+```
+
+Once you have matrixes, you need to initialize them.  A matrix is also a vector, so you can initialize it by indexing into the vector:
+
+```cpp
+for(std::size_t i=0;i<ROWS*COLS;i++)
+{
+      a[i] = float32_t(i);
+}
+```
+
+You can also use a bidimensional indexing:
+
+```cpp
+for(std::size_t row=0; row<ROWS; row++)
+{
+   for(std::size_t col=0; col<COLS; col++)
+   {
+            b(row,col) = float32_t(row*col);
+   }
+}
+```
+
+Once you have initialized matrixes, you can operate on them:
+
+```cpp
+Matrix<float32_t,ROWS,COLS> result = a * a + b;
+```
+
+The operators `+` and `*` are merged into the loop. `*` is the element-wise multiply. For the vector / matrix products you should use the operator `dot`.
+
+Note that fusion of operators will not work with `dot(Matrix, Matrix`). It is only supported with vectors : `dot(Vector,Vector)` or `dot(Matrix,Vector)`.
+
+## VectorView
+
+We can create virtual vectors which are view of some slices of the matrix.
+
+### Row vector
+
+To set the second row to `0.0f`, you can do:
+
+```
+result.row(1) = 0.0f;
+```
+
+To set the odd elements of the 3rd row to `0.0f` we can do:
+
+```
+result.row<2>(2,1) = 0.0f;
+```
+
+The first argument `2` is the row number (starting from `0`).
+
+The second argument `1` is where is the row we start the view : element `1`.
+
+`<2>` is the stride known at built time.
+
+The `row` API is:
+
+```cpp
+template<int S=1>
+VectorView<P,S> row(const index_t i,const index_t start=0,const index_t stop=C)
+   
+```
+
+`stop` is the index of the first element **after** the end of the view.
+
+`i` is the row index
+
+### Column vector
+
+There is a similar API for columns.
+
+Let's set the odd elements of columns 3 to `5.0f`:
+
+```
+result.col<2>(2,1) = 5.0f;
+```
+
+## MatrixView
+
+It is also possible to create a virtual matrix : a view onto a subset of the matrix.
+
+Let's add the bottom right corner of the matrix to itself:
+
+```cpp
+result.sub(4,8,4,8) = result.sub(4,8,4,8) + result.sub(4,8,4,8)
+```
+
+The API is:
+
+```cpp
+MatrixView<P,C> sub(const index_t rs,
+                    const index_t re,
+                    const index_t cs,
+                    const index_t ce)
+```
+
+You specify the row start and row end, then column start and column end.
+
+Note that the end is the first index **after** the end of your rows or columns.
+
+No stride is supported for matrix view in this version of the library.
+
+## Matrix operations
+
+In addition to the vector operations `+`,`-` and `*`, matrixes are supporting more operations:
+
+* `dot` for vector / matrix products
+* `diagonal` to create a diagonal matrix from a vector.
+* `identity` to create an identity matrix
+* `tranpose` to create the transposed matrix
+* `outer` for the outer product of two vectors
+
+### dot
+
+```cpp
+result = dot(a,b);
+```
+
+The compiler may use the move semantic to copy the temporary result of the `dot` function to `result`.
+
+In this case, no copy would occur and `result` after the assignment would be a vector allocated by `dot` so using the `TMP_ALLOC` .
+
+### diagonal
+
+```cpp
+result = Matrix<float32_t,ROWS,COLS>::diagonal(c);
+```
+
+### identity
+
+```cpp
+result = Matrix<float32_t,ROWS,COLS>::identity();
+```
+
+### transpose
+
+```cpp
+result = a.transpose();
+```
+
+or
+
+```cpp
+transposeTo(result,a);
+```
+
+### outer product
+
+```cpp
+result = outer(c,c);
+```
+
diff --git a/Documentation/Doxygen/src/memory_allocator.md b/Documentation/Doxygen/src/memory_allocator.md
new file mode 100644
index 000000000..a539a3109
--- /dev/null
+++ b/Documentation/Doxygen/src/memory_allocator.md
@@ -0,0 +1,87 @@
+# Memory allocation {#dsppp_memory_allocator}
+
+By default, `malloc` is used.
+
+```cpp
+Vector<float32_t,NB>
+```
+
+is allocating a vector of dimension `NB` (known at build time) and datatype `float32_t`.
+
+The definition of the `Vector` template is:
+
+```cpp
+template<typename P,
+         int L=DYNAMIC,
+         template<int> typename Allocator = TMP_ALLOC>
+struct Vector:Vector_Base<P>
+```
+
+It means that by default the memory allocator is `TMP_ALLOC`.
+
+This `TMP_ALLOC` `#define` can be changed if you define it before including any header from the library.
+
+An allocator should implement a template like:
+
+```cpp
+template<int L>
+struct malloc_allocator {
+    /* Dynamic dimension allocations (L<0) */
+    static  char* allocate  ( vector_length_t sz) noexcept;
+        
+    /* Dimension L know at build time (L > 0) */
+    static  char* allocate  ( ) noexcept;
+    
+    static void destroy  ( char* ptr ) noexcept;
+   
+};
+```
+
+It has no state because in practice we observed that compilers were generating more efficient code without state in the memory allocator template.
+
+If you don't want to use a `malloc` based memory allocator, you can replace it with your own memory allocator and implement an API like the one just shown in `malloc_allocator`.
+
+For instance, often in DSP pipelines, the dimensions of the vectors and matrixes are fixed and known at build time.
+In that case, you could replace the memory allocator by one using memory pools.
+
+With memory pools, allocation is nearly cost free and there is no fragmentation.
+
+The test framework of the library is providing an example in `allocator.h` and `allocator.cpp`.
+
+There are two memory allocators:
+
+1. `stat_allocator` is a `malloc` based allocator that is making statistics on the memory allocations and how many buffers of each dimension is required
+
+2. `pool_allocator` can use the data generated by `stat_allocator`to pre-allocate memory pools that will be then used for the memory allocations. The memory pools are also creating aligned buffers.
+
+It is no more difficult (and less difficult) than allocating temporary buffers in CMSIS-DSP.
+
+You could define the `TMP_ALLOC` with:
+
+```cpp
+#if defined(POOL_ALLOCATOR)
+#define TMP_ALLOC pool_allocator
+#else 
+#define TMP_ALLOC stat_allocator
+#endif
+```
+
+You use `stat_allocator` by default. When your code is working, you switch to `pool_allocator` to have better performance and determinism.
+
+Another possibility is to use different vector types:
+
+```cpp
+template<typename P,int L=arm_cmsis_dsp::DYNAMIC>
+using PVector = Vector<P,L,pool_allocator>;
+```
+
+Note that you cannot avoid using `TMP_ALLOC` because some functions in the library are creating temporary objects. For instance, if you want to make an identity matrix, you can use ` mk_identity` that will make a memory allocation using `TMP_ALLOC`
+
+Also note that if you create a vector with:
+
+```cpp
+Vector<float32_t> v(NB);
+```
+
+then the dimension `NB` is a runtime parameter. The memory pool allocator given as example in this library is only working with dimensions known at build time. For runtime dimensions, it is still using a `malloc`.
+
diff --git a/Documentation/Doxygen/src/memory_static_dynamic.md b/Documentation/Doxygen/src/memory_static_dynamic.md
new file mode 100644
index 000000000..a1d911814
--- /dev/null
+++ b/Documentation/Doxygen/src/memory_static_dynamic.md
@@ -0,0 +1,35 @@
+# Static / dynamic {#dsppp_memory_static_dynamic}
+
+As we have seen in the previous sections, there are two kind of vectors:
+
+* `Vector<T>` with a dimension know at runtime
+* `Vector<T,NB>` with a dimension known at build time
+
+The former vectors are called "dynamic" ins this library. The later are called "static". 
+
+This naming "static" / "dynamic" is referring to the dimension. With "dynamic" vectors the same code can, at runtime, create vectors of different length based on a runtime length. 
+
+With "static" vectors : the length is fixed at build time and will never change at runtime.
+
+Note that the library also have "static" / "dynamic" matrixes. So, we are going to use "objects" to cover both cases
+
+# Static objects
+
+The advantage of static objects is that the dimension is known at build time. The compiler can thus generate an algorithm that is specialized for those dimensions and thus more efficient.
+
+With static objects it is also possible to use different memory allocator with better performances and determinism.
+
+But, with static objects, objects of different dimension are considered as different types. The compiler will generate different implementation so it will have an impact on the code dimension.
+
+If you need lots of objects of different dimensions, or if the dimensions are nort known at build time, then you need to use dynamic object
+
+# Dynamic objects
+
+With dynamic objects, the dimension is know at runtime. So object of different dimensions have the same datatype and the compiler is generating only one implementation for all those objects. It cannot generate specialized implementations based on the dimension. It is better for code size, but the implementations will be less efficient.
+
+Also when dimension is not know at build time, some instruction selection made by the C++ library at build time is no more possible. It has an effect on performance since at runtime one must decides what's possible or not. It is mostly impacting matrixes where stride information is needed.
+
+With vector instructions one can use scatter / gather instructions and they require a stride. But there are constraints depending on the datatype and when the stride is too big for those instructions, they cannot be used. This check has to be done at runtime for dynamic object.
+
+Finally, with dynamic object, memory allocation can be an issue. You can mitigate the problem by reusing temporaries in your algorithms instead of re-allocating them. But it makes the implementation more difficult. See section about @ref dsppp_guidelines.
+
diff --git a/Documentation/Doxygen/src/template.md b/Documentation/Doxygen/src/template.md
new file mode 100644
index 000000000..16b4994ac
--- /dev/null
+++ b/Documentation/Doxygen/src/template.md
@@ -0,0 +1,60 @@
+# What you need to know about C++ templates {#dsppp_template}
+
+## What is a template useful for ?
+
+In CMSIS-DSP, you have functions like:
+
+* `arm_add_f32`
+* `arm_add_f64`
+
+Without unrolling, the scalar implementation is the same but is duplicated (two different source files to maintain although they are nearly the same).
+
+One could try to reuse the same source for both functions using C preprocessor. But we would still have two different functions with different names at the end (both generated from the same C + C preprocessor macros)
+
+With C++ templates, we can achieve the same result in a better way since the C++ compiler will check the templates and typecheck them. In addition to that, both functions can share the same name.
+
+With C++ template, we could have a *generic* function `arm_add` taking as argument a pointer `T *pSrc` where `T` is a type variable !
+
+When the function is used with a `float32_t *`, the compiler would generate code for a function using `float32_t`.
+
+And if the function is used with a `float64_t *`, the compiler would generate code for a function using `float64_t`.
+
+The generic `arm_add` source code is a template used to generate different implementations. It is like a code generator.
+
+And if the compiler is unable to generate an implementation because the type variable `T` is replaced by a type with no addition operator, then it would be detected by the compiler.
+
+## Templates for datatypes
+
+C++ templates also apply to structs and classes.
+
+For instance, we could have a template `Vector<T>` and thus different types `Vector<float32_t>`, `Vector<Q15>` ...
+
+There is another aspect of C++ templates that may be surprising : the types can contain numbers. 
+
+For instance, one could have a type
+`Vector<float32_t,10>` for a vector of `float` and of length `10`. The length being known at build time.
+
+The types `Vector<float32_t,10>` and `Vector<float32_t,5>` should be considered as different types because they have different lengths. The length is part of the type.
+
+What we said above for code generation applies. For a template algorithm using any kind of vector, the compiler would generate different code for different vector types. The code for a template algorithm using `Vector<float32_t,10>` would be different from the code for `Vector<float32_t,5>` because those two types are different.
+    
+
+## Implicit parameters
+
+A template can also have implicit parameters.
+
+For instance one could use `Vector<float32_t>` or `Vector<float32_t,10>`.
+
+In the first case, the length is an implicit parameter with a default value and it is equivalent to writing `Vector<float32_t,DYNAMIC>` where `DYNAMIC` could be a special value (negative for instance) used to tell the compiler that the length of the vector is not known at build time but only at runtime.
+
+Both variants may use totally different implementations. The `DYNAMIC` variant may contain a `length` field in the `struct` definition whereas other variants do not need this field since the length is known at build time.
+
+## How to use templates ?
+
+A template is just a C++ header. You only need to include this header to start using the template. There is nothing to build.
+
+## Example
+
+Now you can look at an @ref dsppp_vector_example "example with vector operations" showing how to use the library
+
+
diff --git a/Documentation/Doxygen/src/vector.md b/Documentation/Doxygen/src/vector.md
new file mode 100644
index 000000000..546338fee
--- /dev/null
+++ b/Documentation/Doxygen/src/vector.md
@@ -0,0 +1,112 @@
+# Vector {#dsppp_vector}
+
+The use of vectors has been explained in @ref dsppp_vector_example "example with vector operations" and focusing on `float32_t`.
+
+The vector template is defined as:
+
+```cpp
+template<typename P,
+         int L=DYNAMIC,
+         template<int> typename Allocator = TMP_ALLOC>
+struct Vector:Vector_Base<P>
+```
+
+* `P` is the datatype of vector elements
+* `L` is the static length of the vector (length known at build time). `L<0` when the length is dynamic and not known at build time. It is the default value.
+* `Allocator` is the memory allocator. By default it is `TMP_ALLOC` that you can redefine since it is a macro
+* `Vector_Base<P>` is providing the storage. A vector owns its storage buffer.
+
+## Q15 example
+
+Example with `Q15` is very similar:
+
+The vectors are defined:
+
+```cpp
+Vector<Q15,NB> aQ15;
+Vector<Q15,NB> bQ15;
+Vector<Q15,NB> cQ15;
+```
+
+They are initialized:
+
+```cpp
+for(int i = 0;i< NB;i++)
+{
+   aQ15[i] = bQ15[i] = cQ15[i] = Q15(i);
+}
+```
+
+Here, the `Q15` value is initialized from the int value `i` and thus represents \f$ i/2^{15} \f$
+
+Some computation is done
+
+```cpp
+Vector<Q15,NB> dQ15 = aQ15 + bQ15 * cQ15;
+```
+
+The result is displayed:
+
+```cpp
+std::cout << "Result = " << dQ15 ;
+```
+
+## VectorView
+
+A vector view is a virtual vector : a view of a vector.
+
+One can define a `VectorView` with:
+
+```cpp
+auto subD = d.sub(2);
+```
+
+This is creating a virtual vector starting at index `2` (3rd element) of vector `d`.
+
+You can then operate with this virtual vector:
+
+```cpp
+subD = subD + 2.0f;
+```
+
+If you display the vector `d`, you'll see that `2.0f` has been added to all elements starting from the 2rd one.
+
+`VectorView` do not own their memory. It is owned by the original vector. 
+
+If you write:
+
+```cpp
+x = y
+```
+
+and `x` and `y` are `VectorView`, no copy will occur. `x` will just reference the same data as `y`. If you want to copy you have to be explicit and write:
+
+```cpp
+x = copy(y)
+```
+
+It is advised to always use the `copy` operator (even with normal vectors).
+
+Virtual vectors can have a stride:
+
+```cpp
+d.sub<2>(1) = 0.0f;
+```
+
+This line sets the odd elements of the vector to `0.0f`. It is creating a vvirtual vector with stride `2` and starting at index `1` of first vector.
+
+Then, all elements of this virtual vector are set to `0.0f`.
+
+The `sub` API is:
+
+```cpp
+template<int S=1>
+VectorView<P,S> sub(const index_t start=0,const index_t stop=L)
+```
+
+You can define:
+
+* The stride `S` : statically known and by default `1`.
+* The start of the view (`0` by default)
+* The end of the view (`L` by default : the length known at build time). Note that it is the first index **after** the end of the vector.
+
diff --git a/Documentation/Doxygen/src/vectorop.md b/Documentation/Doxygen/src/vectorop.md
new file mode 100644
index 000000000..aed42944a
--- /dev/null
+++ b/Documentation/Doxygen/src/vectorop.md
@@ -0,0 +1,100 @@
+# Vector operation example {#dsppp_vector_example}
+
+To compute:
+
+\f[
+
+\overrightarrow{d} = \overrightarrow{a} + \overrightarrow{b} * \overrightarrow{c}
+
+\f]
+
+we need to:
+1. Include the right header files
+2. allocate the vectors
+3. initialize the vectors
+4. make the computation.
+
+# Include the headers
+
+The headers are not yet part of the CMSIS-DSP packs since they are experimental. You can get them from the [CMSIS-DSP github](https://github.com/ARM-software/CMSIS-DSP/CPP)
+
+```cpp
+#include <memory_pool>
+#include <matrix>
+
+using namespace arm_cmsis_dsp;
+```
+
+If fixed point datatypes are required, `#include <fixed_point>` should be used before `<matrix>`
+
+Fixed point requires the use of CMSIS-DSP.
+
+# Creation of the vectors
+
+To create a vector `a` you would write:
+
+```cpp
+constexpr int NB = 32;
+
+Vector<float32_t,NB> a;
+Vector<float32_t,NB> b;
+Vector<float32_t,NB> c;
+```
+
+`Vector<float32_t,NB>` is creating a vector of dimension `NB` (known at build time) and datatype `float32_t`. This creation is requiring some memory allocation and by default it is done with a `malloc`. 
+
+It is possible to change the memory allocator for the vectors (and it is advised) to avoid using `malloc` and instead have deterministic allocation without fragmentation.
+
+See section @ref dsppp_memory_allocator "Memory allocation".
+
+Vectors of different dimensions are considered as being different types.
+
+If you don't know the dimension at build time, you can use a different type of vector with:
+
+```cpp
+Vector<float32_t> a(NB);
+```
+
+For the trade-off between vector with build time dimension or runtime dimension please see the section @ref dsppp_memory_static_dynamic .
+
+# Initialization of the vectors
+
+You can index the vectors as normal C arrays.
+
+```cpp
+for(int i = 0;i< NB;i++)
+{
+        a[i] = b[i] = c[i] = i;
+}
+```
+
+# Computation
+
+The computation can be written normally as :
+
+```cpp
+Vector<float32_t,NB> d = a + b * c;
+```
+
+Note that the computation can be parametrized with template arguments so the same computation could be used with any datatype or length. In that case you would have to define a template (and not just a normal function) and inside you would use something like:
+
+```cpp
+Vector<T,NB> d = a + b * c;
+```
+
+where `T` is a type variable coming from the template.
+
+The operators `+`, `*` are computed in one pass with one loop : we have loop fusion and instead of having a loop per operator we have a loop for the whole computation.
+
+To understand fusion and how to extend it with new operators, see section @ref dsppp_fusion .
+
+For an overview of vector operators, see section @ref dsppp_vector .
+For an overview of matrix operators, see section @ref dsppp_matrix .
+
+# Displaying the result
+
+The vectors can be displayed on `stdout` for debug purpose.
+
+```cpp
+std::cout << "Result = " << d ;
+```
diff --git a/dsppp/.gitignore b/dsppp/.gitignore
new file mode 100644
index 000000000..0cd7f9a89
--- /dev/null
+++ b/dsppp/.gitignore
@@ -0,0 +1,13 @@
+build_*
+allocation/*
+out/
+tmp/
+__pycache__/
+**.DS_Store
+*.cprj
+cprj/*.cbuild*.yml
+dump_*
+run_*.bat
+ac6_results/
+gcc_results/
+clang_results/
diff --git a/dsppp/Examples/dot_product.cpp b/dsppp/Examples/dot_product.cpp
new file mode 100644
index 000000000..c1ee8146f
--- /dev/null
+++ b/dsppp/Examples/dot_product.cpp
@@ -0,0 +1,52 @@
+
+#include "RTE_Components.h"
+#include  CMSIS_device_header
+
+#if defined(MPS3)
+#include "cmsis_driver_config.h"
+#include "stdout_USART.h"
+#endif 
+
+#include <iostream>
+
+#include <dsppp/memory_pool.hpp>
+#include <dsppp/matrix.hpp>
+
+using namespace arm_cmsis_dsp;
+
+
+int main(void)
+{
+#if defined(MPS3)
+    stdout_init();
+#endif
+
+    std::cout << "Dot product example\r\n";
+
+    constexpr int NB = 32;
+
+    Vector<float32_t,NB> a;
+    Vector<float32_t,NB> b;
+    Vector<float32_t,NB> c;
+    Vector<float32_t,NB> d;
+
+    float32_t scale = 0.5;
+
+    for(int i = 0;i< NB;i++)
+    {
+        a[i] = b[i] = c[i] = d[i] = i;
+    }
+
+    float32_t r;
+
+    r = dot(scale*(a+b),c*d);
+
+    std::cout << "Result = " << r << "\r\n";
+ 
+
+#if defined(MPS3)
+    while(1);
+#endif
+}
+
+
diff --git a/dsppp/Examples/matrix_op.cpp b/dsppp/Examples/matrix_op.cpp
new file mode 100644
index 000000000..f9fa12318
--- /dev/null
+++ b/dsppp/Examples/matrix_op.cpp
@@ -0,0 +1,109 @@
+
+#include "RTE_Components.h"
+#include  CMSIS_device_header
+
+#if defined(MPS3)
+#include "cmsis_driver_config.h"
+#include "stdout_USART.h"
+#endif 
+
+#include <iostream>
+
+#include <dsppp/memory_pool.hpp>
+#include <dsppp/fixed_point.hpp>
+#include <dsppp/matrix.hpp>
+
+using namespace arm_cmsis_dsp;
+
+
+int main(void)
+{
+#if defined(MPS3)
+    stdout_init();
+#endif
+
+    std::cout << "Matrix operation examples\r\n";
+
+    constexpr int ROWS = 8;
+    constexpr int COLS = 8;
+
+    Matrix<float32_t,ROWS,COLS> a;
+    Matrix<float32_t,ROWS,COLS> b;
+
+    for(std::size_t i=0;i<ROWS*COLS;i++)
+    {
+       a[i] = float32_t(i);
+    }
+    
+    for(std::size_t row=0; row<ROWS; row++)
+    {
+       for(std::size_t col=0; col<COLS; col++)
+       {
+            b(row,col) = float32_t(row*col);
+       }
+    }
+    
+    Matrix<float32_t,ROWS,COLS> result = a * a + b;
+
+    std::cout << "Result = " << std::endl << result ;
+
+    // Vector views
+
+    // Rows
+    result.row(1) = 0.0f;
+    std::cout << "Result = " << std::endl << result ;
+
+    // Row with stride
+    // setting odd elements of 3rd row to 0
+    result.row<2>(2,1) = 0.0f;
+    std::cout << "Result = " << std::endl << result ;
+
+    // Column with stride
+    result.col<2>(2,1) = 5.0f;
+    std::cout << "Result = " << std::endl << result ;
+
+    // Matrix view
+    result.sub(4,8,4,8) = result.sub(4,8,4,8) + result.sub(4,8,4,8);
+    std::cout << "Result = " << std::endl << result ;
+
+    // operators
+    // dot
+    result = dot(a,b);
+    std::cout << "Result = " << std::endl << result ;
+
+    // diagonal
+    Vector<float32_t,ROWS> c;
+
+    for(int i = 0;i< ROWS;i++)
+    {
+        c[i] = i;
+    }
+    result = Matrix<float32_t,ROWS,COLS>::diagonal(c);
+    
+    std::cout << "Result = " << std::endl << result ;
+
+    // identity matrix
+    result = Matrix<float32_t,ROWS,COLS>::identity();
+    
+    std::cout << "Result = " << std::endl << result ;
+
+    // transpose matrix
+    result = a.transpose();
+    
+    std::cout << "Result = " << std::endl << result ;
+
+    transposeTo(result,a);
+    
+    std::cout << "Result = " << std::endl << result ;
+
+    // outer product
+    result = outer(c,c);
+    std::cout << "Result = " << std::endl << result ;
+      
+    
+#if defined(MPS3)
+    while(1);
+#endif
+}
+
+
diff --git a/dsppp/Examples/vector_op.cpp b/dsppp/Examples/vector_op.cpp
new file mode 100644
index 000000000..6964fc3b0
--- /dev/null
+++ b/dsppp/Examples/vector_op.cpp
@@ -0,0 +1,83 @@
+
+#include "RTE_Components.h"
+#include  CMSIS_device_header
+
+#if defined(MPS3)
+#include "cmsis_driver_config.h"
+#include "stdout_USART.h"
+#endif 
+
+#include <iostream>
+
+#include <dsppp/memory_pool.hpp>
+#include <dsppp/fixed_point.hpp>
+#include <dsppp/matrix.hpp>
+
+using namespace arm_cmsis_dsp;
+
+
+int main(void)
+{
+#if defined(MPS3)
+    stdout_init();
+#endif
+
+    std::cout << "Vector operation examples\r\n";
+
+    constexpr int NB = 32;
+
+    // float32 example
+
+    Vector<float32_t,NB> a;
+    Vector<float32_t,NB> b;
+    Vector<float32_t,NB> c;
+
+    for(int i = 0;i< NB;i++)
+    {
+        a[i] = b[i] = c[i] = i;
+    }
+    
+
+    Vector<float32_t,NB> d = a + b * c;
+
+
+    std::cout << "Result = " << d ;
+
+    // Vector view example 1
+    auto subD = d.sub(2);
+    subD = subD + 2.0f;
+
+    // d vector has been modified starting from the 3rd element 
+    // (index 2)
+    std::cout << "Result = " << d ;
+
+    // Now we set all odd elements to 0.
+    d.sub<2>(1) = 0.0f;
+    std::cout << "Result = " << d ;
+
+
+    // Q15 example
+    Vector<Q15,NB> aQ15;
+    Vector<Q15,NB> bQ15;
+    Vector<Q15,NB> cQ15;
+
+    for(int i = 0;i< NB;i++)
+    {
+        aQ15[i] = bQ15[i] = cQ15[i] = Q15(i);
+    }
+    
+
+    Vector<Q15,NB> dQ15 = aQ15 + bQ15 * cQ15;
+
+
+    std::cout << "Result = " << dQ15 ;
+
+
+ 
+
+#if defined(MPS3)
+    while(1);
+#endif
+}
+
+
diff --git a/dsppp/Include/dsppp/DSP/basic.hpp b/dsppp/Include/dsppp/DSP/basic.hpp
new file mode 100644
index 000000000..9032412e4
--- /dev/null
+++ b/dsppp/Include/dsppp/DSP/basic.hpp
@@ -0,0 +1,256 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#ifdef DOXYGEN
+#define ARM_MATH_DSP
+#undef ARM_MATH_MVEI
+#undef ARM_MATH_MVEF
+#undef ARM_MATH_NEON
+#endif
+
+/** \addtogroup ARCHALG 
+ *  \addtogroup DSPALG DSP Extension specific algorithm
+ *  \ingroup ARCHALG
+ *  @{
+ */
+
+#if defined(ARM_MATH_DSP)
+#if !defined(ARM_MATH_MVEI) && !defined(ARM_MATH_MVEF) && !defined(ARM_MATH_NEON)
+
+#define DSP_UNROLL 1
+
+template<typename T,typename DST,
+typename std::enable_if<has_vector_inst<DST>() &&
+                        IsVector<DST>::value &&
+         SameElementType<DST,T>::value,bool>::type = true>
+inline void _Fill(DST &v,
+                  const T val, 
+                  vector_length_t l,
+                  const DSP* = nullptr)
+{
+    constexpr int nb_lanes = vector_traits<T>::nb_lanes;
+    index_t i;
+
+    for(i=0 ; i <= l-(nb_lanes<<DSP_UNROLL); i += (nb_lanes<<DSP_UNROLL))
+    {
+        for(int k=0;k < (1<<DSP_UNROLL);k++)
+        {
+           v.vector_store(i + k*nb_lanes,inner::vconst(val));
+        }
+    }
+
+    for(; i < l ; i++)
+    {
+       v[i] = val;
+    }
+}
+
+
+template<typename T,typename DST,
+typename std::enable_if<has_vector_inst<DST>() &&
+         must_use_matrix_idx<DST>() &&
+         SameElementType<DST,T>::value,bool>::type = true>
+inline void _Fill2D(DST &v,
+                    const T val, 
+                    const vector_length_t rows,
+                    const vector_length_t cols,
+                    const DSP* = nullptr)
+{
+    constexpr int nb_lanes = vector_traits<T>::nb_lanes;
+    index_t row=0;
+
+      for(; row <= rows-(1<<DSP_UNROLL);row += (1<<DSP_UNROLL))
+      {
+          index_t col;
+
+          for(col=0; col <= cols-nb_lanes;col += nb_lanes)
+          {
+              for(int k=0;k<(1<<DSP_UNROLL);k++)
+              {
+                  v.matrix_store(row+k,col,inner::vconst(val));
+              }
+          }
+
+          for(; col < cols;col += nb_lanes)
+          {
+             for(int k=0;k<(1<<DSP_UNROLL);k++)
+             {
+                v(row+k,col) = val;
+             }
+          }
+
+      }
+
+      for(; row < rows;row ++)
+      {
+          index_t col;
+          for(col=0; col <= cols-nb_lanes;col += nb_lanes)
+          {
+              v.matrix_store(row,col,inner::vconst(val));
+          }
+
+          for(; col < cols;col += nb_lanes)
+          {
+              v(row,col) = val;
+          }
+      }
+}
+
+
+/*
+
+Evaluation : used when result is a vector
+
+*/
+template<typename DA,typename DB,
+typename std::enable_if<has_vector_inst<DA>() &&
+                        vector_idx_pair<DA,DB>(),bool>::type = true>
+inline void eval(DA &v,
+                 const DB& other,
+                 const vector_length_t l,
+                 const DSP* = nullptr)
+{
+    using T = typename traits<DA>::Scalar;
+    constexpr int nb_lanes = vector_traits<T>::nb_lanes;
+    constexpr unsigned int U = DSP_UNROLL;
+    index_t i;
+
+    for(i=0 ; i <= l-(nb_lanes<<U); i += (nb_lanes<<U))
+    {
+        for(int k=0;k < (1<<U);k++)
+        {
+           v.vector_store(i + k*nb_lanes,other.vector_op(i+k*nb_lanes));
+        }
+    }
+
+    for(; i < l ; i++)
+    {
+       v[i] = other[i];
+    }
+}
+
+template<typename DA,typename DB,
+typename std::enable_if<has_vector_inst<DA>() &&
+                        must_use_matrix_idx_pair<DA,DB>(),bool>::type = true>
+inline void eval2D(DA &v,
+                   const DB& other,
+                   const vector_length_t rows,
+                   const vector_length_t cols,
+                   const DSP* = nullptr)
+{
+      using T = typename traits<DA>::Scalar;
+      constexpr int nb_lanes = vector_traits<T>::nb_lanes;
+      index_t row=0;
+
+      for(; row <= rows-(1<<DSP_UNROLL);row += (1<<DSP_UNROLL))
+      {
+          index_t col;
+
+          for(col=0; col <= cols-nb_lanes;col += nb_lanes)
+          {
+              for(int k=0;k<(1<<DSP_UNROLL);k++)
+              {
+                  v.matrix_store(row+k,col,other.matrix_op(row+k,col));
+              }
+          }
+
+          for(; col < cols;col += nb_lanes)
+          {
+             for(int k=0;k<(1<<DSP_UNROLL);k++)
+             {
+                v(row+k,col) = other(row+k,col);
+             }
+          }
+
+      }
+
+      for(; row < rows;row ++)
+      {
+          index_t col;
+          for(col=0; col <= cols-nb_lanes;col += nb_lanes)
+          {
+              v.matrix_store(row,col,other.matrix_op(row,col));
+          }
+
+          for(; col < cols;col += nb_lanes)
+          {
+              v(row,col) = other(row,col);
+          }
+      }
+}
+
+template<typename DA,typename DB,
+         typename std::enable_if<has_vector_inst<DA>() &&
+                                vector_idx_pair<DA,DB>(),bool>::type = true>
+inline DotResult<DA> _dot(const DA& a,
+                         const DB& b,
+                         const vector_length_t l,
+                         const DSP* = nullptr)
+{
+    using Acc = DotResult<DA>;
+    using T = typename traits<DA>::Scalar;
+    using Temp = typename vector_traits<T>::temp_accumulator;
+    constexpr int nb_lanes = vector_traits<T>::nb_lanes;
+    constexpr unsigned int U = DSP_UNROLL;
+    index_t i;
+
+    Acc acc = Acc{};
+    Temp vacc = vector_traits<T>::temp_acc_zero();
+
+    for(i=0 ; i <= l-(nb_lanes<<U); i += (nb_lanes<<U))
+    {
+        for(int k=0;k < (1<<U);k++)
+        {
+           vacc = inner::vmacc(vacc,a.vector_op(i+k*nb_lanes),b.vector_op(i+k*nb_lanes));
+        }
+    }
+
+    acc = inner::vreduce(vacc);
+
+    for(; i < l ; i++)
+    {
+       acc = inner::mac(acc , a[i] , b[i]);
+    }
+
+    return(acc);
+}
+
+template<typename DA,typename DB,
+         typename std::enable_if<has_vector_inst<DA>() &&
+                                 vector_idx_pair<DA,DB>(),bool>::type = true>
+inline void _swap(DA&& a,
+                  DB&& b,
+                  const vector_length_t l,
+                  const DSP* = nullptr)
+{
+      using Scalar = typename ElementType<DA>::type;
+      using Vector = typename vector_traits<Scalar>::vector;
+
+      constexpr int nb_lanes = vector_traits<typename ElementType<DA>::type>::nb_lanes;
+      index_t i=0;
+      Vector tmpa,tmpb;
+    
+      for(i=0 ; i <= l-nb_lanes; i += nb_lanes)
+      {
+        tmpa = a.vector_op(i);
+        tmpb = b.vector_op(i);
+        b.vector_store(i,tmpa);
+        a.vector_store(i,tmpb);
+      }
+
+      for(;i<l;i++)
+      {
+         const auto tmp = a[i];
+         a[i] = b[i];
+         b[i] = tmp;
+      }
+
+}
+
+#undef DSP_UNROLL
+
+#endif
+#endif
+
+/*! @} */
diff --git a/dsppp/Include/dsppp/DSP/matrix_multiply.hpp b/dsppp/Include/dsppp/DSP/matrix_multiply.hpp
new file mode 100644
index 000000000..002913005
--- /dev/null
+++ b/dsppp/Include/dsppp/DSP/matrix_multiply.hpp
@@ -0,0 +1,367 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#ifdef DOXYGEN
+#define ARM_MATH_DSP
+#undef ARM_MATH_MVEI
+#undef ARM_MATH_MVEF
+#undef ARM_MATH_NEON
+#endif
+
+/** \addtogroup DSPALG 
+ *  @{
+ */
+
+#if defined(ARM_MATH_DSP)
+#if !defined(ARM_MATH_MVEI) && !defined(ARM_MATH_MVEF) && !defined(ARM_MATH_NEON)
+
+template<typename MA,
+         typename MB,
+         typename std::enable_if<
+         std::is_same<typename traits<MA>::Scalar,Q15>::value &&
+         number_traits<typename traits<MA>::Scalar>::is_fixed,bool>::type = true>
+__STATIC_INLINE void _arm_mat_trans(
+    const MA    &src,
+    MB          &dst,
+    const DSP* = nullptr)
+{
+    using T = typename traits<MA>::Scalar;
+    using VEC = typename vector_traits<T>::vector;
+    constexpr int nb_lanes = vector_traits<T>::nb_lanes;
+
+    T *pIn = src.ptr();                      /* input data matrix pointer */
+    T *pOut = dst.ptr();                     /* output data matrix pointer */
+    uint16_t nRows = src.rows();                /* number of rows */
+    uint16_t nCols = src.columns();                /* number of columns */
+    uint32_t col, row = nRows, i = 0U;             /* Loop counters */
+
+    VEC in;                                      /* variable to hold temporary output  */
+
+    /* Matrix transpose by exchanging the rows with columns */
+    /* row loop */
+    do
+    {
+      /* Pointer pOut is set to starting address of column being processed */
+      pOut = dst.ptr() + i;
+
+
+      /* Loop unrolling: Compute 4 outputs at a time */
+      col = nCols / (2*nb_lanes);
+
+      while (col > 0U)        /* column loop */
+      {
+        /* Read two elements from row */
+        in = inner::vload1<1>(pIn);
+        pIn += nb_lanes;
+
+        /* Unpack and store one element in  destination */
+        *pOut = Q15(in.v);
+        /* Update pointer pOut to point to next row of transposed matrix */
+        pOut += dst.stride();
+
+        /* Unpack and store second element in destination */
+        *pOut = Q15((in.v & (q31_t) 0xffff0000) >> 16);
+        /* Update  pointer pOut to point to next row of transposed matrix */
+        pOut += dst.stride();
+
+        /* Read two elements from row */
+        in = inner::vload1<1>(pIn);
+        pIn += nb_lanes;
+
+        /* Unpack and store one element in destination */
+        *pOut = Q15(in.v);
+        /* Update pointer pOut to point to next row of transposed matrix */
+        pOut += dst.stride();
+
+        /* Unpack and store second element in destination */
+        *pOut = Q15((in & (q31_t) 0xffff0000) >> 16);
+        /* Update pointer pOut to point to next row of transposed matrix */
+        pOut += dst.stride();
+
+        /* Decrement column loop counter */
+        col--;
+      }
+
+      /* Loop unrolling: Compute remaining outputs */
+      col = nCols & (2*nb_lanes-1);
+      while (col > 0U)
+      {
+        /* Read and store input element in destination */
+        *pOut = *pIn++;
+
+        /* Update pointer pOut to point to next row of transposed matrix */
+        pOut += dst.stride();
+
+        /* Decrement column loop counter */
+        col--;
+      }
+
+      pIn += (src.stride()-nCols);
+
+      i ++;
+
+      /* Decrement row loop counter */
+      row--;
+
+    } while (row > 0U);          /* row loop end */
+
+
+  
+}
+
+
+template<typename M,
+         typename V,
+         typename RES,
+         typename std::enable_if<
+         !std::is_same<typename traits<M>::Scalar,Q31>::value &&
+         number_traits<typename traits<M>::Scalar>::is_fixed,bool>::type = true>
+inline void _dot_m_v(RES &res,
+                     const M&m,const V&v,
+                     const DSP* = nullptr)
+{
+    using T = typename traits<M>::Scalar;
+    using ACC = typename vector_traits<T>::temp_accumulator;
+    using VEC = typename vector_traits<T>::vector;
+    constexpr int nb_lanes = vector_traits<T>::nb_lanes;
+
+    uint32_t numRows = m.rows();
+    uint32_t numCols = m.columns();
+    const T *pSrcA = m.ptr();
+    const T *pInA1;      /* input data matrix pointer A of Q15 type */
+    const T *pInA2;      /* input data matrix pointer A of Q15 type */
+    const T *pInA3;      /* input data matrix pointer A of Q15 type */
+    const T *pInA4;      /* input data matrix pointer A of Q15 type */
+    T *px;               /* Temporary output data matrix pointer */
+    uint16_t i, row; /* loop counters */
+    int16_t colCnt;
+    VEC matData, matData2, vecData, vecData2;
+    T tmpData;
+
+
+    /* Process 4 rows at a time */
+    row = numRows >> 2;
+    i = 0u;
+    px = res.ptr();
+
+    /* The following loop performs the dot-product of each row in pSrcA with the vector */
+    /* row loop */
+    while (row > 0) {
+        /* Initialize accumulators */
+        ACC sum1 = ACC{};
+        ACC sum2 = ACC{};
+        ACC sum3 = ACC{};
+        ACC sum4 = ACC{};
+
+        /* For every row wise process, the pInVec pointer is set
+         ** to the starting address of the vector */
+
+        /* Loop unrolling: process 2 columns per iteration */
+
+        /* Initialize pointers to the starting address of the column being processed */
+        pInA1 = pSrcA + i;
+        pInA2 = pInA1 + m.stride();
+        pInA3 = pInA2 + m.stride();
+        pInA4 = pInA3 + m.stride();
+
+        // Main loop: matrix-vector multiplication
+        for(colCnt = 0 ; colCnt <= (int16_t)numCols - nb_lanes; colCnt += nb_lanes)
+        {
+            // Read 2 values from vector
+            vecData = v.vector_op(colCnt);
+
+            // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
+            matData =  inner::vload1<1>  (pInA1);
+            pInA1 += nb_lanes;
+            sum1 = inner::vmacc(sum1, matData, vecData);
+
+            matData = inner::vload1<1>  (pInA2);
+            pInA2 += nb_lanes;
+            sum2 = inner::vmacc(sum2, matData, vecData);
+
+            matData = inner::vload1<1>  (pInA3);
+            pInA3 += nb_lanes;
+            sum3 = inner::vmacc(sum3, matData, vecData);
+
+            matData = inner::vload1<1>  (pInA4);
+            pInA4 += nb_lanes;
+            sum4 = inner::vmacc(sum4, matData, vecData);
+
+            // Decrement the loop counter
+        }
+
+        /* process any remaining columns */
+
+        for(; colCnt < (int16_t)numCols; colCnt ++)
+        {
+            tmpData = v[colCnt];
+            sum1 = inner::mac(sum1,*pInA1++ , tmpData);
+            sum2 = inner::mac(sum2,*pInA2++ , tmpData);
+            sum3 = inner::mac(sum3,*pInA3++ , tmpData);
+            sum4 = inner::mac(sum4,*pInA4++ , tmpData);
+        }
+
+        /* Saturate and store the result in the destination buffer */
+        *px++ = inner::from_accumulator(sum1);
+        *px++ = inner::from_accumulator(sum2);
+        *px++ = inner::from_accumulator(sum3);
+        *px++ = inner::from_accumulator(sum4);
+
+        i = i + m.stride() * 4;
+
+        /* Decrement the row loop counter */
+        row--;
+    }
+
+    /* process any remaining rows */
+    row = numRows & 3u;
+    while (row > 0) {
+
+        ACC sum = ACC{};
+        pInA1 = pSrcA + i;
+
+        // loop unrolling - process 4 elements at a time
+
+        for(colCnt = 0 ; colCnt <= (int16_t)numCols - 2*nb_lanes; colCnt += 2*nb_lanes)
+        {
+            vecData = v.vector_op(colCnt);
+            vecData2 = v.vector_op(colCnt+nb_lanes);
+
+            matData = inner::vload1<1>(pInA1);
+            pInA1 += nb_lanes;
+            matData2 = inner::vload1<1>(pInA1);
+            pInA1 += nb_lanes;
+            sum = inner::vmacc(sum, matData, vecData);
+            sum = inner::vmacc(sum, matData2, vecData2);
+        }
+
+        // process remainder of row
+        for(; colCnt < (int16_t)numCols; colCnt ++)
+        {
+
+            sum = inner::mac(sum, *pInA1++ , v[colCnt]);
+        }
+        *px++ = inner::from_accumulator(sum);
+        i = i + m.stride();
+        row--;
+    }
+}
+
+template<typename MA,
+         typename MB,
+         typename RES,
+         typename TMP,
+         typename std::enable_if<
+         !std::is_same<typename traits<MA>::Scalar,Q31>::value &&
+         number_traits<typename traits<MA>::Scalar>::is_fixed,bool>::type = true>
+__STATIC_INLINE void _dot_m_m(const MA&pSrcA,const MB&pSrcB,
+                     RES &&pDst,
+                     const TMP &BT,
+                     const DSP* = nullptr)
+{
+  using T = typename traits<MA>::Scalar;
+  using ACC = typename vector_traits<T>::temp_accumulator;
+  using VEC = typename vector_traits<T>::vector;
+  constexpr int nb_lanes = vector_traits<T>::nb_lanes;
+
+          ACC sum;                                     /* Accumulator */
+
+
+        T *pSrcBT = BT.ptr();                        /* Input data matrix pointer for transpose */
+        T *pInA = pSrcA.ptr();                    /* Input data matrix pointer A of Q15 type */
+        T *pInB = pSrcB.ptr();                    /* Input data matrix pointer B of Q15 type */
+        T *px;                                     /* Temporary output data matrix pointer */
+        uint16_t numRowsA = pSrcA.rows();            /* Number of rows of input matrix A */
+        uint16_t numColsB = pSrcB.columns();            /* Number of columns of input matrix B */
+        uint16_t numColsA = pSrcA.columns();            /* Number of columns of input matrix A */
+        uint16_t numRowsB = pSrcB.rows();            /* Number of rows of input matrix B */
+        uint32_t col, i = 0U, row = numRowsB, colCnt;  /* Loop counters */
+
+        VEC inA1, inB1, inA2, inB2;
+
+    
+    /* Reset variables for usage in following multiplication process */
+    row = numRowsA;
+    i = 0U;
+    px = pDst.ptr();
+
+    /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+    /* row loop */
+    do
+    {
+      /* For every row wise process, column loop counter is to be initiated */
+      col = numColsB;
+
+      /* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
+      pInB = pSrcBT;
+
+      /* column loop */
+      do
+      {
+        /* Set variable sum, that acts as accumulator, to zero */
+        sum = ACC{};
+
+        /* Initiate pointer pInA to point to starting address of column being processed */
+        pInA = pSrcA.ptr() + i;
+
+        /* Apply loop unrolling and compute 2 MACs simultaneously. */
+        colCnt = numColsA / (2*nb_lanes);
+
+        /* matrix multiplication */
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+          /* read real and imag values from pSrcA and pSrcB buffer */
+          inA1 = inner::vload1<1> (pInA);
+          pInA += nb_lanes;
+          inB1 = inner::vload1<1> (pInB);
+          pInB += nb_lanes;
+
+          inA2 = inner::vload1<1> (pInA);
+          pInA += nb_lanes;
+          inB2 = inner::vload1<1> (pInB);
+          pInB += nb_lanes;
+
+          /* Multiply and Accumulates */
+          sum = inner::vmacc(sum, inA1, inB1);
+          sum = inner::vmacc(sum, inA2, inB2);
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* process remaining column samples */
+        colCnt = numColsA & (2*nb_lanes-1);
+
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+          sum = inner::mac(sum ,*pInA++ , *pInB++);
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Saturate and store result in destination buffer */
+        *px = inner::from_accumulator(sum);
+        px++;
+
+        /* Decrement column loop counter */
+        col--;
+
+      } while (col > 0U);
+
+      i = i + pSrcA.stride();
+
+      /* Decrement row loop counter */
+      row--;
+
+    } while (row > 0U);
+
+}
+#endif
+#endif
+
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/DSP/memory.hpp b/dsppp/Include/dsppp/DSP/memory.hpp
new file mode 100644
index 000000000..6aa190579
--- /dev/null
+++ b/dsppp/Include/dsppp/DSP/memory.hpp
@@ -0,0 +1,98 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#ifdef DOXYGEN
+#define ARM_MATH_DSP
+#undef ARM_MATH_MVEI
+#undef ARM_MATH_MVEF
+#undef ARM_MATH_NEON
+#endif
+
+
+namespace arm_cmsis_dsp {
+
+
+/** \addtogroup DSPALG 
+ *  @{
+ */
+
+#define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v0) <<  0) & (int32_t)0x000000FF) | \
+                                  (((int32_t)(v1) <<  8) & (int32_t)0x0000FF00) | \
+                                  (((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | \
+                                  (((int32_t)(v3) << 24) & (int32_t)0xFF000000)  )
+
+
+__STATIC_FORCEINLINE int32_t read_q15x2 (
+  Q15 const * pQ15)
+{
+  int32_t val;
+  const int16_t *p=reinterpret_cast<const int16_t* >(pQ15);
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (&val, p, 4);
+#else
+  val = (p[1] << 16) | (p[0] & 0x0FFFF) ;
+#endif
+
+  return (val);
+};
+
+
+
+__STATIC_FORCEINLINE void write_q15x2 (
+  Q15 * pQ15,
+  int32_t   value)
+{
+  int32_t val = value;
+  int16_t *p=reinterpret_cast<int16_t* >(pQ15);
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (p, &val, 4);
+#else
+  p[0] = (int16_t)(val & 0x0FFFF);
+  p[1] = (int16_t)(val >> 16);
+#endif
+};
+
+
+__STATIC_FORCEINLINE int32_t read_q7x4 (
+  Q7 const * pQ7)
+{
+  int32_t val;
+  const int8_t *p=reinterpret_cast<const int8_t*>(pQ7);
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (&val, p, 4);
+#else
+  val =((p[3] & 0x0FF) << 24)  | ((p[2] & 0x0FF) << 16)  | ((p[1] & 0x0FF) << 8)  | (p[0] & 0x0FF);
+#endif 
+  return (val);
+};
+
+
+
+
+
+
+__STATIC_FORCEINLINE void write_q7x4 (
+  Q7 *& pQ7,
+  int32_t   value)
+{
+  int8_t *p=reinterpret_cast<int8_t*>(pQ7);
+  int32_t val = value;
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (p, &val, 4);
+#else
+  p[0] = (q7_t)(val & 0x0FF);
+  p[1] = (q7_t)((val >> 8) & 0x0FF);
+  p[2] = (q7_t)((val >> 16) & 0x0FF);
+  p[3] = (q7_t)((val >> 24) & 0x0FF);
+
+#endif
+};
+
+/*! @} */
+
+}
+
diff --git a/dsppp/Include/dsppp/DSP/num_features.hpp b/dsppp/Include/dsppp/DSP/num_features.hpp
new file mode 100644
index 000000000..e13f1a922
--- /dev/null
+++ b/dsppp/Include/dsppp/DSP/num_features.hpp
@@ -0,0 +1,14 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+/*
+
+vreduce is going from vector accumulator to scalar accumulator
+from_accumulator is going from scalar accumulator to scalar datatype
+
+
+*/
+
+#include "q7.hpp"
+#include "q15.hpp"
diff --git a/dsppp/Include/dsppp/DSP/q15.hpp b/dsppp/Include/dsppp/DSP/q15.hpp
new file mode 100644
index 000000000..f10d2c140
--- /dev/null
+++ b/dsppp/Include/dsppp/DSP/q15.hpp
@@ -0,0 +1,238 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#ifdef DOXYGEN
+#define ARM_MATH_DSP
+#undef ARM_MATH_MVEI
+#undef ARM_MATH_MVEF
+#undef ARM_MATH_NEON
+#endif
+
+/** \addtogroup DSPNumber DSP extension specific number definitions
+ *  \ingroup NUMBER
+ *  @{
+ *  \addtogroup DSPQ15Number Q15
+ *  \ingroup DSPNumber
+ *  @{
+ */
+
+#if defined(ARM_MATH_DSP)
+#if !defined(ARM_MATH_MVEI) && !defined(ARM_MATH_MVEF) && !defined(ARM_MATH_NEON)
+
+
+struct Q15DSPVector {
+    Q15DSPVector():v(0){};
+    explicit Q15DSPVector(int32_t val):v(val){};
+    operator int32_t(){return v;};
+
+int32_t v;
+};
+
+template<>
+struct vector_traits<Q15,DSP,typename std::enable_if<true>::type> 
+{
+  typedef Q15 type;
+  typedef type::value_type storage_type;
+  typedef Q15DSPVector vector;
+  typedef Q<33,30> temp_accumulator;
+
+  /*
+
+  The evaluators are not using any predication and instead 
+  use additional code after the loop to manage the tail.
+
+  So, no inner function with predicate_t is required.
+
+  Fusion operators still have call to inner operator with
+  predicate but they are not called in this context.
+
+  */
+  typedef uint32_t predicate_t;
+
+
+  static constexpr bool has_vector = true;
+  static constexpr bool is_float = false;
+  static constexpr bool is_fixed = true;
+  static constexpr bool has_predicate = false;
+
+  static constexpr int nb_lanes = 2;
+
+  static Q<33,30> temp_acc_zero()
+  {
+       return(Q<33,30>());
+  }
+
+  static constexpr int16_t zero_lane() {return 0;};
+
+  static constexpr int16_t lane_value(const Q15 x) {return x.v;};
+
+
+};
+
+
+
+namespace inner {
+
+    /* Needed to build but not used */
+    template<>
+    struct vctpq<Q15>{
+       static uint32_t mk(uint32_t v)
+       {
+            return(v);
+       };
+    };
+
+    __STATIC_FORCEINLINE Q15DSPVector vconst(Q15 val)
+    {
+       return(Q15DSPVector(__PKHBT(val.v, val.v, 16)));
+    }
+
+
+    __STATIC_FORCEINLINE Q15DSPVector vneg(const Q15DSPVector a)
+    {
+       return(Q15DSPVector(__QSUB16(0, a.v)));
+    };
+
+    __STATIC_FORCEINLINE Q15DSPVector vadd(const Q15DSPVector a,
+                                           const Q15DSPVector b)
+    {
+       return(Q15DSPVector(__QADD16(a.v,b.v)));
+    };
+
+    __STATIC_FORCEINLINE Q15DSPVector vadd(const Q15DSPVector a,
+                                            const Q15 b)
+    {
+       return(Q15DSPVector(__QADD16(a.v,vconst(b).v)));
+    };
+
+     __STATIC_FORCEINLINE Q15DSPVector vadd(const Q15 a,
+                                            const Q15DSPVector b)
+    {
+       return(Q15DSPVector(__QADD16(vconst(a).v,b.v)));
+    };
+
+     __STATIC_FORCEINLINE Q15DSPVector vsub(const Q15DSPVector a,
+                                            const Q15DSPVector b)
+    {
+       return(Q15DSPVector(__QSUB16(a.v,b.v)));
+    };
+
+    __STATIC_FORCEINLINE Q15DSPVector vsub(const Q15DSPVector a,
+                                            const Q15 b)
+    {
+       return(Q15DSPVector(__QSUB16(a.v,vconst(b).v)));
+    };
+
+     __STATIC_FORCEINLINE Q15DSPVector vsub(const Q15 a,
+                                            const Q15DSPVector b)
+    {
+       return(Q15DSPVector(__QSUB16(vconst(a).v,b.v)));
+    };
+
+     __STATIC_FORCEINLINE Q15DSPVector vmul(const Q15DSPVector a,
+                                            const Q15DSPVector b)
+    {
+        q31_t mul1,mul2;
+        q15_t out1,out2;
+
+        mul1 = (q31_t) ((q15_t) (a.v      ) * (q15_t) (b.v      ));
+        mul2 = (q31_t) ((q15_t) (a.v >> 16) * (q15_t) (b.v >> 16));
+
+        out1 = (q15_t) __SSAT(mul1 >> 15, 16);
+        out2 = (q15_t) __SSAT(mul2 >> 15, 16);
+        return(Q15DSPVector(__PKHBT(out1, out2, 16)));
+    };
+
+
+    __STATIC_FORCEINLINE Q15DSPVector vmul(const Q15DSPVector a,
+                                           const Q15 b)
+    {
+        return(vmul(a,vconst(b)));
+    };
+
+    __STATIC_FORCEINLINE Q15DSPVector vmul(const Q15 a,
+                                           const Q15DSPVector b)
+    {
+        return(vmul(vconst(a),b));
+    };
+
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline Q15DSPVector vload1(const Q15 *p)
+    {
+       return(Q15DSPVector(read_q15x2(p)));
+    };
+
+   
+    template<int S,
+    typename std::enable_if<(S>1),bool>::type = true>
+    inline Q15DSPVector vload1(const Q15 *p)
+    {
+       Q15 a = p[0];
+       Q15 b = p[S];
+
+       return(Q15DSPVector(__PKHBT(a.v, b.v, 16)));
+    };
+
+
+    // Dynamic stride
+    inline Q15DSPVector vload1(const Q15 *p,index_t stride)
+    {
+       Q15 a = p[0];
+       Q15 b = *(p+stride);
+
+       return(Q15DSPVector(__PKHBT(a.v, b.v, 16)));
+    }
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline void vstore1(Q15 *p,const Q15DSPVector val)
+    {
+       write_q15x2 (p, val.v);
+    };
+
+    template<int S,
+    typename std::enable_if<(S>1),bool>::type = true>
+    inline void vstore1(Q15 *p,const Q15DSPVector val)
+    {
+       p[0] = Q15(val.v & 0x0FFFF);
+       p[S] = Q15(val.v >> 16);
+    };
+
+    // dynamic stride 
+    inline void vstore1(Q15 *p,const index_t stride,
+                        const Q15DSPVector val)
+    {
+        p[0] = Q15(val.v & 0x0FFFF);
+        *(p+stride) = Q15(val.v >> 16);
+    }
+
+    __STATIC_FORCEINLINE Q<33,30> vmacc(const Q<33,30> sum,
+                                        const Q15DSPVector vala,
+                                        const Q15DSPVector valb)
+    {
+       return(Q<33,30>(__SMLALD(vala.v,valb.v,sum.v)));
+    };
+
+     __STATIC_FORCEINLINE Q<33,30> vmacc(const Q15DSPVector vala,
+                                         const Q15DSPVector valb)
+    {
+       return(Q<33,30>(__SMLALD(vala.v,valb.v,0)));
+    };
+
+    __STATIC_FORCEINLINE Q<33,30> vreduce(const Q<33,30> sum)
+    {
+       return(sum);
+    };
+
+  
+};
+
+
+#endif
+#endif
+
+/*! @} */
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/DSP/q7.hpp b/dsppp/Include/dsppp/DSP/q7.hpp
new file mode 100644
index 000000000..7c218294e
--- /dev/null
+++ b/dsppp/Include/dsppp/DSP/q7.hpp
@@ -0,0 +1,264 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#ifdef DOXYGEN
+#define ARM_MATH_DSP
+#undef ARM_MATH_MVEI
+#undef ARM_MATH_MVEF
+#undef ARM_MATH_NEON
+#endif
+
+/** \addtogroup DSPNumber DSP extension specific number definitions
+ *  \ingroup NUMBER
+ *  @{
+ *  \addtogroup DSPQ7Number Q7
+ *  \ingroup DSPNumber
+ *  @{
+ */
+
+#if defined(ARM_MATH_DSP)
+#if !defined(ARM_MATH_MVEI) && !defined(ARM_MATH_MVEF) && !defined(ARM_MATH_NEON)
+
+
+struct Q7DSPVector {
+    Q7DSPVector():v(0){};
+    explicit Q7DSPVector(int32_t val):v(val){};
+    operator int32_t(){return v;};
+int32_t v;
+};
+
+template<>
+struct vector_traits<Q7,DSP,typename std::enable_if<true>::type> 
+{
+  typedef Q7 type;
+  typedef type::value_type storage_type;
+  typedef Q7DSPVector vector;
+  typedef Q<17,14> temp_accumulator;
+
+  /*
+
+  The evaluators are not using any predication and instead 
+  use additional code after the loop to manage the tail.
+
+  So, no inner function with predicate_t is required.
+
+  Fusion operators still have call to inner operator with
+  predicate but they are not called in this context.
+
+  */
+  typedef uint32_t predicate_t;
+
+
+  static constexpr bool has_vector = true;
+  static constexpr bool is_float = false;
+  static constexpr bool is_fixed = true;
+  static constexpr bool has_predicate = false;
+
+  static constexpr int nb_lanes = 4;
+
+  static Q<17,14> temp_acc_zero()
+  {
+       return(Q<17,14>());
+  }
+
+  static constexpr int8_t zero_lane() {return 0;};
+
+  static constexpr int8_t lane_value(const Q7 x) {return x.v;};
+
+
+};
+
+
+
+namespace inner {
+
+    /* Needed to build but not used */
+    template<>
+    struct vctpq<Q7>{
+       static uint32_t mk(uint32_t v)
+       {
+            return(v);
+       };
+    };
+
+    __STATIC_FORCEINLINE Q7DSPVector vconst(Q7 val)
+    {
+       return(Q7DSPVector(__PACKq7(val.v, val.v, val.v, val.v)));
+    }
+
+
+    __STATIC_FORCEINLINE Q7DSPVector vneg(const Q7DSPVector a)
+    {
+       return(Q7DSPVector(__QSUB8(0, a.v)));
+    };
+
+    __STATIC_FORCEINLINE Q7DSPVector vadd(const Q7DSPVector a,
+                                           const Q7DSPVector b)
+    {
+       return(Q7DSPVector(__QADD8(a.v,b.v)));
+    };
+
+    __STATIC_FORCEINLINE Q7DSPVector vadd(const Q7DSPVector a,
+                                            const Q7 b)
+    {
+       return(Q7DSPVector(__QADD8(a.v,vconst(b).v)));
+    };
+
+     __STATIC_FORCEINLINE Q7DSPVector vadd(const Q7 a,
+                                            const Q7DSPVector b)
+    {
+       return(Q7DSPVector(__QADD8(vconst(a).v,b.v)));
+    };
+
+     __STATIC_FORCEINLINE Q7DSPVector vsub(const Q7DSPVector a,
+                                            const Q7DSPVector b)
+    {
+       return(Q7DSPVector(__QSUB8(a.v,b.v)));
+    };
+
+    __STATIC_FORCEINLINE Q7DSPVector vsub(const Q7DSPVector a,
+                                            const Q7 b)
+    {
+       return(Q7DSPVector(__QSUB8(a.v,vconst(b).v)));
+    };
+
+     __STATIC_FORCEINLINE Q7DSPVector vsub(const Q7 a,
+                                            const Q7DSPVector b)
+    {
+       return(Q7DSPVector(__QSUB8(vconst(a).v,b.v)));
+    };
+
+     __STATIC_FORCEINLINE Q7DSPVector vmul(const Q7DSPVector a,
+                                            const Q7DSPVector b)
+    {
+        q7_t out1, out2, out3, out4;
+        q15_t mul1,mul2,mul3,mul4;
+
+        mul1 = (q15_t) ((q7_t) (a.v     ) * (q7_t) (b.v      ));
+        mul2 = (q15_t) ((q7_t) (a.v >> 8) * (q7_t) (b.v >> 8));
+        mul3 = (q15_t) ((q7_t) (a.v >> 16) * (q7_t) (b.v >> 16));
+        mul4 = (q15_t) ((q7_t) (a.v >> 24) * (q7_t) (b.v >> 24));
+
+        out1 = (q7_t) __SSAT(mul1 >> 7, 8);
+        out2 = (q7_t) __SSAT(mul2 >> 7, 8);
+        out3 = (q7_t) __SSAT(mul3 >> 7, 8);
+        out4 = (q7_t) __SSAT(mul4 >> 7, 8);
+        return(Q7DSPVector(__PACKq7(out1,out2,out3,out4)));
+    };
+
+
+    __STATIC_FORCEINLINE Q7DSPVector vmul(const Q7DSPVector a,
+                                           const Q7 b)
+    {
+        return(vmul(a,vconst(b)));
+    };
+
+    __STATIC_FORCEINLINE Q7DSPVector vmul(const Q7 a,
+                                           const Q7DSPVector b)
+    {
+        return(vmul(vconst(a),b));
+    };
+
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline Q7DSPVector vload1(const Q7 *p)
+    {
+       return(Q7DSPVector(read_q7x4(p)));
+    };
+
+   
+    template<int S,
+    typename std::enable_if<(S>1),bool>::type = true>
+    inline Q7DSPVector vload1(const Q7 *p)
+    {
+       Q7 a = p[0];
+       Q7 b = p[S];
+       Q7 c = p[2*S];
+       Q7 d = p[3*S];
+
+       return(Q7DSPVector(__PACKq7(a.v, b.v, c.v,d.v)));
+    };
+
+
+    // Dynamic stride
+    inline Q7DSPVector vload1(const Q7 *p,index_t stride)
+    {
+       Q7 a = p[0];
+       Q7 b = *(p+stride);
+       Q7 c = *(p+2*stride);
+       Q7 d = *(p+3*stride);
+
+       return(Q7DSPVector(__PACKq7(a.v, b.v, c.v,d.v)));
+    }
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline void vstore1(Q7 *p,const Q7DSPVector val)
+    {
+       write_q7x4 (p, val.v);
+    };
+
+    template<int S,
+    typename std::enable_if<(S>1),bool>::type = true>
+    inline void vstore1(Q7 *p,const Q7DSPVector val)
+    {
+       p[0] = Q7(val.v & 0x0FF);
+       p[S] = Q7(val.v >> 8);
+       p[2*S] = Q7(val.v >> 16);
+       p[3*S] = Q7(val.v >> 24);
+    };
+
+    // dynamic stride 
+    inline void vstore1(Q7 *p,const index_t stride,
+                        const Q7DSPVector val)
+    {
+        p[0] = Q7(val.v & 0x0FF);
+        *(p+stride) = Q7(val.v >> 8);
+        *(p+2*stride) = Q7(val.v >> 16);
+        *(p+3*stride) = Q7(val.v >> 24);
+    }
+
+    __STATIC_FORCEINLINE Q<17,14> vmacc(const Q<17,14> sum,
+                                        const Q7DSPVector vala,
+                                        const Q7DSPVector valb)
+    {
+      q31_t inA1, inA2, inB1, inB2; 
+      q31_t s;
+      inA1 = __SXTB16(__ROR(vala.v, 8));
+      /* extract reminaing two samples */
+      inA2 = __SXTB16(vala.v);
+      /* extract two q7_t samples to q15_t samples */
+      inB1 = __SXTB16(__ROR(valb.v, 8));
+      /* extract reminaing two samples */
+      inB2 = __SXTB16(valb.v);
+
+      /* multiply and accumulate two samples at a time */
+      s = __SMLAD(inA1, inB1, sum.v);
+      s = __SMLAD(inA2, inB2, s);
+
+      return(Q<17,14>(s));
+    };
+
+     __STATIC_FORCEINLINE Q<17,14> vmacc(const Q7DSPVector vala,
+                                         const Q7DSPVector valb)
+    {
+       return(vmacc(Q<17,14>(0),vala,valb));
+    };
+
+    __STATIC_FORCEINLINE Q<17,14> vreduce(const Q<17,14> sum)
+    {
+       return(sum);
+    };
+
+  
+};
+
+
+#endif
+#endif
+
+
+/*! @} */
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/Helium/basic.hpp b/dsppp/Include/dsppp/Helium/basic.hpp
new file mode 100644
index 000000000..ac0529fce
--- /dev/null
+++ b/dsppp/Include/dsppp/Helium/basic.hpp
@@ -0,0 +1,223 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#include <dsppp/arch.hpp>
+#include <type_traits>
+#include <dsppp/number.hpp>
+
+#ifdef DOXYGEN
+#define ARM_MATH_MVEI
+#define ARM_MATH_MVEF
+#endif
+
+/** \addtogroup ARCHALG
+ *  \addtogroup HELIUMALG Helium specific algorithm
+ *  \ingroup ARCHALG
+ *  @{
+ */
+
+#if defined(ARM_MATH_MVEI) || defined(ARM_MATH_MVEF)
+template<typename T,typename DST,
+typename std::enable_if<has_vector_inst<DST>() &&
+          IsVector<DST>::value &&
+         SameElementType<DST,T>::value,bool>::type = true>
+inline void _Fill(DST &v,
+                  const T val, 
+                  const vector_length_t l,
+                  const Helium* = nullptr)
+{
+      constexpr int nb_lanes = vector_traits<T>::nb_lanes;
+      index_t i=0;
+      UNROLL_LOOP
+      for(i=0;i < l; i += nb_lanes) 
+      {
+        v.vector_store_tail(i,l-i,inner::vconst_tail(val,inner::vctpq<T>::mk(l-i)));
+      }
+}
+
+template<typename T,typename DST,
+typename std::enable_if<has_vector_inst<DST>() &&
+         must_use_matrix_idx<DST>() &&
+         SameElementType<DST,T>::value,bool>::type = true>
+inline void _Fill2D(DST &v,
+                  const T val, 
+                  const vector_length_t rows,
+                  const vector_length_t cols,
+                  const Helium* = nullptr)
+{
+      constexpr int nb_lanes = vector_traits<T>::nb_lanes;
+
+      // Outer unroll factor in case inner loop does not have
+      // enough arithmetic instructions.
+      // In future version this may be estimated from the
+      // complexity of the AST to evaluate
+      constexpr int U = 1;
+      index_t row=0;
+
+      UNROLL_LOOP
+      for(; row <= rows-U;row += U)
+      {
+
+          UNROLL_LOOP
+          for(index_t col=0; col < cols;col += nb_lanes)
+          {
+              for(int k=0;k<U;k++)
+              {
+                  v.matrix_store_tail(row+k,col,cols-col,inner::vconst_tail(val,inner::vctpq<T>::mk(cols-col)));
+              }
+          }
+      }
+
+      for(; row < rows;row ++)
+      {
+
+          UNROLL_LOOP
+          for(index_t col=0; col < cols;col += nb_lanes)
+          {
+              v.matrix_store_tail(row,col,cols-col,inner::vconst_tail(val,inner::vctpq<T>::mk(cols-col)));
+          }
+      }
+}
+
+template<typename DA,typename DB,
+typename std::enable_if<has_vector_inst<DA>() && 
+                        vector_idx_pair<DA,DB>(),bool>::type = true>
+inline void eval(DA &v,
+                 const DB& other,
+                 const vector_length_t l,
+                 const Helium* = nullptr)
+{
+      using T = typename traits<DA>::Scalar;
+      constexpr int nb_lanes = vector_traits<T>::nb_lanes;
+      
+      index_t i=0;
+
+      UNROLL_LOOP    
+      for(i=0;i < l; i += nb_lanes) 
+      {
+          v.vector_store_tail(i,l-i,other.vector_op_tail(i,l-i));
+      }
+}
+
+
+template<typename DA,typename DB,
+typename std::enable_if<has_vector_inst<DA>() &&
+                        must_use_matrix_idx_pair<DA,DB>(),bool>::type = true>
+inline void eval2D(DA &v,
+                   const DB& other,
+                   const vector_length_t rows,
+                   const vector_length_t cols,
+                   const Helium* = nullptr)
+{
+      using T = typename traits<DA>::Scalar;
+      constexpr int nb_lanes = vector_traits<T>::nb_lanes;
+      // Attempt at computing the unrolling factor
+      // depending on the complexity of the AST
+      // (will have to rework this estimation)
+      constexpr int RU = 5 - Complexity<DB>::value;
+      constexpr int U = (RU <= 0) || (RU>=5) ? 1 : RU;
+      index_t row=0;
+
+      UNROLL_LOOP
+      for(; row <= rows-U;row += U)
+      {
+
+          UNROLL_LOOP
+          for(index_t col=0; col < cols;col += nb_lanes)
+          {
+              for(int k=0;k<U;k++)
+              {
+                  v.matrix_store_tail(row+k,col,cols-col,other.matrix_op_tail(row+k,col,cols-col));
+              }
+          }
+      }
+
+      UNROLL_LOOP
+      for(; row < rows;row ++)
+      {
+
+          UNROLL_LOOP
+          for(index_t col=0; col < cols;col += nb_lanes)
+          {
+              v.matrix_store_tail(row,col,cols-col,other.matrix_op_tail(row,col,cols-col));
+          }
+      }
+}
+
+
+static std::ostream& operator<< (std::ostream& stream, const float32x4_t& other) 
+{
+   stream << "(" << other[0] << "," <<other[1] << "," <<other[2] << "," <<other[3] << ")";
+   return(stream);
+}
+
+template<class TupType, size_t... I>
+void printt(const TupType& _tup, std::index_sequence<I...>)
+{
+    std::cout << "(";
+    (..., (std::cout << (I == 0? "" : ", ") << std::get<I>(_tup)));
+    std::cout << ")\n";
+}
+
+template<class... T>
+void printt (const std::tuple<T...>& _tup)
+{
+    printt(_tup, std::make_index_sequence<sizeof...(T)>());
+}
+
+template<typename DA,typename DB,
+         typename std::enable_if<has_vector_inst<DA>() &&
+         vector_idx_pair<DA,DB>(),bool>::type = true>
+inline DotResult<DA> _dot(const DA& a,
+                          const DB& b,
+                          const vector_length_t l,
+                          const Helium* = nullptr)
+{
+   //using Res = DotResult<DA>;
+   // Vector scalar datatype
+
+   using T = typename traits<DA>::Scalar;
+   using Temp = typename vector_traits<T>::temp_accumulator;
+
+   constexpr int nb_lanes = vector_traits<T>::nb_lanes;
+
+   Temp acc = vector_traits<T>::temp_acc_zero();
+
+    UNROLL_LOOP
+    for(index_t i=0; i<l; i+=nb_lanes)
+    {
+        acc = inner::vmacc(acc,a.vector_op_tail(i,l-i),b.vector_op_tail(i,l-i),inner::vctpq<T>::mk(l-i));
+    }
+
+     return(inner::vreduce(acc));
+}
+
+template<typename DA,typename DB,
+         typename std::enable_if<has_vector_inst<DA>() &&
+                                 vector_idx_pair<DA,DB>(),bool>::type = true>
+inline void _swap(DA&& a,
+                  DB&& b,
+                  const vector_length_t l,
+                  const Helium* = nullptr)
+{
+      using Scalar = typename ElementType<DA>::type;
+      using Vector = typename vector_traits<Scalar>::vector;
+
+      constexpr int nb_lanes = vector_traits<typename ElementType<DA>::type>::nb_lanes;
+      index_t i=0;
+      Vector tmpa,tmpb;
+
+      UNROLL_LOOP     
+      for(i=0;i < l; i += nb_lanes) 
+      {
+        tmpa = a.vector_op_tail(i,l-i);
+        tmpb = b.vector_op_tail(i,l-i);
+        b.vector_store_tail(i,l-i,tmpa);
+        a.vector_store_tail(i,l-i,tmpb);
+      }
+}
+#endif
+
+/*! @} */
+
diff --git a/dsppp/Include/dsppp/Helium/float.hpp b/dsppp/Include/dsppp/Helium/float.hpp
new file mode 100644
index 000000000..6f7861508
--- /dev/null
+++ b/dsppp/Include/dsppp/Helium/float.hpp
@@ -0,0 +1,426 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#ifdef DOXYGEN
+#define ARM_MATH_MVEI
+#define ARM_MATH_MVEF
+#endif
+
+/** \addtogroup HeliumNumber Helium specific number definitions
+ *  \ingroup NUMBER
+ *  @{
+ *  \addtogroup HeliumFloatNumber Float
+ *  \ingroup HeliumNumber
+ *  @{
+ */
+
+/******************
+ *
+ * Helium
+ * 
+ */
+
+#if defined(ARM_MATH_MVEF)
+
+/*
+
+
+Arch is deriving from Helium
+
+*/
+template<typename arch>
+struct vector_traits<float,arch,
+typename std::enable_if<std::is_base_of<Helium,arch>::value>::type >
+{
+  typedef float type;
+  typedef float storage_type;
+  typedef float32x4_t vector;
+  typedef float32x4_t temp_accumulator;
+  typedef mve_pred16_t predicate_t;
+  static constexpr bool has_vector = true;
+  static constexpr bool is_float = true;
+  static constexpr bool is_fixed = false;
+  static constexpr bool has_predicate = true;
+
+  static constexpr int nb_lanes = 4;
+
+  static float32x4_t temp_acc_zero()
+  {
+     return(vdupq_n_f32(0.0f));
+  }
+
+  static constexpr float zero_lane() {return 0.0f;};
+  // Useful in fixed point since lane value is an int and not a Q something
+  static constexpr float lane_value(const float x) {return x;};
+  
+};
+
+
+
+namespace inner {
+
+  template<>
+  struct vctpq<float> {
+    static mve_pred16_t mk(uint32_t v)
+    {
+       return(vctp32q(v));
+    };
+  };
+  
+  __STATIC_FORCEINLINE float32x4_t vconst(const float v)
+  {
+     return(vdupq_n_f32(v));
+  }
+
+  __STATIC_FORCEINLINE float32x4_t vconst_tail(const float v,
+                                               const mve_pred16_t p0)
+  {
+     return(vdupq_x_n_f32(v,p0));
+  }
+
+  __STATIC_FORCEINLINE float32x4_t vneg(const float32x4_t a)
+  {
+     return(vnegq(a));
+  };
+
+  __STATIC_FORCEINLINE float32x4_t vneg(const float32x4_t a,
+                                        const mve_pred16_t p0)
+  {
+     return(vnegq_x(a,p0));
+  };
+  
+  __STATIC_FORCEINLINE float32x4_t vadd(const float32x4_t a,const float32x4_t b)
+  {
+     return(vaddq(a,b));
+  };
+
+ 
+  __STATIC_FORCEINLINE float32x4_t vadd(const float32x4_t a,const float b)
+  {
+     return(vaddq_n_f32(a,b));
+  };
+
+   __STATIC_FORCEINLINE float32x4_t vadd(const float a,const float32x4_t b)
+  {
+     return(vaddq_n_f32(b,a));
+  };
+
+  __STATIC_FORCEINLINE float32x4_t vadd(const float32x4_t a,const float32x4_t b,
+                                         const mve_pred16_t p0)
+  {
+     return(vaddq_x(a,b,p0));
+  };
+
+
+  __STATIC_FORCEINLINE float32x4_t vadd(const float32x4_t a,const float b,
+                                        const mve_pred16_t p0)
+  {
+     return(vaddq_x_n_f32(a,b,p0));
+  };
+
+  __STATIC_FORCEINLINE float32x4_t vadd(const float a,const float32x4_t b,
+                                        const mve_pred16_t p0)
+  {
+     return(vaddq_x_n_f32(b,a,p0));
+  };
+
+  __STATIC_FORCEINLINE float32x4_t vsub(const float32x4_t a,const float32x4_t b)
+  {
+     return(vsubq(a,b));
+  };
+
+  __STATIC_FORCEINLINE float32x4_t vsub(const float32x4_t a,const float b)
+  {
+     return(vsubq_n_f32(a,b));
+  };
+
+   __STATIC_FORCEINLINE float32x4_t vsub(const float a,const float32x4_t b)
+  {
+     return(vsubq_n_f32(b,a));
+  };
+
+  __STATIC_FORCEINLINE float32x4_t vsub(const float32x4_t a,const float32x4_t b,
+                                        const mve_pred16_t p0)
+  {
+     return(vsubq_x(a,b,p0));
+  };
+
+  __STATIC_FORCEINLINE float32x4_t vsub(const float32x4_t a,const float b,
+                                        const mve_pred16_t p0)
+  {
+     return(vsubq_x_n_f32(a,b,p0));
+  };
+
+   __STATIC_FORCEINLINE float32x4_t vsub(const float a,const float32x4_t b,
+                                        const mve_pred16_t p0)
+  {
+     return(vsubq_x_n_f32(b,a,p0));
+  };
+  
+  __STATIC_FORCEINLINE float32x4_t vmul(const float32x4_t a,const float32x4_t b)
+  {
+     return(vmulq(a,b));
+  };
+
+  __STATIC_FORCEINLINE float32x4_t vmul(const float32x4_t a,const float b)
+  {
+     return(vmulq_n_f32(a,b));
+  };
+
+  __STATIC_FORCEINLINE float32x4_t vmul(const float a,const float32x4_t b)
+  {
+     return(vmulq_n_f32(b,a));
+  };
+
+  __STATIC_FORCEINLINE float32x4_t vmul(const float32x4_t a,const float32x4_t b,
+                                        const mve_pred16_t p0)
+  {
+     return(vmulq_x(a,b,p0));
+  };
+
+  __STATIC_FORCEINLINE float32x4_t vmul(const float32x4_t a,const float b,
+                                        const mve_pred16_t p0)
+  {
+     return(vmulq_x_n_f32(a,b,p0));
+  };
+
+  __STATIC_FORCEINLINE float32x4_t vmul(const float a,const float32x4_t b,
+                                        const mve_pred16_t p0)
+  {
+     return(vmulq_x_n_f32(b,a,p0));
+  };
+
+  __STATIC_FORCEINLINE float32x4_t vmacc(const float32x4_t acc,const float32x4_t a,const float32x4_t b)
+  {
+     return(vfmaq(acc,a,b));
+  };
+
+  __STATIC_FORCEINLINE float32x4_t vmacc(const float32x4_t acc,const float32x4_t a,const float_t b)
+  {
+     return(vfmaq(acc,a,b));
+  };
+
+  __STATIC_FORCEINLINE float32x4_t vmacc(const float32x4_t acc,const float32x4_t a,const float32x4_t b,
+                                        const mve_pred16_t p0)
+  {
+     return(vfmaq_m(acc,a,b,p0));
+  };
+
+
+
+  __STATIC_FORCEINLINE float vreduce(const float32x4_t in)
+  {
+     float acc = vgetq_lane(in, 0) + vgetq_lane(in, 1) +
+                 vgetq_lane(in, 2) + vgetq_lane(in, 3);
+     return(acc);
+  };
+
+
+
+
+  template<int S,
+  typename std::enable_if<S==1,bool>::type = true>
+  inline float32x4_t vload1(const float32_t *p)
+  {
+     return(vld1q(p));
+  };
+
+  template<int S,
+  typename std::enable_if<(S>1),bool>::type = true>
+  inline float32x4_t vload1(const float32_t *p)
+  {
+     constexpr uint32x4_t offset={0*S,1*S,2*S,3*S};
+     return(vldrwq_gather_shifted_offset_f32(p,offset));
+  };
+
+
+  // With dynamic stride
+  inline float32x4_t vload1(const float32_t *p,const index_t stride)
+  {
+     uint32x4_t offset = vidupq_u32((uint32_t)0,1);
+     offset = vmulq_n_u32(offset,stride);
+     return(vldrwq_gather_shifted_offset_f32(p,offset));
+  };
+
+  
+ 
+  template<int S,
+  typename std::enable_if<S==1,bool>::type = true>
+  inline float32x4_t vload1_z(const float32_t *p,const std::size_t nb,const mve_pred16_t p0)
+  {
+     (void)nb;
+     return(vld1q_z(p,p0));
+  };
+
+  template<int S,
+  typename std::enable_if<(S>1),bool>::type = true>
+  inline float32x4_t vload1_z(const float32_t *p,const std::size_t nb,const mve_pred16_t p0)
+  {
+     (void)nb;
+     //uint32x4_t offset={0,1,2,3};
+     //uint32x4_t offset = vidupq_u32((uint32_t)0,1);
+     //offset = vmulq_n_u32(offset,S);
+     constexpr uint32x4_t offset={0*S,1*S,2*S,3*S};
+     return(vldrwq_gather_shifted_offset_z_f32(p,offset,p0));
+  };
+
+  // With dynamic stride
+  inline float32x4_t vload1_z(const float32_t *p,const index_t stride,const std::size_t nb,const mve_pred16_t p0)
+  {
+     (void)nb;
+     //uint32x4_t offset={0,1,2,3};
+     //uint32x4_t offset = vidupq_u32((uint32_t)0,1);
+     uint32x4_t offset = vidupq_u32((uint32_t)0,1);
+     offset = vmulq_n_u32(offset,stride);
+     return(vldrwq_gather_shifted_offset_z_f32(p,offset,p0));
+  };
+  
+  /* Generalized stride */
+  template<int ...S>
+  struct vload1_gen_stride
+  {
+     static float32x4_t run(const float32_t *p)
+     {
+        constexpr uint32x4_t offset={S...};
+        return(vldrwq_gather_shifted_offset_f32(p,offset));
+     };
+  };
+
+  template<>
+  struct vload1_gen_stride<0,1,2,3>
+  {
+     inline float32x4_t run(const float32_t *p)
+     {
+        return(vld1q(p));
+     };
+  };
+
+  /* Generalized stride */
+  template<int ...S>
+  struct vload1_gen_stride_z 
+  {
+     inline float32x4_t run(const float32_t *p,const std::size_t nb,const mve_pred16_t p0)
+     {
+        constexpr uint32x4_t offset={S...};
+        (void)nb;
+        return(vldrwq_gather_shifted_offset_z_f32(p,offset,p0));
+     };
+  };
+
+  template<>
+  struct vload1_gen_stride_z<0,1,2,3>
+  {
+     inline float32x4_t run(const float32_t *p,const std::size_t nb,const mve_pred16_t p0)
+     {
+        (void)nb;
+        return(vld1q_z(p,p0));
+     };
+  };
+  
+  template<int S,
+  typename std::enable_if<S==1,bool>::type = true>
+  inline void vstore1(float32_t *p,const float32x4_t val)
+  {
+     vst1q(p,val);
+  };
+
+  template<int S,
+  typename std::enable_if<(S>1),bool>::type = true>
+  inline void vstore1(float32_t *p,const float32x4_t val)
+  {
+     //uint32x4_t offset={0,1,2,3};
+     //uint32x4_t offset = vidupq_u32((uint32_t)0,1);
+     //offset = vmulq_n_u32(offset,S);
+     constexpr uint32x4_t offset={0*S,1*S,2*S,3*S};
+     vstrwq_scatter_shifted_offset_f32(p,offset,val);
+  };
+
+  // with dynamic stride
+  inline void vstore1(float32_t *p,const index_t stride,const float32x4_t val)
+  {
+     uint32x4_t offset = vidupq_u32((uint32_t)0,1);
+     offset = vmulq_n_u32(offset,stride);
+     vstrwq_scatter_shifted_offset_f32(p,offset,val);
+  };
+
+  
+  template<int S,
+  typename std::enable_if<S==1,bool>::type = true>
+  inline void vstore1_z(float32_t *p,const float32x4_t val,const std::size_t nb,const mve_pred16_t p0)
+  {
+     (void)nb;
+     vstrwq_p(p,val,p0);
+  };
+
+  template<int S,
+  typename std::enable_if<(S>1),bool>::type = true>
+  inline void vstore1_z(float32_t *p,const float32x4_t val,const std::size_t nb,const mve_pred16_t p0)
+  {
+     (void)nb;
+     //uint32x4_t offset={0,1,2,3};
+     //uint32x4_t offset = vidupq_u32((uint32_t)0,1);
+     //offset = vmulq_n_u32(offset,S);
+     constexpr uint32x4_t offset={0*S,1*S,2*S,3*S};
+     vstrwq_scatter_shifted_offset_p_f32(p,offset,val,p0);
+  };
+
+  // with dynamic stride
+  inline void vstore1_z(float32_t *p,const index_t stride,const float32x4_t val,const std::size_t nb,const mve_pred16_t p0)
+  {
+     (void)nb;
+     uint32x4_t offset = vidupq_u32((uint32_t)0,1);
+     offset = vmulq_n_u32(offset,stride);
+     vstrwq_scatter_shifted_offset_p_f32(p,offset,val,p0);
+  };
+
+  // Generalized stride
+  template<int ...S>
+  struct vstore1_gen_stride
+  {
+     static void run(float32_t *p,const float32x4_t val)
+     {
+        constexpr uint32x4_t offset={S...};
+        vstrwq_scatter_shifted_offset_f32(p,offset,val);
+     };
+  };
+
+  template<>
+  struct vstore1_gen_stride<0,1,2,3>
+  {
+      static void run(float32_t *p,const float32x4_t val)
+     {
+        vst1q(p,val);
+     };
+  };
+
+  template<int ...S>
+  struct vstore1_gen_stride_z
+  {
+      static void vstore1_z(float32_t *p,const float32x4_t val,const std::size_t nb,const mve_pred16_t p0)
+      {
+        constexpr uint32x4_t offset={S...};
+        (void)nb;
+        vstrwq_scatter_shifted_offset_p_f32(p,offset,val,p0);
+      }
+  };
+
+  template<>
+  struct vstore1_gen_stride_z<0,1,2,3>
+  {
+     static void vstore1_z(float32_t *p,const float32x4_t val,const std::size_t nb,const mve_pred16_t p0)
+     {
+        (void)nb;
+        vstrwq_p(p,val,p0);
+     }
+
+  };
+
+  
+
+};
+
+#endif
+
+/*! @} */
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/Helium/half.hpp b/dsppp/Include/dsppp/Helium/half.hpp
new file mode 100644
index 000000000..0df7f2418
--- /dev/null
+++ b/dsppp/Include/dsppp/Helium/half.hpp
@@ -0,0 +1,520 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#ifdef DOXYGEN
+#define ARM_MATH_MVEI
+#define ARM_MATH_MVEF
+#define ARM_MATH_MVE_FLOAT16
+#endif
+
+/** \addtogroup HeliumNumber Helium specific number definitions
+ *  \ingroup NUMBER
+ *  @{
+ *  \addtogroup HeliumHalfNumber Half
+ *  \ingroup HeliumNumber
+ *  @{
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16)
+template<typename arch>
+struct vector_traits<float16_t,arch,typename std::enable_if<std::is_base_of<Helium,arch>::value>::type> 
+{
+  typedef float16_t type;
+  typedef float16_t storage_type;
+  typedef float16x8_t vector;
+  typedef float16x8_t temp_accumulator;
+  typedef mve_pred16_t predicate_t;
+
+  static constexpr bool has_vector = true;
+  static constexpr bool is_float = true;
+  static constexpr bool is_fixed = false;
+  static constexpr bool has_predicate = true;
+
+  static constexpr int nb_lanes = 8;
+
+  static float16x8_t temp_acc_zero()
+  {
+     return(vdupq_n_f16(0.0f));
+  }
+
+  static constexpr float16_t zero_lane() {return 0.0f;};
+  // Useful in fixed point since lane value is an int and not a Q something
+  static constexpr float16_t lane_value(const float16_t x) {return x;};
+
+};
+
+
+namespace inner {
+
+
+   template<>
+   struct vctpq<float16_t>{
+       static mve_pred16_t mk(uint32_t v)
+       
+       {
+            return(vctp16q(v));
+       };
+   };
+   
+   __STATIC_FORCEINLINE float16x8_t vconst(float16_t v)
+   {
+      return(vdupq_n_f16(v));
+   }
+
+   __STATIC_FORCEINLINE float16x8_t vconst_tail(const float16_t v,
+                                                const mve_pred16_t p0)
+  {
+     return(vdupq_x_n_f16(v,p0));
+  }
+
+   __STATIC_FORCEINLINE float16x8_t vneg(const float16x8_t a)
+  {
+     return(vnegq(a));
+  };
+
+  __STATIC_FORCEINLINE float16x8_t vneg(const float16x8_t a,
+                                        const mve_pred16_t p0)
+  {
+     return(vnegq_x(a,p0));
+  };
+   
+   /*
+
+   ADD
+
+   */
+
+   __STATIC_FORCEINLINE float16x8_t vadd(const float16x8_t a,
+                                         const float16x8_t b)
+   {
+      return(vaddq(a,b));
+   };
+
+   __STATIC_FORCEINLINE float16x8_t vadd(const float16x8_t a,
+                                         const float16_t b)
+   {
+      return(vaddq_n_f16(a,b));
+   };
+
+   __STATIC_FORCEINLINE float16x8_t vadd(const float16_t a,
+                                         const float16x8_t b)
+   {
+      return(vaddq_n_f16(b,a));
+   };
+
+   __STATIC_FORCEINLINE float16x8_t vadd(const float16x8_t a,
+                                         const float16x8_t b,
+                                         const mve_pred16_t p0)
+   {
+     return(vaddq_x(a,b,p0));
+   };
+
+   __STATIC_FORCEINLINE float16x8_t vadd(const float16x8_t a,
+                                         const float16_t b,
+                                         const mve_pred16_t p0)
+   {
+     return(vaddq_x_n_f16(a,b,p0));
+   };
+
+   __STATIC_FORCEINLINE float16x8_t vadd(const float16_t a,
+                                         const float16x8_t b,
+                                         const mve_pred16_t p0)
+   {
+     return(vaddq_x_n_f16(b,a,p0));
+   };
+
+   /*
+
+   SUB
+
+   */
+
+   __STATIC_FORCEINLINE float16x8_t vsub(const float16x8_t a,
+                                         const float16x8_t b)
+   {
+      return(vsubq(a,b));
+   };
+
+   __STATIC_FORCEINLINE float16x8_t vsub(const float16x8_t a,
+                                         const float16_t b)
+   {
+      return(vsubq_n_f16(a,b));
+   };
+
+   __STATIC_FORCEINLINE float16x8_t vsub(const float16_t a,
+                                         const float16x8_t b)
+   {
+      return(vsubq_n_f16(b,a));
+   };
+
+   __STATIC_FORCEINLINE float16x8_t vsub(const float16x8_t a,
+                                         const float16x8_t b,
+                                         const mve_pred16_t p0)
+   {
+     return(vsubq_x(a,b,p0));
+   };
+
+   __STATIC_FORCEINLINE float16x8_t vsub(const float16x8_t a,
+                                         const float16_t b,
+                                         const mve_pred16_t p0)
+   {
+     return(vsubq_x_n_f16(a,b,p0));
+   };
+
+   __STATIC_FORCEINLINE float16x8_t vsub(const float16_t a,
+                                         const float16x8_t b,
+                                         const mve_pred16_t p0)
+   {
+     return(vsubq_x_n_f16(b,a,p0));
+   };
+
+   /*
+
+   MUL
+
+   */
+   
+   __STATIC_FORCEINLINE float16x8_t vmul(const float16x8_t a,
+                                         const float16x8_t b)
+   {
+      return(vmulq(a,b));
+   };
+
+   __STATIC_FORCEINLINE float16x8_t vmul(const float16x8_t a,
+                                         const float16_t b)
+   {
+      return(vmulq_n_f16(a,b));
+   };
+
+   __STATIC_FORCEINLINE float16x8_t vmul(const float16_t a,
+                                         const float16x8_t b)
+   {
+      return(vmulq_n_f16(b,a));
+   };
+
+   __STATIC_FORCEINLINE float16x8_t vmul(const float16x8_t a,
+                                         const float16x8_t b,
+                                         const mve_pred16_t p0)
+   {
+     return(vmulq_x(a,b,p0));
+   };
+
+   __STATIC_FORCEINLINE float16x8_t vmul(const float16x8_t a,
+                                         const float16_t b,
+                                         const mve_pred16_t p0)
+   {
+     return(vmulq_x_n_f16(a,b,p0));
+   };
+
+   __STATIC_FORCEINLINE float16x8_t vmul(const float16_t a,
+                                         const float16x8_t b,
+                                         const mve_pred16_t p0)
+   {
+     return(vmulq_x_n_f16(b,a,p0));
+   };
+
+   /*
+
+   vmacc
+
+   */
+
+   __STATIC_FORCEINLINE float16x8_t vmacc(const float16x8_t acc,
+                                          const float16x8_t a,
+                                          const float16x8_t b)
+  {
+     return(vfmaq(acc,a,b));
+  };
+
+  __STATIC_FORCEINLINE float16x8_t vmacc(const float16x8_t acc,
+                                         const float16x8_t a,
+                                         const float16_t b)
+  {
+     return(vfmaq(acc,a,b));
+  };
+
+  __STATIC_FORCEINLINE float16x8_t vmacc(const float16x8_t acc,
+                                         const float16x8_t a,
+                                         const float16x8_t b,
+                                         const mve_pred16_t p0)
+  {
+     return(vfmaq_m(acc,a,b,p0));
+  };
+
+
+
+  __STATIC_FORCEINLINE float16_t vreduce(float16x8_t in)
+  {
+     float16x8_t tmpVec;
+    _Float16 acc;
+
+    tmpVec = (float16x8_t) vrev32q_s16((int16x8_t) in);
+    in = vaddq_f16(tmpVec, in);
+    tmpVec = (float16x8_t) vrev64q_s32((int32x4_t) in);
+    in = vaddq_f16(tmpVec, in);
+    acc = (_Float16)vgetq_lane_f16(in, 0) + (_Float16)vgetq_lane_f16(in, 4);
+
+    return acc;
+  };
+
+  /*
+
+  Load
+
+  */
+
+  template<int S,
+  typename std::enable_if<S==1,bool>::type = true>
+  inline float16x8_t vload1(const float16_t *p)
+  {
+     return(vld1q(p));
+  };
+
+  template<int S,
+  typename std::enable_if<(S>1) && (S<=65535),bool>::type = true>
+  inline float16x8_t vload1(const float16_t *p)
+  {
+     constexpr uint16x8_t offset={0*S,1*S,2*S,3*S,4*S,5*S,6*S,7*S};
+     return(vldrhq_gather_shifted_offset_f16(p,offset));
+  };
+
+  template<int S,
+  typename std::enable_if<(S>65535),bool>::type = true>
+  inline float16x8_t vload1(const float16_t *p)
+    {
+       float16x8_t res;
+       for(std::size_t i=0;i<8;i++)
+       {
+         res[i] = *p;
+         p += S;
+       }
+       
+       return(res);
+    };
+
+  // With dynamic stride
+  inline float16x8_t vload1(const float16_t *p,const index_t stride)
+  {
+     if (stride <= 65535)
+     {
+       uint16x8_t offset = vidupq_u16((uint32_t)0,1);
+       offset = vmulq_n_u16(offset,stride);
+       return(vldrhq_gather_shifted_offset_f16(p,offset));
+     }
+     else 
+     {
+        float16x8_t res;
+        for(std::size_t i=0;i<8;i++)
+        {
+              res[i] = *p;
+              p += stride;
+        }
+        return(res);
+     }
+  };
+
+  
+ 
+  template<int S,
+  typename std::enable_if<S==1,bool>::type = true>
+  inline float16x8_t vload1_z(const float16_t *p,
+                              const std::size_t nb,
+                              const mve_pred16_t p0)
+  {
+     (void)nb;
+     return(vld1q_z(p,p0));
+  };
+
+  template<int S,
+  typename std::enable_if<(S>1)&& (S<=65535),bool>::type = true>
+  inline float16x8_t vload1_z(const float16_t *p,
+                              const std::size_t nb,
+                              const mve_pred16_t p0)
+  {
+     (void)nb;
+     constexpr uint16x8_t offset={0*S,1*S,2*S,3*S,4*S,5*S,6*S,7*S};
+     return(vldrhq_gather_shifted_offset_z_f16(p,offset,p0));
+  };
+
+    template<int S,
+    typename std::enable_if<(S>65535),bool>::type = true>
+    inline float16x8_t vload1_z(const float16_t *p,std::size_t nb,mve_pred16_t p0)
+    {
+        (void)p0;
+        float16x8_t res;
+        std::size_t i=0;
+        for(;i<std::min(nb,8U);i++)
+        {
+         res[i] = *p;
+         p += S;
+        }
+
+        for(;i<8;i++)
+        {
+         res[i] = 0;
+         p += S;
+        }
+       
+        return(res);
+     
+    };
+
+  // With dynamic stride
+  inline float16x8_t vload1_z(const float16_t *p,
+                              const index_t stride,
+                              const std::size_t nb,
+                              const mve_pred16_t p0)
+  {
+     (void)nb;
+     if (stride <= 65535)
+     {
+       uint16x8_t offset = vidupq_u16((uint32_t)0,1);
+       offset = vmulq_n_u16(offset,stride);
+       return(vldrhq_gather_shifted_offset_z_f16(p,offset,p0));
+     }
+     else 
+     {
+        (void)p0;
+        float16x8_t res;
+        std::size_t i=0;
+        for(;i<std::min(nb,8U);i++)
+        {
+         res[i] = *p;
+         p += stride;
+        }
+
+        for(;i<8;i++)
+        {
+         res[i] = 0;
+         p += stride;
+        }
+       
+        return(res);
+     }
+  };
+   
+
+   /*
+
+   Store
+
+   */
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline void vstore1(float16_t *p,const float16x8_t val)
+    {
+       vst1q(p,val);
+    };
+
+    template<int S,
+    typename std::enable_if<(S>1) && (S<=65535),bool>::type = true>
+    inline void vstore1(float16_t *p,const float16x8_t val)
+    {
+       constexpr uint16x8_t offset={0*S,1*S,2*S,3*S,4*S,5*S,6*S,7*S};
+       vstrhq_scatter_shifted_offset_f16(p,offset,val);
+    };
+
+    template<int S,
+    typename std::enable_if<(S>65535),bool>::type = true>
+    inline void vstore1(float16_t *p,const float16x8_t val)
+    {
+      for(std::size_t i=0;i<8;i++)
+      {
+        *p = val[i];
+         p += S;
+      }
+      
+    };
+
+    // dynamic stride 
+    inline void vstore1(float16_t *p,
+                        const index_t stride,
+                        const float16x8_t val)
+    {
+        if (stride <=65535)
+        {
+            uint16x8_t offset = vidupq_u16((uint32_t)0,1);
+            offset = vmulq_n_u16(offset,stride);
+            vstrhq_scatter_shifted_offset_f16(p,offset,val);
+        }
+        else
+        {
+            for(std::size_t i=0;i<8;i++)
+            {
+              *p = val[i];
+               p += stride;
+            }
+        }
+    }
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline void vstore1_z(float16_t *p,
+                          const float16x8_t val,
+                          std::size_t nb,
+                          mve_pred16_t p0)
+    {
+       (void)nb;
+       vstrhq_p(p,val,p0);
+    };
+
+    template<int S,
+    typename std::enable_if<(S>1) && (S<=65535),bool>::type = true>
+    inline void vstore1_z(float16_t *p,
+                          const float16x8_t val,
+                          std::size_t nb,
+                          mve_pred16_t p0)
+    {
+       (void)nb;
+       
+       constexpr uint16x8_t offset={0*S,1*S,2*S,3*S,4*S,5*S,6*S,7*S};
+       vstrhq_scatter_shifted_offset_p_f16(p,offset,val,p0);
+    };
+    
+    
+    template<int S,
+    typename std::enable_if<(S>65535),bool>::type = true>
+    inline void vstore1_z(float16_t *p,
+                          const float16x8_t val,
+                          std::size_t nb,
+                          mve_pred16_t p0)
+    {
+      (void)p0;
+       for(std::size_t i=0;i<std::min(nb,8U);i++)
+       {
+           *p = val[i];
+           p += S;
+       }
+    };
+
+    // dynamic stride
+    inline void vstore1_z(float16_t *p,
+                          const index_t stride,
+                          const float16x8_t val,
+                          std::size_t nb,
+                          mve_pred16_t p0)
+    {
+       if (stride<=65535)
+       {
+         uint16x8_t offset = vidupq_u16((uint32_t)0,1);
+         offset = vmulq_n_u16(offset,stride);
+         vstrhq_scatter_shifted_offset_p_f16(p,offset,val,p0);
+       }
+       else 
+       {
+         for(std::size_t i=0;i<std::min(nb,8U);i++)
+         {
+             *p = val[i];
+             p += stride;
+         }
+       }
+    };
+
+}
+
+#endif
+
+/*! @} */
+/*! @} */
diff --git a/dsppp/Include/dsppp/Helium/matrix_multiply.hpp b/dsppp/Include/dsppp/Helium/matrix_multiply.hpp
new file mode 100644
index 000000000..82655e4ef
--- /dev/null
+++ b/dsppp/Include/dsppp/Helium/matrix_multiply.hpp
@@ -0,0 +1,335 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#ifdef DOXYGEN
+#define ARM_MATH_MVEI
+#define ARM_MATH_MVEF
+#define ARM_MATH_MVE_FLOAT16
+#endif
+
+/** \addtogroup HELIUMALG
+ *  @{
+ */
+
+template<typename M,
+         typename V,
+         typename RES>
+inline void _dot_m_v(RES &res,
+                    const M&m,const V&v,
+                    const Helium* = nullptr)
+{
+
+   const vector_length_t nb_rows=m.rows();
+   constexpr int U = 4;
+   
+   index_t row=0;
+
+   DISABLE_LOOP_UNROLL
+   for(; row<=nb_rows-U; row += U)
+   {
+      results<U>([&res,&row](index_t k){return &res[row+k];}) =
+           inner::from_accumulator(dot(unroll<U>(
+                                 [&row,&m](index_t k){return m.row(row+k);}),
+                                 replicate<U>(v)
+              ));
+   }
+
+   switch (nb_rows-row)
+   {
+      case 3:
+         results<3>([&res,row](index_t k){return &res[row+k];}) =
+           inner::from_accumulator(dot(unroll<3>(
+                                 [row,&m](index_t k){return m.row(row+k);}),
+                                 replicate<3>(v)
+                           ));
+      break;
+      case 2:
+         results<2>([&res,row](index_t k){return &res[row+k];}) =
+           inner::from_accumulator(dot(unroll<2>(
+                                 [row,&m](index_t k){return m.row(row+k);}),
+                                 replicate<2>(v)
+                               ));
+      break;
+      case 1:
+         res[row] = inner::from_accumulator(dot(m.row(row),v));
+      break;
+   }
+
+}
+
+#define MATRIX_DIM2 2
+#define MATRIX_DIM3 3
+#define MATRIX_DIM4 4
+
+#if defined(ARM_MATH_MVEI) 
+
+/* Fixed point specific cases*/
+#include "matrix_multiply_fixed.hpp"
+
+#endif
+
+#if defined(ARM_MATH_MVEF)
+
+/* Datatype specific cases*/
+#include "matrix_multiply_f16.hpp"
+#include "matrix_multiply_f32.hpp"
+
+/* Generic float */
+template<typename MA,
+         typename MB,
+         typename RES,
+         typename std::enable_if<
+         has_vector_inst<MA>() &&
+         number_traits<typename traits<MA>::Scalar>::is_float,bool>::type = true>
+__STATIC_INLINE void _dot_m_m(const MA&pSrcA,const MB&pSrcB,
+                     RES &&pDst,
+                     const Helium* = nullptr)
+   {
+    using T = typename traits<MA>::Scalar;
+    using ACC = typename vector_traits<T>::temp_accumulator;
+    using VEC = typename vector_traits<T>::vector;
+    constexpr int nb_lanes = vector_traits<T>::nb_lanes;
+
+    T  *pInB = pSrcB.ptr();        /* input data matrix pointer B */
+    T  *pInA = pSrcA.ptr();        /* input data matrix pointer A  */
+    T  *pOut = pDst.ptr();         /* output data matrix pointer */
+    int         numRowsA = pSrcA.rows();  /* number of rows of input matrix A */
+    int         numColsB = pSrcB.columns();  /* number of columns of input matrix B */
+    int         numColsA = pSrcA.columns();  /* number of columns of input matrix A */
+    uint32_t    blkCnt;                     /* loop counters */
+    uint32_t    i;
+
+  {
+      /* small squared matrix specialized routines */
+    if(numRowsA == numColsB && numColsB == numColsA) {
+        if (numRowsA == 1)
+        {
+           pDst(0,0)= pSrcA(0,0) * pSrcB(0,0);
+           return;
+        }
+        else if(numRowsA == 2)
+            return _arm_mat_mult_2x2_mve(pSrcA, pSrcB, std::forward<RES>(pDst));
+        else if(numRowsA == 3)
+            return _arm_mat_mult_3x3_mve(pSrcA, pSrcB, std::forward<RES>(pDst));
+        else if(numRowsA == 4)
+            return _arm_mat_mult_4x4_mve(pSrcA, pSrcB, std::forward<RES>(pDst));
+    }
+
+    /* main loop process 4 rows */
+    i = numRowsA >> 2;
+    while (i > 0U)
+    {
+        T *pInA0, *pInA1, *pInA2, *pInA3;
+        T *pInB0;
+        T *pOut0, *pOut1, *pOut2, *pOut3;
+        ACC vecMac0, vecMac1, vecMac2, vecMac3;
+        VEC vecInB;
+
+        /* pointers to 4 consecutive output rows */
+        pOut0 = pOut;
+        pOut1 = pOut0 + pDst.stride();
+        pOut2 = pOut1 + pDst.stride();
+        pOut3 = pOut2 + pDst.stride();
+        pInB0 = pInB;
+
+        uint32_t  k = numColsB / nb_lanes;
+        while (k > 0U)
+        {
+            /* pointers to 4 consecutive Matrix A rows */
+            pInA0 = pInA;
+            pInA1 = pInA0 + pSrcA.stride();
+            pInA2 = pInA1 + pSrcA.stride();
+            pInA3 = pInA2 + pSrcA.stride();
+
+            vecMac0 = vector_traits<T>::temp_acc_zero();
+            vecMac1 = vector_traits<T>::temp_acc_zero();
+            vecMac2 = vector_traits<T>::temp_acc_zero();
+            vecMac3 = vector_traits<T>::temp_acc_zero();
+
+            blkCnt = numColsA;
+
+            while (blkCnt > 0U)
+            {
+                /*
+                 * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
+                 */
+                vecInB = inner::vload1<1>(pInB0); /* vldrwq_f32(pInB0, 0); */
+
+                vecMac0 = inner::vmacc(vecMac0, vecInB, *pInA0++);
+                vecMac1 = inner::vmacc(vecMac1, vecInB, *pInA1++);
+                vecMac2 = inner::vmacc(vecMac2, vecInB, *pInA2++);
+                vecMac3 = inner::vmacc(vecMac3, vecInB, *pInA3++);
+
+                pInB0 = pInB0 + pSrcB.stride();
+                /*
+                 * Decrement the blockSize loop counter
+                 */
+                blkCnt--;
+            }
+
+            /* Store the results (4 x 4 block) in the destination buffer */
+            inner::vstore1<1>(pOut0, vecMac0);  
+            pOut0 += nb_lanes;
+            inner::vstore1<1>(pOut1, vecMac1);  
+            pOut1 += nb_lanes;
+            inner::vstore1<1>(pOut2, vecMac2);  
+            pOut2 += nb_lanes;
+            inner::vstore1<1>(pOut3, vecMac3);  
+            pOut3 += nb_lanes;
+
+            /*
+             * rewind
+             */
+            pInB0 -= (pSrcB.stride() * numColsA) - nb_lanes;
+            k--;
+        }
+
+        int       colBLeft = numColsB & (nb_lanes - 1);
+        if (colBLeft)
+        {
+            pInA0 = pInA;
+            pInA1 = pInA0 + pSrcA.stride();
+            pInA2 = pInA1 + pSrcA.stride();
+            pInA3 = pInA2 + pSrcA.stride();
+            
+            mve_pred16_t    p0 = inner::vctpq<T>::mk(colBLeft);
+
+            vecMac0 = vector_traits<T>::temp_acc_zero();
+            vecMac1 = vector_traits<T>::temp_acc_zero();
+            vecMac2 = vector_traits<T>::temp_acc_zero();
+            vecMac3 = vector_traits<T>::temp_acc_zero();
+
+            blkCnt = numColsA;
+
+            while (blkCnt > 0U)
+            {
+                /*
+                 * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
+                 */
+                vecInB = inner::vload1_z<1>(pInB0, colBLeft,p0);
+
+                vecMac0 = inner::vmacc(vecMac0, vecInB, *pInA0++);
+                vecMac1 = inner::vmacc(vecMac1, vecInB, *pInA1++);
+                vecMac2 = inner::vmacc(vecMac2, vecInB, *pInA2++);
+                vecMac3 = inner::vmacc(vecMac3, vecInB, *pInA3++);
+
+                pInB0 = pInB0 + pSrcB.stride();
+                /*
+                 * Decrement the blockSize loop counter
+                 */
+                blkCnt--;
+            }
+
+            /* Store the results (4 x colBLeft block) in the destination buffer */
+            inner::vstore1_z<1>(pOut0, vecMac0, colBLeft,p0);
+            inner::vstore1_z<1>(pOut1, vecMac1, colBLeft,p0);
+            inner::vstore1_z<1>(pOut2, vecMac2, colBLeft,p0);
+            inner::vstore1_z<1>(pOut3, vecMac3, colBLeft,p0);
+        }
+
+        /* move to next rows */
+        pInA += 4 * pSrcA.stride();
+        pOut += 4 * pDst.stride();
+        i--;
+    }
+
+    /*
+     * non multiple of 4 rows for Matrix A
+     * process single row
+     */
+    if (numRowsA & 3)
+    {
+        i = numRowsA & 3;
+        while (i > 0U)
+        {
+            T   *pInA0;
+            T   *pInB0;
+            T   *pOut0;
+            VEC    vecInB;
+            ACC    vecMac0;
+
+            pOut0 = pOut;
+            pInB0 = pInB;
+
+            uint32_t       k = numColsB / nb_lanes;
+            while (k > 0U)
+            {
+                pInA0 = pInA;
+
+                vecMac0 = vector_traits<T>::temp_acc_zero();
+                blkCnt = numColsA;
+                while (blkCnt > 0U)
+                {
+                    /*
+                     * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
+                     */
+                    vecInB = inner::vload1<1>(pInB0); /* vldrwq_f32(pInB0, 0); */
+
+                    vecMac0 = inner::vmacc(vecMac0, vecInB, *pInA0++);
+
+                    pInB0 = pInB0 + pSrcB.stride();
+                    /*
+                     * Decrement the blockSize loop counter
+                     */
+                    blkCnt--;
+                }
+
+                /* Store the results (1 x 4 block) in the destination buffer */
+                inner::vstore1<1>(pOut0, vecMac0);  
+                pOut0 += nb_lanes;
+
+                /*
+                 * rewind
+                 */
+                pInB0 -= (pSrcB.stride() * numColsA) - nb_lanes;
+                k--;
+            }
+
+            int       colBLeft = numColsB & (nb_lanes-1);
+            if (colBLeft)
+            {
+                pInA0 = pInA;
+                mve_pred16_t    p0 = inner::vctpq<T>::mk(colBLeft);
+
+                vecMac0 = vector_traits<T>::temp_acc_zero();
+                blkCnt = numColsA;
+                while (blkCnt > 0U)
+                {
+                    /*
+                     * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
+                     */
+                    vecInB = inner::vload1_z<1>(pInB0, colBLeft,p0);
+
+                    vecMac0 = inner::vmacc(vecMac0, vecInB, *pInA0++);
+
+                    pInB0 = pInB0 + pSrcB.stride();
+                    /*
+                     * Decrement the blockSize loop counter
+                     */
+                    blkCnt--;
+                }
+                /* Store the results (1 x colBLeft block) in the destination buffer */
+                inner::vstore1_z<1>(pOut0, vecMac0, colBLeft,p0);
+            }
+
+            /* move to next row */
+            pInA += 1 * pSrcA.stride();
+            pOut += 1 * pDst.stride();
+            i--;
+        }
+        
+      }
+ 
+}
+
+}
+
+
+#undef MATRIX_DIM2
+#undef MATRIX_DIM3
+#undef MATRIX_DIM4
+
+#endif
+
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/Helium/matrix_multiply_f16.hpp b/dsppp/Include/dsppp/Helium/matrix_multiply_f16.hpp
new file mode 100644
index 000000000..3671160f9
--- /dev/null
+++ b/dsppp/Include/dsppp/Helium/matrix_multiply_f16.hpp
@@ -0,0 +1,404 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#ifdef DOXYGEN
+#define ARM_MATH_MVEI
+#define ARM_MATH_MVEF
+#define ARM_MATH_MVE_FLOAT16
+#endif
+
+/** \addtogroup HELIUMALG
+ *  @{
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16)
+
+/*
+
+This can't be used with stride bigger than 21845
+which for embedded is acceptable.
+
+No check is done at runtime or build time that the stride is not
+too big.
+
+*/
+
+template<typename MA,
+         typename MB,
+         typename RES,
+typename std::enable_if<
+         has_vector_inst<MA>() &&
+         SameElementType<MA,float16_t>::value,bool>::type = true>
+__STATIC_INLINE  void _arm_mat_mult_2x2_mve(
+    const MA &pSrcA,
+    const MB &pSrcB,
+    RES &&pDst)
+{
+    using T = typename traits<MA>::Scalar;
+    //using ACC = typename vector_traits<T>::temp_accumulator;
+    using VEC = typename vector_traits<T>::vector;
+
+    const uint16_t offsetA[8] = { 0, 0, (uint16_t)pSrcA.stride(), (uint16_t)pSrcA.stride(), 
+                                  0, 0, (uint16_t)pSrcA.stride(), (uint16_t)pSrcA.stride() };
+    /* offsetB allows to read and duplicate 1 row of B */
+    const uint16_t offsetB[8] = { 0, 1, 0, 1, 0, 1, 0, 1 };
+
+    /* {d00, d01, d10, d11} */
+    const uint16_t  offsetD[8] = { 0, 1, (uint16_t)pDst.stride(), (uint16_t)(pDst.stride()+1),
+                                   0,0,0,0 };
+
+    uint16x8_t    vecOffsA, vecOffsB,vecOffsD;
+    VEC       vecInA, vecInB, vecDst;
+    T      *pOut = pDst.ptr();  /* output data matrix pointer */
+
+    /*
+     * load initial offsets
+     */
+    vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
+    vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
+    /*
+     * load {a00 a00 a10 a10 x x x x }
+     */
+    vecInA = vldrhq_gather_shifted_offset(pSrcA.const_ptr(), vecOffsA);
+    /*
+     * load {b00 b01 b00 b01 x x x x }
+     */
+    vecInB = vldrhq_gather_shifted_offset(pSrcB.const_ptr(), vecOffsB);
+    /*
+     *  { a00 b00       a00 b01
+     *    a10 b00       a10 b01
+     *       x             x
+     *       x             x   }
+     */
+    vecDst = vmulq(vecInA, vecInB);
+    /*
+     * move to 2nd column of matrix A
+     */
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
+    /*
+     * load {a01 a01 a11 a11 x x x x}
+     */
+    vecInA = vldrhq_gather_shifted_offset(pSrcA.const_ptr(), vecOffsA);
+    /*
+     * move to next B row
+     */
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) pSrcB.stride());
+    /*
+     * load {b10, b11, b10, b11, x x x x }
+     */
+    vecInB = vldrhq_gather_shifted_offset(pSrcB.const_ptr(), vecOffsB);
+    /*
+     *  { a00 b00 + a01 b10   a00 b01 + a01 b11
+     *    a10 b00 + a11 b10     a10 b01 + a11 b11
+     *             x                    x
+     *             x                    x       }
+     */
+    vecDst = vfmaq(vecDst, vecInA, vecInB);
+
+    mve_pred16_t p0 = vctp16q(2*2);
+    /*
+     * Store the result in the destination buffer
+     * (lower half of the vector)
+     */
+
+    vecOffsD = vldrhq_u16((uint16_t const *) offsetD);
+
+    vstrhq_scatter_shifted_offset_p(pOut,vecOffsD,vecDst,p0);
+
+}
+
+
+template<typename MA,
+         typename MB,
+         typename RES,
+typename std::enable_if<
+         has_vector_inst<MA>() &&
+         SameElementType<MA,float16_t>::value,bool>::type = true>
+__STATIC_INLINE  void _arm_mat_mult_3x3_mve(
+    const MA &pSrcA,
+    const MB &pSrcB,
+    RES &&pDst)
+{
+    const uint16_t offsetA[8] = { 0, 0, 0, 
+                                         (uint16_t)pSrcA.stride(), (uint16_t)pSrcA.stride(), (uint16_t)pSrcA.stride(), 
+                                         (uint16_t)(2U*pSrcA.stride()), (uint16_t)(2U*pSrcA.stride()) };
+    /* offsetB allows to read and duplicate 1 row of B */
+    const uint16_t offsetB[8] = { 0, 1, 2, 0, 1, 2, 0, 1 };
+    const uint16_t offsetD[8] = { 0, 1, 2, 
+                                (uint16_t)(0+pDst.stride()), (uint16_t)(1+pDst.stride()), 
+                                (uint16_t)(2+pDst.stride()), 
+                                (uint16_t)(0+2*pDst.stride()),
+                                (uint16_t)(1+2*pDst.stride()) };
+
+    uint16x8_t    vecOffsA, vecOffsB,vecOffsD;
+    float16x8_t       vecInA, vecInB, vecDst;
+    float16_t      *pOut = pDst.ptr();  /* output data matrix pointer */
+
+    /*
+     * load initial offsets
+     */
+    vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
+    vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
+
+    /*
+     * load {a00 a00 a00 a10 a10 a10 a20 a20}
+     */
+    vecInA = vldrhq_gather_shifted_offset(pSrcA.const_ptr(), vecOffsA);
+    /*
+     * load {b00 b01 b02 b00 b01 b02 b00 b01}
+     */
+    vecInB = vldrhq_gather_shifted_offset(pSrcB.const_ptr(), vecOffsB);
+    /*
+     *  { a00 b00       a00 b01     a00 b02
+     *    a10 b00       a10 b01     a10 b02
+     *    a20 b00       a20 b01}
+     */
+    vecDst = vmulq(vecInA, vecInB);
+
+    /*
+     * move to 2nd column of matrix A
+     */
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
+    /*
+     * load {a01 a01 a01 a11 a11 a11 a21 a21}
+     */
+    vecInA = vldrhq_gather_shifted_offset(pSrcA.const_ptr(), vecOffsA);
+    /*
+     * move to next B row
+     */
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) pSrcB.stride());
+    /*
+     * load {b10, b11, b12, b10, b11, b12, b10, b11}
+     */
+    vecInB = vldrhq_gather_shifted_offset(pSrcB.const_ptr(), vecOffsB);
+    /*
+     *  { a00 b00 + a01 b10   a00 b01 + a01 b11     a00 b02 + a01 b12
+     *    a10 b00 + a11 b10     a10 b01 + a11 b11     a10 b02 + a11 b12
+     *    a20 b00 + a21 b10     a20 b01 + a21 b11   }
+     */
+    vecDst = vfmaq(vecDst, vecInA, vecInB);
+    /*
+     * move to 3rd column of matrix A
+     */
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
+    /*
+     * load {a02 a02 a02 a12 a12 a12 a22 a22}
+     */
+    vecInA = vldrhq_gather_shifted_offset(pSrcA.const_ptr(), vecOffsA);
+    /*
+     * move to next B row
+     */
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t)  pSrcB.stride());
+    /*
+     * load {b20, b21, b22, b20, b21, b22, b20, b21}
+     */
+    vecInB = vldrhq_gather_shifted_offset(pSrcB.const_ptr(), vecOffsB);
+    /*
+     *  {a00 b00 + a01 b10 + a02 b20  a00 b01 + a01 b11 + a02 b21     a00 b02 + a01 b12 + a02 b22},
+     *   a10 b00 + a11 b10 + a12 b20    a10 b01 + a11 b11 + a12 b21     a10 b02 + a11 b12 + a12 b22},
+     *   a20 b00 + a21 b10 + a22 b20    a20 b01 + a21 b11 + a22 b21   }
+     */
+    vecDst = vfmaq(vecDst, vecInA, vecInB);
+
+    /*
+     * Store the result in the destination buffer
+     */
+    vecOffsD = vldrhq_u16((uint16_t const *) offsetD);
+
+    vstrhq_scatter_shifted_offset(pOut,vecOffsD,vecDst);
+
+    pOut += 2*pDst.stride()+2;
+
+    /* last element computed in scalar mode
+     * a20 b02 + a21 b12 + a22 b22
+     */
+  
+    const _Float16 * pA = (const _Float16 *)pSrcA.const_ptr();
+    const _Float16 * pB = (const _Float16 *)pSrcB.const_ptr();
+    const index_t sa =pSrcA.stride();
+    const index_t sb =pSrcB.stride();
+    *pOut = pA[2*sa] * pB[2] + pA[1+2*sa] * pB[2+sb] + pA[2+2*sa] * pB[2+2*sb];
+
+}
+
+
+
+template<typename MA,
+         typename MB,
+         typename RES,
+typename std::enable_if<
+         has_vector_inst<MA>() &&
+         SameElementType<MA,float16_t>::value,bool>::type = true>
+__STATIC_INLINE  void _arm_mat_mult_4x4_mve(
+    const MA &pSrcA,
+    const MB &pSrcB,
+    RES &&pDst)
+{
+     /* offsetA allows to read and duplicate 2 successive column elements of A */
+    const uint16_t offsetA[8] = { 0, 0, 0, 0, 
+                                         (uint16_t)pSrcA.stride(), (uint16_t)pSrcA.stride(), (uint16_t)pSrcA.stride(), (uint16_t)pSrcA.stride() };
+    /* offsetB allows to read and duplicate 1 row of B */
+    const uint16_t offsetB[8] = { 0, 1, 2, 3, 0, 1, 2, 3 };
+
+    const uint16_t offsetD[8] = { 0, 1, 2, 3, 
+                                  (uint16_t)(0+pDst.stride()), (uint16_t)(1+pDst.stride()), 
+                                  (uint16_t)(2+pDst.stride()), (uint16_t)(3+pDst.stride()) };
+
+    uint16x8_t    vecOffsA, vecOffsB,vecOffsD;
+    float16x8_t       vecInA, vecInB, vecDst0, vecDst1;
+    float16_t      *pOut = pDst.ptr();  /* output data matrix pointer */
+
+    /*
+     * load initial offsets
+     */
+    vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
+    vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
+
+    /*
+     * load {a00 a00 a00 a00 a10 a10 a10 a10}
+     */
+    vecInA = vldrhq_gather_shifted_offset(pSrcA.const_ptr(), vecOffsA);
+    /*
+     * load {b00 b01 b02 b03 b00 b01 b02 b03}
+     */
+    vecInB = vldrhq_gather_shifted_offset(pSrcB.const_ptr(), vecOffsB);
+
+    /*
+     *  { a00 b00       a00 b01     a00 b02     a00 b03
+     *    a10 b00       a10 b01     a10 b02     a10 b03 }
+     */
+    vecDst0 = vmulq(vecInA, vecInB);
+    /*
+     * jump 2 x A rows (2nd half of matrix)
+     */
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) pSrcA.stride()*2);
+    /*
+     * load {a20 a20 a20 a20 a30 a30 a30 a30}
+     */
+    vecInA = vldrhq_gather_shifted_offset(pSrcA.const_ptr(), vecOffsA);
+    /*
+     *  { a20 b00       a20 b01     a20 b02     a20 b03
+     *    a30 b00       a30 b01     a30 b02 +   a31 b12 }
+     */
+    vecDst1 = vmulq(vecInA, vecInB);
+    /*
+     * rewind back to top half of the A matrix (2nd column)
+     */
+    vecOffsA = vsubq(vecOffsA, (uint16_t) (2*pSrcA.stride()-1));
+    /*
+     * load {a01 a01 a01 a01 a11 a11 a11 a11}
+     */
+    vecInA = vldrhq_gather_shifted_offset(pSrcA.const_ptr(), vecOffsA);
+
+    /*
+     * move to next B row
+     */
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) pSrcB.stride());
+    /*
+     * load {b10, b11, b12, b13, b10, b11, b12, b13}
+     */
+    vecInB = vldrhq_gather_shifted_offset(pSrcB.const_ptr(), vecOffsB);
+    /*
+     *  { a00 b00 + a01 b10         a00 b01 + a01 b11       a00 b02 + a01 b12       a00 b03 + a01 b13
+     *    a10 b00 + a11 b10         a10 b01 + a11 b11       a10 b02 + a11 b12       a10 b03 + a11 b13 }
+     */
+    vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
+    /*
+     * jump 2 x A rows (2nd half of matrix)
+     */
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) pSrcA.stride()*2);
+    /*
+     * load {a21 a21 a21 a21 a31 a31 a31 a31}
+     */
+    vecInA = vldrhq_gather_shifted_offset(pSrcA.const_ptr(), vecOffsA);
+    /*
+     *  {a20 b00 + a21 b10      a20 b01 + a21 b11       a20 b02 + a21 b12       a20 b03 + a21 b13
+     *   a30 b00 + a31 b10      a30 b01 + a31 b11       a30 b02 + a31 b12       a30 b03 + a31 b13 }
+     */
+    vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
+
+    /*
+     * rewind back to top half of the A matrix (3rd column)
+     */
+    vecOffsA = vsubq(vecOffsA, (uint16_t) (2*pSrcA.stride()-1));
+    /*
+     * load {a02 a02 a02 a02 a12 a12 a12 a12}
+     */
+    vecInA = vldrhq_gather_shifted_offset(pSrcA.const_ptr(), vecOffsA);
+    /*
+     * move to next B row
+     */
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) pSrcB.stride());
+    /*
+     * load {b20, b21, b22, b23, b20, b21, b22, b23}
+     */
+    vecInB = vldrhq_gather_shifted_offset(pSrcB.const_ptr(), vecOffsB);
+    /*
+     *  { a00 b00 + a01 b10 + a02 b20    a00 b01 + a01 b11 + a02 b21    a00 b02 + a01 b12 + a02 b22   a00 b03 + a01 b13 + a02 b23
+     *    a10 b00 + a11 b10 + a12 b20    a10 b01 + a11 b11 + a12 b21    a10 b02 + a11 b12 + a12 b22   a10 b03 + a11 b13 + a12 b23 }
+     */
+    vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
+    /*
+     * jump 2 x A rows
+     */
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 2*pSrcA.stride());
+
+    /*
+     * load {a22 a22 a22 a22 a32 a32 a32 a32}
+     */
+    vecInA = vldrhq_gather_shifted_offset(pSrcA.const_ptr(), vecOffsA);
+    /*
+     *  {a20 b00 + a21 b10 + a22 b20   a20 b01 + a21 b11 + a22 b21  a20 b02 + a21 b12 + a22 b22    a20 b03 + a21 b13 + a22 b23
+     *   a30 b00 + a31 b10 + a32 b20   a30 b01 + a31 b11 + a32 b21  a30 b02 + a31 b12 + a32 b22    a30 b03 + a31 b13 + a32 b23 }
+     */
+    vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
+
+    /*
+     * rewind back to top half of the A matrix (4th column)
+     */
+    vecOffsA = vsubq(vecOffsA, (uint16_t) (2*pSrcA.stride()-1));
+    /*
+     * load {a03 a03 a03 a03 a13 a13 a13 a13}
+     */
+    vecInA = vldrhq_gather_shifted_offset(pSrcA.const_ptr(), vecOffsA);
+    /*
+     * move to next B row
+     */
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) pSrcB.stride());
+    /*
+     * load {b30, b31, b32, b33, b30, b31, b32, b33}
+     */
+    vecInB = vldrhq_gather_shifted_offset(pSrcB.const_ptr(), vecOffsB);
+    /*
+     * { a00 b00 +...+ a03 b30,    a00 b01 +...+ a03 b31,   a00 b02 +...+ a03 b32,   a00 b03 +...+ a03 b33
+     *   a10 b00 +...+ a13 b30,    a10 b01 +...+ a13 b31,   a10 b02 +...+ a13 b32,   a10 b03 +...+ a13 b33 }
+     */
+    vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
+    /*
+     * jump 2 x A rows
+     */
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) pSrcA.stride()*2);
+    /*
+     * load {a23 a23 a23 a23 a33 a33 a33 a33}
+     */
+    vecInA = vldrhq_gather_shifted_offset(pSrcA.const_ptr(), vecOffsA);
+    /*
+     *  {a20 b00 +...+ a23 b30,   a20 b01 +...+ a23 b31,   a20 b02 +...+ a23 b32,   a20 b03 +...+ a23 b33
+     *   a30 b00 +...+ a33 b30,   a30 b01 +...+ a33 b31,   a30 b02 +...+ a33 b32,   a30 b03 +...+ a33 b33 }
+     */
+    vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
+
+    /*
+     * Store the result in the destination buffer
+     */
+    vecOffsD = vldrhq_u16((uint16_t const *) offsetD);
+    vstrhq_scatter_shifted_offset(pOut,vecOffsD,vecDst0);
+    pOut += 2*pDst.stride();
+    vstrhq_scatter_shifted_offset(pOut,vecOffsD,vecDst1);
+
+}
+
+#endif
+
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/Helium/matrix_multiply_f32.hpp b/dsppp/Include/dsppp/Helium/matrix_multiply_f32.hpp
new file mode 100644
index 000000000..ecdfbc6c2
--- /dev/null
+++ b/dsppp/Include/dsppp/Helium/matrix_multiply_f32.hpp
@@ -0,0 +1,270 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#ifdef DOXYGEN
+#define ARM_MATH_MVEI
+#define ARM_MATH_MVEF
+#define ARM_MATH_MVE_FLOAT16
+#endif
+
+/** \addtogroup HELIUMALG
+ *  @{
+ */
+
+template<typename MA,
+         typename MB,
+         typename RES,
+typename std::enable_if<
+         has_vector_inst<MA>() &&
+         SameElementType<MA,float32_t>::value,bool>::type = true>
+__STATIC_INLINE  void _arm_mat_mult_2x2_mve(
+    const MA &pSrcA,
+    const MB &pSrcB,
+    RES &&pDst)
+{
+    using T = typename traits<MA>::Scalar;
+    //using ACC = typename vector_traits<T>::temp_accumulator;
+    using VEC = typename vector_traits<T>::vector;
+
+    /* {a00, a00, a10, a10} */
+    const uint32_t  offsetA0[4] = { 0, 0, pSrcA.stride(), pSrcA.stride() };
+    /* {b00, b01, b00, b01} */
+    const uint32_t  offsetB0[4] = { 0, 1, 0, 1 };
+    /* {a01, a01, a11, a11} */
+    const uint32_t  offsetA1[4] = { 1, 1, pSrcA.stride() + 1, pSrcA.stride() + 1 };
+    /* {b10, b11, b10, b11} */
+    const uint32_t  offsetB1[4] = { pSrcB.stride(), pSrcB.stride()+1, pSrcB.stride(), pSrcB.stride()+1 };
+
+    /* {d00, d01, d10, d11} */
+    const uint32_t  offsetD[4] = { 0, 1, pDst.stride(), pDst.stride()+1 };
+   
+    uint32x4_t vecOffsA, vecOffsB,vecOffsC;
+    VEC vecInA, vecInB, vecDst;
+
+    if constexpr (!HasStaticStride<MA>::value)
+    {
+       vecOffsA = vldrwq_u32((uint32_t const *) offsetA0);
+    }
+    vecOffsB = vldrwq_u32((uint32_t const *) offsetB0);
+
+    if constexpr (!HasStaticStride<MA>::value)
+    {
+       vecInA = vldrwq_gather_shifted_offset(pSrcA.const_ptr(), vecOffsA);
+    }
+    else
+    {
+        constexpr int s = StaticStride<MA>::value;
+        vecInA = inner::vload1_gen_stride<0, 0, s, s>::run(pSrcA.const_ptr());
+    }
+
+    if constexpr (!HasStaticStride<MB>::value)
+    {
+        vecInB = vldrwq_gather_shifted_offset(pSrcB.const_ptr(), vecOffsB);
+    }
+    else
+    {
+        vecInB = inner::vload1_gen_stride<0, 1, 0, 1>::run(pSrcB.const_ptr());
+    }
+    vecDst = inner::vmul(vecInA, vecInB);
+
+    if constexpr (!HasStaticStride<MA>::value)
+    {
+        vecOffsA = vldrwq_u32((uint32_t const *) offsetA1);
+    }
+
+    if constexpr (!HasStaticStride<MB>::value)
+    {
+        vecOffsB = vldrwq_u32((uint32_t const *) offsetB1);
+    }
+
+    if constexpr (!HasStaticStride<MA>::value)
+    {
+       vecInA = vldrwq_gather_shifted_offset(pSrcA.const_ptr(), vecOffsA);
+    }
+    else 
+    {
+        constexpr int s = StaticStride<MA>::value;
+        vecInA = inner::vload1_gen_stride<1, 1, s+1, s+1>::run(pSrcA.const_ptr());
+
+    }
+
+    if constexpr (!HasStaticStride<MB>::value)
+    {
+        vecInB = vldrwq_gather_shifted_offset(pSrcB.const_ptr(), vecOffsB);
+    }
+    else 
+    {
+        constexpr int s = StaticStride<MB>::value;
+        vecInB = inner::vload1_gen_stride<s, s+1, s, s+1>::run(pSrcB.const_ptr());
+    }
+
+    if constexpr (!HasStaticStride<RES>::value)
+    {
+        vecOffsC = vldrwq_u32((uint32_t const *) offsetD);
+    }
+
+    vecDst = inner::vmacc(vecDst, vecInA, vecInB);
+
+    //inner::vstore1<1>(pDst.ptr(), vecDst);
+    if constexpr (!HasStaticStride<RES>::value)
+    {
+       vstrwq_scatter_shifted_offset(pDst.ptr(),vecOffsC,vecDst);
+    }
+    else 
+    {        
+        constexpr int s = StaticStride<RES>::value;
+        inner::vstore1_gen_stride<0, 1, s, s+1>::run(pDst.ptr(),vecDst);
+    }
+
+}
+
+template<typename MA,
+         typename MB,
+         typename RES,
+typename std::enable_if<
+         has_vector_inst<MA>() &&
+         SameElementType<MA,float32_t>::value,bool>::type = true>
+__STATIC_INLINE  void _arm_mat_mult_3x3_mve(
+    const MA &pSrcA,
+    const MB &pSrcB,
+    RES &&pDst)
+{
+    using T = typename traits<MA>::Scalar;
+    using ACC = typename vector_traits<T>::temp_accumulator;
+    using VEC = typename vector_traits<T>::vector;
+    T   *pInB = pSrcB.ptr(); /* input data matrix pointer B */
+    T   *pInA = pSrcA.ptr(); /* input data matrix pointer A  */
+    T   *pOut = pDst.ptr();  /* output data matrix pointer */
+    T   *pInA0, *pInA1, *pInA2;
+    ACC    vecMac0, vecMac1, vecMac2;
+    VEC    vecInB;
+    T const *pSrBVec;
+
+    pSrBVec = (float32_t const *) pInB;
+
+    pInA0 = pInA;
+    pInA1 = pInA0 + pSrcA.stride();
+    pInA2 = pInA1 + pSrcA.stride();
+    /* enable predication to disable last (4th) vector element */
+    mve_pred16_t p0 = inner::vctpq<T>::mk(MATRIX_DIM3);
+
+    /*
+     * load {b0,0, b0,1, b0,2, 0}
+     */
+    vecInB = inner::vload1_z<1>(pSrBVec, MATRIX_DIM3,p0);  
+    pSrBVec += pSrcB.stride();
+
+    vecMac0 = inner::vmul(vecInB, *pInA0++);
+    vecMac1 = inner::vmul(vecInB, *pInA1++);
+    vecMac2 = inner::vmul(vecInB, *pInA2++);
+    /*
+     * load {b1,0, b1,1, b1,2, 0}
+     */
+    vecInB = inner::vload1_z<1>(pSrBVec, MATRIX_DIM3,p0);  
+    pSrBVec += pSrcB.stride();
+
+    vecMac0 = inner::vmacc(vecMac0, vecInB, *pInA0++);
+    vecMac1 = inner::vmacc(vecMac1, vecInB, *pInA1++);
+    vecMac2 = inner::vmacc(vecMac2, vecInB, *pInA2++);
+    /*
+     * load {b2,0, b2,1 , b2,2, 0}
+     */
+    vecInB = inner::vload1_z<1>(pSrBVec, MATRIX_DIM3,p0);  
+    pSrBVec += pSrcB.stride();
+
+    vecMac0 = inner::vmacc(vecMac0, vecInB, *pInA0++);
+    vecMac1 = inner::vmacc(vecMac1, vecInB, *pInA1++);
+    vecMac2 = inner::vmacc(vecMac2, vecInB, *pInA2++);
+
+    /* partial vector stores */
+    inner::vstore1_z<1>(pOut, vecMac0, MATRIX_DIM3,p0); 
+    pOut += pDst.stride();
+    inner::vstore1_z<1>(pOut, vecMac1, MATRIX_DIM3,p0); 
+    pOut += pDst.stride();
+    inner::vstore1_z<1>(pOut, vecMac2, MATRIX_DIM3,p0);
+    /*
+     * Return to application
+     */
+}
+
+template<typename MA,
+         typename MB,
+         typename RES,
+typename std::enable_if<
+         has_vector_inst<MA>() &&
+         SameElementType<MA,float32_t>::value,bool>::type = true>
+__STATIC_INLINE  void _arm_mat_mult_4x4_mve(
+    const MA &pSrcA,
+    const MB &pSrcB,
+    RES &&pDst)
+{
+    using T = typename traits<MA>::Scalar;
+    using ACC = typename vector_traits<T>::temp_accumulator;
+    using VEC = typename vector_traits<T>::vector;
+    T const *pSrBVec;
+    T *pInB = pSrcB.ptr(); /* input data matrix pointer B */
+    T *pInA = pSrcA.ptr(); /* input data matrix pointer A  */
+    T *pOut = pDst.ptr();  /* output data matrix pointer */
+    T *pInA0, *pInA1, *pInA2, *pInA3;
+    ACC vecMac0, vecMac1, vecMac2, vecMac3;
+    VEC vecInB;
+
+    pSrBVec = (float32_t const *) pInB;
+
+    pInA0 = pInA;
+    pInA1 = pInA0 + pSrcA.stride();
+    pInA2 = pInA1 + pSrcA.stride();
+    pInA3 = pInA2 + pSrcA.stride();
+    /*
+     * load {b0,0, b0,1, b0,2, b0,3}
+     */
+    vecInB = inner::vload1<1>(pSrBVec);  
+    pSrBVec += pSrcB.stride();
+
+    vecMac0 = inner::vmul(vecInB, *pInA0++);
+    vecMac1 = inner::vmul(vecInB, *pInA1++);
+    vecMac2 = inner::vmul(vecInB, *pInA2++);
+    vecMac3 = inner::vmul(vecInB, *pInA3++);
+    /*
+     * load {b1,0, b1,1, b1,2, b1,3}
+     */
+    vecInB = inner::vload1<1>(pSrBVec);  
+    pSrBVec += pSrcB.stride();
+
+    vecMac0 = inner::vmacc(vecMac0, vecInB, *pInA0++);
+    vecMac1 = inner::vmacc(vecMac1, vecInB, *pInA1++);
+    vecMac2 = inner::vmacc(vecMac2, vecInB, *pInA2++);
+    vecMac3 = inner::vmacc(vecMac3, vecInB, *pInA3++);
+    /*
+     * load {b2,0, b2,1, b2,2, b2,3}
+     */
+    vecInB = inner::vload1<1>(pSrBVec);  
+    pSrBVec += pSrcB.stride();
+
+    vecMac0 = inner::vmacc(vecMac0, vecInB, *pInA0++);
+    vecMac1 = inner::vmacc(vecMac1, vecInB, *pInA1++);
+    vecMac2 = inner::vmacc(vecMac2, vecInB, *pInA2++);
+    vecMac3 = inner::vmacc(vecMac3, vecInB, *pInA3++);
+    /*
+     * load {b3,0, b3,1, b3,2, b3,3}
+     */
+    vecInB = inner::vload1<1>(pSrBVec);  
+    pSrBVec += pSrcB.stride();
+
+    vecMac0 = inner::vmacc(vecMac0, vecInB, *pInA0++);
+    vecMac1 = inner::vmacc(vecMac1, vecInB, *pInA1++);
+    vecMac2 = inner::vmacc(vecMac2, vecInB, *pInA2++);
+    vecMac3 = inner::vmacc(vecMac3, vecInB, *pInA3++);
+
+    inner::vstore1<1>(pOut, vecMac0);  
+    pOut += pDst.stride();
+    inner::vstore1<1>(pOut, vecMac1);  
+    pOut += pDst.stride();
+    inner::vstore1<1>(pOut, vecMac2);  
+    pOut += pDst.stride();
+    inner::vstore1<1>(pOut, vecMac3);
+    
+}
+
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/Helium/matrix_multiply_fixed.hpp b/dsppp/Include/dsppp/Helium/matrix_multiply_fixed.hpp
new file mode 100644
index 000000000..8169fc0f9
--- /dev/null
+++ b/dsppp/Include/dsppp/Helium/matrix_multiply_fixed.hpp
@@ -0,0 +1,613 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#ifdef DOXYGEN
+#define ARM_MATH_MVEI
+#define ARM_MATH_MVEF
+#define ARM_MATH_MVE_FLOAT16
+#endif
+
+/** \addtogroup HELIUMALG
+ *  @{
+ */
+
+#if defined(ARM_MATH_MVEI) 
+
+
+
+#define MVE_ASRL_SAT16(acc, shift)          ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
+
+
+
+template<typename MA,
+         typename MB,
+         typename RES,
+         typename std::enable_if<has_vector_inst<MA>() &&
+         number_traits<typename traits<MA>::Scalar>::is_fixed,bool>::type = true>
+inline void arm_mat_mult_2x2_mve(
+    const MA & pSrcA,
+    const MB & pSrcB,
+    RES && pDst)
+{
+    using T = typename traits<MA>::Scalar;
+    using ACC = typename vector_traits<T>::temp_accumulator;
+    using VEC = typename vector_traits<T>::vector;
+
+    const T     *pInB = pSrcB.const_ptr();  /* input data matrix pointer B */
+    const T       *pInA = pSrcA.const_ptr();  /* input data matrix pointer A */
+    T       *pOut = pDst.ptr();   /* output data matrix pointer */
+    const T       *pInA0 = pInA;
+    const T       *pInA1 = pInA0 + pSrcA.stride();
+    ACC        acc0, acc1;
+    VEC     vecB, vecA0, vecA1;
+    mve_pred16_t p0 = inner::vctpq<T>::mk(MATRIX_DIM2);
+
+
+   
+    if constexpr (HasStaticStride<MB>::value)
+    {
+       vecB = inner::vload1_z<StaticStride<MB>::value>(pInB,MATRIX_DIM2,p0);
+    }
+    else
+    {
+       vecB = inner::vload1_z(pInB,pSrcB.stride(),MATRIX_DIM2,p0);
+    }
+
+
+    vecA0 = inner::vload1_z<1>(pInA0,MATRIX_DIM2,p0);
+    vecA1 = inner::vload1_z<1>(pInA1,MATRIX_DIM2,p0);
+
+    acc0 = inner::vmacc(vecA0, vecB,p0);
+    acc1 = inner::vmacc(vecA1, vecB,p0);
+
+    pOut[0] = inner::from_accumulator(inner::vreduce(acc0));
+    pOut[pDst.stride()] = inner::from_accumulator(inner::vreduce(acc1));
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    if constexpr (HasStaticStride<MB>::value)
+    {
+       vecB = inner::vload1_z<StaticStride<MB>::value>(pInB,MATRIX_DIM2,p0);
+    }
+    else
+    {
+       vecB = inner::vload1_z(pInB,pSrcB.stride(),MATRIX_DIM2,p0);
+    }
+
+    acc0 = inner::vmacc(vecA0, vecB,p0);
+    acc1 = inner::vmacc(vecA1, vecB,p0);
+
+    pOut[0] = inner::from_accumulator(inner::vreduce(acc0));
+    pOut[pDst.stride()] = inner::from_accumulator(inner::vreduce(acc1));
+
+}
+
+
+template<typename MA,
+         typename MB,
+         typename RES,
+         typename std::enable_if<has_vector_inst<MA>() &&
+         number_traits<typename traits<MA>::Scalar>::is_fixed,bool>::type = true>
+inline void arm_mat_mult_3x3_mve(
+    const MA & pSrcA,
+    const MB & pSrcB,
+    RES && pDst)
+{
+
+    using T = typename traits<MA>::Scalar;
+    using ACC = typename vector_traits<T>::temp_accumulator;
+    using VEC = typename vector_traits<T>::vector;
+
+    const T       *pInB = pSrcB.const_ptr();  /* input data matrix pointer B */
+    const T       *pInA = pSrcA.const_ptr();  /* input data matrix pointer A */
+    T       *pOut = pDst.ptr();   /* output data matrix pointer */
+    const T       *pInA0 = pInA;
+    const T       *pInA1 = pInA0 + pSrcA.stride();
+    const T       *pInA2 = pInA1 + pSrcA.stride();
+    ACC        acc0, acc1, acc2;
+    VEC    vecB, vecA0, vecA1, vecA2;
+    mve_pred16_t p0 = inner::vctpq<T>::mk(MATRIX_DIM3);
+
+   
+    if constexpr (HasStaticStride<MB>::value)
+    {
+       vecB = inner::vload1_z<StaticStride<MB>::value>(pInB,MATRIX_DIM3,p0);
+    }
+    else
+    {
+       vecB = inner::vload1_z(pInB,pSrcB.stride(),MATRIX_DIM3,p0);
+    }
+
+    vecA0 = inner::vload1_z<1>(pInA0,MATRIX_DIM3,p0);
+    vecA1 = inner::vload1_z<1>(pInA1,MATRIX_DIM3,p0);
+    vecA2 = inner::vload1_z<1>(pInA2,MATRIX_DIM3,p0);
+
+    acc0 = inner::vmacc(vecA0, vecB,p0);
+    acc1 = inner::vmacc(vecA1, vecB,p0);
+    acc2 = inner::vmacc(vecA2, vecB,p0);
+
+    pOut[0 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc0));
+    pOut[1 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc1));
+    pOut[2 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc2));
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    if constexpr (HasStaticStride<MB>::value)
+    {
+       vecB = inner::vload1_z<StaticStride<MB>::value>(pInB,MATRIX_DIM3,p0);
+    }
+    else
+    {
+       vecB = inner::vload1_z(pInB,pSrcB.stride(),MATRIX_DIM3,p0);
+    }
+
+    acc0 = inner::vmacc(vecA0, vecB,p0);
+    acc1 = inner::vmacc(vecA1, vecB,p0);
+    acc2 = inner::vmacc(vecA2, vecB,p0);
+
+
+    pOut[0 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc0));
+    pOut[1 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc1));
+    pOut[2 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc2));
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    if constexpr (HasStaticStride<MB>::value)
+    {
+       vecB = inner::vload1_z<StaticStride<MB>::value>(pInB,MATRIX_DIM3,p0);
+    }
+    else
+    {
+       vecB = inner::vload1_z(pInB,pSrcB.stride(),MATRIX_DIM3,p0);
+    }
+
+    acc0 = inner::vmacc(vecA0, vecB,p0);
+    acc1 = inner::vmacc(vecA1, vecB,p0);
+    acc2 = inner::vmacc(vecA2, vecB,p0);
+
+
+    pOut[0 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc0));
+    pOut[1 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc1));
+    pOut[2 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc2));
+   
+}
+
+
+template<typename MA,
+         typename MB,
+         typename RES,
+         typename std::enable_if<has_vector_inst<MA>() &&
+         number_traits<typename traits<MA>::Scalar>::is_fixed,bool>::type = true>
+inline void arm_mat_mult_4x4_mve(
+    const MA & pSrcA,
+    const MB & pSrcB,
+    RES && pDst)
+{
+    using T = typename traits<MA>::Scalar;
+    using ACC = typename vector_traits<T>::temp_accumulator;
+    using VEC = typename vector_traits<T>::vector;
+
+    const T       *pInB = pSrcB.const_ptr();  /* input data matrix pointer B */
+    const T       *pInA = pSrcA.const_ptr();  /* input data matrix pointer A */
+    T       *pOut = pDst.ptr();   /* output data matrix pointer */
+    const T       *pInA0 = pInA;
+    const T       *pInA1 = pInA0 + pSrcA.stride();
+    const T       *pInA2 = pInA1 + pSrcA.stride();
+    const T       *pInA3 = pInA2 + pSrcA.stride();
+    ACC        acc0, acc1, acc2, acc3;
+    VEC     vecB, vecA0, vecA1, vecA2, vecA3;
+    mve_pred16_t p0 = inner::vctpq<T>::mk(MATRIX_DIM4);
+
+
+   
+    if constexpr (HasStaticStride<MB>::value)
+    {
+       vecB = inner::vload1_z<StaticStride<MB>::value>(pInB,MATRIX_DIM4,p0);
+    }
+    else
+    {
+       vecB = inner::vload1_z(pInB,pSrcB.stride(),MATRIX_DIM4,p0);
+    }
+
+    vecA0 = inner::vload1_z<1>(pInA0,MATRIX_DIM4,p0);
+    vecA1 = inner::vload1_z<1>(pInA1,MATRIX_DIM4,p0);
+    vecA2 = inner::vload1_z<1>(pInA2,MATRIX_DIM4,p0);
+    vecA3 = inner::vload1_z<1>(pInA3,MATRIX_DIM4,p0);
+
+    acc0 = inner::vmacc(vecA0, vecB,p0);
+    acc1 = inner::vmacc(vecA1, vecB,p0);
+    acc2 = inner::vmacc(vecA2, vecB,p0);
+    acc3 = inner::vmacc(vecA3, vecB,p0);
+
+
+    pOut[0 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc0));
+    pOut[1 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc1));
+    pOut[2 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc2));
+    pOut[3 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc3));
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    if constexpr (HasStaticStride<MB>::value)
+    {
+       vecB = inner::vload1_z<StaticStride<MB>::value>(pInB,MATRIX_DIM4,p0);
+    }
+    else
+    {
+       vecB = inner::vload1_z(pInB,pSrcB.stride(),MATRIX_DIM4,p0);
+    }
+
+
+    acc0 = inner::vmacc(vecA0, vecB,p0);
+    acc1 = inner::vmacc(vecA1, vecB,p0);
+    acc2 = inner::vmacc(vecA2, vecB,p0);
+    acc3 = inner::vmacc(vecA3, vecB,p0);
+
+    pOut[0 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc0));
+    pOut[1 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc1));
+    pOut[2 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc2));
+    pOut[3 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc3));
+
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    if constexpr (HasStaticStride<MB>::value)
+    {
+       vecB = inner::vload1_z<StaticStride<MB>::value>(pInB,MATRIX_DIM4,p0);
+    }
+    else
+    {
+       vecB = inner::vload1_z(pInB,pSrcB.stride(),MATRIX_DIM4,p0);
+    }
+
+    acc0 = inner::vmacc(vecA0, vecB,p0);
+    acc1 = inner::vmacc(vecA1, vecB,p0);
+    acc2 = inner::vmacc(vecA2, vecB,p0);
+    acc3 = inner::vmacc(vecA3, vecB,p0);
+   
+    pOut[0 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc0));
+    pOut[1 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc1));
+    pOut[2 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc2));
+    pOut[3 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc3));
+
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    if constexpr (HasStaticStride<MB>::value)
+    {
+       vecB = inner::vload1_z<StaticStride<MB>::value>(pInB,MATRIX_DIM4,p0);
+    }
+    else
+    {
+       vecB = inner::vload1_z(pInB,pSrcB.stride(),MATRIX_DIM4,p0);
+    }
+
+    acc0 = inner::vmacc(vecA0, vecB,p0);
+    acc1 = inner::vmacc(vecA1, vecB,p0);
+    acc2 = inner::vmacc(vecA2, vecB,p0);
+    acc3 = inner::vmacc(vecA3, vecB,p0);
+
+
+    pOut[0 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc0));
+    pOut[1 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc1));
+    pOut[2 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc2));
+    pOut[3 * pDst.stride()] = inner::from_accumulator(inner::vreduce(acc3));
+   
+}
+
+
+
+template<typename MA,
+         typename MB,
+         typename RES,
+         typename TMP,
+         typename std::enable_if<has_vector_inst<MA>() &&
+         number_traits<typename traits<MA>::Scalar>::is_fixed,bool>::type = true>
+   __STATIC_INLINE void _dot_m_m(const MA&pSrcA,const MB&pSrcB,
+                        RES &&pDst,
+                        const TMP &BT,
+                        const Helium* = nullptr)
+   {
+    using T = typename traits<MA>::Scalar;
+    using ACC = typename vector_traits<T>::temp_accumulator;
+    using VEC = typename vector_traits<T>::vector;
+    constexpr int nb_lanes = vector_traits<T>::nb_lanes;
+
+    const T          *pInA = pSrcA.const_ptr();        /* input data matrix pointer A */
+    const T          *pInB = pSrcB.const_ptr();
+    T          *pOut = pDst.ptr();        /* input data matrix pointer B */
+    T          *px;         /* Temporary output data matrix pointer */
+    T          *px2;        /* Temporary output data matrix pointer */
+    uint32_t        numRowsA = pSrcA.rows();  /* number of rows of input matrix A    */
+    
+    uint32_t        numColsB = pSrcB.columns();  /* number of columns of input matrix B */
+    uint32_t        numColsA = pSrcA.columns();  /* number of columns of input matrix A */
+    
+    uint32_t        strideA = pSrcA.stride();  /* number of columns of input matrix A */
+    
+    uint32_t        numRowsB = pSrcB.rows();  /* number of rows of input matrix A    */
+    uint32_t        col, i = 0u, j, row = numRowsB;     /* loop counters */
+   
+    const T          *pInA2;
+    const T          *pInB2;
+    uint32_t        blkCnt;     /* loop counters */
+    
+
+
+    {
+        /* small squared matrix specialized routines */
+        if (numRowsA == numColsB && numColsB == numColsA) {
+
+            if (numRowsA == 1) {
+                pDst(0,0) = pSrcA(0,0) * pSrcB(0,0);
+                return;
+            } else if (numRowsA == 2)
+                return arm_mat_mult_2x2_mve(pSrcA, pSrcB, std::forward<RES>(pDst));
+            else if (numRowsA == 3)
+                return arm_mat_mult_3x3_mve(pSrcA, pSrcB, std::forward<RES>(pDst));
+            else if (numRowsA == 4)
+                return arm_mat_mult_4x4_mve(pSrcA, pSrcB, std::forward<RES>(pDst));
+        }
+
+        /*
+         * Matrix transpose
+         */
+
+        const T *pSrcBT = BT.const_ptr();
+
+
+        /*
+         * Reset the variables for the usage in the following multiplication process
+         */
+        i = 0;
+        row = numRowsA >> 1;
+        px = pOut;
+        px2 = px + pDst.stride();
+
+        /*
+         * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
+         */
+
+        /*
+         * row loop
+         */
+        while (row > 0u) {
+            /*
+             * For every row wise process, the column loop counter is to be initiated
+             */
+            col = numColsB >> 1;
+            /*
+             * For every row wise process, the pIn2 pointer is set
+             * to the starting address of the transposed pSrcB data
+             */
+            pInB = pSrcBT;
+            pInB2 = pInB + numRowsB;
+            j = 0;
+
+            /*
+             * column loop
+             */
+            while (col > 0u) {
+                T const    *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
+                VEC         vecA, vecA2, vecB, vecB2;
+                ACC           acc0, acc1, acc2, acc3;
+
+                /*
+                 * Initiate the pointer pIn1 to point to the starting address of the column being processed
+                 */
+                pInA = pSrcA.const_ptr() + i;
+                pInA2 = pInA + strideA;
+                pInB = pSrcBT + j;
+                pInB2 = pInB + numRowsB;
+
+
+                pSrcAVec = (T const *) pInA;
+                pSrcA2Vec = (T const *) pInA2;
+                pSrcBVec = (T const *) pInB;
+                pSrcB2Vec = (T const *) pInB2;
+
+                acc0 = vector_traits<T>::temp_acc_zero();
+                acc1 = vector_traits<T>::temp_acc_zero();
+                acc2 = vector_traits<T>::temp_acc_zero();
+                acc3 = vector_traits<T>::temp_acc_zero();
+
+                vecA = inner::vload1<1>(pSrcAVec);
+                pSrcAVec += nb_lanes;
+
+                blkCnt = numColsA / nb_lanes;
+                while (blkCnt > 0U) {
+                    vecB = inner::vload1<1>(pSrcBVec);
+                    pSrcBVec += nb_lanes;
+                    acc0 = inner::vmacc(acc0, vecA, vecB);
+                    vecA2 = inner::vload1<1>(pSrcA2Vec);
+                    pSrcA2Vec += nb_lanes;
+                    acc1 = inner::vmacc(acc1, vecA2, vecB);
+                    vecB2 = inner::vload1<1>(pSrcB2Vec);
+                    pSrcB2Vec += nb_lanes;
+                    acc2 = inner::vmacc(acc2, vecA, vecB2);
+                    vecA = inner::vload1<1>(pSrcAVec);
+                    pSrcAVec += nb_lanes;
+                    acc3 = inner::vmacc(acc3, vecA2, vecB2);
+
+                    blkCnt--;
+                }
+                /*
+                 * tail
+                 */
+                blkCnt = numColsA & (nb_lanes-1);
+                if (blkCnt > 0U) {
+                    mve_pred16_t    p0 = inner::vctpq<T>::mk(blkCnt);
+                    vecB = inner::vload1<1>(pSrcBVec);
+                    acc0 = inner::vmacc(acc0, vecA, vecB, p0);
+                    vecA2 = inner::vload1<1>(pSrcA2Vec);
+                    acc1 = inner::vmacc(acc1, vecA2, vecB, p0);
+                    vecB2 = inner::vload1<1>(pSrcB2Vec);
+                    acc2 = inner::vmacc(acc2, vecA, vecB2, p0);
+                    vecA = inner::vload1<1>(pSrcAVec);
+                    acc3 = inner::vmacc(acc3, vecA2, vecB2, p0);
+                }
+
+                *px++  = inner::from_accumulator(inner::vreduce(acc0));
+                *px++  = inner::from_accumulator(inner::vreduce(acc2));
+                *px2++ = inner::from_accumulator(inner::vreduce(acc1));
+                *px2++ = inner::from_accumulator(inner::vreduce(acc3));
+
+                j += numRowsB * 2;
+                /*
+                 * Decrement the column loop counter
+                 */
+                col--;
+
+            }
+
+            i = i + strideA * 2;
+            px = px2 + (numColsB & 1u);
+            px2 = px + pDst.stride();
+            /*
+             * Decrement the row loop counter
+             */
+            row--;
+        }
+
+        /*
+         * Compute remaining row and/or column below
+         */
+
+        if (numColsB & 1u) {
+            row = numRowsA & (~0x1);    //avoid redundant computation
+            px = pOut + + pDst.stride() - 1;
+            i = 0;
+
+            /*
+             * row loop
+             */
+            while (row > 0) {
+
+
+                T const    *pSrcAVec, *pSrcBVec;
+                VEC         vecA, vecB;
+                ACC           acc0;
+
+                /*
+                 * point to last column in matrix B
+                 */
+                pInB = pSrcBT + numRowsB * (numColsB - 1);
+                pInA = pSrcA.const_ptr() + i;
+
+                pSrcAVec = (T const *) pInA;
+                pSrcBVec = (T const *) pInB;
+
+                acc0 = vector_traits<T>::temp_acc_zero();
+                blkCnt = (numColsA) / nb_lanes;
+                while (blkCnt > 0U) {
+                    vecA = inner::vload1<1>(pSrcAVec);
+                    pSrcAVec += nb_lanes;
+                    vecB = inner::vload1<1>(pSrcBVec);
+                    pSrcBVec += nb_lanes;
+                    acc0 = inner::vmacc(acc0, vecA, vecB);
+
+                    blkCnt--;
+                }
+                /*
+                 * tail
+                 */
+                blkCnt = (numColsA & (nb_lanes-1));
+                if (blkCnt > 0U) {
+                    mve_pred16_t    p0 = inner::vctpq<T>::mk(blkCnt);
+                    vecA = inner::vload1<1>(pSrcAVec);
+                    vecB = inner::vload1<1>(pSrcBVec);
+                    acc0 = inner::vmacc(acc0, vecA, vecB, p0);
+                }
+
+                *px = inner::from_accumulator(inner::vreduce(acc0));
+
+                px += pDst.stride();
+
+                i += strideA;
+                /*
+                 * Decrement the row loop counter
+                 */
+                row--;
+            }
+        }
+
+        if (numRowsA & 1u) {
+            col = numColsB;
+            i = 0u;
+            /*
+             * point to last row in output matrix
+             */
+            px = pOut + pDst.stride() * (numRowsA - 1);
+            /*
+             * col loop
+             */
+            while (col > 0) {
+
+                T const    *pSrcAVec, *pSrcBVec;
+                VEC         vecA, vecB;
+                ACC           acc0;
+
+                /*
+                 * point to last row in matrix A
+                 */
+                pInA = pSrcA.const_ptr() + (numRowsA - 1) * strideA;
+                pInB = pSrcBT + i;
+
+                /*
+                 * Set the variable sum, that acts as accumulator, to zero
+                 */
+                pSrcAVec = (T const *) pInA;
+                pSrcBVec = (T const *) pInB;
+                acc0 = vector_traits<T>::temp_acc_zero();
+
+                blkCnt = ((numColsA) / nb_lanes);
+                while (blkCnt > 0U) {
+                    vecA = inner::vload1<1>(pSrcAVec);
+                    pSrcAVec += nb_lanes;
+                    vecB = inner::vload1<1>(pSrcBVec);
+                    pSrcBVec += nb_lanes;
+                    acc0 = inner::vmacc(acc0, vecA, vecB);
+
+                    blkCnt--;
+                }
+                /*
+                 * tail
+                 */
+                blkCnt = (numColsA & 7);
+                if (blkCnt > 0U) {
+                    mve_pred16_t    p0 = inner::vctpq<T>::mk(blkCnt);
+                    vecA = inner::vload1<1>(pSrcAVec);
+                    vecB = inner::vload1<1>(pSrcBVec);
+                    acc0 = inner::vmacc(acc0, vecA, vecB, p0);
+                }
+
+                *px++ = inner::from_accumulator(inner::vreduce(acc0));
+
+                i += numColsA;
+
+                /*
+                 * Decrement the col loop counter
+                 */
+                col--;
+            }
+        }
+
+    }
+   
+}
+
+#endif
+
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/Helium/num_features.hpp b/dsppp/Include/dsppp/Helium/num_features.hpp
new file mode 100644
index 000000000..1f3b34d55
--- /dev/null
+++ b/dsppp/Include/dsppp/Helium/num_features.hpp
@@ -0,0 +1,17 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+/*
+
+vreduce is going from vector accumulator to scalar accumulator
+from_accumulator is going from scalar accumulator to scalar datatype
+
+
+*/
+
+#include "float.hpp"
+#include "half.hpp"
+#include "q31.hpp"
+#include "q15.hpp"
+#include "q7.hpp"
diff --git a/dsppp/Include/dsppp/Helium/q15.hpp b/dsppp/Include/dsppp/Helium/q15.hpp
new file mode 100644
index 000000000..2da379ac2
--- /dev/null
+++ b/dsppp/Include/dsppp/Helium/q15.hpp
@@ -0,0 +1,461 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#ifdef DOXYGEN
+#define ARM_MATH_MVEI
+#define ARM_MATH_MVEF
+#define ARM_MATH_MVE_FLOAT16
+#endif
+
+/** \addtogroup HeliumNumber Helium specific number definitions
+ *  \ingroup NUMBER
+ *  @{
+ *  \addtogroup HeliumQ15Number Q15
+ *  \ingroup HeliumNumber
+ *  @{
+ */
+
+
+/******************
+ *
+ * Helium
+ * 
+ */
+#if defined(ARM_MATH_MVEI)
+
+
+template<typename arch>
+struct vector_traits<Q15,arch,
+typename std::enable_if<std::is_base_of<Helium,arch>::value>::type > 
+{
+  typedef Q15 type;
+  typedef type::value_type storage_type;
+  typedef int16x8_t vector;
+  typedef Q<33,30> temp_accumulator;
+  typedef mve_pred16_t predicate_t;
+
+  static constexpr bool has_vector = true;
+  static constexpr bool is_float = false;
+  static constexpr bool is_fixed = true;
+  static constexpr bool has_predicate = true;
+
+  static constexpr int nb_lanes = 8;
+
+
+  static Q<33,30> temp_acc_zero()
+  {
+       return(Q<33,30>());
+  }
+
+  static constexpr int16_t zero_lane() {return 0;};
+
+  static constexpr int16_t lane_value(const Q15 x) {return x.v;};
+
+};
+
+namespace inner {
+
+
+    template<>
+    struct vctpq<Q15>{
+       static mve_pred16_t mk(uint32_t v)
+       {
+            return(vctp16q(v));
+       };
+    };
+
+     __STATIC_FORCEINLINE int16x8_t vconst(Q15 val)
+    {
+       return(vdupq_n_s16(val.v));
+    }
+
+      __STATIC_FORCEINLINE int16x8_t vconst_tail(Q15 val,
+                                                 const mve_pred16_t p0)
+    {
+       return(vdupq_x_n_s16(val.v,p0));
+    }
+
+    
+    __STATIC_FORCEINLINE int16x8_t vneg(const int16x8_t a)
+    {
+       return(vqnegq(a));
+    };
+
+    __STATIC_FORCEINLINE int16x8_t vneg(const int16x8_t a,
+                                        const mve_pred16_t p0)
+    {
+       return(vqnegq_m(vuninitializedq_s16(),a,p0));
+    };
+
+    __STATIC_FORCEINLINE int16x8_t vadd(const int16x8_t a,const int16x8_t b)
+    {
+       return(vqaddq(a,b));
+    };
+
+     __STATIC_FORCEINLINE int16x8_t vadd(const int16x8_t a,const Q15 b)
+    {
+       return(vqaddq_n_s16(a,b.v));
+    };
+
+     __STATIC_FORCEINLINE int16x8_t vadd(const Q15 a,const int16x8_t b)
+    {
+       return(vqaddq_n_s16(b,a.v));
+    };
+
+
+    __STATIC_FORCEINLINE int16x8_t vadd(const int16x8_t a,const int16x8_t b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqaddq_m(vuninitializedq_s16(),a,b,p0));
+    };
+
+     __STATIC_FORCEINLINE int16x8_t vadd(const int16x8_t a,const Q15 b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqaddq_m_n_s16(vuninitializedq_s16(),a,b.v,p0));
+    };
+
+     __STATIC_FORCEINLINE int16x8_t vadd(const Q15 a,const int16x8_t b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqaddq_m_n_s16(vuninitializedq_s16(),b,a.v,p0));
+    };
+
+    __STATIC_FORCEINLINE int16x8_t vsub(const int16x8_t a,const int16x8_t b)
+    {
+       return(vqsubq(a,b));
+    };
+
+     __STATIC_FORCEINLINE int16x8_t vsub(const int16x8_t a,const Q15 b)
+    {
+       return(vqsubq_n_s16(a,b.v));
+    };
+
+     __STATIC_FORCEINLINE int16x8_t vsub(const Q15 a,const int16x8_t b)
+    {
+       return(vqsubq_n_s16(b,a.v));
+    };
+
+    __STATIC_FORCEINLINE int16x8_t vsub(const int16x8_t a,const int16x8_t b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqsubq_m(vuninitializedq_s16(),a,b,p0));
+    };
+
+     __STATIC_FORCEINLINE int16x8_t vsub(const int16x8_t a,const Q15 b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqsubq_m_n_s16(vuninitializedq_s16(),a,b.v,p0));
+    };
+
+     __STATIC_FORCEINLINE int16x8_t vsub(const Q15 a,const int16x8_t b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqsubq_m_n_s16(vuninitializedq_s16(),b,a.v,p0));
+    };
+  
+    __STATIC_FORCEINLINE int16x8_t vmul(const int16x8_t a,const int16x8_t b)
+    {
+        return(vqdmulhq(a,b));
+    };
+
+
+    __STATIC_FORCEINLINE int16x8_t vmul(const int16x8_t a,const Q15 b)
+    {
+        return(vqdmulhq_n_s16(a,b.v));
+    };
+
+    __STATIC_FORCEINLINE int16x8_t vmul(const Q15 a,const int16x8_t b)
+    {
+        return(vqdmulhq_n_s16(b,a.v));
+    };
+
+    __STATIC_FORCEINLINE int16x8_t vmul(const int16x8_t a,const int16x8_t b,
+                                        const mve_pred16_t p0)
+    {
+        return(vqdmulhq_m(vuninitializedq_s16(),a,b,p0));
+    };
+
+
+    __STATIC_FORCEINLINE int16x8_t vmul(const int16x8_t a,const Q15 b,
+                                        const mve_pred16_t p0)
+    {
+        return(vqdmulhq_m_n_s16(vuninitializedq_s16(),a,b.v,p0));
+    };
+
+    __STATIC_FORCEINLINE int16x8_t vmul(const Q15 a,const int16x8_t b,
+                                        const mve_pred16_t p0)
+    {
+        return(vqdmulhq_m_n_s16(vuninitializedq_s16(),b,a.v,p0));
+    };
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline int16x8_t vload1(const Q15 *p)
+    {
+       return(vld1q(reinterpret_cast<const int16_t*>(p)));
+    };
+
+    /*
+
+    7*S must be <= 65535 so
+    S <= 9362
+
+    */
+    template<int S,
+    typename std::enable_if<(S>1) && (S<=9362),bool>::type = true>
+    inline int16x8_t vload1(const Q15 *p)
+    {
+       constexpr uint16x8_t offset={0*S,1*S,2*S,3*S,4*S,5*S,6*S,7*S};
+       //uint16x8_t offset = vidupq_u16((uint16_t)0,1);
+       //offset = vmulq_n_u16(offset,S);
+       return(vldrhq_gather_shifted_offset_s16(reinterpret_cast<const int16_t*>(p),offset));
+    };
+
+    template<int S,
+    typename std::enable_if<(S>9362),bool>::type = true>
+    inline int16x8_t vload1(const Q15 *p)
+    {
+       int16x8_t res;
+       for(std::size_t i=0;i<8;i++)
+       {
+         res[i] = p->v;
+         p += S;
+       }
+       
+       return(res);
+    };
+
+    // Dynamic stride
+    inline int16x8_t vload1(const Q15 *p,index_t stride)
+    {
+        if (stride <= 9362)
+        {
+            uint16x8_t offset = vidupq_u16((uint32_t)0,1);
+            offset = vmulq_n_u16(offset,stride);
+            return(vldrhq_gather_shifted_offset_s16(reinterpret_cast<const int16_t*>(p),offset));
+        }
+        else
+        {
+            int16x8_t res;
+            for(std::size_t i=0;i<8;i++)
+            {
+              res[i] = p->v;
+              p += stride;
+            }
+            return(res);
+        }
+    }
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline int16x8_t vload1_z(const Q15 *p,std::size_t nb,mve_pred16_t p0)
+    {
+        (void)nb;
+        return(vld1q_z(reinterpret_cast<const int16_t*>(p),p0));
+     
+    };
+
+    template<int S,
+    typename std::enable_if<(S>1) && (S<=9362),bool>::type = true>
+    inline int16x8_t vload1_z(const Q15 *p,std::size_t nb,mve_pred16_t p0)
+    {
+       (void)nb;
+       uint16x8_t offset = vidupq_u16((uint32_t)0,1);
+       offset = vmulq_n_u16(offset,S);
+       return(vldrhq_gather_shifted_offset_z_s16(reinterpret_cast<const int16_t*>(p),offset,p0));
+     };
+
+    template<int S,
+    typename std::enable_if<(S>9362),bool>::type = true>
+    inline int16x8_t vload1_z(const Q15 *p,std::size_t nb,mve_pred16_t p0)
+    {
+        (void)p0;
+        int16x8_t res;
+        std::size_t i=0;
+        for(;i<std::min(nb,8U);i++)
+        {
+         res[i] = p->v;
+         p += S;
+        }
+
+        for(;i<8;i++)
+        {
+         res[i] = 0;
+         p += S;
+        }
+       
+        return(res);
+     
+    };
+
+    // Dynamic stride
+    inline int16x8_t vload1_z(const Q15 *p,index_t stride,std::size_t nb,mve_pred16_t p0)
+    {
+
+       if (stride <= 9362)
+       {
+         uint16x8_t offset = vidupq_u16((uint32_t)0,1);
+         offset = vmulq_n_u16(offset,stride);
+         return(vldrhq_gather_shifted_offset_z_s16(reinterpret_cast<const int16_t*>(p),offset,p0));
+       }
+       else 
+       {
+          int16x8_t res;
+          std::size_t i=0;
+          for(;i<std::min(nb,8U);i++)
+          {
+           res[i] = p->v;
+           p += stride;
+          }
+
+          for(;i<8;i++)
+          {
+           res[i] = 0;
+           p += stride;
+          }
+          return(res);
+       }
+     };
+
+    
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline void vstore1(Q15 *p,const int16x8_t val)
+    {
+       vst1q(reinterpret_cast<int16_t*>(p),val);
+    };
+
+    template<int S,
+    typename std::enable_if<(S>1) && (S<=9362),bool>::type = true>
+    inline void vstore1(Q15 *p,const int16x8_t val)
+    {
+       //uint16x8_t offset={0,1,2,3,4,5,6,7};
+       //uint16x8_t offset = vidupq_u16((uint16_t)0,1);
+       //offset = vmulq_n_u16(offset,S);
+       constexpr uint16x8_t offset={0*S,1*S,2*S,3*S,4*S,5*S,6*S,7*S};
+       return(vstrhq_scatter_shifted_offset_s16(reinterpret_cast<int16_t*>(p),offset,val));
+    };
+
+    template<int S,
+    typename std::enable_if<(S>9362),bool>::type = true>
+    inline void vstore1(Q15 *p,const int16x8_t val)
+    {
+      for(std::size_t i=0;i<8;i++)
+      {
+        *p = Q15(val[i]);
+         p += S;
+      }
+      
+    };
+
+    // dynamic stride 
+    inline void vstore1(Q15 *p,const index_t stride,const int16x8_t val)
+    {
+        if (stride <=9362)
+        {
+            uint16x8_t offset = vidupq_u16((uint32_t)0,1);
+            offset = vmulq_n_u16(offset,stride);
+            return(vstrhq_scatter_shifted_offset_s16(reinterpret_cast<int16_t*>(p),offset,val));
+        }
+        else
+        {
+            for(std::size_t i=0;i<8;i++)
+            {
+              *p = Q15(val[i]);
+               p += stride;
+            }
+        }
+    }
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline void vstore1_z(Q15 *p,const int16x8_t val,std::size_t nb,mve_pred16_t p0)
+    {
+       (void)nb;
+       return(vstrhq_p(reinterpret_cast<int16_t*>(p),val,p0));
+    };
+
+    template<int S,
+    typename std::enable_if<(S>1) && (S<=9362),bool>::type = true>
+    inline void vstore1_z(Q15 *p,const int16x8_t val,std::size_t nb,mve_pred16_t p0)
+    {
+       (void)nb;
+       //uint16x8_t offset={0,1,2,3,4,5,6,7};
+       //uint16x8_t offset = vidupq_u16((uint16_t)0,1);
+       //offset = vmulq_n_u16(offset,S);
+       constexpr uint16x8_t offset={0*S,1*S,2*S,3*S,4*S,5*S,6*S,7*S};
+       return(vstrhq_scatter_shifted_offset_p_s16(reinterpret_cast<int16_t*>(p),offset,val,p0));
+    };
+    
+    
+    template<int S,
+    typename std::enable_if<(S>9362),bool>::type = true>
+    inline void vstore1_z(Q15 *p,const int16x8_t val,std::size_t nb,mve_pred16_t p0)
+    {
+      (void)p0;
+       for(std::size_t i=0;i<std::min(nb,8U);i++)
+       {
+           *p = Q15(val[i]);
+           p += S;
+       }
+    };
+
+    // dynamic stride
+    inline void vstore1_z(Q15 *p,const index_t stride,const int16x8_t val,std::size_t nb,mve_pred16_t p0)
+    {
+       if (stride<=9362)
+       {
+         uint16x8_t offset = vidupq_u16((uint32_t)0,1);
+         offset = vmulq_n_u16(offset,stride);
+         return(vstrhq_scatter_shifted_offset_p_s16(reinterpret_cast<int16_t*>(p),offset,val,p0));
+       }
+       else 
+       {
+         for(std::size_t i=0;i<std::min(nb,8U);i++)
+         {
+             *p = Q15(val[i]);
+             p += stride;
+         }
+       }
+    };
+   
+    __STATIC_FORCEINLINE Q<33,30> vmacc(const Q<33,30> sum,
+                                        const int16x8_t vala,
+                                        const int16x8_t valb)
+    {
+       return(Q<33,30>(vmlaldavaq(sum.v,vala,valb)));
+    };
+
+    __STATIC_FORCEINLINE Q<33,30> vmacc(const Q<33,30> sum,
+                                        const int16x8_t vala,
+                                        const int16x8_t valb,
+                                        const mve_pred16_t p0)
+    {
+       return(Q<33,30>(vmlaldavaq_p(sum.v,vala,valb,p0)));
+    };
+
+     __STATIC_FORCEINLINE Q<33,30> vmacc(const int16x8_t vala,
+                                         const int16x8_t valb)
+    {
+       return(Q<33,30>(vmlaldavq(vala,valb)));
+    };
+
+    __STATIC_FORCEINLINE Q<33,30> vmacc(const int16x8_t vala,
+                                        const int16x8_t valb,
+                                        const mve_pred16_t p0)
+    {
+       return(Q<33,30>(vmlaldavq_p(vala,valb,p0)));
+    };
+
+    __STATIC_FORCEINLINE Q<33,30> vreduce(const Q<33,30> sum)
+    {
+       return(sum);
+    };
+
+};
+#endif
+
+/*! @} */
+/*! @} */
diff --git a/dsppp/Include/dsppp/Helium/q31.hpp b/dsppp/Include/dsppp/Helium/q31.hpp
new file mode 100644
index 000000000..6f8eaae6e
--- /dev/null
+++ b/dsppp/Include/dsppp/Helium/q31.hpp
@@ -0,0 +1,345 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#ifdef DOXYGEN
+#define ARM_MATH_MVEI
+#define ARM_MATH_MVEF
+#define ARM_MATH_MVE_FLOAT16
+#endif
+
+/** \addtogroup HeliumNumber Helium specific number definitions
+ *  \ingroup NUMBER
+ *  @{
+ *  \addtogroup HeliumQ31Number Q31
+ *  \ingroup HeliumNumber
+ *  @{
+ */
+
+
+/******************
+ *
+ * Helium
+ * 
+ */
+#if defined(ARM_MATH_MVEI)
+
+
+
+template<typename arch>
+struct vector_traits<Q31,arch,
+typename std::enable_if<std::is_base_of<Helium,arch>::value>::type > 
+{
+  typedef Q31 type;
+  typedef type::value_type storage_type;
+  typedef int32x4_t vector;
+  typedef Q<9,54> temp_accumulator;
+  typedef mve_pred16_t predicate_t;
+
+  static constexpr bool has_vector = true;
+  static constexpr bool is_float = false;
+  static constexpr bool is_fixed = true;
+  static constexpr bool has_predicate = true;
+
+  static constexpr int nb_lanes = 4;
+
+
+  static Q<9,54> temp_acc_zero()
+  {
+       return(Q<9,54>());
+  }
+
+  static constexpr int16_t zero_lane() {return 0;};
+
+  static constexpr int16_t lane_value(const Q31 x) {return x.v;};
+
+};
+
+namespace inner {
+
+    template<>
+    struct vctpq<Q31>{
+       static mve_pred16_t mk(uint32_t v)
+       {
+            return(vctp32q(v));
+       };
+    };
+
+     __STATIC_FORCEINLINE int32x4_t vconst(Q31 val)
+    {
+       return(vdupq_n_s32(val.v));
+    }
+
+      __STATIC_FORCEINLINE int32x4_t vconst_tail(Q31 val,
+                                                 const mve_pred16_t p0)
+    {
+       return(vdupq_x_n_s32(val.v,p0));
+    }
+
+    
+    __STATIC_FORCEINLINE int32x4_t vneg(const int32x4_t a)
+    {
+       return(vqnegq(a));
+    };
+
+    __STATIC_FORCEINLINE int32x4_t vneg(const int32x4_t a,
+                                        const mve_pred16_t p0)
+    {
+       return(vqnegq_m(vuninitializedq_s32(),a,p0));
+    };
+
+    __STATIC_FORCEINLINE int32x4_t vadd(const int32x4_t a,const int32x4_t b)
+    {
+       return(vqaddq(a,b));
+    };
+
+     __STATIC_FORCEINLINE int32x4_t vadd(const int32x4_t a,const Q31 b)
+    {
+       return(vqaddq_n_s32(a,b.v));
+    };
+
+     __STATIC_FORCEINLINE int32x4_t vadd(const Q31 a,const int32x4_t b)
+    {
+       return(vqaddq_n_s32(b,a.v));
+    };
+
+
+    __STATIC_FORCEINLINE int32x4_t vadd(const int32x4_t a,const int32x4_t b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqaddq_m(vuninitializedq_s32(),a,b,p0));
+    };
+
+     __STATIC_FORCEINLINE int32x4_t vadd(const int32x4_t a,const Q31 b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqaddq_m_n_s32(vuninitializedq_s32(),a,b.v,p0));
+    };
+
+     __STATIC_FORCEINLINE int32x4_t vadd(const Q31 a,const int32x4_t b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqaddq_m_n_s32(vuninitializedq_s32(),b,a.v,p0));
+    };
+
+    __STATIC_FORCEINLINE int32x4_t vsub(const int32x4_t a,const int32x4_t b)
+    {
+       return(vqsubq(a,b));
+    };
+
+     __STATIC_FORCEINLINE int32x4_t vsub(const int32x4_t a,const Q31 b)
+    {
+       return(vqsubq_n_s32(a,b.v));
+    };
+
+     __STATIC_FORCEINLINE int32x4_t vsub(const Q31 a,const int32x4_t b)
+    {
+       return(vqsubq_n_s32(b,a.v));
+    };
+
+    __STATIC_FORCEINLINE int32x4_t vsub(const int32x4_t a,const int32x4_t b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqsubq_m(vuninitializedq_s32(),a,b,p0));
+    };
+
+     __STATIC_FORCEINLINE int32x4_t vsub(const int32x4_t a,const Q31 b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqsubq_m_n_s32(vuninitializedq_s32(),a,b.v,p0));
+    };
+
+     __STATIC_FORCEINLINE int32x4_t vsub(const Q31 a,const int32x4_t b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqsubq_m_n_s32(vuninitializedq_s32(),b,a.v,p0));
+    };
+  
+    __STATIC_FORCEINLINE int32x4_t vmul(const int32x4_t a,const int32x4_t b)
+    {
+        return(vqdmulhq(a,b));
+    };
+
+
+    __STATIC_FORCEINLINE int32x4_t vmul(const int32x4_t a,const Q31 b)
+    {
+        return(vqdmulhq_n_s32(a,b.v));
+    };
+
+    __STATIC_FORCEINLINE int32x4_t vmul(const Q31 a,const int32x4_t b)
+    {
+        return(vqdmulhq_n_s32(b,a.v));
+    };
+
+    __STATIC_FORCEINLINE int32x4_t vmul(const int32x4_t a,const int32x4_t b,
+                                        const mve_pred16_t p0)
+    {
+        return(vqdmulhq_m(vuninitializedq_s32(),a,b,p0));
+    };
+
+
+    __STATIC_FORCEINLINE int32x4_t vmul(const int32x4_t a,const Q31 b,
+                                        const mve_pred16_t p0)
+    {
+        return(vqdmulhq_m_n_s32(vuninitializedq_s32(),a,b.v,p0));
+    };
+
+    __STATIC_FORCEINLINE int32x4_t vmul(const Q31 a,const int32x4_t b,
+                                        const mve_pred16_t p0)
+    {
+        return(vqdmulhq_m_n_s32(vuninitializedq_s32(),b,a.v,p0));
+    };
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline int32x4_t vload1(const Q31 *p)
+    {
+       return(vld1q(reinterpret_cast<const int32_t*>(p)));
+    };
+
+    template<int S,
+    typename std::enable_if<(S>1),bool>::type = true>
+    inline int32x4_t vload1(const Q31 *p)
+    {
+       constexpr uint32x4_t offset={0*S,1*S,2*S,3*S};
+       return(vldrwq_gather_shifted_offset_s32(reinterpret_cast<const int32_t*>(p),offset));
+    };
+
+
+    // Dynamic stride
+    inline int32x4_t vload1(const Q31 *p,index_t stride)
+    {
+        uint32x4_t offset = vidupq_u32((uint32_t)0,1);
+        offset = vmulq_n_u32(offset,stride);
+        return(vldrwq_gather_shifted_offset_s32(reinterpret_cast<const int32_t*>(p),offset));
+        
+    }
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline int32x4_t vload1_z(const Q31 *p,std::size_t nb,mve_pred16_t p0)
+    {
+        (void)nb;
+        return(vld1q_z(reinterpret_cast<const int32_t*>(p),p0));
+     
+    };
+
+    template<int S,
+    typename std::enable_if<(S>1),bool>::type = true>
+    inline int32x4_t vload1_z(const Q31 *p,std::size_t nb,mve_pred16_t p0)
+    {
+       (void)nb;
+       uint32x4_t offset = vidupq_u32((uint32_t)0,1);
+       offset = vmulq_n_u32(offset,S);
+       return(vldrwq_gather_shifted_offset_z_s32(reinterpret_cast<const int32_t*>(p),offset,p0));
+     };
+
+    
+
+    // Dynamic stride
+    inline int32x4_t vload1_z(const Q31 *p,index_t stride,std::size_t nb,mve_pred16_t p0)
+    {
+        (void)nb;
+        uint32x4_t offset = vidupq_u32((uint32_t)0,1);
+        offset = vmulq_n_u32(offset,stride);
+        return(vldrwq_gather_shifted_offset_z_s32(reinterpret_cast<const int32_t*>(p),offset,p0));
+
+     };
+
+    
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline void vstore1(Q31 *p,const int32x4_t val)
+    {
+       vst1q(reinterpret_cast<int32_t*>(p),val);
+    };
+
+    template<int S,
+    typename std::enable_if<(S>1) ,bool>::type = true>
+    inline void vstore1(Q31 *p,const int32x4_t val)
+    {
+
+       constexpr uint32x4_t offset={0*S,1*S,2*S,3*S};
+       return(vstrwq_scatter_shifted_offset_s32(reinterpret_cast<int32_t*>(p),offset,val));
+    };
+
+   
+
+    // dynamic stride 
+    inline void vstore1(Q31 *p,const index_t stride,const int32x4_t val)
+    {
+        
+        uint32x4_t offset = vidupq_u32((uint32_t)0,1);
+        offset = vmulq_n_u32(offset,stride);
+        return(vstrwq_scatter_shifted_offset_s32(reinterpret_cast<int32_t*>(p),offset,val));
+    }
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline void vstore1_z(Q31 *p,const int32x4_t val,std::size_t nb,mve_pred16_t p0)
+    {
+       (void)nb;
+       return(vstrwq_p(reinterpret_cast<int32_t*>(p),val,p0));
+    };
+
+    template<int S,
+    typename std::enable_if<(S>1),bool>::type = true>
+    inline void vstore1_z(Q31 *p,const int32x4_t val,std::size_t nb,mve_pred16_t p0)
+    {
+       (void)nb;
+
+       constexpr uint32x4_t offset={0*S,1*S,2*S,3*S};
+       vstrwq_scatter_shifted_offset_p_s32(reinterpret_cast<int32_t*>(p),offset,val,p0);
+    };
+    
+   
+
+    // dynamic stride
+    inline void vstore1_z(Q31 *p,const index_t stride,const int32x4_t val,std::size_t nb,mve_pred16_t p0)
+    {
+         (void)nb;
+         uint32x4_t offset = vidupq_u32((uint32_t)0,1);
+         offset = vmulq_n_u32(offset,stride);
+         vstrwq_scatter_shifted_offset_p_s32(reinterpret_cast<int32_t*>(p),offset,val,p0);
+
+    };
+   
+    __STATIC_FORCEINLINE Q<9,54> vmacc(const Q<9,54> sum,
+                                        const int32x4_t vala,
+                                        const int32x4_t valb)
+    {
+       return(Q<9,54>(vrmlaldavhaq(sum.v,vala,valb)));
+    };
+
+    __STATIC_FORCEINLINE Q<9,54> vmacc(const Q<9,54> sum,
+                                        const int32x4_t vala,
+                                        const int32x4_t valb,
+                                        const mve_pred16_t p0)
+    {
+       return(Q<9,54>(vrmlaldavhaq_p(sum.v,vala,valb,p0)));
+    };
+
+     __STATIC_FORCEINLINE Q<9,54> vmacc(const int32x4_t vala,
+                                         const int32x4_t valb)
+    {
+       return(Q<9,54>(vrmlaldavhq(vala,valb)));
+    };
+
+    __STATIC_FORCEINLINE Q<9,54> vmacc(const int32x4_t vala,
+                                        const int32x4_t valb,
+                                        const mve_pred16_t p0)
+    {
+       return(Q<9,54>(vrmlaldavhq_p(vala,valb,p0)));
+    };
+
+    __STATIC_FORCEINLINE Q<15,48> vreduce(const Q<9,54> sum)
+    {
+       return(Q<15,48>(asrl(sum.v, 6)));
+    };
+
+};
+
+#endif
+
+/*! @} */
+/*! @} */
diff --git a/dsppp/Include/dsppp/Helium/q7.hpp b/dsppp/Include/dsppp/Helium/q7.hpp
new file mode 100644
index 000000000..5d9f4cc25
--- /dev/null
+++ b/dsppp/Include/dsppp/Helium/q7.hpp
@@ -0,0 +1,463 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#ifdef DOXYGEN
+#define ARM_MATH_MVEI
+#define ARM_MATH_MVEF
+#define ARM_MATH_MVE_FLOAT16
+#endif
+
+/** \addtogroup HeliumNumber Helium specific number definitions
+ *  \ingroup NUMBER
+ *  @{
+ *  \addtogroup HeliumQ7Number Q7
+ *  \ingroup HeliumNumber
+ *  @{
+ */
+
+
+/******************
+ *
+ * Helium
+ * 
+ */
+#if defined(ARM_MATH_MVEI)
+
+
+template<typename arch>
+struct vector_traits<Q7,arch,
+typename std::enable_if<std::is_base_of<Helium,arch>::value>::type > 
+{
+  typedef Q7 type;
+  typedef type::value_type storage_type;
+  typedef int8x16_t vector;
+  typedef Q<17,14> temp_accumulator;
+  typedef mve_pred16_t predicate_t;
+
+  static constexpr bool has_vector = true;
+  static constexpr bool is_float = false;
+  static constexpr bool is_fixed = true;
+  static constexpr bool has_predicate = true;
+
+  static constexpr int nb_lanes = 16;
+
+
+  static Q<17,14> temp_acc_zero()
+  {
+       return(Q<17,14>());
+  }
+
+  static constexpr int8_t zero_lane() {return 0;};
+
+  static constexpr int8_t lane_value(const Q7 x) {return x.v;};
+
+};
+
+namespace inner {
+
+
+    template<>
+    struct vctpq<Q7>{
+       static mve_pred16_t mk(uint32_t v)
+       {
+            return(vctp8q(v));
+       };
+    };
+
+     __STATIC_FORCEINLINE int8x16_t vconst(Q7 val)
+    {
+       return(vdupq_n_s8(val.v));
+    }
+
+      __STATIC_FORCEINLINE int8x16_t vconst_tail(Q7 val,
+                                                 const mve_pred16_t p0)
+    {
+       return(vdupq_x_n_s8(val.v,p0));
+    }
+
+    
+    __STATIC_FORCEINLINE int8x16_t vneg(const int8x16_t a)
+    {
+       return(vqnegq(a));
+    };
+
+    __STATIC_FORCEINLINE int8x16_t vneg(const int8x16_t a,
+                                        const mve_pred16_t p0)
+    {
+       return(vqnegq_m(vuninitializedq_s8(),a,p0));
+    };
+
+    __STATIC_FORCEINLINE int8x16_t vadd(const int8x16_t a,const int8x16_t b)
+    {
+       return(vqaddq(a,b));
+    };
+
+     __STATIC_FORCEINLINE int8x16_t vadd(const int8x16_t a,const Q7 b)
+    {
+       return(vqaddq_n_s8(a,b.v));
+    };
+
+     __STATIC_FORCEINLINE int8x16_t vadd(const Q7 a,const int8x16_t b)
+    {
+       return(vqaddq_n_s8(b,a.v));
+    };
+
+
+    __STATIC_FORCEINLINE int8x16_t vadd(const int8x16_t a,const int8x16_t b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqaddq_m(vuninitializedq_s8(),a,b,p0));
+    };
+
+     __STATIC_FORCEINLINE int8x16_t vadd(const int8x16_t a,const Q7 b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqaddq_m_n_s8(vuninitializedq_s8(),a,b.v,p0));
+    };
+
+     __STATIC_FORCEINLINE int8x16_t vadd(const Q7 a,const int8x16_t b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqaddq_m_n_s8(vuninitializedq_s8(),b,a.v,p0));
+    };
+
+    __STATIC_FORCEINLINE int8x16_t vsub(const int8x16_t a,const int8x16_t b)
+    {
+       return(vqsubq(a,b));
+    };
+
+     __STATIC_FORCEINLINE int8x16_t vsub(const int8x16_t a,const Q7 b)
+    {
+       return(vqsubq_n_s8(a,b.v));
+    };
+
+     __STATIC_FORCEINLINE int8x16_t vsub(const Q7 a,const int8x16_t b)
+    {
+       return(vqsubq_n_s8(b,a.v));
+    };
+
+    __STATIC_FORCEINLINE int8x16_t vsub(const int8x16_t a,const int8x16_t b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqsubq_m(vuninitializedq_s8(),a,b,p0));
+    };
+
+     __STATIC_FORCEINLINE int8x16_t vsub(const int8x16_t a,const Q7 b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqsubq_m_n_s8(vuninitializedq_s8(),a,b.v,p0));
+    };
+
+     __STATIC_FORCEINLINE int8x16_t vsub(const Q7 a,const int8x16_t b,
+                                        const mve_pred16_t p0)
+    {
+       return(vqsubq_m_n_s8(vuninitializedq_s8(),b,a.v,p0));
+    };
+  
+    __STATIC_FORCEINLINE int8x16_t vmul(const int8x16_t a,const int8x16_t b)
+    {
+        return(vqdmulhq(a,b));
+    };
+
+
+    __STATIC_FORCEINLINE int8x16_t vmul(const int8x16_t a,const Q7 b)
+    {
+        return(vqdmulhq_n_s8(a,b.v));
+    };
+
+    __STATIC_FORCEINLINE int8x16_t vmul(const Q7 a,const int8x16_t b)
+    {
+        return(vqdmulhq_n_s8(b,a.v));
+    };
+
+    __STATIC_FORCEINLINE int8x16_t vmul(const int8x16_t a,const int8x16_t b,
+                                        const mve_pred16_t p0)
+    {
+        return(vqdmulhq_m(vuninitializedq_s8(),a,b,p0));
+    };
+
+
+    __STATIC_FORCEINLINE int8x16_t vmul(const int8x16_t a,const Q7 b,
+                                        const mve_pred16_t p0)
+    {
+        return(vqdmulhq_m_n_s8(vuninitializedq_s8(),a,b.v,p0));
+    };
+
+    __STATIC_FORCEINLINE int8x16_t vmul(const Q7 a,const int8x16_t b,
+                                        const mve_pred16_t p0)
+    {
+        return(vqdmulhq_m_n_s8(vuninitializedq_s8(),b,a.v,p0));
+    };
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline int8x16_t vload1(const Q7 *p)
+    {
+       return(vld1q(reinterpret_cast<const int8_t*>(p)));
+    };
+
+    /*
+
+    15*S <= 255 => S <= 17
+
+    */
+    template<int S,
+    typename std::enable_if<(S>1) && (S<=17),bool>::type = true>
+    inline int8x16_t vload1(const Q7 *p)
+    {
+       constexpr uint8x16_t offset={0*S,1*S,2*S,3*S,4*S,5*S,6*S,7*S,
+                                    8*S,9*S,10*S,11*S,12*S,13*S,14*S,15*S};
+       //uint8x16_t offset = vidupq_u8((uint16_t)0,1);
+       //offset = vmulq_n_u8(offset,S);
+       return(vldrbq_gather_offset_s8(reinterpret_cast<const int8_t*>(p),offset));
+    };
+
+    template<int S,
+    typename std::enable_if<(S>17),bool>::type = true>
+    inline int8x16_t vload1(const Q7 *p)
+    {
+       int8x16_t res;
+       for(std::size_t i=0;i<16;i++)
+       {
+         res[i] = p->v;
+         p += S;
+       }
+       
+       return(res);
+    };
+
+    // Dynamic stride
+    inline int8x16_t vload1(const Q7 *p,index_t stride)
+    {
+        if (stride <= 17)
+        {
+            uint8x16_t offset = vidupq_u8((uint32_t)0,1);
+            offset = vmulq_n_u8(offset,stride);
+            return(vldrbq_gather_offset_s8(reinterpret_cast<const int8_t*>(p),offset));
+        }
+        else
+        {
+            int8x16_t res;
+            for(std::size_t i=0;i<16;i++)
+            {
+              res[i] = p->v;
+              p += stride;
+            }
+            return(res);
+        }
+    }
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline int8x16_t vload1_z(const Q7 *p,std::size_t nb,mve_pred16_t p0)
+    {
+        (void)nb;
+        return(vld1q_z(reinterpret_cast<const int8_t*>(p),p0));
+     
+    };
+
+    template<int S,
+    typename std::enable_if<(S>1) && (S<=17),bool>::type = true>
+    inline int8x16_t vload1_z(const Q7 *p,std::size_t nb,mve_pred16_t p0)
+    {
+       (void)nb;
+       uint8x16_t offset = vidupq_u8((uint32_t)0,1);
+       offset = vmulq_n_u8(offset,S);
+       return(vldrbq_gather_offset_z_s8(reinterpret_cast<const int8_t*>(p),offset,p0));
+     };
+
+    template<int S,
+    typename std::enable_if<(S>17),bool>::type = true>
+    inline int8x16_t vload1_z(const Q7 *p,std::size_t nb,mve_pred16_t p0)
+    {
+        (void)p0;
+        int8x16_t res;
+        std::size_t i=0;
+        for(;i<std::min(nb,16U);i++)
+        {
+         res[i] = p->v;
+         p += S;
+        }
+
+        for(;i<16;i++)
+        {
+         res[i] = 0;
+         p += S;
+        }
+       
+        return(res);
+     
+    };
+
+    // Dynamic stride
+    inline int8x16_t vload1_z(const Q7 *p,index_t stride,std::size_t nb,mve_pred16_t p0)
+    {
+
+       if (stride <= 17)
+       {
+         uint8x16_t offset = vidupq_u8((uint32_t)0,1);
+         offset = vmulq_n_u8(offset,stride);
+         return(vldrbq_gather_offset_z_s8(reinterpret_cast<const int8_t*>(p),offset,p0));
+       }
+       else 
+       {
+          int8x16_t res;
+          std::size_t i=0;
+          for(;i<std::min(nb,16U);i++)
+          {
+           res[i] = p->v;
+           p += stride;
+          }
+
+          for(;i<16;i++)
+          {
+           res[i] = 0;
+           p += stride;
+          }
+          return(res);
+       }
+     };
+
+    
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline void vstore1(Q7 *p,const int8x16_t val)
+    {
+       vst1q(reinterpret_cast<int8_t*>(p),val);
+    };
+
+    template<int S,
+    typename std::enable_if<(S>1) && (S<=17),bool>::type = true>
+    inline void vstore1(Q7 *p,const int8x16_t val)
+    {
+       //uint8x16_t offset={0,1,2,3,4,5,6,7};
+       //uint8x16_t offset = vidupq_u8((uint16_t)0,1);
+       //offset = vmulq_n_u8(offset,S);
+       constexpr uint8x16_t offset={0*S,1*S,2*S,3*S,4*S,5*S,6*S,7*S,
+                                    8*S,9*S,10*S,11*S,12*S,13*S,14*S,15*S};
+       return(vstrbq_scatter_offset_s8(reinterpret_cast<int8_t*>(p),offset,val));
+    };
+
+    template<int S,
+    typename std::enable_if<(S>17),bool>::type = true>
+    inline void vstore1(Q7 *p,const int8x16_t val)
+    {
+      for(std::size_t i=0;i<16;i++)
+      {
+        *p = Q7(val[i]);
+         p += S;
+      }
+      
+    };
+
+    // dynamic stride 
+    inline void vstore1(Q7 *p,const index_t stride,const int8x16_t val)
+    {
+        if (stride <=17)
+        {
+            uint8x16_t offset = vidupq_u8((uint32_t)0,1);
+            offset = vmulq_n_u8(offset,stride);
+            return(vstrbq_scatter_offset_s8(reinterpret_cast<int8_t*>(p),offset,val));
+        }
+        else
+        {
+            for(std::size_t i=0;i<16;i++)
+            {
+              *p = Q7(val[i]);
+               p += stride;
+            }
+        }
+    }
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline void vstore1_z(Q7 *p,const int8x16_t val,std::size_t nb,mve_pred16_t p0)
+    {
+       (void)nb;
+       return(vstrbq_p(reinterpret_cast<int8_t*>(p),val,p0));
+    };
+
+    template<int S,
+    typename std::enable_if<(S>1) && (S<=17),bool>::type = true>
+    inline void vstore1_z(Q7 *p,const int8x16_t val,std::size_t nb,mve_pred16_t p0)
+    {
+       (void)nb;
+       //uint8x16_t offset={0,1,2,3,4,5,6,7};
+       //uint8x16_t offset = vidupq_u8((uint16_t)0,1);
+       //offset = vmulq_n_u8(offset,S);
+       constexpr uint8x16_t offset={0*S,1*S,2*S,3*S,4*S,5*S,6*S,7*S,
+                                    8*S,9*S,10*S,11*S,12*S,13*S,14*S,15*S};
+       return(vstrbq_scatter_offset_p_s8(reinterpret_cast<int8_t*>(p),offset,val,p0));
+    };
+    
+    
+    template<int S,
+    typename std::enable_if<(S>17),bool>::type = true>
+    inline void vstore1_z(Q7 *p,const int8x16_t val,std::size_t nb,mve_pred16_t p0)
+    {
+      (void)p0;
+       for(std::size_t i=0;i<std::min(nb,16U);i++)
+       {
+           *p = Q7(val[i]);
+           p += S;
+       }
+    };
+
+    // dynamic stride
+    inline void vstore1_z(Q7 *p,const index_t stride,const int8x16_t val,std::size_t nb,mve_pred16_t p0)
+    {
+       if (stride<=17)
+       {
+         uint8x16_t offset = vidupq_u8((uint32_t)0,1);
+         offset = vmulq_n_u8(offset,stride);
+         return(vstrbq_scatter_offset_p_s8(reinterpret_cast<int8_t*>(p),offset,val,p0));
+       }
+       else 
+       {
+         for(std::size_t i=0;i<std::min(nb,16U);i++)
+         {
+             *p = Q7(val[i]);
+             p += stride;
+         }
+       }
+    };
+   
+    __STATIC_FORCEINLINE Q<17,14> vmacc(const Q<17,14> sum,
+                                        const int8x16_t vala,
+                                        const int8x16_t valb)
+    {
+       return(Q<17,14>(vmladavaq(sum.v,vala,valb)));
+    };
+
+    __STATIC_FORCEINLINE Q<17,14> vmacc(const Q<17,14> sum,
+                                        const int8x16_t vala,
+                                        const int8x16_t valb,
+                                        const mve_pred16_t p0)
+    {
+       return(Q<17,14>(vmladavaq_p(sum.v,vala,valb,p0)));
+    };
+
+     __STATIC_FORCEINLINE Q<17,14> vmacc(const int8x16_t vala,
+                                         const int8x16_t valb)
+    {
+       return(Q<17,14>(vmladavq(vala,valb)));
+    };
+
+    __STATIC_FORCEINLINE Q<17,14> vmacc(const int8x16_t vala,
+                                        const int8x16_t valb,
+                                        const mve_pred16_t p0)
+    {
+       return(Q<17,14>(vmladavq_p(vala,valb,p0)));
+    };
+
+    __STATIC_FORCEINLINE Q<17,14> vreduce(const Q<17,14> sum)
+    {
+       return(sum);
+    };
+
+};
+#endif
+
+/*! @} */
+/*! @} */
diff --git a/dsppp/Include/dsppp/Neon/basic.hpp b/dsppp/Include/dsppp/Neon/basic.hpp
new file mode 100644
index 000000000..828c96489
--- /dev/null
+++ b/dsppp/Include/dsppp/Neon/basic.hpp
@@ -0,0 +1,133 @@
+// -*- C++ -*-
+
+#pragma once 
+
+#include <dsppp/arch.hpp>
+#include <type_traits>
+#include <dsppp/number.hpp>
+#if 0
+
+template<typename T>
+void _Add(const T* pSrcA,
+           const T* pSrcB,
+           T* pDst, 
+           const std::size_t l,
+           const Neon* = nullptr,
+           typename std::enable_if<number_traits<T>::is_float && 
+                                vector_traits<T,Neon>::has_vector,T>::type* = nullptr)
+{
+    using num = vector_traits<T,Neon>;
+    using VecType = typename num::vector;
+    constexpr int nb_lanes = num::nb_lanes;
+    constexpr int lanes_shift = shiftFromValue(nb_lanes);
+    constexpr int lanes_mask = maskFromShift(lanes_shift);
+
+    //std::cout << "Neon float\r\n" ;
+
+    uint32_t blkCnt;                               /* Loop counter */
+
+    VecType vec1;
+    VecType vec2;
+    VecType res;
+
+    /* Compute several lanes at a time */
+    blkCnt = l >> lanes_shift;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+        /* Add and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrcA);
+        vec2 = vld1q(pSrcB);
+        res = vaddq(vec1, vec2);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += nb_lanes;
+        pSrcB += nb_lanes; 
+        pDst += nb_lanes;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = l & lanes_mask;
+
+    while (blkCnt > 0U)
+    {
+      /* C = A + B */
+  
+      /* Add and store result in destination buffer. */
+      *pDst++ = (*pSrcA++) + (*pSrcB++);
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+
+};
+
+
+
+
+template<typename T>
+void _Add(const T* pSrcA_Q,
+           const T* pSrcB_Q,
+           T* pDst_Q, 
+           const std::size_t l,
+           const Neon* = nullptr,
+           typename std::enable_if<number_traits<T>::is_fixed && 
+                                   vector_traits<T,Neon>::has_vector,T>::type* = nullptr)
+{
+    using num = vector_traits<T,Neon>;
+    using VecType = typename num::vector;
+    using value_type = typename T::value_type;
+    constexpr int nb_lanes = num::nb_lanes;
+    constexpr int lanes_shift = shiftFromValue(nb_lanes);
+    constexpr int lanes_mask = maskFromShift(lanes_shift);
+    const value_type *pSrcA = reinterpret_cast<const value_type*>(pSrcA_Q);
+    const value_type *pSrcB = reinterpret_cast<const value_type*>(pSrcB_Q);
+          value_type *pDst = reinterpret_cast<value_type*>(pDst_Q);
+
+    uint32_t  blkCnt;           /* loop counters */
+    VecType vecA;
+    VecType vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = l >> lanes_shift;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + B
+         * Add and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqaddq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += nb_lanes;
+        pSrcB  += nb_lanes;
+        pDst   += nb_lanes;
+    }
+    /*
+     * tail
+     */
+    blkCnt = l & lanes_mask;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = num::vctpq(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrq_p(pDst, vqaddq(vecA, vecB), p0);
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/Neon/float.hpp b/dsppp/Include/dsppp/Neon/float.hpp
new file mode 100644
index 000000000..0dc95759b
--- /dev/null
+++ b/dsppp/Include/dsppp/Neon/float.hpp
@@ -0,0 +1,105 @@
+// -*- C++ -*-
+
+#pragma once 
+
+/******************
+ *
+ * Neon
+ * 
+ */
+#if defined(ARM_MATH_NEON)
+
+template<typename arch>
+struct vector_traits<float,arch,
+typename std::enable_if<std::is_base_of<Neon,arch>::value>::type> 
+{
+  typedef float type;
+  typedef float storage_type;
+  typedef float32x4_t vector;
+  static constexpr bool has_vector = true;
+  static constexpr bool is_float = true;
+  static constexpr bool is_fixed = false;
+
+  static constexpr int nb_lanes = 4;
+
+   static constexpr float zero_lane() {return 0.0f;};
+
+
+};
+
+namespace inner {
+
+  
+   
+   __STATIC_FORCEINLINE float32x4_t vadd(const float32x4_t a,const float32x4_t b)
+   {
+      return(vaddq_f32(a,b));
+   };
+   
+   __STATIC_FORCEINLINE float32x4_t vmul(const float32x4_t a,const float32x4_t b)
+   {
+      return(vmulqq_f32(a,b));
+   };
+
+   __STATIC_FORCEINLINE float32x4_t vconst(const float v)
+   {
+      const float32x4_t t = vdupq_n_f32(v)
+      return(t);
+   }
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline float32x4_t vload1(const float32_t *p)
+    {
+       return(vld1q(p));
+    };
+    
+    template<int S,
+    typename std::enable_if<(S>1),bool>::type = true>
+    inline float32x4_t vload1(const float32_t *p)
+    {
+       float32x4_t res;
+       res[0] = *p;
+       p += S;
+    
+       res[1] = *p;
+       p += S;
+    
+       res[2] = *p;
+       p += S;
+    
+       res[3] = *p;
+       p += S;
+    
+       return(res);
+    };
+
+    template<int S,
+    typename std::enable_if<S==1,bool>::type = true>
+    inline void vstore1(float32_t *p,const float32x4_t val)
+    {
+       return(vst1q(p,val));
+    };
+    
+    template<int S,
+    typename std::enable_if<(S>1),bool>::type = true>
+    inline void vstore1(float32_t *p,const float32x4_t val)
+    {
+       *p = val[0];
+       p += S;
+    
+       *p = val[1];
+       p += S;
+    
+       *p = val[2];
+       p += S;
+    
+       *p = val[3];
+       p += S;
+    };
+  
+  
+
+};
+
+#endif
diff --git a/dsppp/Include/dsppp/Neon/num_features.hpp b/dsppp/Include/dsppp/Neon/num_features.hpp
new file mode 100644
index 000000000..142d4607f
--- /dev/null
+++ b/dsppp/Include/dsppp/Neon/num_features.hpp
@@ -0,0 +1,5 @@
+// -*- C++ -*-
+
+#pragma once 
+
+#include "float.hpp"
diff --git a/dsppp/Include/dsppp/Scalar/basic.hpp b/dsppp/Include/dsppp/Scalar/basic.hpp
new file mode 100644
index 000000000..cace44b47
--- /dev/null
+++ b/dsppp/Include/dsppp/Scalar/basic.hpp
@@ -0,0 +1,189 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+/** \addtogroup ARCHALG Architecture specific algorithm
+ *  \ingroup DSPPP
+ *  \addtogroup SCALARALG Scalar algorithm
+ *  \ingroup ARCHALG
+ *  @{
+ */
+
+
+#define SCALAR_UNROLL 2
+
+template<typename T,typename DST,
+typename std::enable_if<IsVector<DST>::value &&
+         SameElementType<DST,T>::value,bool>::type = true>
+inline void _Fill(DST &v,
+                  const T val, 
+                  vector_length_t l,
+                  const Scalar* = nullptr)
+{
+    constexpr unsigned int U = SCALAR_UNROLL;
+    index_t i;
+
+    UNROLL_LOOP
+    for(i=0 ; i <= l-(1<<U); i += (1<<U))
+    {
+        for(int k=0;k < (1<<U);k++)
+        {
+           v[i+k] = val;
+        }
+    }
+
+    for(; i < l ; i++)
+    {
+       v[i] = val;
+    }
+}
+
+
+template<typename T,typename DST,
+typename std::enable_if<must_use_matrix_idx<DST>() &&
+         SameElementType<DST,T>::value,bool>::type = true>
+inline void _Fill2D(DST &v,
+                    const T val, 
+                    const vector_length_t rows,
+                    const vector_length_t cols,
+                    const Scalar* = nullptr)
+{
+      constexpr unsigned int U = SCALAR_UNROLL;
+      index_t row=0;
+
+      
+      for(; row <= rows-(1<<U);row += (1<<U))
+      {
+          
+          for(index_t col=0; col < cols;col ++)
+          {
+             for(int k=0;k<(1<<U);k++)
+             {
+                v(row+k,col) = val;
+             }
+          }
+
+      }
+
+      for(; row < rows;row ++)
+      {
+          for(index_t col=0; col < cols;col ++)
+          {
+              v(row,col) = val;
+          }
+      }
+}
+
+
+/*
+
+Evaluation : used when result is a vector
+
+*/
+template<typename DA,typename DB,
+typename std::enable_if<vector_idx_pair<DA,DB>(),bool>::type = true>
+inline void eval(DA &v,
+                 const DB& other,
+                 const vector_length_t l,
+                 const Scalar* = nullptr)
+{
+    constexpr unsigned int U = SCALAR_UNROLL;
+    index_t i=0;
+
+    for(i=0 ; i <= l-(1<<U); i += (1<<U))
+    {
+        for(int k=0;k < (1<<U);k++)
+        {
+           v[i+k] = other[i+k];
+        }
+    }
+
+    for(; i < l ; i++)
+    {
+       v[i] = other[i];
+    }
+}
+
+template<typename DA,typename DB,
+typename std::enable_if<must_use_matrix_idx_pair<DA,DB>(),bool>::type = true>
+inline void eval2D(DA &v,
+                   const DB& other,
+                   const vector_length_t rows,
+                   const vector_length_t cols,
+                   const Scalar* = nullptr)
+{
+      constexpr unsigned int U = SCALAR_UNROLL;
+      index_t row=0;
+
+      
+      for(; row <= rows-(1<<U);row += (1<<U))
+      {
+          for(index_t col=0; col < cols;col ++)
+          {
+             for(int k=0;k<(1<<U);k++)
+             {
+                v(row+k,col) = other(row+k,col);
+             }
+          }
+
+      }
+
+
+      for(; row < rows;row ++)
+      {
+          
+          for(index_t col=0; col < cols;col ++)
+          {
+              v(row,col) = other(row,col);
+          }
+      }
+}
+
+template<typename DA,typename DB,
+         typename std::enable_if<vector_idx_pair<DA,DB>(),bool>::type = true>
+inline DotResult<DA> _dot(const DA& a,
+                         const DB& b,
+                         const vector_length_t l,
+                         const Scalar* = nullptr)
+{
+    using Acc = DotResult<DA>;
+    constexpr unsigned int U = SCALAR_UNROLL;
+    index_t i;
+
+    Acc acc = Acc{};
+
+    for(i=0 ; i <= l-(1<<U); i += (1<<U))
+    {
+        for(int k=0;k < (1<<U);k++)
+        {
+           acc = inner::mac(acc , a[i+k] , b[i+k]);
+        }
+    }
+
+    for(; i < l ; i++)
+    {
+       acc = inner::mac(acc , a[i] , b[i]);
+    }
+
+    return(acc);
+}
+
+template<typename DA,typename DB,
+         typename std::enable_if<vector_idx_pair<DA,DB>(),bool>::type = true>
+inline void _swap(DA&& a,
+                  DB&& b,
+                  const vector_length_t l,
+                  const Scalar* = nullptr)
+{
+   for(index_t i=0;i<l;i++)
+   {
+     const auto tmp = a[i];
+     a[i] = b[i];
+     b[i] = tmp;
+   }
+
+}
+
+#undef SCALAR_UNROLL
+
+/*! @} */
diff --git a/dsppp/Include/dsppp/Scalar/matrix_multiply.hpp b/dsppp/Include/dsppp/Scalar/matrix_multiply.hpp
new file mode 100644
index 000000000..a73a96917
--- /dev/null
+++ b/dsppp/Include/dsppp/Scalar/matrix_multiply.hpp
@@ -0,0 +1,134 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+
+/** \addtogroup SCALARALG
+ *  @{
+ */
+
+
+template<typename MA,
+         typename MB>
+__STATIC_INLINE void _arm_mat_trans(
+    const MA    &src,
+    MB          &dst,
+    const Scalar* = nullptr)
+{
+  DISABLE_LOOP_UNROLL
+  for(index_t r=0;r < dst.rows() ; r++)
+  {
+        dst.row(r) = copy(src.col(r));
+  }
+}
+
+template<typename M,
+         typename V,
+         typename RES>
+inline void _dot_m_v(RES &res,
+                     const M&m,const V&v,
+                     const Scalar* = nullptr)
+{
+    using T = typename traits<M>::Scalar;
+    using Acc = typename number_traits<T>::accumulator;
+    uint32_t numRows = m.rows();
+    uint32_t numCols = m.columns();
+    const T *pSrcA = m.ptr();
+    const T *pInA1;      /* input data matrix pointer A of Q31 type */
+    const T *pInA2;      /* input data matrix pointer A of Q31 type */
+    const T *pInA3;      /* input data matrix pointer A of Q31 type */
+    const T *pInA4;      /* input data matrix pointer A of Q31 type */
+    T *px;               /* Temporary output data matrix pointer */
+    uint32_t i;
+    uint16_t row, colCnt; /* loop counters */
+    T matData, matData2, vecData, vecData2;
+
+
+    /* Process 4 rows at a time */
+    row = numRows >> 2;
+    i = 0u;
+    px = res.ptr();
+
+    /* The following loop performs the dot-product of each row in pSrcA with the vector */
+    /* row loop */
+    while (row > 0) {
+        /* Initialize accumulators */
+        Acc sum1 = Acc{};
+        Acc sum2 = Acc{};
+        Acc sum3 = Acc{};
+        Acc sum4 = Acc{};
+
+        
+        /* Loop unrolling: process 2 columns per iteration */
+        //colCnt = numCols;
+
+        /* Initialize pointers to the starting address of the column being processed */
+        pInA1 = pSrcA + i;
+        pInA2 = pInA1 + m.stride();
+        pInA3 = pInA2 + m.stride();
+        pInA4 = pInA3 + m.stride();
+
+
+        // Main loop: matrix-vector multiplication
+        for(colCnt = 0 ; colCnt < numCols; colCnt ++)
+        {
+            // Read 2 values from vector
+            vecData = v[colCnt];
+            // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
+            matData = *(pInA1)++;
+            sum1 = inner::mac(sum1, matData, vecData);
+            matData = *(pInA2)++;
+            sum2 = inner::mac(sum2, matData, vecData);
+            matData = *(pInA3)++;
+            sum3 = inner::mac(sum3, matData, vecData);
+            matData = *(pInA4)++;
+            sum4 = inner::mac(sum4, matData, vecData);
+        }
+
+        /* Saturate and store the result in the destination buffer */
+        *px++ = inner::from_accumulator(sum1);
+        *px++ = inner::from_accumulator(sum2);
+        *px++ = inner::from_accumulator(sum3);
+        *px++ = inner::from_accumulator(sum4);
+
+        i = i + m.stride() * 4;
+
+        /* Decrement the row loop counter */
+        row--;
+    }
+
+    /* process any remaining rows */
+    row = numRows & 3u;
+    while (row > 0) {
+
+        Acc sum = Acc{};
+        pInA1 = pSrcA + i;
+       
+        int32_t k=0;
+        for(k=0; k <= (int)numCols-2; k += 2)
+        {
+            vecData = v[k];
+            vecData2 = v[k+1];
+            matData = *(pInA1)++;
+            matData2 = *(pInA1)++;
+            sum = inner::mac(sum, matData, vecData);
+            sum = inner::mac(sum, matData2, vecData2);
+        }
+        // process remainder of row
+
+
+        for(; k < (int)numCols; k ++)
+        {
+            sum = inner::mac(sum ,*pInA1++, v[k]);
+        }
+
+        *px++ = inner::from_accumulator(sum);
+        i = i + m.stride();
+        row--;
+    }
+}
+
+#include "matrix_multiply_fixed.hpp"
+#include "matrix_multiply_float.hpp"
+
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/Scalar/matrix_multiply_fixed.hpp b/dsppp/Include/dsppp/Scalar/matrix_multiply_fixed.hpp
new file mode 100644
index 000000000..461e63a76
--- /dev/null
+++ b/dsppp/Include/dsppp/Scalar/matrix_multiply_fixed.hpp
@@ -0,0 +1,124 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+/** \addtogroup SCALARALG
+ *  @{
+ */
+
+template<typename MA,
+         typename MB,
+         typename RES,
+         typename TMP,
+         typename std::enable_if<
+         number_traits<typename traits<MA>::Scalar>::is_fixed,bool>::type = true>
+__STATIC_INLINE void _dot_m_m(const MA&pSrcA,const MB&pSrcB,
+                     RES &&pDst,
+                     const TMP &BT,
+                     const Scalar* = nullptr)
+{
+  using T = typename traits<MA>::Scalar;
+  using Acc = typename number_traits<T>::accumulator;
+
+  T *pIn1 = pSrcA.ptr();                    /* Input data matrix pointer A */
+  T *pIn2 = pSrcB.ptr();                    /* Input data matrix pointer B */
+  T *pInA = pSrcA.ptr();                    /* Input data matrix pointer A */
+  T *pInB = pSrcB.ptr();                    /* Input data matrix pointer B */
+  T *pOut = pDst.ptr();                     /* Output data matrix pointer */
+  T *px;                                     /* Temporary output data matrix pointer */
+  Acc sum;                                     /* Accumulator */
+  uint16_t numRowsA = pSrcA.rows();            /* Number of rows of input matrix A */
+  uint16_t numColsB = pSrcB.columns();            /* Number of columns of input matrix B */
+  uint16_t numColsA = pSrcA.columns();            /* Number of columns of input matrix A */
+  uint32_t col, i = 0U, row = numRowsA, colCnt;  /* Loop counters */
+
+  (void)BT;
+    /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+    /* row loop */
+    do
+    {
+      /* Output pointer is set to starting address of row being processed */
+      px = pOut + i;
+
+      /* For every row wise process, column loop counter is to be initiated */
+      col = numColsB;
+
+      /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
+      pIn2 = pSrcB.ptr();
+
+      /* column loop */
+      do
+      {
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sum = Acc{};
+
+        /* Initialize pointer pIn1 to point to starting address of column being processed */
+        pIn1 = pInA;
+
+
+        /* Loop unrolling: Compute 4 MACs at a time. */
+        colCnt = numColsA >> 2U;
+
+        /* matrix multiplication */
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+          /* Perform the multiply-accumulates */
+          sum = inner::mac(sum, *pIn1++, *pIn2);
+          pIn2 += pSrcB.stride();
+
+          sum = inner::mac(sum,*pIn1++, *pIn2);
+          pIn2 += pSrcB.stride();
+
+          sum = inner::mac(sum, *pIn1++, *pIn2);
+          pIn2 += pSrcB.stride();
+
+          sum = inner::mac(sum, *pIn1++, *pIn2);
+          pIn2 += pSrcB.stride();
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Loop unrolling: Compute remaining MACs */
+        colCnt = numColsA % 0x4U;
+
+
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+          /* Perform the multiply-accumulates */
+          sum = inner::mac(sum ,*pIn1++, *pIn2);
+          pIn2 += pSrcB.stride();
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Convert result from 2.62 to 1.31 format and store in destination buffer */
+        *px++ = inner::from_accumulator(sum);
+
+        /* Decrement column loop counter */
+        col--;
+
+        /* Update pointer pIn2 to point to starting address of next column */
+        pIn2 = pInB + (numColsB - col) ;
+
+      } while (col > 0U);
+
+      /* Update pointer pInA to point to starting address of next row */
+      i = i + pDst.stride();
+      pInA = pInA + pSrcA.stride();
+
+      /* Decrement row loop counter */
+      row--;
+
+    } while (row > 0U);
+
+
+}
+
+
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/Scalar/matrix_multiply_float.hpp b/dsppp/Include/dsppp/Scalar/matrix_multiply_float.hpp
new file mode 100644
index 000000000..96cf08cf3
--- /dev/null
+++ b/dsppp/Include/dsppp/Scalar/matrix_multiply_float.hpp
@@ -0,0 +1,119 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+/** \addtogroup SCALARALG 
+ *  @{
+ */
+
+template<typename MA,
+         typename MB,
+         typename RES,
+         typename std::enable_if<number_traits<typename traits<MA>::Scalar>::is_float,bool>::type = true>
+__STATIC_INLINE void _dot_m_m(const MA&pSrcA,const MB&pSrcB,
+                     RES &&pDst,
+                     const Scalar* = nullptr)
+{
+  using T = typename traits<MA>::Scalar;
+  using Acc = typename number_traits<T>::accumulator;
+  //using Comp = typename number_traits<T>::compute_type;
+  T *pIn1 = pSrcA.ptr();                /* Input data matrix pointer A */
+  T *pIn2 = pSrcB.ptr();                /* Input data matrix pointer B */
+  T *pInA = pSrcA.ptr();                /* Input data matrix pointer A */
+  T *pInB = pSrcB.ptr();                /* Input data matrix pointer B */
+  T *pOut = pDst.ptr();                 /* Output data matrix pointer */
+  T *px;                                 /* Temporary output data matrix pointer */
+  Acc sum;                                 /* Accumulator */
+  uint16_t numRowsA = pSrcA.rows();            /* Number of rows of input matrix A */
+  uint16_t numColsB = pSrcB.columns();            /* Number of columns of input matrix B */
+  uint16_t numColsA = pSrcA.columns();            /* Number of columns of input matrix A */
+  uint32_t col, i = 0U, row = numRowsA, colCnt;  /* Loop counters */
+
+
+    /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+    /* row loop */
+    do
+    {
+      /* Output pointer is set to starting address of row being processed */
+      px = pOut + i;
+
+      /* For every row wise process, column loop counter is to be initiated */
+      col = numColsB;
+
+      /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
+      pIn2 = pSrcB.ptr();
+
+      /* column loop */
+      do
+      {
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sum = Acc{};
+
+        /* Initialize pointer pIn1 to point to starting address of column being processed */
+        pIn1 = pInA;
+
+
+        /* Loop unrolling: Compute 4 MACs at a time. */
+        colCnt = numColsA >> 2U;
+
+        /* matrix multiplication */
+        while (colCnt > 0U)
+        {
+          /* c(m,p) = a(m,1) * b(1,p) + a(m,2) * b(2,p) + .... + a(m,n) * b(n,p) */
+
+          /* Perform the multiply-accumulates */
+          sum = inner::mac(sum, *pIn1++, *pIn2);
+          pIn2 += pSrcB.stride();
+
+          sum = inner::mac(sum, *pIn1++, *pIn2);
+          pIn2 += pSrcB.stride();
+
+          sum = inner::mac(sum, *pIn1++, *pIn2);
+          pIn2 += pSrcB.stride();
+
+          sum = inner::mac(sum, *pIn1++, *pIn2);
+          pIn2 += pSrcB.stride();
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Loop unrolling: Compute remaining MACs */
+        colCnt = numColsA % 0x4U;
+
+        while (colCnt > 0U)
+        {
+          /* c(m,p) = a(m,1) * b(1,p) + a(m,2) * b(2,p) + .... + a(m,n) * b(n,p) */
+
+          /* Perform the multiply-accumulates */
+          sum = inner::mac(sum, *pIn1++, *pIn2);
+          pIn2 += pSrcB.stride();
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Store result in destination buffer */
+        *px++ = inner::from_accumulator(sum);
+
+        /* Decrement column loop counter */
+        col--;
+
+        /* Update pointer pIn2 to point to starting address of next column */
+        pIn2 = pInB + (numColsB - col);
+
+      } while (col > 0U);
+
+      /* Update pointer pInA to point to starting address of next row */
+      i = i + pDst.stride();
+      pInA = pInA + pSrcA.stride();
+
+      /* Decrement row loop counter */
+      row--;
+
+    } while (row > 0U);
+
+  
+}
+
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/algorithms.hpp b/dsppp/Include/dsppp/algorithms.hpp
new file mode 100644
index 000000000..6f2f205ab
--- /dev/null
+++ b/dsppp/Include/dsppp/algorithms.hpp
@@ -0,0 +1,269 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+/** \addtogroup DSPPP C++ extension
+ *  C++ template extension to CMSIS-DSP. It is not yet part of
+ *  the pack but the headers can be found on the 
+ *  [CMSIS-DSP github](https://github.com/ARM-software/CMSIS-DSP/dsppp/Include)
+ *  The principles are described in this @ref dsppp_main "page"
+ *  @{
+ *  @}
+ */
+
+
+/**
+In this file we have kernels that are written in an
+architecture independant way (using operators of the library)
+
+*/
+
+namespace arm_cmsis_dsp {
+
+/** \addtogroup ALGO Architecture independent algorithms
+ *  \ingroup DSPPP
+ *  @{
+ */
+
+/*
+
+Matrix transpose 
+
+*/
+
+
+/**
+ * Transpose a matrix.
+ *
+ * @param dst Destination matrix.
+ * @param src Source matrix.
+ *
+ */
+template<typename MA,
+         typename MB,
+         typename std::enable_if<
+         HasMatrixIndexing<MA>::value &&
+         HasMatrixIndexing<MB>::value /*&& 
+         SameElementType<MA,Q15>::value*/,bool>::type = true>
+inline void transposeTo(MA &dst,
+                        const MB& src)
+{
+  _arm_mat_trans(src,dst,CURRENT_ARCH);
+}
+
+
+/*
+
+Init a diagonal matrix (0 outside of diagonal)
+
+*/
+template<typename P,int R,
+         template<int> typename A,
+         typename VB,
+typename std::enable_if<IsVector<VB>::value &&
+         SameElementType<VB,P>::value,bool>::type = true>
+inline void _diagonal(Matrix<P,R,R,A> &v,
+                      const VB& other,
+                      const vector_length_t rows)
+{
+  UNROLL_LOOP
+  for(index_t r=0;r < rows ; r++)
+  {
+     v.row(r) = P{};
+     v(r,r) = other[r];
+  }
+}
+
+
+/*
+
+
+Fill diagonal of an existing matrix 
+
+*/
+template<typename P,int R,
+         template<int> typename A,
+         typename VB,
+typename std::enable_if<IsVector<VB>::value &&
+         SameElementType<VB,P>::value,bool>::type = true>
+inline void _fill_diagonal(Matrix<P,R,R,A> &v,
+                           const VB& other,
+                           const vector_length_t rows)
+{
+  for(index_t r=0;r < rows ; r++)
+  {
+     v(r,r) = other[r];
+  }
+}
+
+template<typename P,int R,
+         template<int> typename A>
+inline void _identity(Matrix<P,R,R,A> &v,
+                      const vector_length_t rows)
+{
+  UNROLL_LOOP
+  for(index_t r=0;r < rows ; r++)
+  {
+     v.row(r) = P{};
+     v(r,r) = number_traits<P>::one();
+  }
+}
+
+
+
+template<typename M,
+         typename V,
+         typename std::enable_if<CompatibleStaticMatVecProduct<M,V>::value,bool>::type = true>
+inline typename OutputVector<M,V>::type dot(const M&m,const V&v)
+{
+   typename OutputVector<M,V>::type res;
+   _dot_m_v(res,m,v,CURRENT_ARCH);
+   return(res);
+}
+
+template<typename M,
+         typename V,
+         typename std::enable_if<CompatibleDynamicMatVecProduct<M,V>::value,bool>::type = true>
+inline typename OutputVector<M,V>::type dot(const M&m,const V&v)
+{
+   typename OutputVector<M,V>::type res(m.rows());
+   _dot_m_v(res,m,v,CURRENT_ARCH);
+   return(res);
+}
+
+template<typename M,
+         typename V,
+         typename RES,
+         typename std::enable_if<CompatibleDynamicMatVecProduct<M,V>::value,bool>::type = true>
+inline void dot(RES && res,const M&m,const V&v)
+{
+   //typename OutputVector<M,V>::type res(m.rows());
+   _dot_m_v(res,m,v,CURRENT_ARCH);
+}
+
+template<typename MA,
+         typename MB,
+         typename std::enable_if<CompatibleStaticMatMatProduct<MA,MB>::value &&
+                                 number_traits<typename traits<MA>::Scalar>::is_fixed,bool>::type = true>
+inline typename OutputMatrix<MA,MB>::type dot(const MA&ma,const MB&mb)
+{
+   
+   typename OutputMatrix<MA,MB>::type res;
+   auto BT = mb.transpose();
+
+   //using M = MatMult<typename ElementType<MA>::type,MA,MB,typename OutputMatrix<MA,MB>::type,decltype(BT)>;
+   _dot_m_m(ma,mb,res,BT,CURRENT_ARCH);
+   return(res);
+}
+
+template<typename MA,
+         typename MB,
+         typename std::enable_if<CompatibleStaticMatMatProduct<MA,MB>::value &&
+                                 number_traits<typename traits<MA>::Scalar>::is_float,bool>::type = true>
+inline typename OutputMatrix<MA,MB>::type dot(const MA&ma,const MB&mb)
+{
+   
+   typename OutputMatrix<MA,MB>::type res;
+
+   //using M = MatMult<typename ElementType<MA>::type,MA,MB,typename OutputMatrix<MA,MB>::type,decltype(BT)>;
+   _dot_m_m(ma,mb,res,CURRENT_ARCH);
+   return(res);
+}
+
+template<typename MA,
+         typename MB,
+         typename std::enable_if<CompatibleDynamicMatMatProduct<MA,MB>::value &&
+                                 number_traits<typename traits<MA>::Scalar>::is_fixed,bool>::type = true>
+inline typename OutputMatrix<MA,MB>::type dot(const MA&ma,const MB&mb)
+{
+   typename OutputMatrix<MA,MB>::type res(ma.rows(),mb.columns());
+   auto BT = mb.transpose();
+
+   //using M = MatMult<typename ElementType<MA>::type,MA,MB,typename OutputMatrix<MA,MB>::type,decltype(BT)>;
+   _dot_m_m(ma,mb,res,BT,CURRENT_ARCH);
+   return(res);
+}
+
+template<typename MA,
+         typename MB,
+         typename std::enable_if<CompatibleDynamicMatMatProduct<MA,MB>::value &&
+                                 number_traits<typename traits<MA>::Scalar>::is_float,bool>::type = true>
+inline typename OutputMatrix<MA,MB>::type dot(const MA&ma,const MB&mb)
+{
+   typename OutputMatrix<MA,MB>::type res(ma.rows(),mb.columns());
+
+   //using M = MatMult<typename ElementType<MA>::type,MA,MB,typename OutputMatrix<MA,MB>::type,decltype(BT)>;
+   _dot_m_m(ma,mb,res,CURRENT_ARCH);
+   return(res);
+}
+
+/*
+
+
+Get res matrix as argument to avoid memory allocation when
+assigning the result to a different type of Matrix (like a Matrix view).
+
+*/
+template<typename MA,
+         typename MB,
+         typename RES,
+         typename std::enable_if<CompatibleDynamicMatMatProduct<MA,MB>::value &&
+                                 number_traits<typename traits<MA>::Scalar>::is_float,bool>::type = true>
+inline void dot(RES &&res,const MA&ma,const MB&mb)
+{
+   //typename OutputMatrix<MA,MB>::type res(ma.rows(),mb.columns());
+
+   //using M = MatMult<typename ElementType<MA>::type,MA,MB,typename OutputMatrix<MA,MB>::type,decltype(BT)>;
+   _dot_m_m(ma,mb,std::forward<RES>(res),CURRENT_ARCH);
+}
+
+template<typename MA,
+         typename MB,
+         typename TMP,
+         typename std::enable_if<CompatibleDynamicMatMatProductDynamicStride<MA,MB>::value &&
+                                 number_traits<typename traits<MA>::Scalar>::is_float,bool>::type = true>
+inline typename OutputMatrix<MA,MB>::type dot(const MA&ma,const MB&mb)
+{
+   typename OutputMatrix<MA,MB>::type res(ma.rows(),mb.columns());
+
+   //using M = MatMult<typename ElementType<MA>::type,MA,MB,typename OutputMatrix<MA,MB>::type,decltype(mbt)>;
+   _dot_m_m(ma,mb,res,CURRENT_ARCH);
+   return(res);
+}
+
+template<typename MA,
+         typename MB,
+         typename TMP,
+         typename std::enable_if<CompatibleDynamicMatMatProductDynamicStride<MA,MB>::value &&
+                                 number_traits<typename traits<MA>::Scalar>::is_fixed,bool>::type = true>
+inline typename OutputMatrix<MA,MB>::type dot(const MA&ma,const MB&mb,const TMP &mbt)
+{
+   typename OutputMatrix<MA,MB>::type res(ma.rows(),mb.columns());
+
+   //using M = MatMult<typename ElementType<MA>::type,MA,MB,typename OutputMatrix<MA,MB>::type,decltype(mbt)>;
+   _dot_m_m(ma,mb,res,mbt,CURRENT_ARCH);
+   return(res);
+}
+
+
+
+template<typename P>
+Matrix<P,DYNAMIC,DYNAMIC,TMP_ALLOC> mk_identity(const vector_length_t l)
+{
+       Matrix<P,DYNAMIC,DYNAMIC,TMP_ALLOC> res(l,l);
+       _identity(res,l);
+       return(res);
+};
+
+
+template<typename P,int L>
+Matrix<P,L,L,TMP_ALLOC> mk_identity()
+{
+       Matrix<P,L,L,TMP_ALLOC> res;
+       _identity(res,L);
+       return(res);
+};
+
+/*! @} */
+
+}
diff --git a/dsppp/Include/dsppp/arch.hpp b/dsppp/Include/dsppp/arch.hpp
new file mode 100644
index 000000000..7326ba182
--- /dev/null
+++ b/dsppp/Include/dsppp/arch.hpp
@@ -0,0 +1,64 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+
+namespace arm_cmsis_dsp {
+
+/** \addtogroup ARCH Architecture detection
+ *  \ingroup DSPPP
+ *  @{
+ */
+
+/**
+ *  Scalar architecture
+ */
+class Scalar {};
+
+/**
+ *  Architecture supporting DSP extensions
+ */
+class DSP:public Scalar {};
+
+/**
+ *  v8.1M Architecture
+ */
+class Helium:public DSP {};
+
+/**
+ *  v8.2M Architecture
+ */
+class Helium82:public Helium {};
+
+/**
+ *  Architecture supporting Neon
+ */
+class Neon:public Scalar {};
+
+/*! @} */
+
+}
+
+#include "arch_detection.hpp"
+
+
+#if defined(ARM_MATH_MVEI) || defined(ARM_MATH_MVEF)
+#define ARCH Helium82
+#elif defined(ARM_MATH_DSP)
+#define ARCH DSP 
+#elif defined(ARM_MATH_NEON)
+#define ARCH Neon
+#else 
+#define ARCH Scalar
+#endif
+
+#define CURRENT_ARCH (ARCH*)nullptr
+
+#if defined(ARM_MATH_MVEI) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_DSP) || defined(ARM_MATH_NEON)
+#define HAS_VECTOR
+#endif
+
+#if defined(ARM_MATH_MVEI) || defined(ARM_MATH_MVEF) 
+#define HAS_PREDICATED_LOOP
+#endif
+
diff --git a/dsppp/Include/dsppp/arch_detection.hpp b/dsppp/Include/dsppp/arch_detection.hpp
new file mode 100644
index 000000000..d8194b407
--- /dev/null
+++ b/dsppp/Include/dsppp/arch_detection.hpp
@@ -0,0 +1,281 @@
+// -*- C++ -*-
+
+#pragma once 
+
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+/* Compiler specific diagnostic adjustment */
+#if   defined ( __CC_ARM )
+
+#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+
+#elif defined ( __APPLE_CC__ )
+  #pragma GCC diagnostic ignored "-Wold-style-cast"
+
+#elif defined ( __GNUC__ )
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wsign-conversion"
+  #pragma GCC diagnostic ignored "-Wconversion"
+  #pragma GCC diagnostic ignored "-Wunused-parameter"
+  #define GCC_COMPILER
+
+#elif defined ( __ICCARM__ )
+
+#elif defined ( __TI_ARM__ )
+
+#elif defined ( __CSMC__ )
+
+#elif defined ( __TASKING__ )
+
+#elif defined ( _MSC_VER )
+
+#else
+  #error Unknown compiler
+#endif
+
+
+/* Included for instrinsics definitions */
+#if defined (_MSC_VER ) 
+#include <stdint.h>
+#define __STATIC_FORCEINLINE static __forceinline
+#define __STATIC_INLINE static __inline
+#define __ALIGNED(x) __declspec(align(x))
+#define __WEAK
+#elif defined ( __APPLE_CC__ )
+#include <stdint.h>
+#define  __ALIGNED(x) __attribute__((aligned(x)))
+#define __STATIC_FORCEINLINE static inline __attribute__((always_inline)) 
+#define __STATIC_INLINE static inline
+#define __WEAK
+#elif defined (__GNUC_PYTHON__)
+#include <stdint.h>
+#define  __ALIGNED(x) __attribute__((aligned(x)))
+#define __STATIC_FORCEINLINE static inline __attribute__((always_inline)) 
+#define __STATIC_INLINE static inline
+#define __WEAK
+#else
+#include "cmsis_compiler.h"
+#endif
+
+
+
+#include <string.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+
+/* evaluate ARM DSP feature */
+#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
+  #define ARM_MATH_DSP                   1
+#endif
+
+#if defined(ARM_MATH_NEON)
+  #if defined(_MSC_VER) && defined(_M_ARM64EC)
+    #include <arm64_neon.h>
+  #else
+    #include <arm_neon.h>
+  #endif
+  #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    #if !defined(ARM_MATH_NEON_FLOAT16)
+      #define ARM_MATH_NEON_FLOAT16
+    #endif
+  #endif
+#endif
+
+#if !defined(ARM_MATH_AUTOVECTORIZE)
+
+
+#if defined(__ARM_FEATURE_MVE)
+#if __ARM_FEATURE_MVE
+  #if !defined(ARM_MATH_MVEI)
+    #define ARM_MATH_MVEI
+  #endif
+#endif
+
+#if (__ARM_FEATURE_MVE & 2)
+  #if !defined(ARM_MATH_MVEF)
+    #define ARM_MATH_MVEF
+  #endif
+  #if !defined(ARM_MATH_MVE_FLOAT16)
+       #define ARM_MATH_MVE_FLOAT16
+  #endif
+#endif
+
+#endif /*defined(__ARM_FEATURE_MVE)*/
+#endif /*!defined(ARM_MATH_AUTOVECTORIZE)*/
+
+
+#if defined (ARM_MATH_HELIUM)
+  #if !defined(ARM_MATH_MVEF)
+    #define ARM_MATH_MVEF
+  #endif
+
+  #if !defined(ARM_MATH_MVEI)
+    #define ARM_MATH_MVEI
+  #endif
+
+  #if !defined(ARM_MATH_MVE_FLOAT16)
+       #define ARM_MATH_MVE_FLOAT16
+  #endif
+#endif
+
+
+
+#if   defined ( __CC_ARM )
+  /* Enter low optimization region - place directly above function definition */
+  #if defined( __ARM_ARCH_7EM__ )
+    #define LOW_OPTIMIZATION_ENTER \
+       _Pragma ("push")         \
+       _Pragma ("O1")
+  #else
+    #define LOW_OPTIMIZATION_ENTER
+  #endif
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #if defined ( __ARM_ARCH_7EM__ )
+    #define LOW_OPTIMIZATION_EXIT \
+       _Pragma ("pop")
+  #else
+    #define LOW_OPTIMIZATION_EXIT
+  #endif
+
+  /* Enter low optimization region - place directly above function definition */
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined (__ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+  
+#elif defined ( __APPLE_CC__ )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __GNUC__ )
+  #define LOW_OPTIMIZATION_ENTER \
+       __attribute__(( optimize("-O1") ))
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __ICCARM__ )
+  /* Enter low optimization region - place directly above function definition */
+  #if defined ( __ARM_ARCH_7EM__ )
+    #define LOW_OPTIMIZATION_ENTER \
+       _Pragma ("optimize=low")
+  #else
+    #define LOW_OPTIMIZATION_ENTER
+  #endif
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #define LOW_OPTIMIZATION_EXIT
+
+  /* Enter low optimization region - place directly above function definition */
+  #if defined ( __ARM_ARCH_7EM__ )
+    #define IAR_ONLY_LOW_OPTIMIZATION_ENTER \
+       _Pragma ("optimize=low")
+  #else
+    #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #endif
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __TI_ARM__ )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __CSMC__ )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __TASKING__ )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+       
+#elif defined ( _MSC_VER ) || defined(__GNUC_PYTHON__)
+      #define LOW_OPTIMIZATION_ENTER
+      #define LOW_OPTIMIZATION_EXIT
+      #define IAR_ONLY_LOW_OPTIMIZATION_ENTER 
+      #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+#endif
+
+
+
+/* Compiler specific diagnostic adjustment */
+#if   defined ( __CC_ARM )
+
+#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+
+#elif defined ( __APPLE_CC__ )
+
+#elif defined ( __GNUC__ )
+#pragma GCC diagnostic pop
+
+#elif defined ( __ICCARM__ )
+
+#elif defined ( __TI_ARM__ )
+
+#elif defined ( __CSMC__ )
+
+#elif defined ( __TASKING__ )
+
+#elif defined ( _MSC_VER )
+
+#else
+  #error Unknown compiler
+#endif
+
+#ifdef   __cplusplus
+}
+#endif
+
+#if defined(__ARM_FEATURE_MVE) && __ARM_FEATURE_MVE
+#include <arm_mve.h>
+#endif
+
+#if !(__ARM_FEATURE_MVE & 2)
+  #if !defined(DISABLEFLOAT16)
+    #if defined(__ARM_FP16_FORMAT_IEEE) || defined(__ARM_FP16_FORMAT_ALTERNATIVE)
+      typedef __fp16 float16_t;
+      #define ARM_FLOAT16_SUPPORTED
+    #endif
+  #endif
+#else
+  /* When Vector float16, this flag is always defined and can't be disabled */
+  #define ARM_FLOAT16_SUPPORTED
+#endif
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#if defined(__ICCARM__)
+
+#define F16INFINITY   ((float16_t) INFINITY)
+
+#else
+
+#define F16INFINITY ((float16_t)__builtin_inf())
+
+#endif
+
+#endif
+
+
+
+
diff --git a/dsppp/Include/dsppp/common.hpp b/dsppp/Include/dsppp/common.hpp
new file mode 100644
index 000000000..cd272c03e
--- /dev/null
+++ b/dsppp/Include/dsppp/common.hpp
@@ -0,0 +1,73 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#include <cstdint>
+#include <iostream>
+
+// For compiler detection
+#include "arch.hpp"
+
+
+#define ARM_COMPUTE_DISABLE_UNROLL
+// For loop (not for fusion unrolling functions)
+#define MEMORY_POOL_ALIGNMENT 128
+//#define MEMORY_ALLOCATION_DEBUG
+
+// TMP_ALLOC must be defined to use the library
+// It is generally defined in an external header not
+// part of the library.
+// By default it is using the malloc allocator
+
+#ifndef TMP_ALLOC
+#define TMP_ALLOC malloc_allocator
+#endif
+
+#if !defined(GCC_COMPILER)
+// clang / AC6
+#if defined(ARM_COMPUTE_DISABLE_UNROLL)
+#define UNROLL_LOOP _Pragma ("clang loop unroll(disable)")
+#else
+#define UNROLL_LOOP _Pragma("clang loop unroll_count(4)")
+#endif
+
+#define DISABLE_LOOP_UNROLL _Pragma("clang loop unroll(disable)")
+
+#else
+// GCC
+#define UNROLL_LOOP
+#define DISABLE_LOOP_UNROLL
+#endif
+
+namespace arm_cmsis_dsp {
+
+/** \addtogroup COMMON Common types and constants
+ *  \ingroup DSPPP
+ *  @{
+ */
+    // Dynamic objects (dimensions only known at runtime)
+    constexpr int DYNAMIC = -1;
+
+    // Dynamic objects (dimensions only known at runtime) but
+    // with some constraints (like stride == nb_cols)
+    constexpr int CONSTRAINED_DYNAMIC = -2;
+
+    // It must be a signed datatype
+    typedef int32_t index_t;
+    typedef int32_t vector_length_t;
+
+/*! @} */
+
+/** \addtogroup DEBUG Tools for debugging
+ *  \ingroup DSPPP
+ *  @{
+ */
+template <typename T>
+void PrintType(void)
+{
+    //T t;
+    std::cout << __PRETTY_FUNCTION__ << "\r\n";
+};
+
+/*! @} */
+}
diff --git a/dsppp/Include/dsppp/fixed_point.hpp b/dsppp/Include/dsppp/fixed_point.hpp
new file mode 100644
index 000000000..82bc2d437
--- /dev/null
+++ b/dsppp/Include/dsppp/fixed_point.hpp
@@ -0,0 +1,1036 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#include <cstdint>
+#include "arch.hpp"
+#include <cstdlib>
+
+#include <type_traits>
+#include <iostream>
+namespace arm_cmsis_dsp {
+
+/** \addtogroup FIXED Fixed point datatypes
+ *  \ingroup DSPPP
+ *  @{
+ */
+
+/*
+
+Normally those kind of definitions are in a compiler file
+in Core or Core_A.
+
+But for MSVC compiler it is a bit special. The goal is very specific
+to CMSIS-DSP and only to allow the use of this library from other
+systems like Python or Matlab.
+
+MSVC is not going to be used to cross-compile to ARM. So, having a MSVC
+compiler file in Core or Core_A would not make sense.
+
+*/
+#if defined ( _MSC_VER ) || defined(__GNUC_PYTHON__) || defined(__APPLE_CC__)
+__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t data)
+{
+  if (data == 0U) { return 32U; }
+
+  uint32_t count = 0U;
+  uint32_t mask = 0x80000000U;
+
+  while ((data & mask) == 0U)
+  {
+    count += 1U;
+    mask = mask >> 1U;
+  }
+  return count;
+}
+
+__STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat)
+{
+  if ((sat >= 1U) && (sat <= 32U))
+  {
+    const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+    const int32_t min = -1 - max ;
+    if (val > max)
+    {
+      return max;
+    }
+    else if (val < min)
+    {
+      return min;
+    }
+  }
+  return val;
+}
+
+__STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
+{
+  if (sat <= 31U)
+  {
+    const uint32_t max = ((1U << sat) - 1U);
+    if (val > (int32_t)max)
+    {
+      return max;
+    }
+    else if (val < 0)
+    {
+      return 0U;
+    }
+  }
+  return (uint32_t)val;
+}
+#endif
+
+#if !defined(ARM_MATH_DSP)
+__STATIC_FORCEINLINE int32_t clip_int64_to_q31(
+  int64_t x)
+  {
+    return ((int32_t) (x >> 32) != ((int32_t) x >> 31)) ?
+      ((0x7FFFFFFF ^ ((int32_t) (x >> 63)))) : (int32_t) x;
+  }
+
+__STATIC_FORCEINLINE int32_t __QADD(
+  int32_t x,
+  int32_t y)
+  {
+    return ((int32_t)(clip_int64_to_q31((int64_t)x + (int32_t)y)));
+  }
+
+
+  /*
+   * @brief C custom defined QSUB
+   */
+  __STATIC_FORCEINLINE int32_t __QSUB(
+  int32_t x,
+  int32_t y)
+  {
+    return ((int32_t)(clip_int64_to_q31((int64_t)x - (int32_t)y)));
+  }
+
+
+#endif
+
+
+constexpr bool test64(const int M,const int F,const int S){return((M+F+S)>32 && (M+F+S)<=64);}
+constexpr bool test32(const int M,const int F,const int S){return((M+F+S)>16 && (M+F+S)<=32);}
+constexpr bool test16(const int M,const int F,const int S){return((M+F+S)>8  && (M+F+S)<=16);}
+constexpr bool test8 (const int M,const int F,const int S){return((M+F+S)<=8);}
+
+template<int M,int F,bool s = true,bool = true>
+struct fixed_storage_type
+{
+};
+
+template<int M,int F>
+struct fixed_storage_type<M,F,true,test64(M,F,1)>
+{
+    typedef int64_t value_type;
+    typedef int64_t wider_type;
+    typedef int32_t narrow_type;
+};
+
+template<int M,int F>
+struct fixed_storage_type<M,F,false,test64(M,F,0)>
+{
+    typedef uint64_t value_type;
+    typedef uint64_t wider_type;
+    typedef uint32_t narrow_type;
+};
+
+
+template<int M,int F>
+struct fixed_storage_type<M,F,true,test32(M,F,1)>
+{
+    typedef int32_t value_type;
+    typedef int64_t wider_type;
+    typedef int16_t narrow_type;
+};
+
+template<int M,int F>
+struct fixed_storage_type<M,F,false,test32(M,F,0)>
+{
+    typedef uint32_t value_type;
+    typedef uint64_t wider_type;
+    typedef uint16_t narrow_type;
+};
+
+
+
+template<int M,int F>
+struct fixed_storage_type<M,F, true,test16(M,F,1)>
+{
+    typedef int16_t value_type;
+    typedef int32_t wider_type;
+    typedef int8_t narrow_type;
+};
+
+template<int M,int F>
+struct fixed_storage_type<M,F, false,test16(M,F,0)>
+{
+    typedef uint16_t value_type;
+    typedef uint32_t wider_type;
+    typedef uint8_t narrow_type;
+};
+
+
+template<int M,int F>
+struct fixed_storage_type<M,F,true,test8(M,F,1)>
+{
+    typedef int8_t value_type;
+    typedef int16_t wider_type;
+    typedef int8_t narrow_type;
+};
+
+template<int M,int F>
+struct fixed_storage_type<M,F,false,test8(M,F,0)>
+{
+    typedef uint8_t value_type;
+    typedef uint16_t wider_type;
+    typedef uint8_t narrow_type;
+};
+
+
+template<int M, int F, bool S = true,
+         typename T=typename fixed_storage_type<M,F,S>::value_type>
+struct Q {};
+
+template<int M,int F>
+struct Q<M,F,true,int64_t> {
+  constexpr static int fracBits = F;
+  constexpr static int mantissaBits = M;
+  constexpr static bool isSigned = true;
+  using value_type = typename fixed_storage_type<M,F,true>::value_type;
+  using wider_type = typename fixed_storage_type<M,F,true>::wider_type;
+  constexpr static value_type maxVal = 0x7FFFFFFFFFFFFFFFLL;
+  constexpr static value_type minVal = 0x8000000000000000LL;
+
+
+  constexpr static value_type convert(const float f) {
+    return(f >= 1.0f ? maxVal : (f <= -1.0f ? minVal : value_type(f * (float)((maxVal >> (63 - F)) ))));
+  };
+
+  value_type v;
+  constexpr Q():v(0){};
+  constexpr explicit Q(const value_type x):v(x){};
+  constexpr static Q f(const float x){return Q(convert(x));}
+
+  constexpr static Q one() {return f(1.0f);};
+
+  Q(Q&& other)=default;
+  Q(const Q& other)=default;
+  Q& operator=(Q&& other)=default;
+  Q& operator=(const Q& other)=default;
+
+
+  constexpr explicit Q(const Q<M,F,false>&other) 
+    :v{value_type(other.v)} {};
+
+   bool operator==(const Q& b) const
+   {
+     return(v == b.v);
+   }
+
+   bool operator!=(const Q& b) const
+   {
+     return(v != b.v);
+   }
+
+   bool operator<(const Q& b) const
+   {
+     return(v < b.v);
+   }
+
+   bool operator>(const Q& b) const
+   {
+     return(v > b.v);
+   }
+
+   bool operator<=(const Q& b) const
+   {
+     return(v <= b.v);
+   }
+
+   bool operator>=(const Q& b) const
+   {
+     return(v >= b.v);
+   }
+
+
+   Q & operator+=(const Q other)
+   {
+       v += other.v;
+       return(*this);
+   }
+
+   Q & operator-=(const Q other)
+   {
+       v -= other.v;
+       return(*this);
+   }
+
+
+   friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
+        stream << double(1.0*other.v / (maxVal >> (63 - F))) << "_Q(" << M << "," << F << ")";;
+        return(stream);
+   }
+
+};
+
+template<int M,int F>
+struct Q<M,F,false,uint64_t> {
+  constexpr static int fracBits = F;
+  constexpr static int mantissaBits = M;
+  constexpr static bool isSigned = false;
+  using value_type = typename fixed_storage_type<M,F,false>::value_type;
+  using wider_type = typename fixed_storage_type<M,F,false>::wider_type;
+  constexpr static value_type maxVal = 0xFFFFFFFFFFFFFFFFLL;
+  constexpr static value_type convert(const float f) {
+    return(f >= 1.0f ? maxVal : (f <= 0.0f ? 0LL : value_type(f * (float)((maxVal >> (64 - F))))));
+  };
+  
+  value_type v;
+  constexpr Q():v(0){};
+  constexpr explicit Q(const value_type x):v(x){};
+  constexpr static Q f(const float x){return Q(convert(x));}
+
+  constexpr static Q one() {return f(1.0f);};
+
+  Q(Q&& other)=default;
+  Q(const Q& other)=default;
+  Q& operator=(Q&& other)=default;
+  Q& operator=(const Q& other)=default;
+
+   friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
+        stream << double(1.0*other.v / (maxVal >> (64 - F))) << "_UQ(" << M << "," << F << ")";;
+        return(stream);
+   }
+
+   bool operator==(const Q& b) const
+   {
+     return(v == b.v);
+   }
+
+   bool operator!=(const Q& b) const
+   {
+     return(v != b.v);
+   }
+
+   bool operator<(const Q& b) const
+   {
+     return(v < b.v);
+   }
+
+   bool operator>(const Q& b) const
+   {
+     return(v > b.v);
+   }
+
+   bool operator<=(const Q& b) const
+   {
+     return(v <= b.v);
+   }
+
+   bool operator>=(const Q& b) const
+   {
+     return(v >= b.v);
+   }
+
+};
+
+
+template<int M,int F>
+struct Q<M,F,true,int32_t> {
+  constexpr static int fracBits = F;
+  constexpr static int mantissaBits = M;
+  constexpr static bool isSigned = true;
+  using value_type = typename fixed_storage_type<M,F,true>::value_type;
+  using wider_type = typename fixed_storage_type<M,F,true>::wider_type;
+  constexpr static value_type maxVal = 0x7FFFFFFFL;
+  constexpr static value_type minVal = 0x80000000L;
+  constexpr static value_type sat(const wider_type i) {
+    return (i > (value_type)maxVal ? maxVal : (i<(value_type)minVal ? minVal : i));
+  };
+
+  constexpr static value_type convert(const float f) {
+    return(f >= 1.0f ? maxVal : (f <= -1.0f ? minVal : value_type(f * (float)((wider_type)1<<F))));
+  };
+  
+  value_type v;
+  constexpr Q():v(0){};
+  constexpr explicit Q(const value_type x):v(x){};
+  constexpr static Q f(const float x){return Q(convert(x));}
+
+  constexpr static Q one() {return f(1.0f);};
+
+  Q(Q&& other)=default;
+  Q(const Q& other)=default;
+  Q& operator=(Q&& other)=default;
+  Q& operator=(const Q& other)=default;
+
+  constexpr explicit Q(const Q<M,F,false>&other):
+  v{value_type(other.v)} {};
+
+  bool operator==(const Q& b) const
+   {
+     return(v == b.v);
+   }
+
+   bool operator!=(const Q& b) const
+   {
+     return(v != b.v);
+   }
+
+   bool operator<(const Q& b) const
+   {
+     return(v < b.v);
+   }
+
+   bool operator>(const Q& b) const
+   {
+     return(v > b.v);
+   }
+
+   bool operator<=(const Q& b) const
+   {
+     return(v <= b.v);
+   }
+
+   bool operator>=(const Q& b) const
+   {
+     return(v >= b.v);
+   }
+
+   Q & operator+=(const Q other)
+   {
+       v = __QADD(v,other.v);
+       return(*this);
+   }
+
+   Q & operator-=(const Q other)
+   {
+       v = __QSUB(v,other.v);
+       return(*this);
+   }
+
+   friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
+        stream << double(1.0*other.v / ((wider_type)1<<F)) << "_Q(" << M << "," << F << ")";;
+        return(stream);
+   }
+
+};
+
+
+template<int M,int F>
+struct Q<M,F,false,uint32_t> {
+  constexpr static int fracBits = F;
+  constexpr static int mantissaBits = M;
+  constexpr static bool isSigned = false;
+  using value_type = typename fixed_storage_type<M,F,false>::value_type;
+  using wider_type = typename fixed_storage_type<M,F,false>::wider_type;
+  constexpr static value_type maxVal = 0xFFFFFFFFL;
+  constexpr static value_type sat(const wider_type i) {
+    return (i > (value_type)maxVal ? maxVal : i);
+  };
+  constexpr static value_type convert(const float f) {
+    return(f >= 1.0f ? maxVal : (f <= 0.0f ? 0 : value_type(f * (float)((wider_type)1<<F))));
+  };
+  
+  value_type v;
+  constexpr Q():v(0){};
+  constexpr explicit Q(const value_type x):v(x){};
+  constexpr static Q f(const float x){return Q(convert(x));}
+
+  constexpr static Q one() {return f(1.0f);};
+
+  Q(Q&& other)=default;
+  Q(const Q& other)=default;
+  Q& operator=(Q&& other)=default;
+  Q& operator=(const Q& other)=default;
+ 
+   friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
+        stream << double(1.0*other.v / ((wider_type)1<<F)) << "_UQ(" << M << "," << F << ")";;
+        return(stream);
+   }
+
+   bool operator==(const Q& b) const
+   {
+     return(v == b.v);
+   }
+
+   bool operator!=(const Q& b) const
+   {
+     return(v != b.v);
+   }
+
+   bool operator<(const Q& b) const
+   {
+     return(v < b.v);
+   }
+
+   bool operator>(const Q& b) const
+   {
+     return(v > b.v);
+   }
+
+   bool operator<=(const Q& b) const
+   {
+     return(v <= b.v);
+   }
+
+   bool operator>=(const Q& b) const
+   {
+     return(v >= b.v);
+   }
+};
+
+template<int M,int F>
+struct Q<M,F,true,int16_t> {
+  constexpr static int fracBits = F;
+  constexpr static int mantissaBits = M;
+  constexpr static bool isSigned = true;
+  using value_type = typename fixed_storage_type<M,F,true>::value_type;
+  using wider_type = typename fixed_storage_type<M,F,true>::wider_type;
+  constexpr static value_type maxVal = 0x7FFF;
+  constexpr static value_type minVal = 0x8000;
+  constexpr static value_type sat(const wider_type i) {
+    return (i > (value_type)maxVal ? maxVal : (i<(value_type)minVal ? minVal : i));
+  };
+  constexpr static value_type convert(const float f) {
+    return(f >= 1.0f ? maxVal : (f <= -1.0f ? minVal : value_type(f * (float)((wider_type)1<<F))));
+  };
+  
+  value_type v;
+  constexpr Q():v(0){};
+  constexpr explicit Q(const value_type x):v(x){};
+  constexpr static Q f(const float x){return Q(convert(x));}
+
+  constexpr static Q one() {return f(1.0f);};
+
+  Q(Q&& other)=default;
+  Q(const Q& other)=default;
+  Q& operator=(Q&& other)=default;
+  Q& operator=(const Q& other)=default;
+
+  constexpr explicit Q(const Q<M,F,false>&other):v{value_type(other.v)} {};
+
+  bool operator==(const Q& b) const
+   {
+     return(v == b.v);
+   }
+
+   bool operator!=(const Q& b) const
+   {
+     return(v != b.v);
+   }
+
+   bool operator<(const Q& b) const
+   {
+     return(v < b.v);
+   }
+
+   bool operator>(const Q& b) const
+   {
+     return(v > b.v);
+   }
+
+   bool operator<=(const Q& b) const
+   {
+     return(v <= b.v);
+   }
+
+   bool operator>=(const Q& b) const
+   {
+     return(v >= b.v);
+   }
+
+   Q & operator+=(const Q other)
+   {
+    #if !defined(ARM_MATH_DSP)
+      v = __SSAT((value_type)v + other.v,16);
+    #else 
+      v = (value_type) __QADD16(v, other.v);
+    #endif
+    return(*this);
+   }
+
+   Q & operator-=(const Q other)
+   {
+    #if !defined(ARM_MATH_DSP)
+       v = __SSAT((value_type)v - other.v,16);
+    #else 
+       v = (value_type) __QSUB16(v, other.v);
+    #endif
+    return(*this);
+   }
+
+   friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
+        stream << double(1.0*other.v / (((wider_type)1)<<F)) << "_Q(" << M << "," << F << ")";;
+        return(stream);
+   }
+
+};
+
+
+template<int M,int F>
+struct Q<M,F,false,uint16_t> {
+  constexpr static int fracBits = F;
+  constexpr static int mantissaBits = M;
+  constexpr static bool isSigned = false;
+  using value_type = typename fixed_storage_type<M,F,false>::value_type;
+  using wider_type = typename fixed_storage_type<M,F,false>::wider_type;
+  constexpr static value_type maxVal = 0xFFFF;
+  constexpr static value_type sat(const wider_type i) {
+    return (i > (value_type)maxVal ? maxVal : i);
+  };
+  constexpr static value_type convert(const float f) {
+    return(f >= 1.0f ? maxVal : (f <= 0.0f ? 0 : value_type(f * (float)((wider_type)1<<F))));
+  };
+  
+  value_type v;
+  constexpr Q():v(0){};
+  constexpr explicit Q(const value_type x):v(x){};
+  constexpr static Q f(const float x){return Q(convert(x));}
+
+  constexpr static Q one() {return f(1.0f);};
+
+  Q(Q&& other)=default;
+  Q(const Q& other)=default;
+  Q& operator=(Q&& other)=default;
+  Q& operator=(const Q& other)=default;
+
+  bool operator==(const Q& b) const
+   {
+     return(v == b.v);
+   }
+
+   bool operator!=(const Q& b) const
+   {
+     return(v != b.v);
+   }
+
+   bool operator<(const Q& b) const
+   {
+     return(v < b.v);
+   }
+
+   bool operator>(const Q& b) const
+   {
+     return(v > b.v);
+   }
+
+   bool operator<=(const Q& b) const
+   {
+     return(v <= b.v);
+   }
+
+   bool operator>=(const Q& b) const
+   {
+     return(v >= b.v);
+   }
+
+   Q & operator+=(const Q other)
+   {
+    v = __USAT((value_type)v + other.v,16);
+    return(*this);
+   }
+
+
+
+   friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
+        stream << double(1.0*other.v / ((wider_type)1<<F)) << "_UQ(" << M << "," << F << ")";;
+        return(stream);
+   }
+
+};
+
+template<int M,int F>
+struct Q<M,F,true,int8_t> {
+  constexpr static int fracBits = F;
+  constexpr static int mantissaBits = M;
+  constexpr static bool isSigned = true;
+  using value_type = typename fixed_storage_type<M,F,true>::value_type;
+  using wider_type = typename fixed_storage_type<M,F,true>::wider_type;
+  constexpr static value_type maxVal = 0x7F;
+  constexpr static value_type minVal = 0x80;
+  constexpr static value_type sat(const wider_type i) {
+    return (i > (value_type)maxVal ? maxVal : (i<(value_type)minVal ? minVal : i));
+  };
+  constexpr static value_type convert(const float f) {
+    return(f >= 1.0f ? maxVal : (f <= -1.0f ? minVal : value_type(f * (float)((wider_type)1<<F))));
+  };
+  
+  value_type v;
+  constexpr Q():v(0){};
+  constexpr explicit Q(const value_type x):v(x){};
+  constexpr static Q f(const float x){return Q(convert(x));}
+
+  constexpr static Q one() {return f(1.0f);};
+
+  Q(Q&& other)=default;
+  Q(const Q& other)=default;
+  Q& operator=(Q&& other)=default;
+  Q& operator=(const Q& other)=default;
+
+  constexpr explicit Q(const Q<M,F,false>&other):v{value_type(other.v)} {};
+
+  bool operator==(const Q& b) const
+   {
+     return(v == b.v);
+   }
+
+   bool operator!=(const Q& b) const
+   {
+     return(v != b.v);
+   }
+
+   bool operator<(const Q& b) const
+   {
+     return(v < b.v);
+   }
+
+   bool operator>(const Q& b) const
+   {
+     return(v > b.v);
+   }
+
+   bool operator<=(const Q& b) const
+   {
+     return(v <= b.v);
+   }
+
+   bool operator>=(const Q& b) const
+   {
+     return(v >= b.v);
+   }
+
+   Q & operator+=(const Q other)
+   {
+     #if !defined(ARM_MATH_DSP)
+        v = __SSAT((value_type)v + other.v,8);
+     #else 
+        v = (value_type) __QADD8(v, other.v);
+     #endif
+    return(*this);
+   }
+
+   Q & operator-=(const Q other)
+   {
+     #if !defined(ARM_MATH_DSP)
+         v = __SSAT((value_type)v + other.v,8);
+     #else 
+         v = (value_type) __QSUB8(v, other.v);
+     #endif
+    return(*this);
+   }
+
+   friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
+        stream << double(1.0*other.v / ((wider_type)1<<F)) << "_Q(" << M << "," << F << ")";
+        return(stream);
+   }
+
+};
+
+template<int M,int F>
+struct Q<M,F,false,uint8_t> {
+  constexpr static int fracBits = F;
+  constexpr static int mantissaBits = M;
+  constexpr static bool isSigned = false;
+  using value_type = typename fixed_storage_type<M,F,false>::value_type;
+  using wider_type = typename fixed_storage_type<M,F,false>::wider_type;
+  constexpr static value_type maxVal = 0xFF;
+  constexpr static value_type sat(const wider_type i) {
+    return (i > (value_type)maxVal ? maxVal : i);
+  };
+  constexpr static value_type convert(const float f) {
+    return(f >= 1.0f ? maxVal : (f <= 0.0f ? 0 : value_type(f * (float)((wider_type)1<<F))));
+  };
+  
+  value_type v;
+  constexpr Q():v(0){};
+  constexpr explicit Q(const value_type x):v(x){};
+  constexpr static Q f(const float x){return Q(convert(x));}
+
+  constexpr static Q one() {return f(1.0f);};
+
+  Q(Q&& other)=default;
+  Q(const Q& other)=default;
+  Q& operator=(Q&& other)=default;
+  Q& operator=(const Q& other)=default;
+
+  bool operator==(const Q& b) const
+   {
+     return(v == b.v);
+   }
+
+   bool operator!=(const Q& b) const
+   {
+     return(v != b.v);
+   }
+
+   bool operator<(const Q& b) const
+   {
+     return(v < b.v);
+   }
+
+   bool operator>(const Q& b) const
+   {
+     return(v > b.v);
+   }
+
+   bool operator<=(const Q& b) const
+   {
+     return(v <= b.v);
+   }
+
+   bool operator>=(const Q& b) const
+   {
+     return(v >= b.v);
+   }
+   
+   Q & operator+=(const Q other)
+   {
+    v = __USAT((value_type)v + other.v,8);
+    return(*this);
+   }
+
+   friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
+        stream << double(1.0*other.v / ((wider_type)1<<F)) << "_UQ(" << M << "," << F << ")";
+        return(stream);
+   }
+
+};
+
+using Q63 = Q<0,63>;
+using Q31 = Q<0,31>;
+using Q15 = Q<0,15>;
+using Q7  = Q<0,7>;
+
+constexpr Q63 operator ""_q63(long double x){return Q63(Q63::convert((float)x));}
+constexpr Q31 operator ""_q31(long double x){return Q31(Q31::convert((float)x));}
+constexpr Q15 operator ""_q15(long double x){return Q15(Q15::convert((float)x));}
+constexpr Q7 operator ""_q7(long double x){return Q7(Q7::convert((float)x));}
+
+
+
+template<int MA,int FA,int MB,int FB,bool SA,bool SB>
+inline Q< MA+MB+1 , FA+FB,SA || SB> mult(const Q<MA,FA,SA> &a,
+                                         const Q<MB,FB,SB> &b)
+{
+    /*
+
+    Why mantissa of result is MA + MB + 1.
+    If we take as example, Q7 * Q7 and we multiply
+    0x80 * 0x80 (-128 * -128) we get 0x4000 and if we shift right by 7
+    we get 0x080 (on 9 bits). If the additional mantissa bit was not 
+    kept, we would get 0x80 (on 8 bits) which would mean a negative 
+    number.
+
+    Saturation of 0x080 (on 9 bits) will give 0x7F whereas 
+    saturation of 0x80 (on 8 bits) would keep 0x80 and thus 
+    the wrong sign.
+
+    By using MA + MB + 1 we ensure that Q7 * Q7 is Q<1,14>
+    and not Q<0,14>.
+
+    To convert Q<1,14> to Q<0,7> we need a toFract and a saturate.
+
+    */
+    using ResType = typename Q< MA+MB+1 , FA+FB,SA || SB>::value_type;
+    ResType res = ((ResType)a.v * (ResType)b.v);
+    return(Q<MA+MB+1,FA+FB,SA || SB>(res));
+}
+
+
+template<int M,int F,bool S>
+inline Q<M,F,S> operator+(const Q<M,F,S> &a,const Q<M,F,S> &b)
+{
+    Q<M,F,S> ret(a);
+    ret+=b;
+    return ret;
+}
+
+template<int M,int F,bool S>
+inline Q<M,F,S> operator-(const Q<M,F,S> &a,const Q<M,F,S> &b)
+{
+    Q<M,F,S> ret(a);
+    ret-=b;
+    return ret;
+}
+
+template<int M,int F,bool S>
+inline Q<M,F,S> operator-(const Q<M,F,S> &a)
+{
+    Q<M,F,S> ret;
+    ret-=a;
+    return ret;
+}
+
+// Unsaturating add
+template<int M,int F,bool S>
+inline Q<M,F,S> add(const Q<M,F,S> &a,const Q<M,F,S> &b)
+{
+    return Q<M,F,S>(a.v + b.v);
+}
+
+// Unsaturating sub
+template<int M,int F,bool S>
+inline Q<M,F,S> sub(const Q<M,F,S> &a,const Q<M,F,S> &b)
+{
+    return Q<M,F,S>(a.v - b.v);
+}
+
+
+template<int N>
+constexpr std::integral_constant<int, N> i_{};
+
+template<int M,int F, int N,bool S>
+inline Q<M,F,S> operator >>(const Q<M,F,S> &a, std::integral_constant<int, N>) noexcept {
+    return Q<M,F,S>(a.v >> N);
+}
+
+template<int M,int F,int N,bool S>
+inline Q< M+N , F,S> operator <<(const Q<M,F,S> &a,  std::integral_constant<int, N>) noexcept {
+    using ResType = typename Q<M+N,F,S>::value_type;
+    return Q<M+N,F,S>(ResType(a.v) << N);
+}
+
+template<int MD=0,int MS,int F>
+inline Q<MD,F,true> saturate(const Q<MS,F,true> &src,
+                             typename std::enable_if<(MD < MS) && ((MD+F)<31)>::type* = nullptr)
+{
+    return(Q<MD,F,true>(__SSAT(src.v, MD+F+1)));
+}
+
+
+template<int MD=0,int MS,int F>
+inline Q<MD,F,false> saturate(const Q<MS,F,false> &src,typename std::enable_if<(MD < MS) && ((MD+F)<31)>::type* = nullptr)
+{
+    return(Q<MD,F,false>(__USAT(src.v, MD+F+1)));
+}
+
+
+template<int M,int FD,int FS,bool S,bool = true>
+struct FixedCastShift {};
+
+/* Positive shift */
+template<int M,int FD,int FS,bool S>
+struct FixedCastShift<M,FD,FS,S,(FD>FS)> {
+    constexpr static Q<M,FD,S> shift(const Q<M,FS,S> &src)
+    {
+        using DstType = typename Q<M,FD,S>::value_type;
+        return(Q<M,FD,S>(DstType(src.v) << (FD-FS)));
+    }
+};
+
+template<int M,int FD,int FS,bool S>
+struct FixedCastShift<M,FD,FS,S,(FD<FS)> {
+    constexpr static Q<M,FD,S> shift(const Q<M,FS,S> &src)
+    {
+        using DstType = typename Q<M,FD,S>::value_type;
+        using SrcType = typename Q<M,FS,S>::value_type;
+
+        return(Q<M,FD,S>(DstType(SrcType(src.v) >> (FS-FD))));
+    }
+};
+
+template<int FD,int M,int FS,bool S>
+inline Q<M,FD,S> toFrac(const Q<M,FS,S> &src)
+{
+    return(FixedCastShift<M,FD,FS,S>::shift(src));
+}
+
+
+template<int MD,int MS,int F,bool S,bool = true>
+struct Accumulate;
+
+template<int MD,int MS,int F,bool S>
+struct Accumulate<MD,MS,F,S,true> {
+  static Q<MD,F,S> acc (const Q<MD,F,S> &a,const Q<MS,F,S> &b)
+  {
+     using DstType = typename Q<MD,F,S>::value_type;
+     return(Q<MD,F,S>(DstType(a.v) + DstType(b.v)));
+  }
+};
+
+template<int MD,int MS,int F,bool S>
+inline Q<MD,F,S> accumulate(const Q<MD,F,S> &a,const Q<MS,F,S> &b)
+{
+   return(Accumulate<MD,MS,F,S,(MD>MS)>::acc(a,b));
+}
+
+
+template<int M,int F>
+inline Q<M,F,true> _abs(const Q<M,F,true> a)
+{
+    using DestType = typename Q<M,F,true>::value_type;
+    return(Q<M,F>(DestType(abs(a.v))));
+}
+
+inline Q7 operator*(const Q7 &a,const Q7 &b)
+{
+    return(saturate(toFrac<7>(mult(a,b))));
+}
+
+inline Q15 operator*(const Q15 &a,const Q15 &b)
+{
+    return (saturate(toFrac<15>(mult(a,b))));
+}
+
+inline Q31 operator*(const Q31 &a,const Q31 &b)
+{
+    return (toFrac<31>(saturate(toFrac<30>(mult(a,b)))));
+}
+
+template<int M,int F>
+inline bool operator>(const Q<M,F> &a,const Q<M,F> &b)
+{
+    return(a.v>b.v);
+}
+
+template<int M,int F>
+inline bool operator<(const Q<M,F> &a,const Q<M,F> &b)
+{
+    return(a.v<b.v);
+}
+
+template<int M,int F>
+inline bool operator>=(const Q<M,F> &a,const Q<M,F> &b)
+{
+    return(a.v>=b.v);
+}
+
+
+template<int M,int F>
+inline bool operator<=(const Q<M,F> &a,const Q<M,F> &b)
+{
+    return(a.v<=b.v);
+}
+
+template<int M,int F>
+inline bool operator==(const Q<M,F> a,const Q<M,F> b)
+{
+    return(a.v==b.v);
+}
+
+template<int M,int F>
+inline bool operator!=(const Q<M,F> a,const Q<M,F> b)
+{
+    return(a.v!=b.v);
+}
+
+
+template<int M,int F,bool S>
+inline Q<M,F,S> operator/(const Q<M,F,S> a,const int32_t b)
+{
+    return(Q<M,F,S>(a.v / b));
+}
+
+
+template<int M,int F, bool S>
+inline Q<M,F,S> operator+(const Q<M,F,S> &a)
+{
+    return(a);
+}
+
+/*! @} */
+
+}
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/forward.hpp b/dsppp/Include/dsppp/forward.hpp
new file mode 100644
index 000000000..012953dcc
--- /dev/null
+++ b/dsppp/Include/dsppp/forward.hpp
@@ -0,0 +1,149 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+namespace arm_cmsis_dsp {
+
+template<typename P>
+struct Vector_Base;
+
+template<typename T,int stride>
+struct VectorView;
+
+template<typename P,int L,
+         template<int> typename Allocator>
+struct Vector;
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct Matrix;
+
+template<typename T,int S>
+struct MatrixView;
+
+template<typename T>
+struct NbRows;
+
+template<typename T>
+struct NbCols;
+
+template<typename M>
+struct Complexity;
+
+template<typename M>
+struct OutputVectorDim;
+
+template<typename M,typename V>
+struct CompatibleStaticMatMatProduct;
+
+template<typename M,typename V>
+struct CompatibleStaticMatVecProduct;
+
+template<typename M,typename V>
+struct CompatibleDynamicMatVecProduct;
+
+template<typename M,typename V>
+struct CompatibleDynamicMatMatProductStaticStride;
+
+template<typename M,typename V>
+struct CompatibleDynamicMatMatProductDynamicStride;
+
+template<typename M,typename V>
+struct CompatibleDynamicMatMatProduct;
+
+template<typename M,typename V>
+struct OutputVector;
+
+template<typename MA,typename MB>
+struct OutputMatrix;
+
+
+
+/*
+
+Identifications
+
+*/
+
+/* 
+
+Is a contiguous array in memory with scalar indexing
+(operator[])
+It can be an _Expr
+Vector has a length
+
+Generally used whe scalar indexing is required or length
+
+*/
+template<typename T>
+struct IsVector;
+
+/*
+
+Has matrix indexing (operator())
+and matrix operations like transpose, identity.
+So it cannot be an _Expr because _Expr has no transpose, identity
+Has rows, columns
+Matrix may be vectors Vectors (with above definition)
+
+Generally used when transpose or identity are required.
+
+*/
+template<typename T>
+struct IsMatrix;
+
+/*
+
+Has matrix indexing (operator())
+but no matrix operator like transpose.
+It can be an Expr
+Has rows, columns
+It may not always be a Vector (MatrixView are not contiguous)
+
+Generally used only when matrix indexing is mandatory
+
+*/
+template<typename T>
+struct HasMatrixIndexing;
+
+/*
+
+
+Type Matrix                     : IsVector, IsMatrix, HasMatrixIndexing
+Type MatrixView                 :         , IsMatrix, HasMatrixIndexing
+Type _Expr with Matrix          : IsVector,         , HasMatrixIndexing 
+Type _Expr with some MatrixView :                     HasMatrixIndexing
+
+*/
+
+
+/*
+
+Dimensions only known at runtime
+
+*/
+template<typename T>
+struct IsDynamic;
+
+/*
+
+StaticLength if known at build time otherwise 0
+*/
+template<typename T>
+struct StaticLength;
+
+/*
+
+Type of elements in vector or matrix
+
+*/
+template<typename T>
+struct ElementType;
+
+template<typename T>
+struct HasStaticStride;
+
+template<typename T>
+struct StaticStride;
+
+}
diff --git a/dsppp/Include/dsppp/fusion.hpp b/dsppp/Include/dsppp/fusion.hpp
new file mode 100644
index 000000000..db8864c4a
--- /dev/null
+++ b/dsppp/Include/dsppp/fusion.hpp
@@ -0,0 +1,760 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+namespace arm_cmsis_dsp {
+
+/** \addtogroup FUSION Abstract syntax tree for fusion
+ *  \ingroup DSPPP
+ *  @{
+ */
+
+template<typename T> struct traits
+{
+    typedef T Scalar;
+#if defined(HAS_VECTOR)
+    typedef typename vector_traits<T>::vector Vector;
+#endif
+};
+
+template<typename M>
+struct Complexity
+{
+   constexpr static int value = 0;
+};
+
+/*
+
+An unregular datatype has different strides like MatrixView
+and can only be assigned to a MatrixView using a 2D functions.
+So all normal eval functions will reject unregular structures.
+
+*/
+template<typename T>
+struct HasMatrixIndexing
+{
+    constexpr static bool value = false;
+};
+
+template<typename T>
+struct HasStaticStride
+{
+    constexpr static bool value = false;
+};
+
+
+
+template<typename T>
+struct IsVector
+{
+    constexpr static bool value = false;
+};
+
+template<typename T>
+struct IsMatrix
+{
+    constexpr static bool value = false;
+};
+
+
+
+template<typename T>
+struct StaticLength
+{
+    constexpr static vector_length_t value = 0;
+};
+
+template<typename T>
+struct ElementType
+{
+    typedef T type;
+};
+
+
+template<typename A,typename B>
+using SameElementType=std::is_same<typename ElementType<A>::type,typename ElementType<B>::type>;
+
+template<typename DA>
+constexpr bool has_vector_inst() {return (vector_traits<typename ElementType<DA>::type>::has_vector);}
+
+template<typename DA>
+constexpr bool has_predicate_inst() {return (vector_traits<typename ElementType<DA>::type>::has_predicate);}
+
+template<typename DA>
+constexpr bool is_scalar() {return (!IsVector<DA>::value && 
+                                    !HasMatrixIndexing<DA>::value);}
+
+template<typename DA>
+constexpr bool must_use_matrix_idx() {return (!IsVector<DA>::value && 
+                                       HasMatrixIndexing<DA>::value);}
+template<typename DA,typename DB>
+constexpr bool vector_idx_pair() {return (IsVector<DA>::value && 
+                                          IsVector<DB>::value &&
+                                          SameElementType<DA,DB>::value);}
+
+// By default scalar has no vector size so can't be used
+// to infer a size at build time. They are considered as dynamic
+// Otherwise, by default vectors are considered static 
+// except is there is a specialization of this template
+// (and that is the case for dynamic vectors)
+template<typename T>
+struct IsDynamic
+{
+    constexpr static bool value = is_scalar<T>();
+};
+
+/*
+
+Vector only not including matrixes (which are also vectors)
+
+*/
+template<typename DA>
+constexpr bool is_only_vector() {return (IsVector<DA>::value && 
+                                    !HasMatrixIndexing<DA>::value);}
+
+template<typename DA,typename DB>
+constexpr bool must_use_matrix_idx_pair() {return ((must_use_matrix_idx<DA>() || must_use_matrix_idx<DB>()) &&
+                                                 SameElementType<DA,DB>::value);}
+
+
+/*
+
+Static length is 0 for scalar and Dynamic vectors
+*/
+template<typename DA,typename DB>
+constexpr vector_length_t static_length() {
+    return ((StaticLength<DA>::value==0) ? StaticLength<DB>::value : StaticLength<DA>::value);
+}
+
+/*
+
+False only when DA and DB are static vector and with differet size
+Anyother case is ok.
+
+*/
+
+template<typename DA,typename DB>
+constexpr bool same_static_length()
+{
+    return((StaticLength<DA>::value == 0) || /* Scalar or dynamic case */
+           (StaticLength<DB>::value == 0) || /* Scalar or dynamic case */
+           (StaticLength<DA>::value == StaticLength<DB>::value));
+}
+/*
+
+Vector operators at instruction level
+
+*/
+#include "fusion_ops.hpp"
+
+
+template<typename T>
+struct _Expr {
+
+    using Scalar = typename traits<T>::Scalar;
+#if defined(HAS_VECTOR)
+    using Vector = typename traits<T>::Vector;
+#endif
+
+  T& derived()  {return(static_cast<T&>(*this));}
+
+  T const& derived() const {return(static_cast<T const&>(*this));}
+
+  Scalar const operator[](const index_t i) const {return(this->derived()[i]);}
+  
+  Scalar const operator()(const index_t r,const index_t c) const {return(this->derived()(r,c));}
+
+#if defined(HAS_VECTOR)
+  Vector const vector_op(const index_t i) const {return(this->derived().vector_op(i));}
+  
+  Vector const vector_op_tail(const index_t i,const vector_length_t remaining) const {return(this->derived().vector_op_tail(i,remaining));}
+
+  Vector const matrix_op(const index_t r,const index_t c) const {return(this->derived().matrix_op(r,c));}
+  
+  Vector const matrix_op_tail(const index_t r,const index_t c,const vector_length_t remaining) const {return(this->derived().matrix_op_tail(r,c,remaining));}
+#endif 
+
+  vector_length_t length() const {return(this->derived().length());}
+  vector_length_t rows() const {return(this->derived().rows());}
+  vector_length_t columns() const {return(this->derived().columns());}
+
+  virtual ~_Expr(){};
+
+protected:
+   _Expr() = default;
+   _Expr(const _Expr&) = default;
+   _Expr(_Expr&&) = default;
+   _Expr& operator=(const _Expr& other) = delete;
+   _Expr& operator=(_Expr&& other) = delete;
+};
+
+/*****************
+ * 
+ * BINARY AST
+ */
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct _Binary: _Expr<_Binary<LHS,RHS,DerivedOp>>
+{
+    using Scalar = typename traits<LHS>::Scalar;
+#if defined(HAS_VECTOR)
+    using Vector = typename traits<LHS>::Vector;
+#endif
+    _Binary(const LHS &lhs,
+            const RHS &rhs,
+            const _BinaryOperator<Scalar,DerivedOp> &op):
+            lhs_(lhs),rhs_(rhs),op_(op){
+    }
+
+    
+    _Binary(const _Binary &other):
+    lhs_(other.lhs_),rhs_(other.rhs_),op_(other.op_){
+    }
+
+    _Binary& operator=(const _Binary& other) = delete;
+    _Binary& operator=(_Binary&& other) = delete;
+
+    _Binary(_Binary &&other): 
+    lhs_(std::move(other.lhs_)),rhs_(std::move(other.rhs_)),op_(std::move(other.op_))
+    {
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<IsVector<L>::value,bool>::type = true>
+    vector_length_t length() const {
+        return(lhs_.length());
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<!IsVector<L>::value && IsVector<R>::value,bool>::type = true>
+    vector_length_t length() const {
+        return(rhs_.length());
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<HasMatrixIndexing<L>::value,bool>::type = true>
+    vector_length_t rows() const {
+        return(lhs_.rows());
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<!HasMatrixIndexing<L>::value && HasMatrixIndexing<R>::value,bool>::type = true>
+    vector_length_t rows() const {
+        return(rhs_.rows());
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<HasMatrixIndexing<L>::value,bool>::type = true>
+    vector_length_t columns() const {
+        return(lhs_.columns());
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<!HasMatrixIndexing<L>::value && HasMatrixIndexing<R>::value,bool>::type = true>
+    vector_length_t columns() const {
+        return(rhs_.columns());
+    }
+
+
+   
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if< 
+                        IsVector<L>::value && 
+                        IsVector<R>::value,bool>::type = true>
+    Scalar const operator[](const index_t i) const {
+        return(op_(lhs_[i],rhs_[i]));
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<IsVector<L>::value && 
+                       is_scalar<R>(),bool>::type = true>
+    Scalar const operator[](const index_t i) const {
+        return(op_(lhs_[i],rhs_));
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<is_scalar<L>() && 
+                        IsVector<R>::value,bool>::type = true>
+    Scalar const operator[](const index_t i) const {
+        return(op_(lhs_,rhs_[i]));
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<HasMatrixIndexing<L>::value && 
+                        HasMatrixIndexing<R>::value,bool>::type = true>
+    Scalar const operator()(const index_t r,const index_t c) const
+    {
+        return(op_(lhs_(r,c),rhs_(r,c)));
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<is_scalar<L>() && 
+                        HasMatrixIndexing<R>::value,bool>::type = true>
+    Scalar const operator()(const index_t r,const index_t c) const
+    {
+        return(op_(lhs_,rhs_(r,c)));
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<HasMatrixIndexing<L>::value && 
+                        is_scalar<R>(),bool>::type = true>
+    Scalar const operator()(const index_t r,const index_t c) const
+    {
+        return(op_(lhs_(r,c),rhs_));
+    }
+
+#if defined(HAS_VECTOR)
+    /* V + V */
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if< 
+                        IsVector<L>::value && 
+                        IsVector<R>::value,bool>::type = true>
+    Vector const vector_op(const index_t i) const
+    {
+        return(op_(lhs_.vector_op(i),rhs_.vector_op(i)));
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if< 
+                        has_predicate_inst<L>() &&
+                        IsVector<L>::value && 
+                        IsVector<R>::value,bool>::type = true>
+    Vector const vector_op_tail(const index_t i,const vector_length_t remaining) const
+    {
+        return(op_(lhs_.vector_op_tail(i,remaining),rhs_.vector_op_tail(i,remaining),inner::vctpq<Scalar>::mk(remaining)));
+    }
+
+    /* V + S */
+    template<typename R=RHS, typename L=LHS,
+            typename std::enable_if< 
+                        IsVector<L>::value && 
+                        is_scalar<R>(),bool>::type = true>
+    Vector const vector_op(const index_t i) const
+    {
+        return(op_(lhs_.vector_op(i),rhs_));
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if< 
+                        has_predicate_inst<L>() &&
+                        IsVector<L>::value && 
+                        is_scalar<R>(),bool>::type = true>
+    Vector const vector_op_tail(const index_t i,const vector_length_t remaining) const
+    {
+        return(op_(lhs_.vector_op_tail(i,remaining),rhs_,inner::vctpq<Scalar>::mk(remaining)));
+    }
+
+
+
+    /* S + V */
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<is_scalar<L>() && 
+                        IsVector<R>::value,bool>::type = true>
+    Vector const vector_op(const index_t i) const
+    {
+        return(op_(lhs_,rhs_.vector_op(i)));
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if< 
+                        has_predicate_inst<L>() &&
+                        is_scalar<L>() && 
+                        IsVector<R>::value,bool>::type = true>
+    Vector const vector_op_tail(const index_t i,const vector_length_t remaining) const
+    {
+        return(op_(lhs_,rhs_.vector_op_tail(i,remaining),inner::vctpq<Scalar>::mk(remaining)));
+    }
+
+
+    /*************
+     * 
+     * For matrix
+     * 
+     */
+
+    /* V + V */
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<HasMatrixIndexing<L>::value && 
+                        HasMatrixIndexing<R>::value,bool>::type = true>
+    Vector const matrix_op(const index_t r,const index_t c) const
+    {
+        return(op_(lhs_.matrix_op(r,c),rhs_.matrix_op(r,c)));
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<
+                        has_predicate_inst<L>() &&
+                        HasMatrixIndexing<L>::value && 
+                        HasMatrixIndexing<R>::value,bool>::type = true>
+    Vector const matrix_op_tail(const index_t r,const index_t c,const vector_length_t remaining) const
+    {
+        return(op_(lhs_.matrix_op_tail(r,c,remaining),rhs_.matrix_op_tail(r,c,remaining),inner::vctpq<Scalar>::mk(remaining)));
+    }
+
+    /* V + S */
+    template<typename R=RHS, typename L=LHS,
+            typename std::enable_if<HasMatrixIndexing<L>::value && 
+                                    is_scalar<R>(),bool>::type = true>
+    Vector const matrix_op(const index_t r,const index_t c) const
+    {
+        return(op_(lhs_.matrix_op(r,c),rhs_));
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<has_predicate_inst<L>() &&
+                                     HasMatrixIndexing<L>::value && 
+                                     is_scalar<R>(),bool>::type = true>
+    Vector const matrix_op_tail(const index_t r,const index_t c,const vector_length_t remaining) const
+    {
+        return(op_(lhs_.matrix_op_tail(r,c,remaining),rhs_,inner::vctpq<Scalar>::mk(remaining)));
+    }
+
+
+
+    /* S + V */
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<is_scalar<L>() && 
+                        HasMatrixIndexing<R>::value,bool>::type = true>
+    Vector const matrix_op(const index_t r,const index_t c) const
+    {
+        return(op_(lhs_,rhs_.matrix_op(r,c)));
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<has_predicate_inst<L>() &&
+                                     is_scalar<L>() && 
+                                     HasMatrixIndexing<R>::value,bool>::type = true>
+    Vector const matrix_op_tail(const index_t r,const index_t c,const vector_length_t remaining) const
+    {
+        return(op_(lhs_,rhs_.matrix_op_tail(r,c,remaining),inner::vctpq<Scalar>::mk(remaining)));
+    }
+
+
+#endif
+    const LHS lhs_;
+    const RHS rhs_;
+    const _BinaryOperator<Scalar,DerivedOp> op_;
+};
+
+template<typename DerivedOp>
+struct Complexity<_Expr<DerivedOp>>
+{
+   constexpr static int value = Complexity<DerivedOp>::value;
+};
+
+template<typename DerivedOp>
+struct ElementType<_Expr<DerivedOp>>
+{
+    typedef typename ElementType<DerivedOp>::type type;
+};
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct Complexity<_Binary<LHS,RHS,DerivedOp>>
+{
+   constexpr static int lhsv = Complexity<LHS>::value;
+   constexpr static int rhsv = Complexity<RHS>::value;
+   constexpr static int value = lhsv + rhsv + 1;
+};
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct ElementType<_Binary<LHS,RHS,DerivedOp>>
+{
+    typedef typename ElementType<LHS>::type type;
+};
+
+
+template<typename DerivedOp>
+struct IsVector<_Expr<DerivedOp>>
+{
+    constexpr static bool value = IsVector<DerivedOp>::value;
+};
+
+template<typename DerivedOp>
+struct HasMatrixIndexing<_Expr<DerivedOp>>
+{
+    constexpr static bool value = HasMatrixIndexing<DerivedOp>::value;
+};
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct IsVector<_Binary<LHS,RHS,DerivedOp>>
+{
+    constexpr static bool value = 
+    (IsVector<LHS>::value && IsVector<RHS>::value) ||
+    (IsVector<LHS>::value && is_scalar<RHS>()) ||
+    (is_scalar<LHS>()     && IsVector<RHS>::value);
+};
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct HasMatrixIndexing<_Binary<LHS,RHS,DerivedOp>>
+{
+    constexpr static bool value = 
+    (HasMatrixIndexing<LHS>::value && HasMatrixIndexing<RHS>::value) ||
+    (HasMatrixIndexing<LHS>::value && is_scalar<RHS>()) ||
+    (is_scalar<LHS>()     && HasMatrixIndexing<RHS>::value);
+};
+
+template<typename DerivedOp>
+struct IsDynamic<_Expr<DerivedOp>>
+{
+    constexpr static bool value = IsDynamic<DerivedOp>::value;
+};
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct IsDynamic<_Binary<LHS,RHS,DerivedOp>>
+{
+    constexpr static bool value = IsDynamic<LHS>::value && IsDynamic<RHS>::value;
+};
+
+template<typename DerivedOp>
+struct StaticLength<_Expr<DerivedOp>>
+{
+    constexpr static vector_length_t value = StaticLength<DerivedOp>::value;
+};
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct StaticLength<_Binary<LHS,RHS,DerivedOp>>
+{
+    constexpr static vector_length_t value = static_length<LHS,RHS>();
+   
+};
+
+template<typename DerivedOp>
+struct traits<_Expr<DerivedOp>>
+{
+    typedef typename traits<DerivedOp>::Scalar Scalar;
+#if defined(HAS_VECTOR)
+    typedef typename traits<DerivedOp>::Vector Vector;
+#endif
+};
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct traits<_Binary<LHS,RHS,DerivedOp>>
+{
+    typedef typename traits<LHS>::Scalar Scalar;
+#if defined(HAS_VECTOR)
+    typedef typename traits<LHS>::Vector Vector;
+#endif
+};
+
+
+/*****************
+ * 
+ * UNARY AST
+ */
+
+template<typename LHS,typename DerivedOp>
+struct _Unary: _Expr<_Unary<LHS,DerivedOp>>
+{
+    using Scalar = typename traits<LHS>::Scalar;
+#if defined(HAS_VECTOR)
+    using Vector = typename traits<LHS>::Vector;
+#endif
+    _Unary(const LHS &lhs,
+           const _UnaryOperator<Scalar,DerivedOp> &op):
+            lhs_(lhs),op_(op){
+    }
+
+    _Unary(const _Unary &other):
+    lhs_(other.lhs_),op_(other.op_){
+    }
+
+    _Unary(LHS &&other):
+    lhs_(std::move(other.lhs_)),op_(std::move(other.op_)){
+    }
+
+    _Unary& operator=(const _Unary& other) = delete;
+    _Unary& operator=(_Unary&& other) = delete;
+
+
+    vector_length_t length() const {
+        return(lhs_.length());
+    }
+
+    template<typename L=LHS,
+             typename std::enable_if<HasMatrixIndexing<L>::value,bool>::type = true>
+    vector_length_t rows() const {
+        return(lhs_.rows());
+    }
+
+    template<typename L=LHS,
+             typename std::enable_if<HasMatrixIndexing<L>::value,bool>::type = true>
+    vector_length_t columns() const {
+        return(lhs_.columns());
+    }
+
+    template<typename L=LHS,
+             typename std::enable_if<IsVector<L>::value ,bool>::type = true>
+    Scalar const operator[](const index_t i) const {
+        return(op_(lhs_[i]));
+    }
+
+    template<typename L=LHS,
+             typename std::enable_if<HasMatrixIndexing<L>::value ,bool>::type = true>
+    Scalar const operator()(const index_t r,const index_t c) const
+    {
+        return(op_(lhs_(r,c)));
+    }
+
+
+#if defined(HAS_VECTOR)
+    /* V */
+    template<typename L=LHS,
+             typename std::enable_if<
+                        IsVector<L>::value ,bool>::type = true>
+    Vector const vector_op(const index_t i) const
+    {
+        return(op_(lhs_.vector_op(i)));
+    }
+
+    template<typename L=LHS,
+             typename std::enable_if<has_predicate_inst<L>() &&
+                                     IsVector<L>::value ,bool>::type = true>
+    Vector const vector_op_tail(const index_t i,const vector_length_t remaining) const
+    {
+        return(op_(lhs_.vector_op_tail(i,remaining),inner::vctpq<Scalar>::mk(remaining)));
+    }
+
+    /*
+
+    For Matrix
+
+    */
+
+    /* V */
+    template<typename L=LHS,
+             typename std::enable_if<HasMatrixIndexing<L>::value ,bool>::type = true>
+    Vector const matrix_op(const index_t r,const index_t c) const
+    {
+        return(op_(lhs_.matrix_op(r,c)));
+    }
+
+    template<typename L=LHS,
+             typename std::enable_if<has_predicate_inst<L>() &&
+                                     HasMatrixIndexing<L>::value ,bool>::type = true>
+    Vector const matrix_op_tail(const index_t r,const index_t c,const vector_length_t remaining) const
+    {
+        return(op_(lhs_.matrix_op_tail(r,c,remaining),inner::vctpq<Scalar>::mk(remaining)));
+    }
+
+    
+#endif
+    const LHS lhs_;
+    const _UnaryOperator<Scalar,DerivedOp> op_;
+};
+
+template<typename LHS,typename DerivedOp>
+struct Complexity<_Unary<LHS,DerivedOp>>
+{
+   constexpr static int value = 1 + Complexity<LHS>::value;
+};
+
+template<typename LHS,typename DerivedOp>
+struct ElementType<_Unary<LHS,DerivedOp>>
+{
+    typedef typename ElementType<LHS>::type type;
+};
+
+template<typename LHS,typename DerivedOp>
+struct IsVector<_Unary<LHS,DerivedOp>>
+{
+    constexpr static bool value = IsVector<LHS>::value;
+};
+
+template<typename LHS,typename DerivedOp>
+struct HasMatrixIndexing<_Unary<LHS,DerivedOp>>
+{
+    constexpr static bool value = HasMatrixIndexing<LHS>::value;
+};
+
+template<typename LHS,typename DerivedOp>
+struct IsDynamic<_Unary<LHS,DerivedOp>>
+{
+    constexpr static bool value = IsDynamic<LHS>::value;
+};
+
+template<typename LHS,typename DerivedOp>
+struct StaticLength<_Unary<LHS,DerivedOp>>
+{
+    constexpr static vector_length_t value = StaticLength<LHS>::value;
+};
+
+
+template<typename LHS,typename DerivedOp>
+struct traits<_Unary<LHS,DerivedOp>>
+{
+    typedef typename traits<LHS>::Scalar Scalar;
+#if defined(HAS_VECTOR)
+    typedef typename traits<LHS>::Vector Vector;
+#endif
+};
+
+
+
+
+/*
+
+Dot product 
+
+*/
+
+template<typename DA>
+using DotResult = typename number_traits<typename traits<DA>::Scalar>::accumulator;
+
+
+
+
+template<typename VA,typename VB,
+         typename std::enable_if<vector_idx_pair<VA,VB>() &&
+         is_only_vector<VA>() &&
+         is_only_vector<VB>() &&
+         (!IsDynamic<VA>::value || !IsDynamic<VB>::value),bool>::type = true>
+inline DotResult<VA> dot(const VA& a,
+                         const VB& b)
+{
+   constexpr vector_length_t l = static_length<VA,VB>();
+   return(_dot(a,b,l,CURRENT_ARCH));
+}
+
+template<typename VA,typename VB,
+         typename std::enable_if<vector_idx_pair<VA,VB>() &&
+         is_only_vector<VA>() &&
+         is_only_vector<VB>() &&
+         (IsDynamic<VA>::value && IsDynamic<VB>::value),bool>::type = true>
+inline DotResult<VA> dot(const VA& a,
+                         const VB& b)
+{
+   const vector_length_t l = a.length();
+   
+   return(_dot(a,b,l,CURRENT_ARCH));
+}
+
+
+
+
+
+template<typename VA,typename VB,
+         typename std::enable_if<vector_idx_pair<VA,VB>() &&
+         (!IsDynamic<VA>::value || !IsDynamic<VB>::value),bool>::type = true>
+inline void swap(VA&& a,
+                 VB&& b)
+{
+   constexpr vector_length_t l = static_length<VA,VB>();
+   
+   _swap(std::forward<VA>(a),std::forward<VB>(b),l,CURRENT_ARCH);
+}
+
+template<typename VA,typename VB,
+         typename std::enable_if<vector_idx_pair<VA,VB>() &&
+         (IsDynamic<VA>::value && IsDynamic<VB>::value),bool>::type = true>
+inline void swap(VA&& a,
+                 VB&& b)
+{
+   const vector_length_t l = a.length();
+   
+   _swap(std::forward<VA>(a),std::forward<VB>(b),l,CURRENT_ARCH);
+}
+
+/*! @} */
+
+}
+
diff --git a/dsppp/Include/dsppp/fusion_ops.hpp b/dsppp/Include/dsppp/fusion_ops.hpp
new file mode 100644
index 000000000..a1a83d932
--- /dev/null
+++ b/dsppp/Include/dsppp/fusion_ops.hpp
@@ -0,0 +1,358 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+/** \addtogroup FUSION 
+ *  @{
+ */
+
+
+template<typename Scalar,typename Derived>
+struct _UnaryOperator{
+    Derived& derived()  {return(static_cast<Derived&>(*this));}
+
+    Derived const& derived() const {return(static_cast<Derived const&>(*this));}
+
+    Scalar const operator()(const Scalar lhs) const
+    {
+        return(this->derived()(lhs));
+    }
+
+    #if defined(HAS_VECTOR)
+    using Vector= typename vector_traits<Scalar>::vector ;
+    using pred_t = typename vector_traits<Scalar>::predicate_t;
+
+    Vector const operator()(const Vector lhs) const
+    {
+        return(this->derived()(lhs));
+    }
+
+    /*
+
+    Predicated operation when exists (Helium)
+
+    */
+    template<typename T=Scalar,
+             typename std::enable_if<vector_traits<T>::has_predicate,bool>::type = true>
+    Vector const operator()(const Vector lhs,const pred_t p0) const
+    {
+        return(this->derived()(lhs,p0));
+    }
+
+    /*
+    Vector const to_vector(const Scalar lhs) const
+    {
+        return(this->derived().to_vector(lhs));
+    }
+    */
+#endif
+};
+
+template<typename Scalar,typename Derived>
+struct _BinaryOperator{
+    Derived& derived()  {return(static_cast<Derived&>(*this));}
+
+    Derived const& derived() const {return(static_cast<Derived const&>(*this));}
+
+    Scalar const operator()(const Scalar lhs, 
+                            const Scalar rhs) const
+    {
+        return(this->derived()(lhs,rhs));
+    }
+
+    #if defined(HAS_VECTOR)
+    using Vector= typename vector_traits<Scalar>::vector ;
+    using pred_t = typename vector_traits<Scalar>::predicate_t;
+
+
+    Vector const operator()(const Vector lhs, 
+                            const Vector rhs) const
+    {
+        return(this->derived()(lhs,rhs));
+    }
+
+    Vector const operator()(const Vector lhs, 
+                            const Scalar rhs) const
+    {
+        return(this->derived()(lhs,rhs));
+    }
+
+    Vector const operator()(const Scalar lhs, 
+                            const Vector rhs) const
+    {
+        return(this->derived()(lhs,rhs));
+    }
+
+    template<typename T=Scalar,
+             typename std::enable_if<vector_traits<T>::has_predicate,bool>::type = true>
+    Vector const operator()(const Vector lhs, 
+                            const Vector rhs,
+                            const pred_t p0) const
+    {
+        return(this->derived()(lhs,rhs,p0));
+    }
+
+    template<typename T=Scalar,
+             typename std::enable_if<vector_traits<T>::has_predicate,bool>::type = true>
+    Vector const operator()(const Vector lhs, 
+                            const Scalar rhs,
+                            const pred_t p0) const
+    {
+        return(this->derived()(lhs,rhs,p0));
+    }
+
+    template<typename T=Scalar,
+             typename std::enable_if<vector_traits<T>::has_predicate,bool>::type = true>
+    Vector const operator()(const Scalar lhs, 
+                            const Vector rhs,
+                            const pred_t p0) const
+    {
+        return(this->derived()(lhs,rhs,p0));
+    }
+#endif
+};
+
+/*****************
+ * 
+ * BINARY
+ * 
+ */
+template<typename Scalar>
+struct _AddOp:_BinaryOperator<Scalar,_AddOp<Scalar>>
+{
+    Scalar const operator()(const Scalar lhs, 
+                            const Scalar rhs) const {
+        return(lhs + rhs);
+    }
+
+#if defined(HAS_VECTOR)
+    using Vector=typename vector_traits<Scalar>::vector ;
+    using pred_t = typename vector_traits<Scalar>::predicate_t;
+
+    Vector const operator()(const Vector lhs, 
+                            const Vector rhs) const
+    {
+        return(inner::vadd(lhs,rhs));
+    }
+
+    Vector const operator()(const Vector lhs, 
+                            const Scalar rhs) const
+    {
+        return(inner::vadd(lhs,rhs));
+    }
+
+    Vector const operator()(const Scalar lhs, 
+                            const Vector rhs) const
+    {
+        return(inner::vadd(lhs,rhs));
+    }
+
+    template<typename T=Scalar,
+             typename std::enable_if<vector_traits<T>::has_predicate,bool>::type = true>
+    Vector const operator()(const Vector lhs, 
+                            const Vector rhs,
+                            const pred_t p0) const
+    {
+        return(inner::vadd(lhs,rhs,p0));
+    }
+
+    template<typename T=Scalar,
+             typename std::enable_if<vector_traits<T>::has_predicate,bool>::type = true>
+    Vector const operator()(const Vector lhs, 
+                            const Scalar rhs,
+                            const pred_t p0) const
+    {
+        return(inner::vadd(lhs,rhs,p0));
+    }
+
+    template<typename T=Scalar,
+             typename std::enable_if<vector_traits<T>::has_predicate,bool>::type = true>
+    Vector const operator()(const Scalar lhs, 
+                            const Vector rhs,
+                            const pred_t p0) const
+    {
+        return(inner::vadd(lhs,rhs,p0));
+    }
+#endif
+};
+
+template<typename Scalar>
+struct _SubOp:_BinaryOperator<Scalar,_SubOp<Scalar>>
+{
+    Scalar const operator()(const Scalar lhs, 
+                            const Scalar rhs) const {
+        return(lhs - rhs);
+    }
+
+#if defined(HAS_VECTOR)
+    using Vector=typename vector_traits<Scalar>::vector ;
+    using pred_t = typename vector_traits<Scalar>::predicate_t;
+
+    Vector const operator()(const Vector lhs, 
+                            const Vector rhs) const
+    {
+        return(inner::vsub(lhs,rhs));
+    }
+
+    Vector const operator()(const Vector lhs, 
+                            const Scalar rhs) const
+    {
+        return(inner::vsub(lhs,rhs));
+    }
+
+    Vector const operator()(const Scalar lhs, 
+                            const Vector rhs) const
+    {
+        return(inner::vsub(lhs,rhs));
+    }
+
+    template<typename T=Scalar,
+             typename std::enable_if<vector_traits<T>::has_predicate,bool>::type = true>
+    Vector const operator()(const Vector lhs, 
+                            const Vector rhs,
+                            const pred_t p0) const
+    {
+        return(inner::vsub(lhs,rhs,p0));
+    }
+
+    template<typename T=Scalar,
+             typename std::enable_if<vector_traits<T>::has_predicate,bool>::type = true>
+    Vector const operator()(const Vector lhs, 
+                            const Scalar rhs,
+                            const pred_t p0) const
+    {
+        return(inner::vsub(lhs,rhs,p0));
+    }
+
+    template<typename T=Scalar,
+             typename std::enable_if<vector_traits<T>::has_predicate,bool>::type = true>
+    Vector const operator()(const Scalar lhs, 
+                            const Vector rhs,
+                            const pred_t p0) const
+    {
+        return(inner::vsub(lhs,rhs,p0));
+    }
+#endif
+};
+
+template<typename Scalar>
+struct _MulOp:_BinaryOperator<Scalar,_MulOp<Scalar>>
+{
+    Scalar const operator()(const Scalar lhs, 
+                            const Scalar rhs) const {
+        return(lhs * rhs);
+    }
+
+#if defined(HAS_VECTOR)
+    using Vector= typename vector_traits<Scalar>::vector ;
+    using pred_t = typename vector_traits<Scalar>::predicate_t;
+
+    Vector const operator()(const Vector lhs, 
+                            const Vector rhs) const
+    {
+        return(inner::vmul(lhs,rhs));
+    }
+
+    Vector const operator()(const Vector lhs, 
+                            const Scalar rhs) const
+    {
+        return(inner::vmul(lhs,rhs));
+    }
+
+    Vector const operator()(const Scalar lhs, 
+                            const Vector rhs) const
+    {
+        return(inner::vmul(lhs,rhs));
+    }
+
+    template<typename T=Scalar,
+             typename std::enable_if<vector_traits<T>::has_predicate,bool>::type = true>
+    Vector const operator()(const Vector lhs, 
+                            const Vector rhs,
+                            const pred_t p0) const
+    {
+        return(inner::vmul(lhs,rhs,p0));
+    }
+
+    template<typename T=Scalar,
+             typename std::enable_if<vector_traits<T>::has_predicate,bool>::type = true>
+    Vector const operator()(const Vector lhs, 
+                            const Scalar rhs,
+                            const pred_t p0) const
+    {
+        return(inner::vmul(lhs,rhs,p0));
+    }
+
+    template<typename T=Scalar,
+             typename std::enable_if<vector_traits<T>::has_predicate,bool>::type = true>
+    Vector const operator()(const Scalar lhs, 
+                            const Vector rhs,
+                            const pred_t p0) const
+    {
+        return(inner::vmul(lhs,rhs,p0));
+    }
+#endif
+};
+
+/*****************
+ * 
+ * UNARY
+ * 
+ */
+template<typename Scalar>
+struct _NegOp:_UnaryOperator<Scalar,_NegOp<Scalar>>
+{
+    Scalar const operator()(const Scalar lhs) const {
+        return(-lhs);
+    }
+
+#if defined(HAS_VECTOR)
+    using Vector= typename vector_traits<Scalar>::vector ;
+    using pred_t = typename vector_traits<Scalar>::predicate_t;
+
+    Vector const operator()(const Vector lhs) const
+    {
+        return(inner::vneg(lhs));
+    }
+
+    template<typename T=Scalar,
+             typename std::enable_if<vector_traits<T>::has_predicate,bool>::type = true>
+    Vector const operator()(const Vector lhs,
+                            const pred_t p0) const
+    {
+        return(inner::vneg(lhs,p0));
+    }
+
+   
+#endif
+};
+
+template<typename Scalar>
+struct _NoOp:_UnaryOperator<Scalar,_NoOp<Scalar>>
+{
+    Scalar const operator()(const Scalar lhs) const {
+        return(lhs);
+    }
+
+#if defined(HAS_VECTOR)
+    using Vector= typename vector_traits<Scalar>::vector ;
+    using pred_t = typename vector_traits<Scalar>::predicate_t;
+
+    Vector const operator()(const Vector lhs) const
+    {
+        return(lhs);
+    }
+
+    template<typename T=Scalar,
+             typename std::enable_if<vector_traits<T>::has_predicate,bool>::type = true>
+    Vector const operator()(const Vector lhs,
+                            const pred_t p0) const
+    {
+        (void)p0;
+        return(lhs);
+    }
+
+#endif
+};
+
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/matrix.hpp b/dsppp/Include/dsppp/matrix.hpp
new file mode 100644
index 000000000..7836d6bec
--- /dev/null
+++ b/dsppp/Include/dsppp/matrix.hpp
@@ -0,0 +1,647 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+
+#include <memory>
+#include <cstring>
+#include <algorithm>
+#include <iostream>
+#include "common.hpp"
+#include "arch.hpp"
+#include <type_traits>
+#include "number.hpp"
+#include "forward.hpp"
+#include "fusion.hpp"
+#include "unroll.hpp"
+#include "algorithms.hpp"
+#include "vec.hpp"
+#include "matrix_impl.hpp"
+#include "matrix_view.hpp"
+
+namespace arm_cmsis_dsp {
+
+/** \addtogroup Matrix Matrixes
+ *  \ingroup DSPPP
+ *  @{
+ */
+
+template<typename P,int R,int C,
+         template<int> typename A>
+struct traits<Matrix<P,R,C,A>>
+{
+    typedef P Scalar;
+#if defined(HAS_VECTOR)
+    typedef typename vector_traits<P>::vector Vector;
+#endif
+};
+
+template<typename P,int S>
+struct traits<MatrixView<P,S>>
+{
+    typedef P Scalar;
+#if defined(HAS_VECTOR)
+    typedef typename vector_traits<P>::vector Vector;
+#endif
+};
+
+template<typename P,int R,int C,
+         template<int> typename A>
+struct traits<const Matrix<P,R,C,A>&>
+{
+    typedef P Scalar;
+#if defined(HAS_VECTOR)
+    typedef typename vector_traits<P>::vector Vector;
+#endif
+};
+
+template<typename P,int S>
+struct traits<const MatrixView<P,S>&>
+{
+    typedef P Scalar;
+#if defined(HAS_VECTOR)
+    typedef typename vector_traits<P>::vector Vector;
+#endif
+};
+
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct IsVector<Matrix<P,R,C,Allocator>>
+{
+    constexpr static bool value = true;
+};
+
+
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct HasStaticStride<Matrix<P,R,C,Allocator>>
+{
+    constexpr static bool value = (C>0);
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct StaticStride<Matrix<P,R,C,Allocator>>
+{
+    constexpr static index_t value = C;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct IsMatrix<Matrix<P,R,C,Allocator>>
+{
+    constexpr static bool value = true;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct HasMatrixIndexing<Matrix<P,R,C,Allocator>>
+{
+    constexpr static bool value = true;
+};
+
+template<typename P,int S>
+struct IsMatrix<MatrixView<P,S>>
+{
+    constexpr static bool value = true;
+};
+
+template<typename P,int S>
+struct HasStaticStride<MatrixView<P,S>>
+{
+    constexpr static bool value = (S>0);
+};
+
+template<typename P,int S>
+struct StaticStride<MatrixView<P,S>>
+{
+    constexpr static index_t value = S;
+};
+
+template<typename P,int S>
+struct HasMatrixIndexing<MatrixView<P,S>>
+{
+    constexpr static bool value = true;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct IsVector<const Matrix<P,R,C,Allocator>&>
+{
+    constexpr static bool value = true;
+};
+
+
+template<typename P>
+struct IsVector<MatrixView<P,CONSTRAINED_DYNAMIC>>
+{
+    constexpr static bool value = true;
+};
+
+template<typename P>
+struct IsVector<const MatrixView<P,CONSTRAINED_DYNAMIC>&>
+{
+    constexpr static bool value = true;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct HasStaticStride<const Matrix<P,R,C,Allocator>&>
+{
+    constexpr static bool value = (C>0);
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct StaticStride<const Matrix<P,R,C,Allocator>&>
+{
+    constexpr static index_t value = C;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct HasMatrixIndexing<const Matrix<P,R,C,Allocator>&>
+{
+    constexpr static bool value = true;
+};
+
+
+template<typename P,int S>
+struct IsMatrix<const MatrixView<P,S>&>
+{
+    constexpr static bool value = true;
+};
+
+template<typename P,int S>
+struct HasMatrixIndexing<const MatrixView<P,S>&>
+{
+    constexpr static bool value = true;
+};
+
+template<typename P,int S>
+struct HasStaticStride<const MatrixView<P,S>&>
+{
+    constexpr static bool value = (S>0);
+};
+
+template<typename P,int S>
+struct StaticStride<const MatrixView<P,S>&>
+{
+    constexpr static index_t value = S;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct ElementType<Matrix<P,R,C,Allocator>>
+{
+    typedef P type;
+};
+
+
+template<typename P,int S>
+struct ElementType<MatrixView<P,S>>
+{
+    typedef P type;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct ElementType<const Matrix<P,R,C,Allocator>&>
+{
+    typedef P type;
+};
+
+template<typename P,int S>
+struct ElementType<const MatrixView<P,S>&>
+{
+    typedef P type;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct StaticLength<Matrix<P,R,C,Allocator>>
+{
+    constexpr static vector_length_t value = (R*C<0) ? 0 : R*C;
+};
+
+template<typename P,int S>
+struct StaticLength<MatrixView<P,S>>
+{
+    constexpr static vector_length_t value = 0;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct StaticLength<const Matrix<P,R,C,Allocator>&>
+{
+    constexpr static vector_length_t value = (R*C<0) ? 0 : R*C;
+};
+
+template<typename P,int S>
+struct StaticLength<const MatrixView<P,S>&>
+{
+    constexpr static vector_length_t value = 0 ;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct IsDynamic<Matrix<P,R,C,Allocator>>
+{
+    constexpr static bool value = (R<0) || (C<0);
+};
+
+template<typename P,int S>
+struct IsDynamic<MatrixView<P,S>>
+{
+    constexpr static bool value = true;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct IsDynamic<const Matrix<P,R,C,Allocator>&>
+{
+    constexpr static bool value = (R<0) || (C<0);
+};
+
+template<typename P,int S>
+struct IsDynamic<const MatrixView<P,S>&>
+{
+    constexpr static bool value = true;
+};
+
+/*
+
+
+Compatibility of vector and matrix dimensions at build time
+
+*/
+
+template<typename T>
+struct NbRows
+{
+    constexpr static vector_length_t value = DYNAMIC;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct NbRows<Matrix<P,R,C,Allocator>>
+{
+    constexpr static vector_length_t value = R;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct NbRows<const Matrix<P,R,C,Allocator>&>
+{
+    constexpr static vector_length_t value = R;
+};
+
+template<typename T>
+struct NbCols
+{
+    constexpr static vector_length_t value = DYNAMIC;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct NbCols<Matrix<P,R,C,Allocator>>
+{
+    constexpr static vector_length_t value = C;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct NbCols<const Matrix<P,R,C,Allocator>&>
+{
+    constexpr static vector_length_t value = C;
+};
+
+
+template<typename M,typename V>
+struct CompatibleStaticMatVecProduct
+{
+   constexpr static bool value = 
+   is_only_vector<V>() &&
+   HasMatrixIndexing<M>::value &&
+   (NbCols<M>::value == StaticLength<V>::value) &&
+   !IsDynamic<M>::value
+   && SameElementType<M,V>::value;
+
+};
+
+/* MB IsMatrix because we need transpose operator */
+template<typename MA,typename MB>
+struct CompatibleStaticMatMatProduct
+{
+   constexpr static bool value = 
+   HasMatrixIndexing<MA>::value &&
+   IsMatrix<MB>::value &&
+   (NbCols<MA>::value == NbRows<MB>::value) &&
+   !IsDynamic<MA>::value && 
+   SameElementType<MA,MB>::value;
+
+};
+
+template<typename M,typename V>
+struct CompatibleDynamicMatVecProduct
+{
+   constexpr static bool value = 
+   HasMatrixIndexing<M>::value &&
+   IsDynamic<M>::value &&
+   is_only_vector<V>() &&
+   SameElementType<M,V>::value;
+
+};
+
+/* MB IsMatrix because we need transpose operator */
+template<typename MA,typename MB>
+struct CompatibleDynamicMatMatProductStaticStride
+{
+   constexpr static bool value = 
+   HasMatrixIndexing<MA>::value &&
+   IsMatrix<MB>::value &&
+   IsDynamic<MA>::value &&
+   HasStaticStride<MB>::value &&
+   SameElementType<MA,MB>::value;
+};
+
+template<typename MA,typename MB>
+struct CompatibleDynamicMatMatProductDynamicStride
+{
+   constexpr static bool value = 
+   HasMatrixIndexing<MA>::value &&
+   IsMatrix<MB>::value &&
+   IsDynamic<MA>::value &&
+   !HasStaticStride<MB>::value &&
+   SameElementType<MA,MB>::value;
+};
+
+template<typename MA,typename MB>
+struct CompatibleDynamicMatMatProduct
+{
+   constexpr static bool value = 
+   HasMatrixIndexing<MA>::value &&
+   IsMatrix<MB>::value &&
+   IsDynamic<MA>::value &&
+   SameElementType<MA,MB>::value;
+};
+
+template<typename M,typename V>
+struct OutputVector {
+    typedef Vector<typename traits<V>::Scalar,
+                   OutputVectorDim<M>::value,TMP_ALLOC> type;
+};
+
+template<typename MA,typename MB>
+struct OutputMatrix {
+    constexpr static bool dynamic = (NbRows<MA>::value < 0) || (NbCols<MB>::value < 0);
+    constexpr static vector_length_t nbrows = dynamic ? DYNAMIC : NbRows<MA>::value;
+    constexpr static vector_length_t nbcols = dynamic ? DYNAMIC : NbCols<MB>::value;
+
+    typedef Matrix<typename traits<MA>::Scalar,nbrows,nbcols,TMP_ALLOC> type;
+};
+
+
+
+template<typename M>
+struct OutputVectorDim 
+{
+    constexpr static vector_length_t value = DYNAMIC;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct OutputVectorDim<Matrix<P,R,C,Allocator>>
+{
+   constexpr static vector_length_t value = R;
+};
+
+template<typename P,int R,int C,
+         template<int> typename Allocator>
+struct OutputVectorDim<const Matrix<P,R,C,Allocator>&>
+{
+   constexpr static vector_length_t value = R;
+};
+
+
+template<typename T,int S>
+struct VecRef<MatrixView<T,S>>
+{
+   typedef MatrixView<T,S> type;
+   static type ref(const MatrixView<T,S>&a){
+      return(a);
+   };
+};
+
+template<typename P,int R,int C,
+         template<int> typename A>
+struct VecRef<Matrix<P,R,C,A>,((R>0) && (C>0))>
+{
+   typedef const Matrix<P,R,C,A>& type;
+   static type ref(const Matrix<P,R,C,A>&a,typename std::enable_if<(R>0) && (C>0)>::type* = nullptr){
+      return(a);
+   };
+};
+
+template<typename P,int R,int C,
+         template<int> typename A>
+struct VecRef<Matrix<P,R,C,A>,((R<0) || (C<0))>
+{
+   typedef MatrixView<P,CONSTRAINED_DYNAMIC> type;
+   static type ref(const Matrix<P,R,C,A>&a,typename std::enable_if<(R<0) || (C<0)>::type* = nullptr){
+      return(type(a,a.rows(),a.columns()));
+   };
+};
+
+
+/*****************
+ * 
+ * 
+ *  Fused matrix operators
+ * 
+ ****************/
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct _Outer: _Expr<_Outer<LHS,RHS,DerivedOp>>
+{
+    using Scalar = typename traits<LHS>::Scalar;
+#if defined(HAS_VECTOR)
+    using Vector = typename traits<LHS>::Vector;
+#endif
+    _Outer(const LHS &lhs,
+            const RHS &rhs,
+            const _BinaryOperator<Scalar,DerivedOp> &op):
+            lhs_(lhs),rhs_(rhs),op_(op){
+    }
+
+    
+    _Outer(const _Outer &other):
+    lhs_(other.lhs_),rhs_(other.rhs_),op_(other.op_){
+    }
+
+    _Outer& operator=(const _Outer& other) = delete;
+    _Outer& operator=(_Outer&& other) = delete;
+
+    _Outer(_Outer &&other): 
+    lhs_(std::move(other.lhs_)),rhs_(std::move(other.rhs_)),op_(std::move(other.op_))
+    {
+    }
+
+    
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<IsVector<L>::value && IsVector<R>::value,bool>::type = true>
+    vector_length_t length() const {
+        return(lhs_.length() * rhs_.length());
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<IsVector<L>::value,bool>::type = true>
+    vector_length_t rows() const {
+        return(lhs_.length());
+    }
+
+
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<IsVector<R>::value,bool>::type = true>
+    vector_length_t columns() const {
+        return(rhs_.length());
+    }
+
+
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<IsVector<L>::value && 
+                        IsVector<R>::value,bool>::type = true>
+    Scalar const operator()(const index_t r,const index_t c) const
+    {
+        return(op_(lhs_[r],rhs_[c]));
+    }
+
+  
+#if defined(HAS_VECTOR)
+    /*************
+     * 
+     * For matrix
+     * 
+     */
+
+    /* V + V */
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<IsVector<L>::value && 
+                        IsVector<R>::value,bool>::type = true>
+    Vector const matrix_op(const index_t r,const index_t c) const
+    {
+        return(op_(lhs_[r],rhs_.vector_op(c)));
+    }
+
+    template<typename R=RHS, typename L=LHS,
+             typename std::enable_if<IsVector<L>::value && 
+                        IsVector<R>::value,bool>::type = true>
+    Vector const matrix_op_tail(const index_t r,const index_t c,const vector_length_t remaining) const
+    {
+        return(op_(lhs_[r],rhs_.vector_op_tail(c,remaining),inner::vctpq<Scalar>::mk(remaining)));
+    }
+
+
+#endif
+    const LHS lhs_;
+    const RHS rhs_;
+    const _BinaryOperator<Scalar,DerivedOp> op_;
+};
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct IsVector<_Outer<LHS,RHS,DerivedOp>>
+{
+    constexpr static bool value = false;
+};
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct HasMatrixIndexing<_Outer<LHS,RHS,DerivedOp>>
+{
+    constexpr static bool value = true;
+};
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct StaticLength<_Outer<LHS,RHS,DerivedOp>>
+{
+    constexpr static vector_length_t value = StaticLength<LHS>::value * StaticLength<RHS>::value;
+};
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct IsDynamic<_Outer<LHS,RHS,DerivedOp>>
+{
+    constexpr static vector_length_t value = IsDynamic<LHS>::value || IsDynamic<RHS>::value;
+};
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct Complexity<_Outer<LHS,RHS,DerivedOp>>
+{
+   constexpr static int lhsv = Complexity<LHS>::value;
+   constexpr static int rhsv = Complexity<RHS>::value;
+   constexpr static int value = lhsv + rhsv + 1;
+};
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct ElementType<_Outer<LHS,RHS,DerivedOp>>
+{
+    typedef typename ElementType<LHS>::type type;
+};
+
+template<typename LHS,typename RHS,typename DerivedOp>
+struct traits<_Outer<LHS,RHS,DerivedOp>>
+{
+    typedef typename traits<LHS>::Scalar Scalar;
+#if defined(HAS_VECTOR)
+    typedef typename traits<LHS>::Vector Vector;
+#endif
+};
+
+template<typename LHS,typename RHS,typename OP>
+struct VecRef<_Outer<LHS,RHS,OP>>
+{
+   typedef _Outer<LHS,RHS,OP> type;
+   static type ref(const _Outer<LHS,RHS,OP>&a){
+      return(a);
+   };
+};
+
+template<typename LHS,typename RHS,typename OP>
+struct NbRows<_Outer<LHS,RHS,OP>>
+{
+    constexpr static vector_length_t value = NbRows<LHS>::value;
+};
+
+
+template<typename LHS,typename RHS,typename OP>
+struct NbCols<_Outer<LHS,RHS,OP>>
+{
+    constexpr static vector_length_t value = NbCols<RHS>::value;
+};
+
+
+template<typename VA,typename VB,
+typename std::enable_if<vector_idx_pair<VA,VB>(),bool>::type = true>
+inline auto outer(const VA&a,const VB&b)
+{
+   //constexpr int NBROWS = StaticLength<VA>::value;
+   //constexpr int NBCOLS = StaticLength<VB>::value;
+
+   //using T = typename traits<VA>::Scalar;
+
+   //Matrix<T,NBROWS,NBCOLS,TMP_ALLOC> res;
+   //_outer(res,a,b);
+    using Scalar = typename traits<VA>::Scalar;
+    using VecLHS = VecRef<VA>;
+    using VecRHS = VecRef<VB>;
+
+    return(_Outer<typename VecLHS::type,typename VecRHS::type,_MulOp<Scalar>>(VecLHS::ref(a),VecRHS::ref(b),_MulOp<Scalar>()));
+
+
+}
+
+/*! @} */
+}
diff --git a/dsppp/Include/dsppp/matrix_impl.hpp b/dsppp/Include/dsppp/matrix_impl.hpp
new file mode 100644
index 000000000..759e11cbc
--- /dev/null
+++ b/dsppp/Include/dsppp/matrix_impl.hpp
@@ -0,0 +1,612 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+
+#include <memory>
+#include <cstring>
+#include <algorithm>
+#include <iostream>
+#include "common.hpp"
+#include "arch.hpp"
+#include <type_traits>
+#include "number.hpp"
+#include "forward.hpp"
+#include "fusion.hpp"
+#include "unroll.hpp"
+#include "algorithms.hpp"
+#include "vec.hpp"
+
+namespace arm_cmsis_dsp {
+
+/** \addtogroup Matrix 
+ *  @{
+ */
+
+/********************
+ * 
+ *  MATRIX
+ * 
+ ********************/
+
+struct Slice
+{
+    Slice(const index_t s,const index_t e):start(s),stop(e){};
+
+    const index_t start;
+    const index_t stop;
+};
+
+template<typename P,int R=DYNAMIC,int C=DYNAMIC,
+         template<int> typename Allocator = TMP_ALLOC>
+struct Matrix:Vector<P,R*C,Allocator>
+{
+   constexpr vector_length_t rows() const {return(R);}
+   constexpr vector_length_t columns() const {return(C);}
+   constexpr uint32_t stride() const {return(C);}
+
+
+   Matrix():Vector<P,R*C,Allocator>(){};
+   explicit Matrix(P init_val):Vector<P,R*C,Allocator>(init_val){};
+
+   Matrix(const Matrix& other) = default;
+   Matrix(Matrix&& other) = default;
+
+   template<template<int> typename OtherAllocator>
+   explicit Matrix(const Matrix<P,R,C,OtherAllocator>& other):Vector<P,R*C,Allocator>()
+   {
+        eval(*this,+other,(vector_length_t)(R*C),CURRENT_ARCH);
+   };
+
+   /* Applies only when the AST does not contain any dynamic MatrixView */
+   template<typename Derived,
+            typename std::enable_if<IsVector<Derived>::value,bool>::type = true>
+   Matrix(const _Expr<Derived>& other):Vector<P,R*C,Allocator>(other)
+   {
+   };
+
+   /* Applies only when AST is containing any dynamic MatrixView */
+   template<typename Derived,
+            typename std::enable_if<must_use_matrix_idx<Derived>(),bool>::type = true>
+   Matrix(const _Expr<Derived>& other):Vector<P,R*C,Allocator>()
+   {
+     eval2D(*this,other.derived(),rows(),columns(),CURRENT_ARCH);
+   };
+
+   template<typename Derived,
+            typename std::enable_if<IsVector<Derived>::value,bool>::type = true>
+   Matrix& operator=(const _Expr<Derived>& other)
+   {
+        eval(*this,other.derived(),(vector_length_t)R*C,CURRENT_ARCH);
+        return(*this);
+   }
+
+   /* Applies only when AST is containing any dynamic MatrixView */
+   template<typename Derived,
+            typename std::enable_if<must_use_matrix_idx<Derived>(),bool>::type = true>
+   Matrix& operator=(const _Expr<Derived>& other)
+   {
+       eval2D(*this,other.derived(),rows(),columns(),CURRENT_ARCH);
+       return(*this);
+   }
+
+   MatrixView<P,C> sub(const index_t rs,const index_t cs)
+   {
+        const vector_length_t nb_rows = rows() - rs;
+        const vector_length_t nb_cols = columns() - cs;
+
+        return(MatrixView<P,C>(Vector_Base<P>::ptr(rs*stride()+cs),nb_rows,nb_cols));
+   }
+
+   const MatrixView<P,C> sub(const index_t rs,const index_t cs) const
+   {
+        const vector_length_t nb_rows = rows() - rs;
+        const vector_length_t nb_cols = columns() - cs;
+
+        return(MatrixView<P,C>(Vector_Base<P>::ptr(rs*stride()+cs),nb_rows,nb_cols));
+   }
+   
+   MatrixView<P,C> sub(const Slice &rs,const index_t cs)
+   {
+        const vector_length_t nb_rows = rs.stop - rs.start;
+        const vector_length_t nb_cols = columns() - cs;
+
+        return(MatrixView<P,C>(Vector_Base<P>::ptr(rs.start*stride()+cs),nb_rows,nb_cols));
+   }
+
+   const MatrixView<P,C> sub(const Slice &rs,const index_t cs) const
+   {
+        const vector_length_t nb_rows = rs.stop - rs.start;
+        const vector_length_t nb_cols = columns() - cs;
+
+        return(MatrixView<P,C>(Vector_Base<P>::ptr(rs.start*stride()+cs),nb_rows,nb_cols));
+   }
+
+   MatrixView<P,C> sub(const index_t rs,const Slice &cs)
+   {
+        const vector_length_t nb_rows = rows() - rs;
+        const vector_length_t nb_cols = cs.stop - cs.start;
+
+        return(MatrixView<P,C>(Vector_Base<P>::ptr(rs*stride()+cs.start),nb_rows,nb_cols));
+   }
+
+   const MatrixView<P,C> sub(const index_t rs,const Slice &cs) const
+   {
+        const vector_length_t nb_rows = rows() - rs;
+        const vector_length_t nb_cols = cs.stop - cs.start;
+
+        return(MatrixView<P,C>(Vector_Base<P>::ptr(rs*stride()+cs.start),nb_rows,nb_cols));
+   }
+
+   MatrixView<P,C> sub(const Slice& rs,const Slice& cs)
+   {
+        const vector_length_t nb_rows = rs.stop - rs.start;
+        const vector_length_t nb_cols = cs.stop - cs.start;
+
+        return(MatrixView<P,C>(Vector_Base<P>::ptr(rs.start*stride()+cs.start),nb_rows,nb_cols));
+   }
+
+   const MatrixView<P,C> sub(const Slice& rs,const Slice& cs) const
+   {
+        const vector_length_t nb_rows = rs.stop - rs.start;
+        const vector_length_t nb_cols = cs.stop - cs.start;
+
+        return(MatrixView<P,C>(Vector_Base<P>::ptr(rs.start*stride()+cs.start),nb_rows,nb_cols));
+   }
+
+   MatrixView<P,C> sub(const index_t rs,
+                       const index_t re,
+                       const index_t cs,
+                       const index_t ce)
+   {
+        const vector_length_t nb_rows = re - rs;
+        const vector_length_t nb_cols = ce - cs;
+
+        return(MatrixView<P,C>(Vector_Base<P>::ptr(rs*stride()+cs),nb_rows,nb_cols));
+   }
+
+   const MatrixView<P,C> sub(const index_t rs,
+                             const index_t re,
+                             const index_t cs,
+                             const index_t ce) const
+   {
+        const vector_length_t nb_rows = re - rs;
+        const vector_length_t nb_cols = ce - cs;
+
+        return(MatrixView<P,C>(Vector_Base<P>::ptr(rs*stride()+cs),nb_rows,nb_cols));
+   }
+
+
+   Matrix& operator=(const Matrix& other) = default;
+   
+   Matrix& operator=(Matrix&& other) = default;
+
+   P& operator()(const index_t r,const index_t c)
+   {
+     return(Vector_Base<P>::ptr()[r*C+c]);
+   }
+
+   P& operator()(const index_t r,const index_t c) const
+   {
+     return(Vector_Base<P>::ptr()[r*C+c]);
+   }
+
+
+   friend std::ostream& operator<< (std::ostream& stream, const Matrix& other) {
+        int c=0;
+        for(index_t k=0;k<other.length();k++)
+        {
+           stream << other[k] << " , ";
+           c++;
+           if (c == other.columns())
+           {
+              c=0;
+              stream << "\r\n";
+           }
+        }
+        stream << "\r\n";
+        return(stream);
+    }
+
+   template<int S=1>
+   VectorView<P,S> row(const index_t i,const index_t start=0,const index_t stop=C)
+   {
+     return(VectorView<P,S>(*this,i*stride()+start,i*stride()+stop));
+   }
+
+   template<int S=1>
+   const VectorView<P,S> row(const index_t i,const index_t start=0,const index_t stop=C) const
+   {
+     return(VectorView<P,S>(*this,i*stride()+start,i*stride()+stop));
+   }
+
+  
+   template<int CS=1,int S=C>
+   VectorView<P,CS*S> col(const index_t i,const index_t start=0,const index_t stop=R)
+   {
+     return(VectorView<P,CS*S>(*this,i+stride()*start,i+stride()*stop));
+   }
+
+   template<int CS=1,int S=C>
+   const VectorView<P,CS*S> col(const index_t i,const index_t start=0,const index_t stop=R) const
+   {
+     return(VectorView<P,CS*S>(*this,i+stride()*start,i+stride()*stop));
+   }
+
+    template<int RA=R,int CA=C,typename VA,
+            typename std::enable_if<IsVector<VA>::value && 
+            (RA == CA) && (RA>0) &&
+            SameElementType<VA,P>::value,bool>::type = true>
+    static Matrix<P,RA,CA,Allocator> diagonal(const VA& a)
+    {
+       Matrix<P,RA,CA,Allocator> res;
+       _diagonal(res,a,RA);
+       return(res);
+    }
+
+    template<int RA=R,int CA=C,typename VA,
+            typename std::enable_if<IsVector<VA>::value && 
+            (RA == CA) && (RA>0) &&
+            SameElementType<VA,P>::value,bool>::type = true>
+    void fill_diagonal(const VA& a)
+    {
+       _fill_diagonal(*this,a,RA);
+    }
+
+    template<int RA=R, int CA=C,
+            typename std::enable_if< 
+            (RA == CA) && (RA>0),bool>::type = true>
+    static Matrix<P,RA,CA,Allocator> identity()
+    {
+       Matrix<P,RA,CA,Allocator> res;
+       _identity(res,RA);
+       return(res);
+    }
+
+    Matrix<P,R,C,Allocator> create() const
+    {
+       Matrix<P,R,C,Allocator> res;
+       return(res);
+    }
+
+    Matrix<P,C,R,Allocator> transpose() const
+    {
+       Matrix<P,C,R,Allocator> res;
+       transposeTo(res,*this);
+       return(res);
+    }
+
+#if defined(HAS_VECTOR)
+    using VectorType = typename vector_traits<P>::vector;
+    void matrix_store(const index_t row,
+                      const index_t col,
+                      const VectorType val) const
+    {
+        Vector_Base<P>::vector_store(row*C + col,val);
+    }
+
+#if defined(HAS_PREDICATED_LOOP)
+    void matrix_store_tail(const index_t row,
+                           const index_t col,
+                           const vector_length_t remaining,
+                           const VectorType val) const
+    {
+       Vector_Base<P>::vector_store_tail(row*C + col,remaining,val);
+    }
+
+    VectorType const matrix_op_tail(const index_t row,
+                                const index_t col,
+                                const vector_length_t remaining) const
+    {
+        return(Vector_Base<P>::vector_op_tail(row*C + col,remaining));
+    }
+#endif
+
+    VectorType const matrix_op(const index_t row,
+                           const index_t col) const
+    {
+        return(Vector_Base<P>::vector_op(row*C + col));
+    }
+#endif
+
+};
+
+template<typename P,
+         template<int> typename Allocator>
+struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
+{
+   vector_length_t rows() const {return(rows_);}
+   vector_length_t columns() const {return(columns_);}
+   uint32_t stride() const {return(columns_);}
+
+
+   explicit Matrix(vector_length_t r,vector_length_t c):
+   Vector<P,DYNAMIC,Allocator>(r*c),rows_(r),columns_(c){};
+   explicit Matrix(vector_length_t r,vector_length_t c,P init_val):
+   Vector<P,DYNAMIC,Allocator>(r*c,init_val),rows_(r),columns_(c){};
+
+   Matrix(const Matrix& other) = default;
+   Matrix(Matrix&& other) = default;
+
+   P& operator()(const index_t r,const index_t c)
+   {
+     return(Vector_Base<P>::ptr()[r*columns()+c]);
+   }
+
+   P& operator()(const index_t r,const index_t c) const
+   {
+     return(Vector_Base<P>::ptr()[r*columns()+c]);
+   }
+
+   template<int RK,int CK,template<int> typename OtherAllocator>
+   explicit Matrix(const Matrix<P,RK,CK,OtherAllocator>& other):
+   Vector<P,DYNAMIC,Allocator>(other.rows()*other.columns()),
+   rows_(other.rows()),columns_(other.columns())
+   {
+        if ((other.rows() == rows()) && (other.columns() == columns()))
+        {
+          eval(*this,+other,(vector_length_t)(other.rows()*other.columns()),CURRENT_ARCH);
+        }
+   };
+
+   template<typename Derived,
+            typename std::enable_if<IsVector<Derived>::value,bool>::type = true>
+   Matrix(const _Expr<Derived>& other):Vector<P,DYNAMIC,Allocator>(other),
+   rows_(other.rows()),columns_(other.columns())
+   {
+   };
+
+   template<typename Derived,
+            typename std::enable_if<must_use_matrix_idx<Derived>(),bool>::type = true>
+   Matrix(const _Expr<Derived>& other):
+   Vector<P,DYNAMIC,Allocator>(other.rows()*other.columns()),
+   rows_(other.rows()),columns_(other.columns())
+   {
+        eval2D(*this,other.derived(),rows(),columns(),CURRENT_ARCH);
+   };
+
+   template<typename Derived,
+            typename std::enable_if<IsVector<Derived>::value,bool>::type = true>
+   Matrix& operator=(const _Expr<Derived>& other)
+   {
+      eval(*this,other.derived(),rows()*columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+
+   template<typename Derived,
+            typename std::enable_if<must_use_matrix_idx<Derived>(),bool>::type = true>
+   Matrix& operator=(const _Expr<Derived>& other)
+   {
+         eval2D(*this,other.derived(),rows(),columns(),CURRENT_ARCH);
+         return(*this);
+   };
+
+   Matrix& operator=(const Matrix& other) = default;
+   
+   Matrix& operator=(Matrix&& other) = default;
+
+   friend std::ostream& operator<< (std::ostream& stream, const Matrix& other) {
+        int c=0;
+        for(index_t k=0;k<other.length();k++)
+        {
+           stream << other[k] << " , ";
+           c++;
+           if (c == other.columns())
+           {
+              c=0;
+              stream << "\r\n";
+           }
+        }
+        stream << "\r\n";
+        return(stream);
+    }
+
+    template<typename VA,
+            typename std::enable_if<IsVector<VA>::value && 
+            SameElementType<VA,P>::value,bool>::type = true>
+    static Matrix<P,DYNAMIC,DYNAMIC,Allocator> diagonal(const VA& a)
+    {
+       Matrix<P,DYNAMIC,DYNAMIC,Allocator> res(a.length(),a.length());
+       _diagonal(res,a,a.length());
+       return(res);
+    }
+
+    template<typename VA,
+            typename std::enable_if<IsVector<VA>::value && 
+            SameElementType<VA,P>::value,bool>::type = true>
+    void fill_diagonal(const VA& a)
+    {
+       _fill_diagonal(*this,a,this->length());
+    }
+
+    static Matrix<P,DYNAMIC,DYNAMIC,Allocator> identity(const vector_length_t l)
+    {
+       Matrix<P,DYNAMIC,DYNAMIC,Allocator> res(l,l);
+       _identity(res,l);
+       return(res);
+    }
+
+    Matrix<P,DYNAMIC,DYNAMIC,Allocator> create() const
+    {
+       Matrix<P,DYNAMIC,DYNAMIC,Allocator> res(rows(),columns());
+       return(res);
+    }
+
+    Matrix<P,DYNAMIC,DYNAMIC,Allocator> transpose() const
+    {
+       Matrix<P,DYNAMIC,DYNAMIC,Allocator> res(columns(),rows());
+       transposeTo(res,*this);
+       return(res);
+    }
+
+   VectorView<P,1> row(const index_t i,const index_t start=0)
+   {
+     return(VectorView<P,1>(*this,i*this->stride()+start,i*this->stride()+this->columns()));
+   }
+
+   VectorView<P,1> row(const index_t i,const index_t start,const index_t stop)
+   {
+     return(VectorView<P,1>(*this,i*this->stride()+start,i*this->stride()+stop));
+   }
+
+   const VectorView<P,1> row(const index_t i,const index_t start=0) const
+   {
+     return(VectorView<P,1>(*this,i*this->stride()+start,i*this->stride()+this->columns()));
+   }
+
+   const VectorView<P,1> row(const index_t i,const index_t start,const index_t stop) const
+   {
+     return(VectorView<P,1>(*this,i*this->stride()+start,i*this->stride()+stop));
+   }
+
+   template<int CS=1>
+   VectorView<P,DYNAMIC> col(const index_t i,const index_t start=0)
+   {
+     return(VectorView<P,DYNAMIC>(*this,i+this->stride()*start,i+this->stride()*this->rows(),this->stride()*CS));
+   }
+
+   template<int CS=1>
+   VectorView<P,DYNAMIC> col(const index_t i,const index_t start,const index_t stop)
+   {
+     return(VectorView<P,DYNAMIC>(*this,i+this->stride()*start,i+this->stride()*stop,this->stride()*CS));
+   }
+
+   template<int CS=1>
+   const VectorView<P,DYNAMIC> col(const index_t i,const index_t start=0) const
+   {
+     return(VectorView<P,DYNAMIC>(*this,i+this->stride()*start,i+this->stride()*this->rows(),this->stride()*CS));
+   }
+
+   template<int CS=1>
+   const VectorView<P,DYNAMIC> col(const index_t i,const index_t start,const index_t stop) const
+   {
+     return(VectorView<P,DYNAMIC>(*this,i+this->stride()*start,i+this->stride()*stop,this->stride()*CS));
+   }
+
+#if defined(HAS_VECTOR)
+    using VectorType = typename vector_traits<P>::vector;
+    void matrix_store(const index_t row,
+                      const index_t col,
+                      const VectorType val) const
+    {
+        Vector_Base<P>::vector_store(row*stride() + col,val);
+    }
+
+#if defined(HAS_PREDICATED_LOOP)
+    void matrix_store_tail(const index_t row,
+                           const index_t col,
+                           const vector_length_t remaining,
+                           const VectorType val) const
+    {
+       Vector_Base<P>::vector_store_tail(row*stride() + col,remaining,val);
+    }
+
+    VectorType const matrix_op_tail(const index_t row,
+                                const index_t col,
+                                const vector_length_t remaining) const
+    {
+        return(Vector_Base<P>::vector_op_tail(row*stride() + col,remaining));
+    }
+#endif
+
+    VectorType const matrix_op(const index_t row,
+                           const index_t col) const
+    {
+        return(Vector_Base<P>::vector_op(row*stride() + col));
+    }
+#endif
+
+   MatrixView<P,DYNAMIC> sub(const index_t rs,const index_t cs)
+   {
+        const vector_length_t nb_rows = rows() - rs;
+        const vector_length_t nb_cols = columns() - cs;
+
+        return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs*stride()+cs),nb_rows,nb_cols,stride()));
+   }
+
+   const MatrixView<P,DYNAMIC> sub(const index_t rs,const index_t cs) const
+   {
+        const vector_length_t nb_rows = rows() - rs;
+        const vector_length_t nb_cols = columns() - cs;
+
+        return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs*stride()+cs),nb_rows,nb_cols,stride()));
+   }
+   
+   MatrixView<P,DYNAMIC> sub(const Slice &rs,const index_t cs)
+   {
+        const vector_length_t nb_rows = rs.stop - rs.start;
+        const vector_length_t nb_cols = columns() - cs;
+
+        return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs.start*stride()+cs),nb_rows,nb_cols,stride()));
+   }
+
+   const MatrixView<P,DYNAMIC> sub(const Slice &rs,const index_t cs) const
+   {
+        const vector_length_t nb_rows = rs.stop - rs.start;
+        const vector_length_t nb_cols = columns() - cs;
+
+        return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs.start*stride()+cs),nb_rows,nb_cols,stride()));
+   }
+
+   MatrixView<P,DYNAMIC> sub(const index_t rs,const Slice &cs)
+   {
+        const vector_length_t nb_rows = rows() - rs;
+        const vector_length_t nb_cols = cs.stop - cs.start;
+
+        return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs*stride()+cs.start),nb_rows,nb_cols,stride()));
+   }
+
+   const MatrixView<P,DYNAMIC> sub(const index_t rs,const Slice &cs) const
+   {
+        const vector_length_t nb_rows = rows() - rs;
+        const vector_length_t nb_cols = cs.stop - cs.start;
+
+        return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs*stride()+cs.start),nb_rows,nb_cols,stride()));
+   }
+
+   MatrixView<P,DYNAMIC> sub(const Slice& rs,const Slice& cs)
+   {
+        const vector_length_t nb_rows = rs.stop - rs.start;
+        const vector_length_t nb_cols = cs.stop - cs.start;
+
+        return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs.start*stride()+cs.start),nb_rows,nb_cols,stride()));
+   }
+
+   const MatrixView<P,DYNAMIC> sub(const Slice& rs,const Slice& cs) const
+   {
+        const vector_length_t nb_rows = rs.stop - rs.start;
+        const vector_length_t nb_cols = cs.stop - cs.start;
+
+        return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs.start*stride()+cs.start),nb_rows,nb_cols,stride()));
+   }
+
+   MatrixView<P,DYNAMIC> sub(const index_t rs,
+                             const index_t re,
+                             const index_t cs,
+                             const index_t ce)
+   {
+        const vector_length_t nb_rows = re - rs;
+        const vector_length_t nb_cols = ce - cs;
+
+        return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs*stride()+cs),nb_rows,nb_cols,stride()));
+   }
+
+   const MatrixView<P,DYNAMIC> sub(const index_t rs,
+                                   const index_t re,
+                                   const index_t cs,
+                                   const index_t ce) const
+   {
+        const vector_length_t nb_rows = re - rs;
+        const vector_length_t nb_cols = ce - cs;
+
+        return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs*stride()+cs),nb_rows,nb_cols,stride()));
+   }
+  
+protected:
+    vector_length_t rows_,columns_;
+};
+
+
+/*! @} */
+}
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/matrix_view.hpp b/dsppp/Include/dsppp/matrix_view.hpp
new file mode 100644
index 000000000..d37bf9dce
--- /dev/null
+++ b/dsppp/Include/dsppp/matrix_view.hpp
@@ -0,0 +1,751 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+
+#include <memory>
+#include <cstring>
+#include <algorithm>
+#include <iostream>
+#include "common.hpp"
+#include "arch.hpp"
+#include <type_traits>
+#include "number.hpp"
+#include "forward.hpp"
+#include "fusion.hpp"
+#include "unroll.hpp"
+#include "algorithms.hpp"
+#include "vec.hpp"
+#include "matrix_impl.hpp"
+
+namespace arm_cmsis_dsp {
+
+/** \addtogroup Matrix
+ *  @{
+ */
+
+template<typename T,int S=1>
+struct MatrixView
+{
+   vector_length_t rows() const {return(nb_rows_);}
+   vector_length_t columns() const {return(nb_cols_);}
+   constexpr uint32_t stride() const {return(S);}
+
+   explicit MatrixView(T* v,
+              const vector_length_t rows,
+              const vector_length_t cols):
+   v_(v),nb_rows_(rows),nb_cols_(cols){};
+
+   explicit MatrixView(const Vector_Base<T> &v,
+              const vector_length_t rows,
+              const vector_length_t cols):
+   v_(v.ptr()),nb_rows_(rows),nb_cols_(cols){};
+
+   virtual ~MatrixView() {};
+
+   MatrixView(const MatrixView& other):
+    v_(other.v_),
+    nb_rows_(other.nb_rows_),nb_cols_(other.nb_cols_){};
+
+   MatrixView(MatrixView&& other) :
+    v_(other.v_),
+    nb_rows_(other.nb_rows_),nb_cols_(other.nb_cols_){};
+
+   
+   MatrixView& operator=(const MatrixView& other) = delete;
+   MatrixView& operator=(MatrixView&& other)  = delete;
+
+   T& operator()(const index_t r,const index_t c)
+   {
+     return(v_[r*stride()+c]);
+   }
+
+   T const operator()(const index_t r,const index_t c) const
+   {
+     return(v_[r*stride()+c]);
+   }
+
+
+   template<typename Derived>
+   MatrixView& operator=(const _Expr<Derived>&other)
+   {
+      eval2D(*this,other.derived(),rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   }
+
+   MatrixView& operator=(const T val)
+   {
+        _Fill2D(*this,val,rows(),columns(),CURRENT_ARCH);
+       
+        return(*this);
+   }
+
+
+   template<typename Derived>
+   MatrixView& operator +=(const _Expr<Derived>& other)
+   {
+      eval2D(*this,*this + other.derived(),rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   MatrixView& operator +=(const MatrixView& other)
+   {
+      eval2D(*this,*this + other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   MatrixView& operator +=(const T other)
+   {
+      eval2D(*this,*this + other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   template<typename Derived>
+   MatrixView& operator -=(const _Expr<Derived>& other)
+   {
+      eval2D(*this,*this - other.derived(),rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   
+   MatrixView& operator -=(const MatrixView& other)
+   {
+      eval2D(*this,*this - other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   MatrixView& operator -=(const T other)
+   {
+      eval2D(*this,*this - other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   template<typename Derived>
+   MatrixView& operator *=(const _Expr<Derived>& other)
+   {
+      eval2D(*this,*this * other.derived(),rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   MatrixView& operator *=(const MatrixView& other)
+   {
+      eval2D(*this,*this * other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   MatrixView& operator *=(const T other)
+   {
+      eval2D(*this,*this * other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+  friend std::ostream& operator<< (std::ostream& stream, const MatrixView& other) {
+        for(index_t row=0;row<other.rows();row++)
+        {
+           for(index_t col=0;col<other.columns();col++)
+           {
+                stream << other(row,col)<< " , ";
+           }
+           stream << "\r\n";
+        }
+        stream << "\r\n";
+        return(stream);
+    }
+
+   VectorView<T,1> row(const index_t i,const index_t start=0)
+   {
+     return(VectorView<T,1>(v_,i*stride()+start,i*stride()+columns()));
+   }
+
+   VectorView<T,1> row(const index_t i,const index_t start,const index_t stop)
+   {
+     return(VectorView<T,1>(v_,i*stride()+start,i*stride()+stop));
+   }
+
+   const VectorView<T,1> row(const index_t i,const index_t start=0) const
+   {
+     return(VectorView<T,1>(v_,i*stride()+start,i*stride()+columns()));
+   }
+
+   const VectorView<T,1> row(const index_t i,const index_t start,const index_t stop) const
+   {
+     return(VectorView<T,1>(v_,i*stride()+start,i*stride()+stop));
+   }
+
+   template<int CS=1>
+   VectorView<T,CS*S> col(const index_t i,const index_t start=0)
+   {
+     return(VectorView<T,CS*S>(v_,i+stride()*start,i+stride()*rows()));
+   }
+
+   template<int CS=1>
+   VectorView<T,CS*S> col(const index_t i,const index_t start,const index_t stop)
+   {
+     return(VectorView<T,CS*S>(v_,i+stride()*start,i+stride()*stop));
+   }
+
+   template<int CS=1>
+   const VectorView<T,CS*S> col(const index_t i,const index_t start=0) const
+   {
+     return(VectorView<T,CS*S>(v_,i+stride()*start,i+stride()*rows()));
+   }
+
+   template<int CS=1>
+   const VectorView<T,CS*S> col(const index_t i,const index_t start,const index_t stop) const
+   {
+     return(VectorView<T,CS*S>(v_,i+stride()*start,i+stride()*stop));
+   }
+
+   #if defined(HAS_VECTOR)
+    using VectorType = typename vector_traits<T>::vector;
+    void matrix_store(const index_t row,
+                      const index_t col,
+                      const VectorType val) const
+    {
+        inner::vstore1<1>((typename std::remove_cv<T>::type*)(&v_[row*stride() + col]),val);
+    }
+
+#if defined(HAS_PREDICATED_LOOP)
+    void matrix_store_tail(const index_t row,
+                           const index_t col,
+                           const vector_length_t remaining,
+                           const VectorType val) const
+    {
+        inner::vstore1_z<1>((typename std::remove_cv<T>::type*)(&v_[row*stride() + col]),val,remaining,inner::vctpq<T>::mk(remaining));
+    }
+
+    VectorType const matrix_op_tail(const index_t row,
+                                const index_t col,
+                                const vector_length_t remaining) const
+    {
+        return(inner::vload1_z<1>((typename std::remove_cv<T>::type*)(&v_[row*stride() + col]),remaining,inner::vctpq<T>::mk(remaining)));
+    }
+#endif
+
+    VectorType const matrix_op(const index_t row,
+                           const index_t col) const
+    {
+        return(inner::vload1<1>((typename std::remove_cv<T>::type*)(&v_[row*stride() + col])));
+    }
+#endif
+
+    template<typename VA,
+            typename std::enable_if<IsVector<VA>::value && 
+            SameElementType<VA,T>::value,bool>::type = true>
+    void fill_diagonal(const VA& a)
+    {
+       _fill_diagonal(*this,a,this->length());
+    }
+
+    Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> transpose() const
+    {
+       Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> res(columns(),rows());
+       transposeTo(res,*this);
+       return(res);
+    }
+
+    Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> create() const
+    {
+       Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> res(rows(),columns());
+       return(res);
+    }
+
+    T* ptr() const {return(v_);}
+    const T* const_ptr() const {return(v_);}
+
+protected:
+    T* const v_;
+    const vector_length_t nb_rows_;
+    const vector_length_t nb_cols_;
+};
+
+/*
+
+When the stride if not known at build time AND different
+from the nb_cols_
+
+*/
+template<typename T>
+struct MatrixView<T,DYNAMIC>
+{
+   vector_length_t rows() const {return(nb_rows_);}
+   vector_length_t columns() const {return(nb_cols_);}
+   uint32_t stride() const {return(stride_);}
+
+   explicit MatrixView(T* v,
+              const vector_length_t rows,
+              const vector_length_t cols,
+              const uint32_t stride):
+   v_(v),nb_rows_(rows),nb_cols_(cols),stride_(stride){};
+
+   explicit MatrixView(const Vector_Base<T> &v,
+              const vector_length_t rows,
+              const vector_length_t cols,
+              const uint32_t stride):
+   v_(v.ptr()),nb_rows_(rows),nb_cols_(cols),stride_(stride){};
+
+   virtual ~MatrixView() {};
+
+   MatrixView(const MatrixView& other):
+    v_(other.v_),
+    nb_rows_(other.nb_rows_),nb_cols_(other.nb_cols_),stride_(other.stride_){};
+
+   MatrixView(MatrixView&& other) :
+    v_(other.v_),
+    nb_rows_(other.nb_rows_),nb_cols_(other.nb_cols_),stride_(other.stride_){};
+
+   
+   MatrixView& operator=(const MatrixView& other) = delete;
+   MatrixView& operator=(MatrixView&& other)  = delete;
+
+   T& operator()(const index_t r,const index_t c)
+   {
+     return(v_[r*stride()+c]);
+   }
+
+   T const operator()(const index_t r,const index_t c) const
+   {
+     return(v_[r*stride()+c]);
+   }
+
+
+   template<typename Derived>
+   MatrixView& operator=(const _Expr<Derived>&other)
+   {
+      eval2D(*this,other.derived(),rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   }
+
+   MatrixView& operator=(const T val)
+   {
+        _Fill2D(*this,val,rows(),columns(),CURRENT_ARCH);
+       
+        return(*this);
+   }
+
+
+   template<typename Derived>
+   MatrixView& operator +=(const _Expr<Derived>& other)
+   {
+      eval2D(*this,*this + other.derived(),rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   MatrixView& operator +=(const MatrixView& other)
+   {
+      eval2D(*this,*this + other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   MatrixView& operator +=(const T other)
+   {
+      eval2D(*this,*this + other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   template<typename Derived>
+   MatrixView& operator -=(const _Expr<Derived>& other)
+   {
+      eval2D(*this,*this - other.derived(),rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   
+   MatrixView& operator -=(const MatrixView& other)
+   {
+      eval2D(*this,*this - other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   MatrixView& operator -=(const T other)
+   {
+      eval2D(*this,*this - other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   template<typename Derived>
+   MatrixView& operator *=(const _Expr<Derived>& other)
+   {
+      eval2D(*this,*this * other.derived(),rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   MatrixView& operator *=(const MatrixView& other)
+   {
+      eval2D(*this,*this * other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   MatrixView& operator *=(const T other)
+   {
+      eval2D(*this,*this * other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+  friend std::ostream& operator<< (std::ostream& stream, const MatrixView& other) {
+        for(index_t row=0;row<other.rows();row++)
+        {
+           for(index_t col=0;col<other.columns();col++)
+           {
+                stream << other(row,col)<< " , ";
+           }
+           stream << "\r\n";
+        }
+        stream << "\r\n";
+        return(stream);
+    }
+
+   VectorView<T,1> row(const index_t i,const index_t start=0)
+   {
+     return(VectorView<T,1>(v_,i*stride()+start,i*stride()+columns()));
+   }
+
+   VectorView<T,1> row(const index_t i,const index_t start,const index_t stop)
+   {
+     return(VectorView<T,1>(v_,i*stride()+start,i*stride()+stop));
+   }
+
+   const VectorView<T,1> row(const index_t i,const index_t start=0) const
+   {
+     return(VectorView<T,1>(v_,i*stride()+start,i*stride()+columns()));
+   }
+
+   const VectorView<T,1> row(const index_t i,const index_t start,const index_t stop) const
+   {
+     return(VectorView<T,1>(v_,i*stride()+start,i*stride()+stop));
+   }
+
+   template<int CS=1>
+   VectorView<T,DYNAMIC> col(const index_t i,const index_t start=0)
+   {
+     return(VectorView<T,DYNAMIC>(v_,i+stride()*start,i+stride()*rows(),stride()*CS));
+   }
+
+   template<int CS=1>
+   VectorView<T,DYNAMIC> col(const index_t i,const index_t start,const index_t stop)
+   {
+     return(VectorView<T,DYNAMIC>(v_,i+stride()*start,i+stride()*stop,stride()*CS));
+   }
+
+   template<int CS=1>
+   const VectorView<T,DYNAMIC> col(const index_t i,const index_t start=0) const
+   {
+     return(VectorView<T,DYNAMIC>(v_,i+stride()*start,i+stride()*rows(),stride()*CS));
+   }
+
+   template<int CS=1>
+   const VectorView<T,DYNAMIC> col(const index_t i,const index_t start,const index_t stop) const
+   {
+     return(VectorView<T,DYNAMIC>(v_,i+stride()*start,i+stride()*stop,stride()*CS));
+   }
+
+   #if defined(HAS_VECTOR)
+    using VectorType = typename vector_traits<T>::vector;
+    void matrix_store(const index_t row,
+                      const index_t col,
+                      const VectorType val) const
+    {
+        inner::vstore1<1>((typename std::remove_cv<T>::type*)(&v_[row*stride() + col]),val);
+    }
+
+#if defined(HAS_PREDICATED_LOOP)
+    void matrix_store_tail(const index_t row,
+                           const index_t col,
+                           const vector_length_t remaining,
+                           const VectorType val) const
+    {
+        inner::vstore1_z<1>((typename std::remove_cv<T>::type*)(&v_[row*stride() + col]),val,remaining,inner::vctpq<T>::mk(remaining));
+    }
+
+    VectorType const matrix_op_tail(const index_t row,
+                                const index_t col,
+                                const vector_length_t remaining) const
+    {
+        return(inner::vload1_z<1>((typename std::remove_cv<T>::type*)(&v_[row*stride() + col]),remaining,inner::vctpq<T>::mk(remaining)));
+    }
+#endif
+    
+    VectorType const matrix_op(const index_t row,
+                           const index_t col) const
+    {
+        return(inner::vload1<1>((typename std::remove_cv<T>::type*)(&v_[row*stride() + col])));
+    }
+#endif
+
+    template<typename VA,
+            typename std::enable_if<IsVector<VA>::value && 
+            SameElementType<VA,T>::value,bool>::type = true>
+    void fill_diagonal(const VA& a)
+    {
+       _fill_diagonal(*this,a,this->length());
+    }
+
+    Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> transpose() const
+    {
+       Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> res(columns(),rows());
+       transposeTo(res,*this);
+       return(res);
+    }
+
+    Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> create() const
+    {
+       Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> res(rows(),columns());
+       return(res);
+    }
+
+    T* ptr() const {return(v_);}
+
+    const T* const_ptr() const {return(v_);}
+
+
+protected:
+    T* const v_;
+    const vector_length_t nb_rows_;
+    const vector_length_t nb_cols_;
+    const uint32_t stride_;
+};
+
+/*
+
+
+Dynamic but with stride == nb_cols_
+
+*/
+
+template<typename T>
+struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
+{
+   vector_length_t rows() const {return(nb_rows_);}
+   vector_length_t columns() const {return(nb_cols_);}
+   uint32_t stride() const {return(nb_cols_);}
+
+   explicit MatrixView(T* v,
+              const vector_length_t rows,
+              const vector_length_t cols):
+   VectorView<T,1>(v,0,rows*cols),
+   nb_rows_(rows),nb_cols_(cols){};
+
+   explicit MatrixView(const Vector_Base<T> &v,
+              const vector_length_t rows,
+              const vector_length_t cols):
+   VectorView<T,1>(v.ptr(),0,rows*cols),
+   nb_rows_(rows),nb_cols_(cols){};
+
+   virtual ~MatrixView() {};
+
+   MatrixView(const MatrixView& other):
+    VectorView<T,1>(other),
+    nb_rows_(other.nb_rows_),nb_cols_(other.nb_cols_){};
+
+   MatrixView(MatrixView&& other) :
+    VectorView<T,1>(std::forward<MatrixView>(other)),
+    nb_rows_(other.nb_rows_),nb_cols_(other.nb_cols_){};
+
+   
+   MatrixView& operator=(const MatrixView& other) = delete;
+   MatrixView& operator=(MatrixView&& other)  = delete;
+
+   T& operator()(const index_t r,const index_t c)
+   {
+     return(&(*this)[r*stride()+c]);
+   }
+
+   T const operator()(const index_t r,const index_t c) const
+   {
+     return((*this)[r*stride()+c]);
+   }
+
+
+   template<typename Derived>
+   MatrixView& operator=(const _Expr<Derived>&other)
+   {
+      eval2D(*this,other.derived(),rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   }
+
+   MatrixView& operator=(const T val)
+   {
+        _Fill2D(*this,val,rows(),columns(),CURRENT_ARCH);
+       
+        return(*this);
+   }
+
+
+   template<typename Derived>
+   MatrixView& operator +=(const _Expr<Derived>& other)
+   {
+      eval2D(*this,*this + other.derived(),rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   MatrixView& operator +=(const MatrixView& other)
+   {
+      eval2D(*this,*this + other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   MatrixView& operator +=(const T other)
+   {
+      eval2D(*this,*this + other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   template<typename Derived>
+   MatrixView& operator -=(const _Expr<Derived>& other)
+   {
+      eval2D(*this,*this - other.derived(),rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   
+   MatrixView& operator -=(const MatrixView& other)
+   {
+      eval2D(*this,*this - other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   MatrixView& operator -=(const T other)
+   {
+      eval2D(*this,*this - other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   template<typename Derived>
+   MatrixView& operator *=(const _Expr<Derived>& other)
+   {
+      eval2D(*this,*this * other.derived(),rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   MatrixView& operator *=(const MatrixView& other)
+   {
+      eval2D(*this,*this * other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   MatrixView& operator *=(const T other)
+   {
+      eval2D(*this,*this * other,rows(),columns(),CURRENT_ARCH);
+      return(*this);
+   };
+
+  friend std::ostream& operator<< (std::ostream& stream, const MatrixView& other) {
+        for(index_t row=0;row<other.rows();row++)
+        {
+           for(index_t col=0;col<other.columns();col++)
+           {
+                stream << other(row,col)<< " , ";
+           }
+           stream << "\r\n";
+        }
+        stream << "\r\n";
+        return(stream);
+    }
+
+   VectorView<T,1> row(const index_t i,const index_t start=0)
+   {
+     return(VectorView<T,1>(this->ptr(),i*stride()+start,i*stride()+columns()));
+   }
+
+   VectorView<T,1> row(const index_t i,const index_t start,const index_t stop)
+   {
+     return(VectorView<T,1>(this->ptr(),i*stride()+start,i*stride()+stop));
+   }
+
+   const VectorView<T,1> row(const index_t i,const index_t start=0) const
+   {
+     return(VectorView<T,1>(this->ptr(),i*stride()+start,i*stride()+columns()));
+   }
+
+   const VectorView<T,1> row(const index_t i,const index_t start,const index_t stop) const
+   {
+     return(VectorView<T,1>(this->ptr(),i*stride()+start,i*stride()+stop));
+   }
+
+   template<int CS=1>
+   VectorView<T,DYNAMIC> col(const index_t i,const index_t start=0)
+   {
+     return(VectorView<T,DYNAMIC>(this->ptr(),i+stride()*start,i+stride()*rows(),stride()*CS));
+   }
+
+   template<int CS=1>
+   VectorView<T,DYNAMIC> col(const index_t i,const index_t start,const index_t stop)
+   {
+     return(VectorView<T,DYNAMIC>(this->ptr(),i+stride()*start,i+stride()*stop,stride()*CS));
+   }
+
+   template<int CS=1>
+   const VectorView<T,DYNAMIC> col(const index_t i,const index_t start=0) const
+   {
+     return(VectorView<T,DYNAMIC>(this->ptr(),i+stride()*start,i+stride()*rows(),stride()*CS));
+   }
+
+   template<int CS=1>
+   const VectorView<T,DYNAMIC> col(const index_t i,const index_t start,const index_t stop) const
+   {
+     return(VectorView<T,DYNAMIC>(this->ptr(),i+stride()*start,i+stride()*stop,stride()*CS));
+   }
+
+   #if defined(HAS_VECTOR)
+    using VectorType = typename vector_traits<T>::vector;
+    void matrix_store(const index_t row,
+                      const index_t col,
+                      const VectorType val) const
+    {
+        inner::vstore1<1>((typename std::remove_cv<T>::type*)(ptr(row*stride() + col)),val);
+    }
+
+#if defined(HAS_PREDICATED_LOOP)
+    void matrix_store_tail(const index_t row,
+                           const index_t col,
+                           const vector_length_t remaining,
+                           const VectorType val) const
+    {
+        inner::vstore1_z<1>((typename std::remove_cv<T>::type*)(ptr(row*stride() + col)),val,remaining,inner::vctpq<T>::mk(remaining));
+    }
+
+    VectorType const matrix_op_tail(const index_t row,
+                                const index_t col,
+                                const vector_length_t remaining) const
+    {
+        return(inner::vload1_z<1>((typename std::remove_cv<T>::type*)(VectorView<T,1>::ptr(row*stride() + col)),remaining,inner::vctpq<T>::mk(remaining)));
+    }
+#endif
+    
+    VectorType const matrix_op(const index_t row,
+                           const index_t col) const
+    {
+        return(inner::vload1<1>((typename std::remove_cv<T>::type*)(VectorView<T,1>::ptr(row*stride() + col))));
+    }
+#endif
+
+    template<typename VA,
+            typename std::enable_if<IsVector<VA>::value && 
+            SameElementType<VA,T>::value,bool>::type = true>
+    void fill_diagonal(const VA& a)
+    {
+       _fill_diagonal(*this,a,this->length());
+    }
+
+    Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> transpose() const
+    {
+       Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> res(columns(),rows());
+       transposeTo(res,*this);
+       return(res);
+    }
+
+    Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> create() const
+    {
+       Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> res(rows(),columns());
+       return(res);
+    }
+
+
+protected:
+    const vector_length_t nb_rows_;
+    const vector_length_t nb_cols_;
+};
+
+/*! @} */
+
+}
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/memory_pool.hpp b/dsppp/Include/dsppp/memory_pool.hpp
new file mode 100644
index 000000000..7b60b4f83
--- /dev/null
+++ b/dsppp/Include/dsppp/memory_pool.hpp
@@ -0,0 +1,259 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+
+#include <cstdio>
+#include <cstddef>
+#include <vector>
+#include "common.hpp"
+
+namespace arm_cmsis_dsp {
+
+/** \addtogroup MEMORY Memory allocator
+ *  \ingroup DSPPP
+ *  @{
+ */
+
+/*
+
+Buffer allocator
+
+Can be used to build memory allocators foe vector
+and matrix.
+
+For instance, it is usedin the Memory pool allocator
+
+*/
+
+struct default_user_allocator_malloc_free
+{
+  static char * malloc(const std::size_t bytes)  
+  { 
+    #if !defined(MEMORY_ALLOCATION_DEBUG)
+    return reinterpret_cast<char *>(std::malloc(bytes)); 
+    #else
+    char *ret=reinterpret_cast<char *>(std::malloc(bytes));
+    if (ret==nullptr)
+    {
+       std::cout << "out of memory for " << bytes << " bytes\r\n";
+    }
+    return(ret);
+    #endif
+  } 
+  static void free(char * const block)  
+  { 
+    #if defined(MEMORY_ALLOCATION_DEBUG)
+    if (block==nullptr)
+    {
+       std::cout << "free null ptr \r\n";
+    }
+    #endif
+    std::free(block); 
+  }
+};
+
+inline void* aligned_malloc(std::size_t alignment, std::size_t size)
+{
+   void *ptr=std::malloc(size+alignment+sizeof(void*));
+   void *aligned = 
+   reinterpret_cast<char*>(
+    (reinterpret_cast<std::size_t>(ptr)+sizeof(void*)+alignment) & ~(alignment-1)
+   );
+
+   *(static_cast<void**>(aligned) - 1) = ptr;
+   return(aligned);
+}
+
+inline void
+aligned_free(void* ptr)
+{
+    if (ptr) {
+        std::free(*(static_cast<void**>(ptr) - 1));
+    }
+};
+
+
+struct user_allocator_aligned_malloc
+{
+  typedef std::size_t size_type;
+  typedef std::ptrdiff_t difference_type;
+
+  static char * malloc(const size_type bytes)  
+  { 
+    #if !defined(MEMORY_ALLOCATION_DEBUG)
+    return reinterpret_cast<char *>(aligned_malloc(MEMORY_POOL_ALIGNMENT, bytes));
+    #else 
+    char *ret = reinterpret_cast<char *>(aligned_malloc(MEMORY_POOL_ALIGNMENT, bytes));
+    if (ret==nullptr)
+    {
+        std::cout << "out of memory for " << bytes << " bytes\r\n";
+    }
+    return(ret);
+    #endif
+  }
+  static void free(char * const block)  
+  { 
+    #if defined(MEMORY_ALLOCATION_DEBUG)
+    if (block==nullptr)
+    {
+       std::cout << "free null ptr \r\n";
+    }
+    #endif
+    aligned_free(block); 
+  }
+};
+
+/*
+
+Memory allocator for vector and matrix.
+
+*/
+
+// Default allocator
+// Other allocator must be provided by user of the library
+template<int L>
+struct malloc_allocator {
+    /* Dynamic size allocations */
+    static  char* allocate  ( vector_length_t sz) noexcept{
+        char *res;
+        res=reinterpret_cast<char*>(std::malloc(sz));
+        #if defined(MEMORY_ALLOCATION_DEBUG)
+        if (res==nullptr)
+        {
+           std::cout << "out of memory for " << sz << " bytes\r\n";
+        }
+        #endif
+        return(res);
+    }
+
+    /* Size know at build time */
+    static  char* allocate  ( ) noexcept{
+        char *res;
+        res=reinterpret_cast<char*>(std::malloc(L));
+        #if defined(MEMORY_ALLOCATION_DEBUG)
+        if (res==nullptr)
+        {
+           std::cout << "out of memory for " << L << " bytes\r\n";
+        }
+        #endif
+        return(res);
+    }
+    
+    static void destroy  ( char* ptr ) noexcept {
+        #if defined(MEMORY_ALLOCATION_DEBUG)
+        if (ptr==nullptr)
+        {
+           std::cout << "free null ptr \r\n";
+        }
+        #endif
+        std::free(ptr);
+    }
+   
+};
+
+
+/*
+
+Memory pool
+
+Memory pool  is using a buffer 
+allocator (aligned or normal malloc)
+
+A memory pool can be used to by a memory allocator for
+vectors and matrixes.
+
+
+*/
+
+struct ListElem;
+
+struct ListElem {
+    ListElem *next;
+};
+
+template<int BUF_SIZE,typename UserAllocator = default_user_allocator_malloc_free>
+class MemoryPool {
+public:
+    explicit MemoryPool(const uint16_t nbBufs) 
+    {
+        buffer_list.reserve(nbBufs);
+        buffer_list.assign(nbBufs,nullptr);
+        for(auto p=buffer_list.begin();p != buffer_list.end(); ++p)
+        {
+            *p = UserAllocator::malloc(BUF_SIZE < sizeof(ListElem) ? sizeof(ListElem) : BUF_SIZE);            
+        }
+        reset();
+    };
+
+    ~MemoryPool() 
+    {
+        for(auto p=buffer_list.begin();p != buffer_list.end(); ++p)
+        {
+            UserAllocator::free(*p);
+        }
+    }
+
+    MemoryPool(const MemoryPool& other) = delete;
+
+    MemoryPool(MemoryPool&& other) = delete;
+    
+
+    MemoryPool& operator=(const MemoryPool& other) = delete;
+    
+    MemoryPool& operator=(MemoryPool&& other) = delete;
+
+    char* get_new_buffer() noexcept
+    {
+         /* No error handling.
+            The sizing of the pool must have been done, for
+            instance, with a statistic allocator.
+            Allocation is thus assumed to succeed */
+         char* res = reinterpret_cast<char*>(free);
+         free = free->next;
+         #if defined(MEMORY_ALLOCATION_DEBUG)
+         if (res == nullptr)
+         {
+           std::cout << "memory pool alloc error " << BUF_SIZE << " bytes\r\n";
+         }
+         #endif
+         return(res);
+    }
+
+    void recycle_buffer(char* buf)  noexcept
+    {
+        ListElem *l = reinterpret_cast<ListElem*>(buf);
+        #if defined(MEMORY_ALLOCATION_DEBUG)
+        if (l == nullptr)
+        {
+           std::cout << "memory pool free error " << BUF_SIZE << " bytes\r\n";
+        }
+        #endif
+        l->next = free;
+        free = l;
+    }
+
+    void reset()  noexcept
+    {
+        const int nbBufs = buffer_list.size();
+        for(int i=0;i<nbBufs-1;i++)
+        {
+            ListElem *l=reinterpret_cast<ListElem*>(buffer_list[i]);
+            l->next = reinterpret_cast<ListElem*>(buffer_list[i+1]);
+        }
+        ListElem *l=reinterpret_cast<ListElem*>(buffer_list[nbBufs-1]);
+        l->next = nullptr;
+        free = reinterpret_cast<ListElem*>(buffer_list[0]);
+    }
+
+  
+
+protected:
+    ListElem *free;
+    std::vector<char*> buffer_list;
+};
+
+
+/*! @} */
+
+}
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/num_features/double.hpp b/dsppp/Include/dsppp/num_features/double.hpp
new file mode 100644
index 000000000..1e3c78ae7
--- /dev/null
+++ b/dsppp/Include/dsppp/num_features/double.hpp
@@ -0,0 +1,63 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+/** \addtogroup GenericNumber Scalar number definitions
+ *  \ingroup NUMBER
+ *  @{
+ *  \addtogroup GenericDoubleNumber Double
+ *  \ingroup GenericNumber
+ *  @{
+ */
+
+
+template<>
+struct number_traits<double>
+{
+   static constexpr bool is_float = true;
+   static constexpr bool is_fixed = false;
+   typedef double accumulator;
+   static constexpr double one() {return 1.0;};
+   typedef double compute_type;
+};
+
+template<typename arch>
+struct vector_traits<double,arch,void> {
+  typedef double type;
+  typedef double storage_type;
+
+  // No vector type but must still be defined
+  typedef bool vector;
+  typedef bool temp_accumulator;
+  typedef uint32_t predicate_t;
+
+  static constexpr bool has_vector = false;
+  static constexpr bool is_float = true;
+  static constexpr bool is_fixed = false;
+  static constexpr bool has_predicate = false;
+};
+
+namespace inner {
+ __STATIC_FORCEINLINE double from_accumulator(const double a)
+  {
+     return(a);
+  };
+
+  __STATIC_FORCEINLINE double mac(const double acc,const double a,const double b)
+  {
+     return(acc+a*b);
+  };
+
+  __STATIC_FORCEINLINE void accumulate(double &a,const double &b)
+{
+   a += b;
+}
+
+__STATIC_FORCEINLINE double mult(double &a,const double &b)
+{
+   return(a*b);
+}
+}
+
+/*! @} */
+/*! @} */
diff --git a/dsppp/Include/dsppp/num_features/float.hpp b/dsppp/Include/dsppp/num_features/float.hpp
new file mode 100644
index 000000000..bf7838302
--- /dev/null
+++ b/dsppp/Include/dsppp/num_features/float.hpp
@@ -0,0 +1,77 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+/** \addtogroup GenericNumber 
+ *  \ingroup NUMBER
+ *  @{
+ *  \addtogroup GenericFloatNumber Float
+ *  \ingroup GenericNumber
+ *  @{
+ */
+
+template<>
+struct number_traits<float>
+{
+   static constexpr bool is_float = true;
+   static constexpr bool is_fixed = false;
+   typedef float accumulator;
+   static constexpr float one() {return 1.0f;};
+   typedef float compute_type;
+};
+
+
+/*
+
+If arch is not deriving from Neon or Helium, then there are
+no vectors for float
+
+*/
+template<typename arch>
+struct vector_traits<float,arch,
+    typename std::enable_if<!std::is_base_of<Helium,arch>::value &&
+                            !std::is_base_of<Neon,arch>::value>::type> {
+  typedef float type;
+  typedef float storage_type;
+
+  // No vector type but must still be defined
+  typedef bool vector;
+  typedef bool temp_accumulator;
+  typedef uint32_t predicate_t;
+
+  
+  static constexpr bool has_vector = false;
+  static constexpr bool is_float = true;
+  static constexpr bool is_fixed = false;
+  static constexpr bool has_predicate = false;
+
+};
+
+namespace inner {
+  __STATIC_FORCEINLINE float from_accumulator(const float a)
+  {
+     return(a);
+  };
+
+  __STATIC_FORCEINLINE float mac(const float acc,const float a,const float b)
+  {
+     return(acc+a*b);
+  };
+
+__STATIC_FORCEINLINE void accumulate(float &a,const float &b)
+{
+   a += b;
+}
+
+__STATIC_FORCEINLINE float mult(float &a,const float &b)
+{
+   return(a*b);
+}
+
+}
+
+
+/*! @} */
+/*! @} */
+
+
diff --git a/dsppp/Include/dsppp/num_features/group.hpp b/dsppp/Include/dsppp/num_features/group.hpp
new file mode 100644
index 000000000..f55d98770
--- /dev/null
+++ b/dsppp/Include/dsppp/num_features/group.hpp
@@ -0,0 +1,171 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+/** \addtogroup GenericNumber 
+ *  \ingroup NUMBER
+ *  @{
+ *  \addtogroup GenericTUPLENumber Tuple
+ *  Tuples of numbers or expressions used for unrolling
+ *  \ingroup GenericNumber
+ *  @{
+ */
+
+template<typename ... E>
+struct number_traits<std::tuple<E...>>
+{
+   static constexpr bool is_float = false;
+   static constexpr bool is_fixed = false;
+   typedef std::tuple<typename number_traits<E>::accumulator...> accumulator;
+   typedef std::tuple<typename number_traits<E>::compute_type...> compute_type;
+
+   static std::tuple<typename number_traits<E>::accumulator...> one()
+   {
+       return(std::make_tuple(vector_traits<E>::one()...));
+   }
+
+};
+
+/*
+
+Assume that all E are using the same scalar type or coherent types
+like f32 and q13 that have same number of lanes.
+
+Any other mix will not work and won't be catched at build time.
+
+*/
+template<typename arch,typename ... E>
+struct vector_traits<std::tuple<E...>,arch> {
+  using RefScalar = typename std::tuple_element<0,std::tuple<E...>>::type;
+
+  typedef std::tuple<typename vector_traits<E,arch>::temp_accumulator...> temp_accumulator;
+  typedef std::tuple<typename vector_traits<E,arch>::vector...> vector;
+  typedef std::tuple<typename vector_traits<E,arch>::predicate_t...> predicate_t;
+
+  static constexpr int nb_lanes = vector_traits<RefScalar,arch>::nb_lanes;
+
+  static constexpr bool has_vector = vector_traits<RefScalar,arch>::has_vector;
+  static constexpr bool is_float = vector_traits<RefScalar,arch>::is_float;
+  static constexpr bool is_fixed = vector_traits<RefScalar,arch>::is_fixed;
+  static constexpr bool has_predicate = vector_traits<RefScalar,arch>::has_predicate;
+
+  static temp_accumulator temp_acc_zero()
+  {
+     return(std::make_tuple(vector_traits<E,arch>::temp_acc_zero()...));
+  }
+
+};
+
+namespace inner {
+
+ 
+
+  /*
+
+  Assume that the vctpq is the same for all tuple elements.
+  If it is not the case, we can't get a predicated loop and
+  the code contains additional VPSTTTT and it is not
+  efficient.
+
+  */
+#if defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
+  template<typename ...E>
+  struct vctpq<std::tuple<E...>>
+  {
+     static auto mk(const uint32_t v/*,
+      typename std::enable_if<(vector_traits<E>::nb_lanes == ...),bool>::type* = nullptr*/)
+     {
+        return(vctpq<std::tuple_element_t<0, std::tuple<E...>>>::mk(v));
+     };
+  };
+#endif
+  /*
+
+  Typical configuration is vmacc between tuple and tuple
+  but also very common is vmacc between tuple and vector
+
+  */
+
+  template<typename A,typename V,std::size_t... Ns>
+  __STATIC_FORCEINLINE A vmacc_impl(const A &acc,const V &a,const V &b, std::index_sequence<Ns...>)
+  {
+     return(std::make_tuple(vmacc(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b))...));
+  };
+
+  template<typename A,typename ...E>
+  __STATIC_FORCEINLINE A 
+  vmacc(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b)
+  {
+     return(vmacc_impl(acc,a,b,std::make_index_sequence<sizeof...(E)>()));
+  };
+
+  template<typename A,typename V,typename B,std::size_t... Ns>
+  __STATIC_FORCEINLINE A vmacc_impl(const A &acc,const V &a,const V &b, const B p0,std::index_sequence<Ns...>)
+  {
+     return(std::make_tuple(vmacc(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b),p0)...));
+  };
+
+  template<typename A,typename B,typename ...E>
+  __STATIC_FORCEINLINE A 
+  vmacc(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b,const B p0)
+  {
+     return(vmacc_impl(acc,a,b,p0,std::make_index_sequence<sizeof...(E)>()));
+  };
+
+ 
+
+  template<typename A,std::size_t... Ns>
+  __STATIC_FORCEINLINE auto vreduce_impl(const A &acc, std::index_sequence<Ns...>)
+  {
+     return(std::make_tuple(vreduce(std::get<Ns>(acc))...));
+  };
+
+  template<typename ...E>
+  __STATIC_FORCEINLINE auto vreduce(const std::tuple<E...> &acc)
+  {
+     return(vreduce_impl(acc,std::make_index_sequence<sizeof...(E)>()));
+  };
+
+   template<typename A,std::size_t... Ns>
+  __STATIC_FORCEINLINE auto from_accumulator_impl(const A &acc, std::index_sequence<Ns...>)
+  {
+     return(std::make_tuple(from_accumulator(std::get<Ns>(acc))...));
+  };
+
+  template<typename ...E>
+  __STATIC_FORCEINLINE auto from_accumulator(const std::tuple<E...> &acc)
+  {
+     return(from_accumulator_impl(acc,std::make_index_sequence<sizeof...(E)>()));
+  };
+
+  template<typename A,typename V,std::size_t... Ns>
+  __STATIC_FORCEINLINE A mac_impl(const A &acc,const V &a,const V &b, std::index_sequence<Ns...>)
+  {
+     return(std::make_tuple(mac(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b))...));
+  };
+
+  template<typename A,typename ...E>
+  __STATIC_FORCEINLINE A 
+  mac(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b)
+  {
+     return(mac_impl(acc,a,b,std::make_index_sequence<sizeof...(E)>()));
+  };
+
+ template<typename A,typename V,typename B,std::size_t... Ns>
+  __STATIC_FORCEINLINE A mac_impl(const A &acc,const V &a,const V &b, const B p0,std::index_sequence<Ns...>)
+  {
+     return(std::make_tuple(mac(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b),p0)...));
+  };
+
+   template<typename A,typename B,typename ...E>
+  __STATIC_FORCEINLINE A 
+  mac(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b,const B p0)
+  {
+     return(mac_impl(acc,a,b,p0,std::make_index_sequence<sizeof...(E)>()));
+  };
+
+};
+
+
+/*! @} */
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/num_features/half.hpp b/dsppp/Include/dsppp/num_features/half.hpp
new file mode 100644
index 000000000..dd24fc785
--- /dev/null
+++ b/dsppp/Include/dsppp/num_features/half.hpp
@@ -0,0 +1,76 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#ifdef DOXYGEN
+#define ARM_FLOAT16_SUPPORTED
+#endif
+
+/** \addtogroup GenericNumber 
+ *  \ingroup NUMBER
+ *  @{
+ *  \addtogroup GenericHalfNumber Half
+ *  \ingroup GenericNumber
+ *  @{
+ */
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+template<>
+struct number_traits<float16_t>
+{
+   static constexpr bool is_float = true;
+   static constexpr bool is_fixed = false;
+   typedef float16_t accumulator;
+   static constexpr float16_t one() {return  ((float16_t)1.0f);};
+
+   typedef _Float16 compute_type;
+};
+
+
+#if !defined(ARM_MATH_MVE_FLOAT16)
+template<>
+struct vector_traits<float16_t> {
+  typedef float16_t type;
+  typedef float16_t storage_type;
+
+  // No vector type but must still be defined
+  typedef bool vector;
+  typedef bool temp_accumulator;
+  typedef uint32_t predicate_t;
+
+  
+  static constexpr bool has_vector = false;
+  static constexpr bool is_float = true;
+  static constexpr bool is_fixed = false;
+  static constexpr bool has_predicate = false;
+};
+#endif 
+
+namespace inner {
+   __STATIC_FORCEINLINE float16_t from_accumulator(const float16_t a)
+   {
+     return(a);
+   };
+
+  __STATIC_FORCEINLINE float16_t mac(const float16_t acc,const float16_t a,const float16_t b)
+  {
+     return((_Float16)acc+(_Float16)a*(_Float16)b);
+  };
+  
+
+__STATIC_FORCEINLINE void accumulate(float16_t &a,const float16_t &b)
+{
+   a += (_Float16)b;
+}
+
+__STATIC_FORCEINLINE float16_t mult(float16_t &a,const float16_t &b)
+{
+   return((_Float16)a*(_Float16)b);
+}
+
+}
+
+#endif
+
+/*! @} */
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/num_features/q15.hpp b/dsppp/Include/dsppp/num_features/q15.hpp
new file mode 100644
index 000000000..5bd5d9fc3
--- /dev/null
+++ b/dsppp/Include/dsppp/num_features/q15.hpp
@@ -0,0 +1,66 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+/** \addtogroup GenericNumber 
+ *  \ingroup NUMBER
+ *  @{
+ *  \addtogroup GenericQ15Number Q15
+ *  \ingroup GenericNumber
+ *  @{
+ */
+
+template<>
+struct number_traits<Q15>
+{
+   static constexpr bool is_float = false;
+   static constexpr bool is_fixed = true;
+   typedef Q<33,30> accumulator;
+   static constexpr Q15 one() {return Q15::one();};
+   typedef Q15 compute_type;
+};
+
+template<typename arch>
+struct vector_traits<Q15,arch,
+    typename std::enable_if<!std::is_base_of<Helium,arch>::value &&
+                            !std::is_base_of<Neon,arch>::value &&
+                            !std::is_base_of<DSP,arch>::value>::type> {
+  typedef Q15 type;
+  typedef type::value_type storage_type;
+
+  // No vector type but must still be defined
+  typedef bool vector;
+  typedef bool temp_accumulator;
+  typedef uint32_t predicate_t;
+
+
+  
+  static constexpr bool has_vector = false;
+  static constexpr bool is_float = false;
+  static constexpr bool is_fixed = true;
+  static constexpr bool has_predicate = false;
+
+};
+
+namespace inner {
+#if defined(ARM_MATH_MVEI)
+    __STATIC_FORCEINLINE Q15 from_accumulator(const Q<33,30> a)
+    {
+      //return(saturate(toFrac<15>(a)));
+        return(Q15((sqrshrl_sat48(a.v, -(32-15)) >> 32) & 0xffffffff));
+    };
+#else 
+     __STATIC_FORCEINLINE Q15 from_accumulator(const Q<33,30> a)
+    {
+       return(saturate(toFrac<15>(a)));
+    };
+#endif
+
+    __STATIC_FORCEINLINE Q<33,30> mac(const Q<33,30> acc,const Q15 a,const Q15 b)
+    {
+      return(accumulate(acc , mult(a,b)));
+    };
+}
+
+/*! @} */
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/num_features/q31.hpp b/dsppp/Include/dsppp/num_features/q31.hpp
new file mode 100644
index 000000000..5af4f5647
--- /dev/null
+++ b/dsppp/Include/dsppp/num_features/q31.hpp
@@ -0,0 +1,65 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+/** \addtogroup GenericNumber 
+ *  \ingroup NUMBER
+ *  @{
+ *  \addtogroup GenericQ31Number Q31
+ *  \ingroup GenericNumber
+ *  @{
+ */
+
+template<>
+struct number_traits<Q31>
+{
+   static constexpr bool is_float = false;
+   static constexpr bool is_fixed = true;
+   typedef Q<15,48> accumulator;
+   static constexpr Q31 one() {return Q31::one();};
+   typedef Q31 compute_type;
+};
+
+template<typename arch>
+struct vector_traits<Q31,arch,
+    typename std::enable_if<!std::is_base_of<Helium,arch>::value &&
+                            !std::is_base_of<Neon,arch>::value>::type> {
+  typedef Q31 type;
+  typedef type::value_type storage_type;
+
+  // No vector type but must still be defined
+  typedef bool vector;
+  typedef bool temp_accumulator;
+  typedef uint32_t predicate_t;
+
+
+  
+  static constexpr bool has_vector = false;
+  static constexpr bool is_float = false;
+  static constexpr bool is_fixed = true;
+  static constexpr bool has_predicate = false;
+};
+
+namespace inner {
+#if defined(ARM_MATH_MVEI)
+  __STATIC_FORCEINLINE Q31 from_accumulator(const Q<15,48> a)
+  {
+        return(Q31(asrl(a.v, 17)));
+  };
+#else
+  __STATIC_FORCEINLINE Q31 from_accumulator(const Q<15,48> a)
+  {
+        return(Q31(a.v >> 17));
+  };
+#endif
+
+
+__STATIC_FORCEINLINE Q<15,48> mac(const Q<15,48> acc,const Q31 a,const Q31 b)
+{
+    return(accumulate(acc , toFrac<48>(mult(a,b))));
+};
+    
+  }
+
+/*! @} */
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/num_features/q7.hpp b/dsppp/Include/dsppp/num_features/q7.hpp
new file mode 100644
index 000000000..e408801bd
--- /dev/null
+++ b/dsppp/Include/dsppp/num_features/q7.hpp
@@ -0,0 +1,57 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+/** \addtogroup GenericNumber 
+ *  \ingroup NUMBER
+ *  @{
+ *  \addtogroup GenericQ7Number Q7
+ *  \ingroup GenericNumber
+ *  @{
+ */
+
+template<>
+struct number_traits<Q7>
+{
+   static constexpr bool is_float = false;
+   static constexpr bool is_fixed = true;
+   typedef Q<17,14> accumulator;
+   static constexpr Q7 one() {return Q7::one();};
+   typedef Q7 compute_type;
+};
+
+template<typename arch>
+struct vector_traits<Q7,arch,
+    typename std::enable_if<!std::is_base_of<Helium,arch>::value &&
+                            !std::is_base_of<Neon,arch>::value &&
+                            !std::is_base_of<DSP,arch>::value>::type> {
+  typedef Q7 type;
+  typedef type::value_type storage_type;
+
+  // No vector type but must still be defined
+  typedef bool vector;
+  typedef bool temp_accumulator;
+  typedef uint32_t predicate_t;
+
+
+  
+  static constexpr bool has_vector = false;
+  static constexpr bool is_float = false;
+  static constexpr bool is_fixed = true;
+  static constexpr bool has_predicate = false;
+};
+
+namespace inner {
+    __STATIC_FORCEINLINE Q7 from_accumulator(const Q<17,14> a)
+    {
+        return(Q7(__SSAT(a.v >> 7, 8)));
+    };
+
+    __STATIC_FORCEINLINE Q<17,14> mac(const Q<17,14> acc,const Q7 a,const Q7 b)
+    {
+      return(accumulate(acc , mult(a,b)));
+    };
+}
+   
+/*! @} */
+/*! @} */
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/number.hpp b/dsppp/Include/dsppp/number.hpp
new file mode 100644
index 000000000..033a65e49
--- /dev/null
+++ b/dsppp/Include/dsppp/number.hpp
@@ -0,0 +1,190 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#include "fixed_point.hpp"
+#include <type_traits>
+
+#include "arm_math_types.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+#include "arm_math_types_f16.h"
+#endif
+
+#if defined(ARM_MATH_DSP)
+#include "DSP/memory.hpp"
+#endif
+
+namespace arm_cmsis_dsp {
+
+/** \addtogroup NUMBER Number datatypes
+ *  \ingroup DSPPP
+ * Number datatypes expressing different properties of the numbers
+ * according to the architecture.
+ * 
+ * Those definitions are used to write architecture independents
+ * algorithms.
+ *  @{
+ */
+
+constexpr uint32_t maskFromShift(const uint32_t shift)
+{
+     return ((1<<shift)-1);
+}
+
+
+
+constexpr uint32_t shiftFromValue(const uint32_t val)
+{
+     return (val == 1 ? 0 : 1 + shiftFromValue(val>>1));
+}
+
+/** Properties of a scalar datatype
+ * 
+ * Needs to contain two static bool : is_float and is_fixed
+ * 
+ * Needs to contain a static function `one` returning the value
+ * 
+ * 1 for this datatype (used to write some datatype generic
+ * algorithms)
+ */
+template<typename T>
+struct number_traits;
+
+
+/*
+
+When vector is true we have a vector datatype
+A temporary accumulator datatype and an accumulator datatype.
+For most types the temporary and accumulator are the same.
+For float, vector instruction mac is doing a mac per lane.
+So temporay is a vector and the final accumulator is a float.
+
+*/
+
+/** @brief Properties of a vector datatype linked to a scalar datatype
+ *  @tparam T Type of the scalar
+ *  @tparam arch Architecture. It is defined by the 
+ *               architecture selection code and should never be
+ *               set by the user.   
+ */
+template<typename T,typename arch = ARCH,typename = void>
+struct vector_traits {
+  typedef T type; //!< Scalar datatype
+  typedef T storage_type; //!< Storage type (for instance for Q15 scalar the storage is int16_t)
+  static constexpr bool has_vector = false; //!< True if scalar type has a related vector type 
+  static constexpr bool is_float = false; //!< True if scalar type is a float (half, float or double)
+  static constexpr bool is_fixed = false; //!< True if scalar type is fixed point
+};
+
+/** @brief Scalar properties of fixed point datatype
+ *  @tparam M Mantissa bits (not including sign bit)
+ *  @tparam F Fractional bits
+ *  @tparam S Signed or unsigned 
+ *  @tparam T Storage datatype
+ */
+template<int M, int F, bool S,typename T>
+struct number_traits<Q<M,F,S,T>>
+{
+   static constexpr bool is_float = false; //!< False because scalar is not a float datatype (half, float, double)
+   static constexpr bool is_fixed = true; //!< True because datatype is a fixed point arithmetic one
+   
+   /** @brief Return 1 for this datatype
+    * 
+    *  Used for writing datatype generic algorithms
+   */
+   static constexpr Q<M,F,S,T> one() {return Q<M,F,S,T>::one();};
+};
+
+
+namespace inner {
+
+/** @brief Predicate (only defined for vector architectures)
+ *  @tparam T scalar data type
+ *  @param v Number of loops
+ *  @return Predicate for the given architecture
+ */
+template<typename T,typename = void>
+struct vctpq {
+static typename vector_traits<T>::predicate_t mk(uint32_t v);
+};
+
+};
+
+
+/*
+
+vconst 
+vconst_tail
+vadd
+vsub
+vmul
+vacc
+
+
+vload1
+vstore1
+
+// When predicate
+vctpq
+vload1_z
+vstore1_z
+
+// When predicated loop
+vadd_x
+vsub_x
+vmul_x 
+vmacc_p 
+
+
+*/
+
+
+
+// Common to all architectures
+#include "num_features/double.hpp"
+#include "num_features/float.hpp"
+#include "num_features/half.hpp"
+#include "num_features/q31.hpp"
+#include "num_features/q15.hpp"
+#include "num_features/q7.hpp"
+
+// Specific for some architecture
+//#include <Scalar/num_features>
+#include "DSP/num_features.hpp"
+#include "Helium/num_features.hpp"
+//#include <Neon/num_features>
+
+
+#include "num_features/group.hpp"
+
+/*
+
+If there is the need to tune the intrinsics depending on the
+Helium variant of the architecture, somehting like that could be used.
+In practice, selection is done at level of of algorithms more than
+instructions where it may be simple to just use a #if to use the
+right intrinsics when it is available.
+
+*/
+#if 0
+template<typename T>
+__STATIC_FORCEINLINE mve_pred16_t _vctpq(uint32_t v,Helium * = nullptr);
+
+template<>
+__STATIC_FORCEINLINE mve_pred16_t _vctpq<float>(uint32_t v,Helium *)
+{
+     return(vctp32q(v));
+};
+
+template<typename T>
+__STATIC_FORCEINLINE mve_pred16_t vctpq(uint32_t v)
+{
+   return(_vctpq<T>(v,CURRENT_ARCH));
+}
+
+#endif 
+
+/*! @} */
+
+} // cmsis-dsp namespace
\ No newline at end of file
diff --git a/dsppp/Include/dsppp/unroll.hpp b/dsppp/Include/dsppp/unroll.hpp
new file mode 100644
index 000000000..b6e6693f5
--- /dev/null
+++ b/dsppp/Include/dsppp/unroll.hpp
@@ -0,0 +1,247 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#include <utility>
+#include <tuple>
+
+#include "common.hpp"
+#include "arch.hpp"
+#include <type_traits>
+#include "number.hpp"
+#include "forward.hpp"
+#include "fusion.hpp"
+#include "matrix.hpp"
+
+namespace arm_cmsis_dsp {
+
+/** \addtogroup UNROLLING Unrolling
+ *  \ingroup DSPPP
+ *  @{
+ */
+
+template<typename ... E>
+struct Merged
+{
+
+   using ScalarResult = std::tuple<typename traits<std::remove_reference_t<E>>::Scalar...>;
+   using TypeOfElement = typename std::tuple_element<0,ScalarResult>::type;
+
+   constexpr explicit Merged(const E& ... values) : vals { values ...} { }
+
+   constexpr Merged(Merged&& other) = default; 
+   constexpr Merged(const Merged& other) = default; 
+   constexpr Merged& operator=(const Merged& other) = delete; 
+   constexpr Merged& operator=(Merged&& other) = delete; 
+   ~Merged() = default;
+
+   constexpr vector_length_t length() const noexcept {return std::get<0>(vals).length();};
+
+   template<std::size_t... Ns>
+   constexpr ScalarResult val_impl(const int i, const std::index_sequence<Ns...>) const noexcept
+   {
+     return std::tuple(std::get<Ns>(vals)[i]...);
+   }
+
+   constexpr ScalarResult operator[](const int i)  noexcept{
+    return val_impl(i,std::make_index_sequence<sizeof...(E)>());
+   }  
+
+   constexpr ScalarResult const operator[](const int i) const  noexcept{
+    return val_impl(i,std::make_index_sequence<sizeof...(E)>());
+   } 
+
+#if defined(HAS_VECTOR)
+
+   using Vector = std::tuple<typename vector_traits<typename traits<std::remove_reference_t<E>>::Scalar>::vector...>;
+
+   template<std::size_t... Ns>
+   void vector_store_impl(const index_t i,const Vector &val, const std::index_sequence<Ns...>) const noexcept
+   {
+        (inner::vstore1<1>((std::get<Ns>(vals).ptr(i)),std::get<Ns>(val)),...);
+   }
+
+   void vector_store(const index_t i,const Vector &val) const noexcept
+   {
+     vector_store_impl(i,val,std::make_index_sequence<sizeof...(E)>());
+   }
+   
+#if defined(HAS_PREDICATED_LOOP)
+   template<std::size_t... Ns>
+   void vector_store_tail_impl(const index_t i,const vector_length_t remaining,const Vector &val, const std::index_sequence<Ns...>) const noexcept
+   {
+        (inner::vstore1_z<1>((std::get<Ns>(vals).ptr(i)),std::get<Ns>(val),remaining,inner::vctpq<TypeOfElement>::mk(remaining)),...);
+   }
+
+
+   void vector_store_tail(const index_t i,const vector_length_t remaining,const Vector &val) const noexcept
+   {
+     vector_store_tail_impl(i,remaining,val,std::make_index_sequence<sizeof...(E)>());
+   }
+#endif
+
+ 
+   template<std::size_t... Ns>
+   Vector vector_op_impl(const int i, const std::index_sequence<Ns...>) const noexcept
+   {
+     return std::make_tuple(std::get<Ns>(vals).vector_op(i)...);
+   }
+
+   Vector vector_op(const index_t i) const noexcept
+   {
+        return(vector_op_impl(i,std::make_index_sequence<sizeof...(E)>()));
+   }
+
+#if defined(HAS_PREDICATED_LOOP)
+   template<std::size_t... Ns>
+   Vector vector_op_tail_impl(const index_t i,const vector_length_t remaining, const std::index_sequence<Ns...>) const noexcept
+   {
+        return std::make_tuple(std::get<Ns>(vals).vector_op_tail(i,remaining)...);
+   }
+
+   Vector vector_op_tail(const index_t i,const vector_length_t remaining) const noexcept
+    {
+        return(vector_op_tail_impl(i,remaining,std::make_index_sequence<sizeof...(E)>()));
+    }
+#endif
+#endif
+
+   template<typename ... EA>
+   Merged& operator=(const Merged<EA...>& other) noexcept
+   {
+       eval(*this,other,std::get<0>(vals).length(),CURRENT_ARCH);
+       return(*this);
+   }
+
+   const std::tuple<E...> vals;
+};
+
+template <typename ..._Tp>
+static inline Merged<_Tp&...>
+results(_Tp&... __t) noexcept {return Merged<_Tp&...>(__t...);}
+
+
+template<typename ... E>
+struct traits<Merged<E...>>
+{
+    typedef std::tuple<typename traits<std::remove_reference_t<E>>::Scalar...> Scalar;
+
+#if defined(HAS_VECTOR)
+    typedef std::tuple<typename vector_traits<typename traits<std::remove_reference_t<E>>::Scalar>::vector...> Vector;
+#endif
+};
+
+template<typename ... E>
+struct IsVector<Merged<E...>>
+{
+    constexpr static bool value = true;
+};
+
+template<typename ... E>
+struct IsDynamic<Merged<E...>>
+{
+    constexpr static bool value = (... && IsDynamic<std::remove_reference_t<E>>::value);
+};
+
+template<typename ... E>
+struct ElementType<Merged<E...>>
+{
+    typedef std::tuple<typename ElementType<std::remove_reference_t<E>>::type...> type;
+};
+
+constexpr vector_length_t max_length(const vector_length_t a,const vector_length_t b) noexcept
+{
+   return((a>b) ? a : b);
+};
+
+
+template<typename F,typename ... N>
+constexpr vector_length_t max_vec_length(F a,N ...b) noexcept
+{
+    if constexpr (sizeof...(b) == 0) 
+    {
+       return(a);
+    }
+    else 
+    {
+        return max_length(a,max_vec_length(b...));
+    }
+};
+
+
+template<typename ... E>
+struct StaticLength<Merged<E...>>
+{
+    constexpr static vector_length_t value = max_vec_length(StaticLength<std::remove_reference_t<E>>::value...);
+};
+
+
+  template<typename F,std::size_t... Ns>
+  auto unroll_impl(const F& func,std::index_sequence<Ns...>) noexcept
+  {
+     return Merged{func(Ns)...};
+  };
+
+  template<int N,typename F>
+  auto unroll(const F& func) noexcept
+  {
+     return unroll_impl(func,std::make_index_sequence<N>());
+  };
+
+  template<typename E>
+  constexpr static const E& constres(const E& r,const std::size_t) noexcept
+  {
+    return(r);
+  }
+
+  template<typename E,std::size_t... Ns>
+  auto replicate_impl(const E& expr,std::index_sequence<Ns...>) noexcept
+  {
+     return Merged{constres(expr,Ns)...};
+  };
+
+  template<int N,typename E>
+  auto replicate(const E& expr) noexcept
+  {
+     return replicate_impl(expr,std::make_index_sequence<N>());
+  };
+
+  /*
+
+  We don't want to replicate the Vector but only a reference
+  to the vector. So it is packed into an expr
+
+  */
+  template<int N,typename P,int L,template<int> typename A>
+  auto replicate(const Vector<P,L,A>& e) noexcept
+  {
+     //return replicate_impl(expr(e),std::make_index_sequence<N>());
+     return replicate_impl(VectorView<P,1>(e),std::make_index_sequence<N>());
+  };
+
+  template<typename T,unsigned int N,std::size_t... Ns>
+  auto results_impl(std::array<T,N> &a,std::index_sequence<Ns...>) noexcept
+  {
+     return std::tie(a[Ns]...);
+  };
+
+  template<unsigned int N,typename T>
+  auto results(std::array<T,N> &a) noexcept
+  {
+     return results_impl(a,std::make_index_sequence<N>());
+  };
+
+  template<typename F,std::size_t... Ns>
+  auto result_impl_func(const F& func,std::index_sequence<Ns...>) noexcept
+  {
+     return std::tie(*func(Ns)...);
+  };
+
+  template<int N,typename F>
+  auto results(const F& func) noexcept
+  {
+     return result_impl_func(func,std::make_index_sequence<N>());
+  };
+
+/*! @} */
+}
diff --git a/dsppp/Include/dsppp/vec.hpp b/dsppp/Include/dsppp/vec.hpp
new file mode 100644
index 000000000..c388bae7d
--- /dev/null
+++ b/dsppp/Include/dsppp/vec.hpp
@@ -0,0 +1,442 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#include <memory>
+#include <cstring>
+#include <algorithm>
+#include <iostream>
+#include "common.hpp"
+#include "arch.hpp"
+#include <type_traits>
+#include "number.hpp"
+#include "forward.hpp"
+#include "fusion.hpp"
+#include "unroll.hpp"
+#include "algorithms.hpp"
+#include "vector_impl.hpp"
+#include "vector_view.hpp"
+
+namespace arm_cmsis_dsp {
+
+/** \addtogroup VECTOR Vectors
+ *  \ingroup DSPPP
+ *  @{
+ */
+
+template<typename T,bool = true>
+struct VecRef;
+
+template<typename T>
+struct VecRef<Vector_Base<T>>
+{
+   typedef VectorView<T,1> type;
+   static type ref(const Vector_Base<T>&a){
+      return(type(a));
+   };
+};
+
+template<typename T,int S>
+struct VecRef<VectorView<T,S>>
+{
+   typedef VectorView<T,S> type;
+   static type ref(const VectorView<T,S>&a){
+      return(a);
+   };
+};
+
+
+template<typename P,int L,
+         template<int> typename A>
+struct VecRef<Vector<P,L,A>,(L<0)>
+{
+  
+   typedef VectorView<P,1> type;
+   static VectorView<P,1> ref(const Vector<P,L,A>&a,typename std::enable_if<(L<0)>::type* = nullptr){
+      return(VectorView<P,1>(a));
+   };
+
+};
+
+template<typename P,int L,
+         template<int> typename A>
+struct VecRef<Vector<P,L,A>,(L>0)>
+{
+   typedef const Vector<P,L,A>& type;
+   static const Vector<P,L,A>& ref(const Vector<P,L,A>&a,typename std::enable_if<(L>0)>::type* = nullptr){
+      return(a);
+   };
+};
+
+
+
+template<typename LHS,typename RHS,typename OP>
+struct VecRef<_Binary<LHS,RHS,OP>>
+{
+   typedef _Binary<LHS,RHS,OP> type;
+   static type ref(const _Binary<LHS,RHS,OP>&a){
+      return(a);
+   };
+};
+
+template<typename LHS,typename OP>
+struct VecRef<_Unary<LHS,OP>>
+{
+   typedef _Unary<LHS,OP> type;
+   static type ref(const _Unary<LHS,OP>&a){
+      return(a);
+   };
+};
+
+template<typename Derived>
+struct VecRef<_Expr<Derived>>
+{
+   typedef Derived type;
+   static type ref(const _Expr<Derived>&a){
+      return(a.derived());
+   };
+};
+
+template<>
+struct VecRef<double>
+{
+   typedef double type;
+   static type ref(const double a){
+      return(a);
+   };
+};
+
+template<>
+struct VecRef<float>
+{
+   typedef float type;
+   static type ref(const float a){
+      return(a);
+   };
+};
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+template<>
+struct VecRef<float16_t>
+{
+   typedef float16_t type;
+   static type ref(const float16_t a){
+      return(a);
+   };
+};
+#endif
+
+template<>
+struct VecRef<Q7>
+{
+   typedef Q7 type;
+   static type ref(const Q7 a){
+      return(a);
+   };
+};
+
+template<>
+struct VecRef<Q15>
+{
+   typedef Q15 type;
+   static type ref(const Q15 a){
+      return(a);
+   };
+};
+
+template<>
+struct VecRef<Q31>
+{
+   typedef Q31 type;
+   static type ref(const Q31 a){
+      return(a);
+   };
+};
+
+
+template<typename T,int S>
+struct traits<VectorView<T,S>>
+{
+    typedef T Scalar;
+#if defined(HAS_VECTOR)
+    typedef typename vector_traits<T>::vector Vector;
+#endif
+};
+
+
+template<typename P,int L,
+         template<int> typename Allocator>
+struct traits<Vector<P,L,Allocator>>
+{
+    typedef P Scalar;
+#if defined(HAS_VECTOR)
+    typedef typename vector_traits<P>::vector Vector;
+#endif
+};
+
+
+template<typename P,int L,
+         template<int> typename Allocator>
+struct traits<const Vector<P,L,Allocator>&>
+{
+    typedef P Scalar;
+#if defined(HAS_VECTOR)
+    typedef typename vector_traits<P>::vector Vector;
+#endif
+};
+
+
+
+template<typename T>
+struct StaticStride
+{
+    constexpr static std::size_t value = 1;
+};
+
+template<typename T,int S>
+struct StaticStride<VectorView<T,S>>
+{
+    constexpr static std::size_t value = S;
+};
+
+
+template<typename P,int L,
+         template<int> typename Allocator>
+struct IsVector<Vector<P,L,Allocator>>
+{
+    constexpr static bool value = true;
+};
+
+template<typename P,int L,
+         template<int> typename Allocator>
+struct IsVector<const Vector<P,L,Allocator>&>
+{
+    constexpr static bool value = true;
+};
+
+template<typename P>
+struct IsVector<const Vector_Base<P>&>
+{
+    constexpr static bool value = true;
+};
+
+template<typename P>
+struct IsVector<Vector_Base<P>>
+{
+    constexpr static bool value = true;
+};
+
+template<typename P,int L,
+         template<int> typename Allocator>
+struct ElementType<Vector<P,L,Allocator>>
+{
+    typedef P type;
+};
+
+template<typename P,int L,
+         template<int> typename Allocator>
+struct ElementType<const Vector<P,L,Allocator>&>
+{
+    typedef P type;
+};
+
+
+template<typename P>
+struct ElementType<Vector_Base<P>>
+{
+    typedef P type;
+};
+
+template<typename P>
+struct ElementType<const Vector_Base<P>&>
+{
+    typedef P type;
+};
+
+template<typename T,int stride>
+struct IsVector<VectorView<T,stride>>
+{
+    constexpr static bool value = true;
+};
+
+
+template<typename P,int L,
+         template<int> typename Allocator>
+struct StaticLength<Vector<P,L,Allocator>>
+{
+    constexpr static vector_length_t value = (L<0) ? 0 : L;
+};
+
+template<typename P,int L,
+         template<int> typename Allocator>
+struct StaticLength<const Vector<P,L,Allocator>&>
+{
+    constexpr static vector_length_t value = (L<0) ? 0 : L;
+};
+
+
+template<typename T,int stride>
+struct ElementType<VectorView<T,stride>>
+{
+    typedef T type;
+};
+
+template<typename T,int stride>
+struct ElementType<const VectorView<T,stride>&>
+{
+    typedef T type;
+};
+
+
+template<typename P,int L,
+         template<int> typename Allocator>
+struct IsDynamic<Vector<P,L,Allocator>>
+{
+    constexpr static bool value = (L<0);
+};
+
+template<typename P,int L,
+         template<int> typename Allocator>
+struct IsDynamic<const Vector<P,L,Allocator>&>
+{
+    constexpr static bool value = (L<0);
+};
+
+template<typename T,int stride>
+struct IsDynamic<VectorView<T,stride>>
+{
+    constexpr static bool value = true;
+};
+
+
+
+
+// Assume one at least is static
+template<typename VA,typename VB>
+using StaticType=typename std::conditional<IsDynamic<VA>::value,VB,VA>::type;
+
+
+
+template<typename LHS,typename RHS,
+typename std::enable_if<(!is_scalar<LHS>() || 
+                        !is_scalar<RHS>()) && 
+                        SameElementType<LHS,RHS>::value && 
+                        same_static_length<LHS,RHS>(),bool>::type = true>
+inline auto operator+(const LHS &a,const RHS &b)
+{ 
+    using Scalar = typename traits<LHS>::Scalar;
+    using VecLHS = VecRef<LHS>;
+    using VecRHS = VecRef<RHS>;
+
+    return(_Binary<typename VecLHS::type,typename VecRHS::type,_AddOp<Scalar>>(VecLHS::ref(a),VecRHS::ref(b),_AddOp<Scalar>()));
+};
+
+template<typename LHS,
+typename std::enable_if<!is_scalar<LHS>(),bool>::type = true>
+inline auto operator+(const LHS &a)
+{ 
+    using Scalar = typename traits<LHS>::Scalar;
+    using VecLHS = VecRef<LHS>;
+
+    return(_Unary<typename VecLHS::type,_NoOp<Scalar>>(VecLHS::ref(a),_NoOp<Scalar>()));
+};
+
+
+/*
+
+VectorView = VectorView must be a cheap copy of reference only.
+So when we want to copy a VectorView onto another we need to
+write
+VectorView = expr(VectorView) or copy
+
+we cannot rely on the copy or move constructors.
+
+*/
+template<typename LHS,
+typename std::enable_if<!is_scalar<LHS>(),bool>::type = true>
+inline auto expr(const LHS &a)
+{ 
+    using Scalar = typename traits<LHS>::Scalar;
+    using VecLHS = VecRef<LHS>;
+    return(_Unary<typename VecLHS::type,_NoOp<Scalar>>(VecLHS::ref(a),_NoOp<Scalar>()));
+};
+
+template<typename LHS,
+typename std::enable_if<!is_scalar<LHS>(),bool>::type = true>
+inline auto copy(const LHS &a)
+{ 
+    using Scalar = typename traits<LHS>::Scalar;
+    using VecLHS = VecRef<LHS>;
+    return(_Unary<typename VecLHS::type,_NoOp<Scalar>>(VecLHS::ref(a),_NoOp<Scalar>()));
+};
+
+template<typename LHS,typename RHS,
+typename std::enable_if<(!is_scalar<LHS>() || 
+                        !is_scalar<RHS>()) && 
+                        SameElementType<LHS,RHS>::value && 
+                        same_static_length<LHS,RHS>(),bool>::type = true>
+inline auto operator-(const LHS &a,const RHS &b)
+{ 
+    using Scalar = typename traits<LHS>::Scalar;
+    using VecLHS = VecRef<LHS>;
+    using VecRHS = VecRef<RHS>;
+
+    return(_Binary<typename VecLHS::type,typename VecRHS::type,_SubOp<Scalar>>(
+        VecLHS::ref(a),VecRHS::ref(b),_SubOp<Scalar>()));
+};
+
+template<typename LHS,
+typename std::enable_if<!is_scalar<LHS>(),bool>::type = true>
+inline auto operator-(const LHS &a)
+{ 
+    using Scalar = typename traits<LHS>::Scalar;
+    using VecLHS = VecRef<LHS>;
+
+    return(_Unary<typename VecLHS::type,_NegOp<Scalar>>(VecLHS::ref(a),_NegOp<Scalar>()));
+};
+
+
+template<typename LHS,typename RHS,
+typename std::enable_if<(!is_scalar<LHS>() || 
+                        !is_scalar<RHS>())  && 
+                        SameElementType<LHS,RHS>::value && 
+                        same_static_length<LHS,RHS>(),bool>::type = true>
+inline auto operator*(const LHS &a,const RHS &b)
+{ 
+    using Scalar = typename traits<LHS>::Scalar;
+    using VecLHS = VecRef<LHS>;
+    using VecRHS = VecRef<RHS>;
+
+    return(_Binary<typename VecLHS::type,typename VecRHS::type,_MulOp<Scalar>>(
+        VecLHS::ref(a),VecRHS::ref(b),_MulOp<Scalar>()));
+};
+
+
+
+#if 0
+template<typename VA,typename VB,typename OP,
+std::enable_if<IsVector<VA>::value && 
+               IsVector<VB>::value && 
+               SameElementType<VA,VB>::value && 
+               (same_static_length(StaticLength<VA>::value , StaticLength<VB>::value)),bool>::type = true>
+inline _Expr<OP> operator+(const VA &a,
+                           const VB &b)  
+{ 
+    
+    return(_Add<decltype(pA),decltype(pB)>(a,b));
+};
+#endif
+
+/*
+
+Core algorithms that cannot be expressed only with high level
+abstractions and need intrinsincs.
+
+*/
+#include "Helium/matrix_multiply.hpp"
+#include "DSP/matrix_multiply.hpp"
+#include "Scalar/matrix_multiply.hpp"
+
+/*! @} */
+
+}
diff --git a/dsppp/Include/dsppp/vector_impl.hpp b/dsppp/Include/dsppp/vector_impl.hpp
new file mode 100644
index 000000000..6978e2db6
--- /dev/null
+++ b/dsppp/Include/dsppp/vector_impl.hpp
@@ -0,0 +1,576 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#include <memory>
+#include <cstring>
+#include <algorithm>
+#include <iostream>
+#include "common.hpp"
+#include "arch.hpp"
+#include <type_traits>
+#include "number.hpp"
+#include "forward.hpp"
+#include "fusion.hpp"
+#include "unroll.hpp"
+#include "algorithms.hpp"
+
+namespace arm_cmsis_dsp {
+
+/** \addtogroup VECTOR
+ *  @{
+ */
+
+/*
+
+Generic evaluators.
+
+*/
+#include "Scalar/basic.hpp"
+#include "DSP/basic.hpp"
+#include "Helium/basic.hpp"
+#include "Neon/basic.hpp"
+
+
+template<typename P>
+struct Vector_Base {
+
+    typedef P element_type;
+
+    vector_length_t length() const   {return(length_);};
+
+    P* ptr() const {return(values_);}
+    P* ptr(const index_t i) const {return(&values_[i]);}
+
+    const P* const_ptr() const {return(values_);}
+    const P* const_ptr(const index_t i) const {return(&values_[i]);}
+
+
+    P* begin() const {return(values_);}
+    P* end() const {return(values_+length_);}
+
+
+    friend std::ostream& operator<< (std::ostream& stream, const Vector_Base<P>& other) {
+        constexpr int nb = 10;
+        int i=0;
+        for(index_t k=0;k<other.length();k++)
+        {
+           stream << other[k] << " , ";
+           i++;
+           if(i==nb)
+           {
+             i=0;
+             stream << "\r\n";
+           };
+        }
+        stream << "\r\n";
+        return(stream);
+    }
+
+    virtual ~Vector_Base(){};
+
+    Vector_Base(Vector_Base&& other) :
+     length_(other.length_),values_(other.values_){
+        other.values_=nullptr;
+        other.length_ = 0;
+     };
+
+    // Done in derivated classes since they have
+   // knowledge of the allocator to use
+   // But used in VectorView
+   Vector_Base(const Vector_Base& other) = delete;
+   //   length_(other.length_),values_(other.values_){};
+
+   P& operator[](const index_t i)
+   {
+       return(values_[i]);
+   }
+
+   P& operator[](const index_t i) const
+   {
+       return(values_[i]);
+   }
+
+#if defined(HAS_VECTOR)
+    using Vector = typename vector_traits<P>::vector;
+
+    template<typename T=P,
+             typename std::enable_if<vector_traits<T>::has_vector,bool>::type = true>
+    void vector_store(const index_t i,const Vector val) const
+    {
+        inner::vstore1<1>((typename std::remove_cv<P>::type*)(&values_[i]),val);
+    }
+
+#if defined(HAS_PREDICATED_LOOP)
+    void vector_store_tail(const index_t i,const vector_length_t remaining,const Vector val) const
+    {
+        inner::vstore1_z<1>((typename std::remove_cv<P>::type*)(&values_[i]),val,remaining,inner::vctpq<P>::mk(remaining));
+    }
+
+    Vector const vector_op_tail(const index_t i,const vector_length_t remaining) const
+    {
+        return(inner::vload1_z<1>((typename std::remove_cv<P>::type*)(&values_[i]),remaining,inner::vctpq<P>::mk(remaining)));
+    }
+#endif
+    
+    Vector const vector_op(const index_t i) const
+    {
+        return(inner::vload1<1>((typename std::remove_cv<P>::type*)(&values_[i])));
+    }
+
+#endif
+
+
+
+protected:
+
+   //Vector_Base():length_(0),values_(nullptr){};
+    Vector_Base() = delete;
+
+   explicit Vector_Base(vector_length_t length, char *val):
+   length_(length),
+   values_(reinterpret_cast<P*>(val)){};
+
+   explicit Vector_Base(vector_length_t length, char *val,P init_val):
+   length_(length),
+   values_(reinterpret_cast<P*>(val)){
+    _Fill(*this,init_val,length,CURRENT_ARCH);
+   };
+
+   
+   Vector_Base& operator=(const Vector_Base& other) 
+   {
+        if ((length_ == other.length_) && (this != &other)) 
+        {
+            _Fill(*this,other,other.length_,CURRENT_ARCH);
+             //std::memcpy(values_,other.values_,sizeof(P)*length_);
+        }  
+      return(*this);
+   }
+
+   // Done in derivated classes since we need
+   // the allocator destroy
+   Vector_Base& operator=(Vector_Base&& other) = delete; 
+   
+
+
+
+   vector_length_t length_;
+   P* values_;
+};
+
+
+template<typename T>
+struct traits<Vector_Base<T>>
+{
+    typedef T Scalar;
+#if defined(HAS_VECTOR)
+    typedef typename vector_traits<T>::vector Vector;
+#endif
+};
+
+/** @brief Vector template
+ *  @tparam P Type of the scalar
+ *  @tparam L Vector length in number of elements. 
+ *            Negative if length not known at build time. It is the default value
+ *  @tparam Allocator Memory allocator to use. By default it is the macro `TMP_ALLOC`
+ */
+template<typename P,
+         int L=DYNAMIC,
+         template<int> typename Allocator = TMP_ALLOC>
+struct Vector:Vector_Base<P> {
+
+   
+   //! Type of vector elements
+   using element_type = P;
+
+   //! Length of the vector when known at build time.
+   constexpr static vector_length_t vector_size = sizeof(P)*L;
+
+   /**
+    * @brief      Allocate a buffer for this vector using the memory allocator
+    *
+    * @return     A new memory buffer
+    */
+   static char* allocate(){return(Allocator<vector_size>::allocate());};
+
+   /**
+    * @brief      Construct a new vector
+    * 
+    *             The length is known at build time.
+    *             
+    */
+   Vector():Vector_Base<P>(L,Vector::allocate()){};
+
+    /**
+    * @brief      Construct a new vector and initialize it
+    * 
+    *             The length is known at build time.
+    *      
+    * @param init_val Initialization value         
+    */
+   explicit Vector(P init_val):Vector_Base<P>(L,Vector::allocate(),init_val){
+   };
+
+   /**
+    * @brief      Construct a new vector and initialize it with a list
+    * 
+    *             The length is known at build time.
+    *      
+    * @param l Initialization list         
+    */
+   Vector(const std::initializer_list<P> &l)
+   :Vector_Base<P>(L,Vector::allocate()){
+     std::memcpy(Vector_Base<P>::values_,l.data(),vector_size);
+   };
+
+  Vector(Vector&& other) = default;
+
+  Vector(const Vector& other):Vector_Base<P>(L,Vector::allocate())
+  {
+        eval(*this,+other,(vector_length_t)L,CURRENT_ARCH);
+
+        //std::memcpy(Vector_Base<P>::values_,other.values_,vector_size);
+  };
+
+   template<template<int> typename OtherAllocator>
+   explicit Vector(const Vector<P,L,OtherAllocator>& other):Vector_Base<P>(L,Vector::allocate())
+   {
+        eval(*this,+other,(vector_length_t)L,CURRENT_ARCH);
+   };
+
+   template<template<int> typename OtherAllocator>
+   explicit Vector(const Vector<P,DYNAMIC,OtherAllocator>& other):Vector_Base<P>(L,Vector::allocate())
+   {
+        if (other.length() == Vector_Base<P>::length())
+        {
+           eval(*this,+other,(vector_length_t)L,CURRENT_ARCH);
+        }
+   };
+
+   template<int S>
+   explicit Vector(const VectorView<P,S>& other):Vector_Base<P>(L,Vector::allocate())
+   {
+        eval(*this,+other,(vector_length_t)L,CURRENT_ARCH);
+   };
+   
+
+  
+
+   template<typename Derived>
+   Vector(const _Expr<Derived>& other):Vector_Base<P>(L,Vector::allocate())
+   {
+        eval(*this,other.derived(),(vector_length_t)L,CURRENT_ARCH);
+   };
+
+
+  Vector& operator=(const Vector& other) = default;
+
+  Vector& operator=(Vector&& other)
+   {
+      if (this != &other)
+      {
+        if (Vector_Base<P>::values_!=nullptr)
+        {
+           Allocator<vector_size>::destroy(reinterpret_cast<char*>(Vector_Base<P>::values_));
+        }
+  
+        Vector_Base<P>::length_= other.length_;
+        Vector_Base<P>::values_ = other.values_;
+        other.values_=nullptr;
+        other.length_ = 0;
+      }
+
+      return(*this);
+   }
+
+  template<typename Derived>
+  Vector& operator=(const _Expr<Derived>&other)
+  {
+      eval(*this,other.derived(),(vector_length_t)L,CURRENT_ARCH);
+      return(*this);
+  }
+
+
+  template<typename T,
+           typename std::enable_if<is_scalar<T>(),bool>::type = true>
+  Vector& operator=(const T other)
+  {
+      _Fill(*this,other,L,CURRENT_ARCH);
+      return(*this);
+  }
+
+
+   
+
+   
+   template<typename Derived>
+   Vector& operator +=(const _Expr<Derived>& other)
+   {
+      eval(*this,*this + other.derived(),(vector_length_t)L,CURRENT_ARCH);
+      return(*this);
+   };
+
+   Vector& operator +=(const Vector& other)
+   {
+      eval(*this,*this + other,(vector_length_t)L,CURRENT_ARCH);
+      return(*this);
+   };
+
+   Vector& operator +=(const P other)
+   {
+      eval(*this,*this + other,(vector_length_t)L,CURRENT_ARCH);
+      return(*this);
+   };
+
+   template<typename Derived>
+   Vector& operator -=(const _Expr<Derived>& other)
+   {
+      eval(*this,*this - other.derived(),(vector_length_t)L,CURRENT_ARCH);
+      return(*this);
+   };
+
+   Vector& operator -=(const Vector& other)
+   {
+      eval(*this,*this - other,(vector_length_t)L,CURRENT_ARCH);
+      return(*this);
+   };
+
+   Vector& operator -=(const P other)
+   {
+      eval(*this,*this - other,(vector_length_t)L,CURRENT_ARCH);
+      return(*this);
+   };
+
+   template<typename Derived>
+   Vector& operator *=(const _Expr<Derived>& other)
+   {
+      eval(*this,*this * other.derived(),(vector_length_t)L,CURRENT_ARCH);
+      return(*this);
+   };
+
+   Vector& operator *=(const Vector& other)
+   {
+      eval(*this,*this * other,(vector_length_t)L,CURRENT_ARCH);
+      return(*this);
+   };
+
+   Vector& operator *=(const P other)
+   {
+      eval(*this,*this * other,(vector_length_t)L,CURRENT_ARCH);
+      return(*this);
+   };
+
+
+
+  
+   template<int S=1>
+   VectorView<P,S> sub(const index_t start=0,const index_t stop=L)
+   {
+     return(VectorView<P,S>(*this,start,stop));
+   }
+
+   template<int S=1>
+   const VectorView<P,S> sub(const index_t start=0,const index_t stop=L) const
+   {
+     return(VectorView<P,S>(*this,start,stop));
+   }
+
+
+   virtual ~Vector() {
+      if (Vector_Base<P>::values_!=nullptr)
+      {
+         Allocator<vector_size>::destroy(reinterpret_cast<char*>(Vector_Base<P>::values_));
+      }
+    }
+
+
+
+
+};
+
+
+
+template<typename P,
+         template<int> typename Allocator>
+struct Vector<P,DYNAMIC,Allocator>:Vector_Base<P> {
+
+    static char* allocate(vector_length_t length){return(Allocator<DYNAMIC>::allocate(sizeof(P)*length));};
+
+    Vector() = delete;
+
+    explicit Vector(vector_length_t length,P init_val):
+    Vector_Base<P>(length,Vector::allocate(length),init_val){};
+      
+    explicit Vector(vector_length_t length):
+    Vector_Base<P>(length,Vector::allocate(length)){};
+
+    explicit Vector(const std::initializer_list<P> &l)
+    :Vector_Base<P>(l.size(),Vector::allocate(l.size())){
+       std::memcpy(Vector_Base<P>::values_,l.data(),sizeof(P)*l.size());
+    };
+   
+   template<int K,template<int> typename OtherAllocator>
+   explicit Vector(const Vector<P,K,OtherAllocator>& other):Vector_Base<P>(other.length(),Vector::allocate(other.length()))
+   {
+        eval(*this,+other,Vector_Base<P>::length(),CURRENT_ARCH);
+   };
+
+   
+   Vector(const Vector& other):Vector_Base<P>(other.length(),Vector::allocate(other.length()))
+   {
+        eval(*this,+other,Vector_Base<P>::length(),CURRENT_ARCH);
+
+        //std::memcpy(Vector_Base<P>::values_,other.values_,vector_size);
+   };
+
+   template<int S>
+   explicit Vector(const VectorView<P,S>& other):Vector_Base<P>(other.length(),Vector::allocate(other.length()))
+   {
+        eval(*this,+other,Vector_Base<P>::length(),CURRENT_ARCH);
+   };
+
+   template<typename Derived>
+   Vector(const _Expr<Derived>& other):Vector_Base<P>(other.length(),Vector::allocate(other.length()))
+   {
+        eval(*this,other.derived(),Vector_Base<P>::length(),CURRENT_ARCH);
+   };
+
+   Vector(Vector&& other) = default;
+
+
+  
+   Vector& operator=(const Vector& other) = default;
+   
+   Vector& operator=(Vector&& other)
+   {
+      if (this != &other)
+      {
+        if (Vector_Base<P>::values_!=nullptr)
+        {
+           Allocator<DYNAMIC>::destroy(reinterpret_cast<char*>(Vector_Base<P>::values_));
+        }
+  
+        Vector_Base<P>::length_= other.length_;
+        Vector_Base<P>::values_ = other.values_;
+        other.values_=nullptr;
+        other.length_ = 0;
+      }
+
+      return(*this);
+   }
+
+  template<typename Derived>
+  Vector& operator=(const _Expr<Derived>&other)
+  {
+      eval(*this,other.derived(),Vector_Base<P>::length(),CURRENT_ARCH);
+      return(*this);
+  }
+
+  template<typename T,
+           typename std::enable_if<is_scalar<T>(),bool>::type = true>
+  Vector& operator=(const T other)
+  {
+      _Fill(*this,other,Vector_Base<P>::length(),CURRENT_ARCH);
+      return(*this);
+  }
+
+   template<typename Derived>
+   Vector& operator +=(const _Expr<Derived>& other)
+   {
+      eval(*this,*this + other.derived(),Vector_Base<P>::length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   Vector& operator +=(const Vector& other)
+   {
+      eval(*this,*this + other,Vector_Base<P>::length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   Vector& operator +=(const P other)
+   {
+      eval(*this,*this + other,Vector_Base<P>::length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   template<typename Derived>
+   Vector& operator -=(const _Expr<Derived>& other)
+   {
+      eval(*this,*this - other.derived(),Vector_Base<P>::length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   Vector& operator -=(const Vector& other)
+   {
+      eval(*this,*this - other,Vector_Base<P>::length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   Vector& operator -=(const P other)
+   {
+      eval(*this,*this - other,Vector_Base<P>::length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   template<typename Derived>
+   Vector& operator *=(const _Expr<Derived>& other)
+   {
+      eval(*this,*this * other.derived(),Vector_Base<P>::length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   Vector& operator *=(const Vector& other)
+   {
+      eval(*this,*this * other,Vector_Base<P>::length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+
+   Vector& operator *=(const P other)
+   {
+      eval(*this,*this * other,Vector_Base<P>::length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   template<int S=1>
+   VectorView<P,S> sub(const index_t start=0,const index_t stop=-1)
+   {
+     if  (stop<0)
+     {
+        return(VectorView<P,S>(*this,start,Vector_Base<P>::length()));
+     }
+     else
+     {
+       return(VectorView<P,S>(*this,start,stop));
+     }
+   }
+
+   template<int S=1>
+   const VectorView<P,S> sub(const index_t start=0,const index_t stop=-1) const
+   {
+     if  (stop<0)
+     {
+        return(VectorView<P,S>(*this,start,Vector_Base<P>::length()));
+     }
+     else
+     {
+       return(VectorView<P,S>(*this,start,stop));
+     }
+   }
+
+   
+
+   virtual ~Vector() {
+      if (Vector_Base<P>::values_!=nullptr)
+      {
+         Allocator<DYNAMIC>::destroy(reinterpret_cast<char*>(Vector_Base<P>::values_));
+      }
+    }
+
+};
+
+/*! @} */
+
+}
+
diff --git a/dsppp/Include/dsppp/vector_view.hpp b/dsppp/Include/dsppp/vector_view.hpp
new file mode 100644
index 000000000..b3ddc9c6f
--- /dev/null
+++ b/dsppp/Include/dsppp/vector_view.hpp
@@ -0,0 +1,449 @@
+// -*- C++ -*-
+/** @file */ 
+#pragma once 
+
+#include <memory>
+#include <cstring>
+#include <algorithm>
+#include <iostream>
+#include "common.hpp"
+#include "arch.hpp"
+#include <type_traits>
+#include "number.hpp"
+#include "forward.hpp"
+#include "fusion.hpp"
+#include "unroll.hpp"
+#include "algorithms.hpp"
+#include "vector_impl.hpp"
+
+namespace arm_cmsis_dsp {
+
+/** \addtogroup VECTOR
+ *  @{
+ */
+
+template<typename T,int stride=1>
+struct VectorView
+{
+
+    /*
+
+    Start and stop are the position in the raw Vector_base pointer.
+    Stop is the first sample outside of the vector
+
+    */
+    VectorView() = delete;
+
+    constexpr static vector_length_t compute_length(const index_t start,const index_t stop)
+    {
+        return(1+(stop-1 -start)/stride);
+    }
+
+    explicit VectorView(T *v,const vector_length_t start,const vector_length_t stop):
+      v_(v+start),nb_samples_(compute_length(start,stop)){};
+
+    explicit VectorView(const Vector_Base<T> &v):
+      v_(v.ptr()),nb_samples_(compute_length(0,v.length())){};
+
+    explicit VectorView(const Vector_Base<T> &v,const index_t start,const index_t stop):
+      v_(v.ptr()+start),nb_samples_(compute_length(start,stop)){};
+
+    vector_length_t length() const {return(nb_samples_);};
+
+
+    T* ptr() const {return(v_);}
+    T* ptr(const index_t i) const {return(&v_[i*stride]);}
+
+    const T* const_ptr() const {return(v_);}
+    const T* const_ptr(const index_t i) const {return(&v_[i*stride]);}
+
+    T& operator[](const index_t i)
+    {
+        return(v_[i*stride]);
+    }
+   
+    T& operator[](const index_t i) const
+    {
+        return(v_[i*stride]);
+    }
+   
+#if defined(HAS_VECTOR)
+    using Vector = typename vector_traits<T>::vector;
+    void vector_store(const index_t i,const Vector val)
+    {
+        inner::vstore1<stride>((typename std::remove_cv<T>::type*)(&v_[i*stride]),val);
+    }
+
+#if defined(HAS_PREDICATED_LOOP)
+    void vector_store_tail(const index_t i,const vector_length_t remaining,const Vector val)
+    {
+        inner::vstore1_z<stride>((typename std::remove_cv<T>::type*)(&v_[i*stride]),val,remaining,inner::vctpq<T>::mk(remaining));
+    }
+
+    Vector const vector_op_tail(const index_t i,const vector_length_t remaining) const
+    {
+        return(inner::vload1_z<stride>((typename std::remove_cv<T>::type*)(&v_[i*stride]),remaining,inner::vctpq<T>::mk(remaining)));
+    }
+#endif
+
+    Vector const vector_op(const index_t i) const
+    {
+        return(inner::vload1<stride>((typename std::remove_cv<T>::type*)(&v_[i*stride])));
+    }
+#endif
+
+    virtual ~VectorView() {};
+
+    VectorView(const VectorView& other):
+    v_(other.v_),nb_samples_(other.nb_samples_){};
+  
+
+    VectorView(VectorView&& other) :
+     v_(std::move(other.v_)),nb_samples_(other.nb_samples_)
+     {
+        other.v_ = nullptr;
+     };
+
+VectorView& operator=(const VectorView& other) = delete;
+VectorView& operator=(VectorView&& other)  = delete;
+
+
+
+   template<typename Derived>
+   VectorView& operator=(const _Expr<Derived>&other)
+   {
+      eval(*this,other.derived(),length(),CURRENT_ARCH);
+      return(*this);
+   }
+
+
+   VectorView& operator=(const T val)
+   {
+        _Fill(*this,val,length(),CURRENT_ARCH);
+       
+        return(*this);
+   }
+
+   template<typename Derived>
+   VectorView& operator +=(const _Expr<Derived>& other)
+   {
+      eval(*this,*this + other.derived(),length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   VectorView& operator +=(const VectorView& other)
+   {
+      eval(*this,*this + other,length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   VectorView& operator +=(const T other)
+   {
+      eval(*this,*this + other,length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   template<typename Derived>
+   VectorView& operator -=(const _Expr<Derived>& other)
+   {
+      eval(*this,*this - other.derived(),length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   
+   VectorView& operator -=(const VectorView& other)
+   {
+      eval(*this,*this - other,length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   VectorView& operator -=(const T other)
+   {
+      eval(*this,*this - other,length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   template<typename Derived>
+   VectorView& operator *=(const _Expr<Derived>& other)
+   {
+      eval(*this,*this * other.derived(),length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   VectorView& operator *=(const VectorView& other)
+   {
+      eval(*this,*this * other,length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   VectorView& operator *=(const T other)
+   {
+      eval(*this,*this * other,length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+  friend std::ostream& operator<< (std::ostream& stream, const VectorView<T,stride>& other) {
+        constexpr int nb = 10;
+        int i=0;
+        for(index_t k=0;k<other.length();k++)
+        {
+           stream << other[k] << " , ";
+           i++;
+           if(i==nb)
+           {
+             i=0;
+             stream << "\r\n";
+           };
+        }
+        stream << "\r\n";
+        return(stream);
+    }
+
+   template<int S=1>
+   VectorView<T,S*stride> sub(const index_t start=0,const index_t stop=-1)
+   {
+     if (stop < 0)
+     {
+       return(VectorView<T,S*stride>(v_,stride*start,stride*length()));
+     }
+     else 
+     {
+       return(VectorView<T,S*stride>(v_,stride*start,stride*stop));
+     }
+   }
+
+   template<int S=1>
+   const VectorView<T,S*stride> sub(const index_t start=0,const index_t stop=-1) const
+   {
+     if (stop < 0)
+     {
+        return(VectorView<T,S*stride>(v_,stride*start,stride*length()));
+     }
+     else 
+     {
+        return(VectorView<T,S*stride>(v_,stride*start,stride*stop));
+     }
+   }
+
+
+protected:
+    T* const v_;
+    const vector_length_t nb_samples_;
+};
+
+template<typename T>
+struct VectorView<T,DYNAMIC>
+{
+
+    /*
+
+    Start and stop are the position in the raw Vector_base pointer.
+    Stop is the first sample outside of the vector
+
+    */
+    VectorView() = delete;
+
+    vector_length_t compute_length(const index_t start,const index_t stop,const index_t stride) const
+    {
+        return(1+(stop-1 -start)/stride);
+    }
+
+    explicit VectorView(T *v,const index_t start,const index_t stop,const index_t stride):
+      v_(v+start),nb_samples_(compute_length(start,stop,stride)),stride_(stride){};
+
+    explicit VectorView(const Vector_Base<T> &v,const index_t stride):
+      v_(v.ptr()),nb_samples_(compute_length(0,v.length(),stride)),stride_(stride){};
+
+    explicit VectorView(const Vector_Base<T> &v,const index_t start,const index_t stop,const index_t stride):
+      v_(v.ptr()+start),nb_samples_(compute_length(start,stop,stride)),stride_(stride){};
+
+    vector_length_t length() const {return(nb_samples_);};
+
+
+    T* ptr() const {return(v_);}
+    T* ptr(const index_t i) const {return(&v_[i*stride_]);}
+
+    const T* const_ptr() const {return(v_);}
+    const T* const_ptr(const index_t i) const {return(&v_[i*stride_]);}
+
+    T& operator[](index_t i)
+    {
+        return(v_[i*stride_]);
+    }
+   
+    T& operator[](index_t i) const
+    {
+        return(v_[i*stride_]);
+    }
+   
+#if defined(HAS_VECTOR)
+    using Vector = typename vector_traits<T>::vector;
+    void vector_store(index_t i,Vector val)
+    {
+        inner::vstore1((typename std::remove_cv<T>::type*)(&v_[i*stride_]),stride_,val);
+    }
+
+#if defined(HAS_PREDICATED_LOOP)
+    void vector_store_tail(index_t i,vector_length_t remaining,Vector val)
+    {
+        inner::vstore1_z((typename std::remove_cv<T>::type*)(&v_[i*stride_]),stride_,val,remaining,inner::vctpq<T>::mk(remaining));
+    }
+
+    Vector const vector_op_tail(index_t i,vector_length_t remaining) const
+    {
+        return(inner::vload1_z((typename std::remove_cv<T>::type*)(&v_[i*stride_]),stride_,remaining,inner::vctpq<T>::mk(remaining)));
+    }
+#endif
+    
+    Vector const vector_op(index_t i) const
+    {
+        return(inner::vload1((typename std::remove_cv<T>::type*)(&v_[i*stride_]),stride_));
+    }
+#endif
+
+    virtual ~VectorView() {};
+
+    VectorView(const VectorView& other):
+    v_(other.v_),nb_samples_(other.nb_samples_),stride_(other.stride_){};
+  
+
+    VectorView(VectorView&& other) :
+     v_(std::move(other.v_)),nb_samples_(other.nb_samples_),stride_(other.stride_)
+     {
+        other.v_ = nullptr;
+     };
+
+VectorView& operator=(const VectorView& other) = delete;
+VectorView& operator=(VectorView&& other)  = delete;
+
+
+
+   template<typename Derived>
+   VectorView& operator=(const _Expr<Derived>&other)
+   {
+      eval(*this,other.derived(),length(),CURRENT_ARCH);
+      return(*this);
+   }
+
+
+   VectorView& operator=(const T val)
+   {
+        _Fill(*this,val,length(),CURRENT_ARCH);
+       
+        return(*this);
+   }
+
+   template<typename Derived>
+   VectorView& operator +=(const _Expr<Derived>& other)
+   {
+      eval(*this,*this + other.derived(),length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   VectorView& operator +=(const VectorView& other)
+   {
+      eval(*this,*this + other,length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   VectorView& operator +=(const T other)
+   {
+      eval(*this,*this + other,length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   template<typename Derived>
+   VectorView& operator -=(const _Expr<Derived>& other)
+   {
+      eval(*this,*this - other.derived(),length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   
+   VectorView& operator -=(const VectorView& other)
+   {
+      eval(*this,*this - other,length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   VectorView& operator -=(const T other)
+   {
+      eval(*this,*this - other,length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   template<typename Derived>
+   VectorView& operator *=(const _Expr<Derived>& other)
+   {
+      eval(*this,*this * other.derived(),length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   VectorView& operator *=(const VectorView& other)
+   {
+      eval(*this,*this * other,length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+   VectorView& operator *=(const T other)
+   {
+      eval(*this,*this * other,length(),CURRENT_ARCH);
+      return(*this);
+   };
+
+  friend std::ostream& operator<< (std::ostream& stream, const VectorView<T,DYNAMIC>& other) {
+        constexpr int nb = 10;
+        int i=0;
+        for(index_t k=0;k<other.length();k++)
+        {
+           stream << other[k] << " , ";
+           i++;
+           if(i==nb)
+           {
+             i=0;
+             stream << "\r\n";
+           };
+        }
+        stream << "\r\n";
+        return(stream);
+    }
+
+    index_t stride() const {return stride_;}
+
+  
+   template<int S=1>
+   VectorView<T,DYNAMIC> sub(const index_t start=0,const index_t stop=-1)
+   {
+     if  (stop<0)
+     {
+        return(VectorView<T,DYNAMIC>(v_,stride()*start,stride()*length(),stride()*S));
+     }
+     else
+     {
+       return(VectorView<T,DYNAMIC>(v_,stride()*start,stride()*stop,stride()*S));
+     }
+   }
+
+   template<int S=1>
+   const VectorView<T,DYNAMIC> sub(const index_t start=0,const index_t stop=-1) const
+   {
+     if  (stop<0)
+     {
+        return(VectorView<T,DYNAMIC>(v_,stride()*start,stride()*length(),stride()*S));
+     }
+     else
+     {
+       return(VectorView<T,DYNAMIC>(v_,stride()*start,stride()*stop,stride()*S));
+     }
+   }
+
+protected:
+    T* const v_;
+    const vector_length_t nb_samples_;
+    const index_t stride_;
+};
+
+/*! @} */
+
+}
+
diff --git a/dsppp/RTE/Device/ARMCM0P/ARMCM0plus_ac6.sct b/dsppp/RTE/Device/ARMCM0P/ARMCM0plus_ac6.sct
new file mode 100644
index 000000000..0f499b2cc
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM0P/ARMCM0plus_ac6.sct
@@ -0,0 +1,80 @@
+#! armclang -E --target=arm-arm-none-eabi -mcpu=cortex-m0+ -xc
+; command above MUST be in first line (no comment above!)
+
+/*
+;-------- <<< Use Configuration Wizard in Context Menu >>> -------------------
+*/
+
+/*--------------------- Flash Configuration ----------------------------------
+; <h> Flash Configuration
+;   <o0> Flash Base Address <0x0-0xFFFFFFFF:8>
+;   <o1> Flash Size (in Bytes) <0x0-0xFFFFFFFF:8>
+; </h>
+ *----------------------------------------------------------------------------*/
+#define __ROM_BASE      0x00000000
+#define __ROM_SIZE      0x00080000
+
+/*--------------------- Embedded RAM Configuration ---------------------------
+; <h> RAM Configuration
+;   <o0> RAM Base Address    <0x0-0xFFFFFFFF:8>
+;   <o1> RAM Size (in Bytes) <0x0-0xFFFFFFFF:8>
+; </h>
+ *----------------------------------------------------------------------------*/
+#define __RAM_BASE      0x20000000
+#define __RAM_SIZE      0x00040000
+
+/*--------------------- Stack / Heap Configuration ---------------------------
+; <h> Stack / Heap Configuration
+;   <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;   <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+; </h>
+ *----------------------------------------------------------------------------*/
+#define __STACK_SIZE    0x00000200
+#define __HEAP_SIZE     0x00000C00
+
+/*
+;------------- <<< end of configuration section >>> ---------------------------
+*/
+
+
+/*----------------------------------------------------------------------------
+  User Stack & Heap boundary definition
+ *----------------------------------------------------------------------------*/
+#define __STACK_TOP    (__RAM_BASE + __RAM_SIZE)    /* starts at end of RAM */
+#define __HEAP_BASE    (AlignExpr(+0, 8))           /* starts after RW_RAM section, 8 byte aligned */
+
+
+/*----------------------------------------------------------------------------
+  Scatter File Definitions definition
+ *----------------------------------------------------------------------------*/
+#define __RO_BASE       __ROM_BASE
+#define __RO_SIZE       __ROM_SIZE
+
+#define __RW_BASE       __RAM_BASE
+#define __RW_SIZE      (__RAM_SIZE - __STACK_SIZE - __HEAP_SIZE)
+
+
+LR_ROM __RO_BASE __RO_SIZE  {                       ; load region size_region
+  ER_ROM __RO_BASE __RO_SIZE  {                     ; load address = execution address
+   *.o (RESET, +First)
+   *(InRoot$$Sections)
+   .ANY (+RO)
+   .ANY (+XO)
+  }
+
+  RW_NOINIT __RW_BASE UNINIT __RW_SIZE {
+    *(.bss.noinit)
+  }
+
+  RW_RAM AlignExpr(+0, 8) (__RW_SIZE - AlignExpr(ImageLength(RW_NOINIT), 8)) {
+    *(+RW +ZI)
+  }
+
+#if __HEAP_SIZE > 0
+  ARM_LIB_HEAP  __HEAP_BASE EMPTY  __HEAP_SIZE  {   ; Reserve empty region for heap
+  }
+#endif
+
+  ARM_LIB_STACK __STACK_TOP EMPTY -__STACK_SIZE {   ; Reserve empty region for stack
+  }
+}
diff --git a/dsppp/RTE/Device/ARMCM0P/ARMCM0plus_ac6.sct.base@1.0.0 b/dsppp/RTE/Device/ARMCM0P/ARMCM0plus_ac6.sct.base@1.0.0
new file mode 100644
index 000000000..0f499b2cc
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM0P/ARMCM0plus_ac6.sct.base@1.0.0
@@ -0,0 +1,80 @@
+#! armclang -E --target=arm-arm-none-eabi -mcpu=cortex-m0+ -xc
+; command above MUST be in first line (no comment above!)
+
+/*
+;-------- <<< Use Configuration Wizard in Context Menu >>> -------------------
+*/
+
+/*--------------------- Flash Configuration ----------------------------------
+; <h> Flash Configuration
+;   <o0> Flash Base Address <0x0-0xFFFFFFFF:8>
+;   <o1> Flash Size (in Bytes) <0x0-0xFFFFFFFF:8>
+; </h>
+ *----------------------------------------------------------------------------*/
+#define __ROM_BASE      0x00000000
+#define __ROM_SIZE      0x00080000
+
+/*--------------------- Embedded RAM Configuration ---------------------------
+; <h> RAM Configuration
+;   <o0> RAM Base Address    <0x0-0xFFFFFFFF:8>
+;   <o1> RAM Size (in Bytes) <0x0-0xFFFFFFFF:8>
+; </h>
+ *----------------------------------------------------------------------------*/
+#define __RAM_BASE      0x20000000
+#define __RAM_SIZE      0x00040000
+
+/*--------------------- Stack / Heap Configuration ---------------------------
+; <h> Stack / Heap Configuration
+;   <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;   <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+; </h>
+ *----------------------------------------------------------------------------*/
+#define __STACK_SIZE    0x00000200
+#define __HEAP_SIZE     0x00000C00
+
+/*
+;------------- <<< end of configuration section >>> ---------------------------
+*/
+
+
+/*----------------------------------------------------------------------------
+  User Stack & Heap boundary definition
+ *----------------------------------------------------------------------------*/
+#define __STACK_TOP    (__RAM_BASE + __RAM_SIZE)    /* starts at end of RAM */
+#define __HEAP_BASE    (AlignExpr(+0, 8))           /* starts after RW_RAM section, 8 byte aligned */
+
+
+/*----------------------------------------------------------------------------
+  Scatter File Definitions definition
+ *----------------------------------------------------------------------------*/
+#define __RO_BASE       __ROM_BASE
+#define __RO_SIZE       __ROM_SIZE
+
+#define __RW_BASE       __RAM_BASE
+#define __RW_SIZE      (__RAM_SIZE - __STACK_SIZE - __HEAP_SIZE)
+
+
+LR_ROM __RO_BASE __RO_SIZE  {                       ; load region size_region
+  ER_ROM __RO_BASE __RO_SIZE  {                     ; load address = execution address
+   *.o (RESET, +First)
+   *(InRoot$$Sections)
+   .ANY (+RO)
+   .ANY (+XO)
+  }
+
+  RW_NOINIT __RW_BASE UNINIT __RW_SIZE {
+    *(.bss.noinit)
+  }
+
+  RW_RAM AlignExpr(+0, 8) (__RW_SIZE - AlignExpr(ImageLength(RW_NOINIT), 8)) {
+    *(+RW +ZI)
+  }
+
+#if __HEAP_SIZE > 0
+  ARM_LIB_HEAP  __HEAP_BASE EMPTY  __HEAP_SIZE  {   ; Reserve empty region for heap
+  }
+#endif
+
+  ARM_LIB_STACK __STACK_TOP EMPTY -__STACK_SIZE {   ; Reserve empty region for stack
+  }
+}
diff --git a/dsppp/RTE/Device/ARMCM0P/ARMCM0plus_gcc.ld b/dsppp/RTE/Device/ARMCM0P/ARMCM0plus_gcc.ld
new file mode 100644
index 000000000..93ed813c8
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM0P/ARMCM0plus_gcc.ld
@@ -0,0 +1,263 @@
+/*
+ *-------- <<< Use Configuration Wizard in Context Menu >>> -------------------
+ */
+
+/*---------------------- Flash Configuration ----------------------------------
+  <h> Flash Configuration
+    <o0> Flash Base Address <0x0-0xFFFFFFFF:8>
+    <o1> Flash Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+  -----------------------------------------------------------------------------*/
+__ROM_BASE = 0x00000000;
+__ROM_SIZE = 0x00040000;
+
+/*--------------------- Embedded RAM Configuration ----------------------------
+  <h> RAM Configuration
+    <o0> RAM Base Address    <0x0-0xFFFFFFFF:8>
+    <o1> RAM Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+ -----------------------------------------------------------------------------*/
+__RAM_BASE = 0x20000000;
+__RAM_SIZE = 0x00020000;
+
+/*--------------------- Stack / Heap Configuration ----------------------------
+  <h> Stack / Heap Configuration
+    <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+    <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+  -----------------------------------------------------------------------------*/
+__STACK_SIZE = 0x00000400;
+__HEAP_SIZE  = 0x00000C00;
+
+/*
+ *-------------------- <<< end of configuration section >>> -------------------
+ */
+
+MEMORY
+{
+  FLASH (rx)  : ORIGIN = __ROM_BASE, LENGTH = __ROM_SIZE
+  RAM   (rwx) : ORIGIN = __RAM_BASE, LENGTH = __RAM_SIZE
+}
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext          (deprecated)
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+  .text :
+  {
+    KEEP(*(.vectors))
+    *(.text*)
+
+    KEEP(*(.init))
+    KEEP(*(.fini))
+
+    /* .ctors */
+    *crtbegin.o(.ctors)
+    *crtbegin?.o(.ctors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+    *(SORT(.ctors.*))
+    *(.ctors)
+
+    /* .dtors */
+    *crtbegin.o(.dtors)
+    *crtbegin?.o(.dtors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+    *(SORT(.dtors.*))
+    *(.dtors)
+
+    *(.rodata*)
+
+    KEEP(*(.eh_frame*))
+  } > FLASH
+
+  .ARM.extab :
+  {
+    *(.ARM.extab* .gnu.linkonce.armextab.*)
+  } > FLASH
+
+  __exidx_start = .;
+  .ARM.exidx :
+  {
+    *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+  } > FLASH
+  __exidx_end = .;
+
+  .copy.table :
+  {
+    . = ALIGN(4);
+    __copy_table_start__ = .;
+
+    LONG (LOADADDR(.data))
+    LONG (ADDR(.data))
+    LONG (SIZEOF(.data) / 4)
+
+    /* Add each additional data section here */
+/*
+    LONG (LOADADDR(.data2))
+    LONG (ADDR(.data2))
+    LONG (SIZEOF(.data2) / 4)
+*/
+    __copy_table_end__ = .;
+  } > FLASH
+
+  .zero.table :
+  {
+    . = ALIGN(4);
+    __zero_table_start__ = .;
+
+/*  .bss initialization to zero is already done during C Run-Time Startup.
+    LONG (ADDR(.bss))
+    LONG (SIZEOF(.bss) / 4)
+*/
+
+    /* Add each additional bss section here */
+/*
+    LONG (ADDR(.bss2))
+    LONG (SIZEOF(.bss2) / 4)
+*/
+    __zero_table_end__ = .;
+  } > FLASH
+
+  /*
+   * This __etext variable is kept for backward compatibility with older,
+   * ASM based startup files.
+   */
+  PROVIDE(__etext = LOADADDR(.data));
+
+  .data : ALIGN(4)
+  {
+    __data_start__ = .;
+    *(vtable)
+    *(.data)
+    *(.data.*)
+
+    . = ALIGN(4);
+    /* preinit data */
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP(*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+
+    . = ALIGN(4);
+    /* init data */
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP(*(SORT(.init_array.*)))
+    KEEP(*(.init_array))
+    PROVIDE_HIDDEN (__init_array_end = .);
+
+    . = ALIGN(4);
+    /* finit data */
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP(*(SORT(.fini_array.*)))
+    KEEP(*(.fini_array))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+
+    KEEP(*(.jcr*))
+    . = ALIGN(4);
+    /* All data end */
+    __data_end__ = .;
+
+  } > RAM AT > FLASH
+
+  /*
+   * Secondary data section, optional
+   *
+   * Remember to add each additional data section
+   * to the .copy.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .data2 : ALIGN(4)
+  {
+    . = ALIGN(4);
+    __data2_start__ = .;
+    *(.data2)
+    *(.data2.*)
+    . = ALIGN(4);
+    __data2_end__ = .;
+
+  } > RAM2 AT > FLASH
+*/
+
+  .bss :
+  {
+    . = ALIGN(4);
+    __bss_start__ = .;
+    *(.bss)
+    *(.bss.*)
+    *(COMMON)
+    . = ALIGN(4);
+    __bss_end__ = .;
+  } > RAM AT > RAM
+
+  /*
+   * Secondary bss section, optional
+   *
+   * Remember to add each additional bss section
+   * to the .zero.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .bss2 :
+  {
+    . = ALIGN(4);
+    __bss2_start__ = .;
+    *(.bss2)
+    *(.bss2.*)
+    . = ALIGN(4);
+    __bss2_end__ = .;
+  } > RAM2 AT > RAM2
+*/
+
+  .heap (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __end__ = .;
+    PROVIDE(end = .);
+    . = . + __HEAP_SIZE;
+    . = ALIGN(8);
+    __HeapLimit = .;
+  } > RAM
+
+  .stack (ORIGIN(RAM) + LENGTH(RAM) - __STACK_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackLimit = .;
+    . = . + __STACK_SIZE;
+    . = ALIGN(8);
+    __StackTop = .;
+  } > RAM
+  PROVIDE(__stack = __StackTop);
+
+  /* Check if data + heap + stack exceeds RAM limit */
+  ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack")
+}
diff --git a/dsppp/RTE/Device/ARMCM0P/ARMCM0plus_gcc.ld.base@2.2.0 b/dsppp/RTE/Device/ARMCM0P/ARMCM0plus_gcc.ld.base@2.2.0
new file mode 100644
index 000000000..93ed813c8
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM0P/ARMCM0plus_gcc.ld.base@2.2.0
@@ -0,0 +1,263 @@
+/*
+ *-------- <<< Use Configuration Wizard in Context Menu >>> -------------------
+ */
+
+/*---------------------- Flash Configuration ----------------------------------
+  <h> Flash Configuration
+    <o0> Flash Base Address <0x0-0xFFFFFFFF:8>
+    <o1> Flash Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+  -----------------------------------------------------------------------------*/
+__ROM_BASE = 0x00000000;
+__ROM_SIZE = 0x00040000;
+
+/*--------------------- Embedded RAM Configuration ----------------------------
+  <h> RAM Configuration
+    <o0> RAM Base Address    <0x0-0xFFFFFFFF:8>
+    <o1> RAM Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+ -----------------------------------------------------------------------------*/
+__RAM_BASE = 0x20000000;
+__RAM_SIZE = 0x00020000;
+
+/*--------------------- Stack / Heap Configuration ----------------------------
+  <h> Stack / Heap Configuration
+    <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+    <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+  -----------------------------------------------------------------------------*/
+__STACK_SIZE = 0x00000400;
+__HEAP_SIZE  = 0x00000C00;
+
+/*
+ *-------------------- <<< end of configuration section >>> -------------------
+ */
+
+MEMORY
+{
+  FLASH (rx)  : ORIGIN = __ROM_BASE, LENGTH = __ROM_SIZE
+  RAM   (rwx) : ORIGIN = __RAM_BASE, LENGTH = __RAM_SIZE
+}
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext          (deprecated)
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+  .text :
+  {
+    KEEP(*(.vectors))
+    *(.text*)
+
+    KEEP(*(.init))
+    KEEP(*(.fini))
+
+    /* .ctors */
+    *crtbegin.o(.ctors)
+    *crtbegin?.o(.ctors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+    *(SORT(.ctors.*))
+    *(.ctors)
+
+    /* .dtors */
+    *crtbegin.o(.dtors)
+    *crtbegin?.o(.dtors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+    *(SORT(.dtors.*))
+    *(.dtors)
+
+    *(.rodata*)
+
+    KEEP(*(.eh_frame*))
+  } > FLASH
+
+  .ARM.extab :
+  {
+    *(.ARM.extab* .gnu.linkonce.armextab.*)
+  } > FLASH
+
+  __exidx_start = .;
+  .ARM.exidx :
+  {
+    *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+  } > FLASH
+  __exidx_end = .;
+
+  .copy.table :
+  {
+    . = ALIGN(4);
+    __copy_table_start__ = .;
+
+    LONG (LOADADDR(.data))
+    LONG (ADDR(.data))
+    LONG (SIZEOF(.data) / 4)
+
+    /* Add each additional data section here */
+/*
+    LONG (LOADADDR(.data2))
+    LONG (ADDR(.data2))
+    LONG (SIZEOF(.data2) / 4)
+*/
+    __copy_table_end__ = .;
+  } > FLASH
+
+  .zero.table :
+  {
+    . = ALIGN(4);
+    __zero_table_start__ = .;
+
+/*  .bss initialization to zero is already done during C Run-Time Startup.
+    LONG (ADDR(.bss))
+    LONG (SIZEOF(.bss) / 4)
+*/
+
+    /* Add each additional bss section here */
+/*
+    LONG (ADDR(.bss2))
+    LONG (SIZEOF(.bss2) / 4)
+*/
+    __zero_table_end__ = .;
+  } > FLASH
+
+  /*
+   * This __etext variable is kept for backward compatibility with older,
+   * ASM based startup files.
+   */
+  PROVIDE(__etext = LOADADDR(.data));
+
+  .data : ALIGN(4)
+  {
+    __data_start__ = .;
+    *(vtable)
+    *(.data)
+    *(.data.*)
+
+    . = ALIGN(4);
+    /* preinit data */
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP(*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+
+    . = ALIGN(4);
+    /* init data */
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP(*(SORT(.init_array.*)))
+    KEEP(*(.init_array))
+    PROVIDE_HIDDEN (__init_array_end = .);
+
+    . = ALIGN(4);
+    /* finit data */
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP(*(SORT(.fini_array.*)))
+    KEEP(*(.fini_array))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+
+    KEEP(*(.jcr*))
+    . = ALIGN(4);
+    /* All data end */
+    __data_end__ = .;
+
+  } > RAM AT > FLASH
+
+  /*
+   * Secondary data section, optional
+   *
+   * Remember to add each additional data section
+   * to the .copy.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .data2 : ALIGN(4)
+  {
+    . = ALIGN(4);
+    __data2_start__ = .;
+    *(.data2)
+    *(.data2.*)
+    . = ALIGN(4);
+    __data2_end__ = .;
+
+  } > RAM2 AT > FLASH
+*/
+
+  .bss :
+  {
+    . = ALIGN(4);
+    __bss_start__ = .;
+    *(.bss)
+    *(.bss.*)
+    *(COMMON)
+    . = ALIGN(4);
+    __bss_end__ = .;
+  } > RAM AT > RAM
+
+  /*
+   * Secondary bss section, optional
+   *
+   * Remember to add each additional bss section
+   * to the .zero.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .bss2 :
+  {
+    . = ALIGN(4);
+    __bss2_start__ = .;
+    *(.bss2)
+    *(.bss2.*)
+    . = ALIGN(4);
+    __bss2_end__ = .;
+  } > RAM2 AT > RAM2
+*/
+
+  .heap (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __end__ = .;
+    PROVIDE(end = .);
+    . = . + __HEAP_SIZE;
+    . = ALIGN(8);
+    __HeapLimit = .;
+  } > RAM
+
+  .stack (ORIGIN(RAM) + LENGTH(RAM) - __STACK_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackLimit = .;
+    . = . + __STACK_SIZE;
+    . = ALIGN(8);
+    __StackTop = .;
+  } > RAM
+  PROVIDE(__stack = __StackTop);
+
+  /* Check if data + heap + stack exceeds RAM limit */
+  ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack")
+}
diff --git a/dsppp/RTE/Device/ARMCM0P/ac6_linker_script.sct b/dsppp/RTE/Device/ARMCM0P/ac6_linker_script.sct
new file mode 100644
index 000000000..4d6e579d0
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM0P/ac6_linker_script.sct
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/*----------------------------------------------------------------------------
+  Scatter File Definitions definition
+ *----------------------------------------------------------------------------*/
+
+LR_ROM0 __ROM0_BASE __ROM0_SIZE  {
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+  ER_CMSE_VENEER __ROM0_BASE+__ROM0_SIZE -__ROM0_SIZE {
+   *(Veneer$$CMSE)
+  }
+  #define ER_CMSE_VENEER_SIZE AlignExpr(ImageLength(ER_CMSE_VENEER), 8)
+#else
+  #define ER_CMSE_VENEER_SIZE 0
+#endif
+
+  ER_ROM0 __ROM0_BASE (__ROM0_SIZE - ER_CMSE_VENEER_SIZE) {
+   *.o (RESET, +First)
+   *(InRoot$$Sections)
+   *(+RO +XO)
+  }
+
+  RW_NOINIT __RAM0_BASE UNINIT (__RAM0_SIZE - __HEAP_SIZE - __STACK_SIZE) {
+    *(.bss.noinit)
+  }
+
+  RW_RAM0 AlignExpr(+0, 8) (__RAM0_SIZE - __HEAP_SIZE - __STACK_SIZE - AlignExpr(ImageLength(RW_NOINIT), 8)) {
+    *(+RW +ZI)
+  }
+
+#if __HEAP_SIZE > 0
+  ARM_LIB_HEAP  (AlignExpr(+0, 8)) EMPTY __HEAP_SIZE  {   ; Reserve empty region for heap
+  }
+#endif
+
+  ARM_LIB_STACK (__RAM0_BASE + __RAM0_SIZE - __STACKSEAL_SIZE) EMPTY -__STACK_SIZE {   ; Reserve empty region for stack
+  }
+
+#if __STACKSEAL_SIZE > 0
+  STACKSEAL +0 EMPTY 8 {             ; Reserve empty region for stack seal immediately after stack
+  }
+#endif
+
+#if __RAM1_SIZE > 0
+  RW_RAM1 __RAM1_BASE __RAM1_SIZE  {
+   .ANY (+RW +ZI)
+  }
+#endif
+
+#if __RAM2_SIZE > 0
+  RW_RAM2 __RAM2_BASE __RAM2_SIZE  {
+   .ANY (+RW +ZI)
+  }
+#endif
+
+#if __RAM3_SIZE > 0
+  RW_RAM3 __RAM3_BASE __RAM3_SIZE  {
+   .ANY (+RW +ZI)
+  }
+#endif
+}
+
+#if __ROM1_SIZE > 0
+LR_ROM1 __ROM1_BASE __ROM1_SIZE  {
+  ER_ROM1 +0 __ROM1_SIZE {
+   .ANY (+RO +XO)
+  }
+}
+#endif
+
+#if __ROM2_SIZE > 0
+LR_ROM2 __ROM2_BASE __ROM2_SIZE  {
+  ER_ROM2 +0 __ROM2_SIZE {
+   .ANY (+RO +XO)
+  }
+}
+#endif
+
+#if __ROM3_SIZE > 0
+LR_ROM3 __ROM3_BASE __ROM3_SIZE  {
+  ER_ROM3 +0 __ROM3_SIZE {
+   .ANY (+RO +XO)
+  }
+}
+#endif
diff --git a/dsppp/RTE/Device/ARMCM0P/clang_linker_script.ld b/dsppp/RTE/Device/ARMCM0P/clang_linker_script.ld
new file mode 100644
index 000000000..40f955c16
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM0P/clang_linker_script.ld
@@ -0,0 +1,353 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright © 2019 Keith Packard
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials provided
+ *    with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/* ----------------------------------------------------------------------------
+  Memory definition
+ *----------------------------------------------------------------------------*/
+MEMORY
+{
+  ROM0  (rx!w)  : ORIGIN = __ROM0_BASE, LENGTH = __ROM0_SIZE
+#if __ROM1_SIZE > 0
+  ROM1  (rx!w)  : ORIGIN = __ROM1_BASE, LENGTH = __ROM1_SIZE
+#endif
+#if __ROM2_SIZE > 0
+  ROM2  (rx!w)  : ORIGIN = __ROM2_BASE, LENGTH = __ROM2_SIZE
+#endif
+#if __ROM3_SIZE > 0
+  ROM3  (rx!w)  : ORIGIN = __ROM3_BASE, LENGTH = __ROM3_SIZE
+#endif
+
+  RAM0  (w!rx) : ORIGIN = __RAM0_BASE, LENGTH = __RAM0_SIZE
+#if __RAM1_SIZE > 0
+  RAM1  (w!rx) : ORIGIN = __RAM1_BASE, LENGTH = __RAM1_SIZE
+#endif
+#if __RAM2_SIZE > 0
+  RAM2  (w!rx) : ORIGIN = __RAM2_BASE, LENGTH = __RAM2_SIZE
+#endif
+#if __RAM3_SIZE > 0
+  RAM3  (w!rx) : ORIGIN = __RAM3_BASE, LENGTH = __RAM3_SIZE
+#endif
+}
+
+ENTRY(Reset_Handler)
+
+PHDRS
+{
+	text PT_LOAD;
+	ram PT_LOAD;
+	ram_init PT_LOAD;
+	tls PT_TLS;
+}
+
+SECTIONS
+{
+	.init : {
+		KEEP (*(.vectors))
+		KEEP (*(.text.init.enter))
+		KEEP (*(.data.init.enter))
+		KEEP (*(SORT_BY_NAME(.init) SORT_BY_NAME(.init.*)))
+	} >ROM0 AT>ROM0 :text
+
+	.text : {
+
+		/* code */
+		*(.text.unlikely .text.unlikely.*)
+		*(.text.startup .text.startup.*)
+		*(.text .text.* .opd .opd.*)
+		*(.gnu.linkonce.t.*)
+		KEEP (*(.fini .fini.*))
+		__text_end = .;
+
+		PROVIDE (__etext = __text_end);
+		PROVIDE (_etext = __text_end);
+		PROVIDE (etext = __text_end);
+
+		/* read-only data */
+		*(.rdata)
+		*(.rodata .rodata.*)
+		*(.gnu.linkonce.r.*)
+
+		*(.srodata.cst16)
+		*(.srodata.cst8)
+		*(.srodata.cst4)
+		*(.srodata.cst2)
+		*(.srodata .srodata.*)
+		*(.data.rel.ro .data.rel.ro.*)
+		*(.got .got.*)
+
+		/* Need to pre-align so that the symbols come after padding */
+		. = ALIGN(8);
+
+		/* lists of constructors and destructors */
+		PROVIDE_HIDDEN ( __preinit_array_start = . );
+		KEEP (*(.preinit_array))
+		PROVIDE_HIDDEN ( __preinit_array_end = . );
+
+		PROVIDE_HIDDEN ( __init_array_start = . );
+		KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
+		KEEP (*(.init_array .ctors))
+		PROVIDE_HIDDEN ( __init_array_end = . );
+
+		PROVIDE_HIDDEN ( __fini_array_start = . );
+		KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
+		KEEP (*(.fini_array .dtors))
+		PROVIDE_HIDDEN ( __fini_array_end = . );
+
+	} >ROM0 AT>ROM0 :text
+
+	.toc : {
+		*(.toc .toc.*)
+	} >ROM0 AT>ROM0 :text
+
+	/* additional sections when compiling with C++ exception support */
+	
+	.except_ordered : {
+		*(.gcc_except_table *.gcc_except_table.*)
+		KEEP (*(.eh_frame .eh_frame.*))
+		*(.ARM.extab* .gnu.linkonce.armextab.*)
+	} >ROM0 AT>ROM0 :text
+
+	.except_unordered : {
+		. = ALIGN(8);
+
+		PROVIDE(__exidx_start = .);
+		*(.ARM.exidx*)
+		PROVIDE(__exidx_end = .);
+	} >ROM0 AT>ROM0 :text
+	
+
+	/*
+	 * Data values which are preserved across reset
+	 */
+	.preserve (NOLOAD) : {
+		PROVIDE(__preserve_start__ = .);
+		KEEP(*(SORT_BY_NAME(.preserve.*)))
+		KEEP(*(.preserve))
+		PROVIDE(__preserve_end__ = .);
+	} >RAM0 AT>RAM0 :ram
+
+	.data :  {
+		*(.data .data.*)
+		*(.gnu.linkonce.d.*)
+
+		/* Need to pre-align so that the symbols come after padding */
+		. = ALIGN(8);
+
+		PROVIDE( __global_pointer$ = . + 0x800 );
+		*(.sdata .sdata.* .sdata2.*)
+		*(.gnu.linkonce.s.*)
+	} >RAM0 AT>ROM0 :ram_init
+	PROVIDE(__data_start = ADDR(.data));
+	PROVIDE(__data_source = LOADADDR(.data));
+
+	/* Thread local initialized data. This gets
+	 * space allocated as it is expected to be placed
+	 * in ram to be used as a template for TLS data blocks
+	 * allocated at runtime. We're slightly abusing that
+	 * by placing the data in flash where it will be copied
+	 * into the allocate ram addresses by the existing
+	 * data initialization code in crt0
+	 */
+	.tdata :  {
+		*(.tdata .tdata.* .gnu.linkonce.td.*)
+		PROVIDE(__data_end = .);
+		PROVIDE(__tdata_end = .);
+	} >RAM0 AT>ROM0 :tls :ram_init
+	PROVIDE( __tls_base = ADDR(.tdata));
+	PROVIDE( __tdata_start = ADDR(.tdata));
+	PROVIDE( __tdata_source = LOADADDR(.tdata) );
+	PROVIDE( __tdata_source_end = LOADADDR(.tdata) + SIZEOF(.tdata) );
+	PROVIDE( __data_source_end = __tdata_source_end );
+	PROVIDE( __tdata_size = SIZEOF(.tdata) );
+	PROVIDE( __tls_align = MAX(ALIGNOF(.tdata),ALIGNOF(.tbss)) );
+
+	PROVIDE( __edata = __data_end );
+	PROVIDE( _edata = __data_end );
+	PROVIDE( edata = __data_end );
+	PROVIDE( __data_size = __data_end - __data_start );
+	PROVIDE( __data_source_size = __data_source_end - __data_source );
+
+	.tbss (NOLOAD) : {
+		*(.tbss .tbss.* .gnu.linkonce.tb.*)
+		*(.tcommon)
+		PROVIDE( __tls_end = . );
+		PROVIDE( __tbss_end = . );
+	} >RAM0 AT>RAM0 :tls :ram
+	PROVIDE( __bss_start = ADDR(.tbss));
+	PROVIDE( __tbss_start = ADDR(.tbss));
+	PROVIDE( __tbss_offset = ADDR(.tbss) - ADDR(.tdata) );
+	PROVIDE( __tbss_size = SIZEOF(.tbss) );
+	PROVIDE( __tls_size = __tls_end - __tls_base );
+	PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
+	PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
+	PROVIDE( __arm64_tls_tcb_offset = MAX(16, __tls_align) );
+
+	/*
+	 * The linker special cases .tbss segments which are
+	 * identified as segments which are not loaded and are
+	 * thread_local.
+	 *
+	 * For these segments, the linker does not advance 'dot'
+	 * across them.  We actually need memory allocated for tbss,
+	 * so we create a special segment here just to make room
+	 */
+	/*
+	.tbss_space (NOLOAD) : {
+		. = ADDR(.tbss);
+		. = . + SIZEOF(.tbss);
+	} >RAM0 AT>RAM0 :ram
+	*/
+
+	.bss (NOLOAD) : {
+		*(.sbss*)
+		*(.gnu.linkonce.sb.*)
+		*(.bss .bss.*)
+		*(.gnu.linkonce.b.*)
+		*(COMMON)
+
+		/* Align the heap */
+		. = ALIGN(8);
+		__bss_end = .;
+	} >RAM0 AT>RAM0 :ram
+	PROVIDE( __non_tls_bss_start = ADDR(.bss) );
+	PROVIDE( __end = __bss_end );
+	PROVIDE( _end = __bss_end );
+	PROVIDE( end = __bss_end );
+	PROVIDE( __bss_size = __bss_end - __bss_start );
+
+	/* Make the rest of memory available for heap storage */
+	PROVIDE (__heap_start = __end);
+#ifdef __HEAP_SIZE
+	PROVIDE (__heap_end = __heap_start + __HEAP_SIZE);
+	PROVIDE (__heap_size = __HEAP_SIZE);
+#else
+	PROVIDE (__heap_end = __stack - __STACK_SIZE);
+	PROVIDE (__heap_size = __heap_end - __heap_start);
+#endif
+	.heap (NOLOAD) : {
+		. += __heap_size;
+	} >RAM0 :ram
+
+	/* Define a stack region to make sure it fits in memory */
+	PROVIDE(__stack = ORIGIN(RAM0) + LENGTH(RAM0) - __STACKSEAL_SIZE);
+	PROVIDE(__stack_limit = __stack - __STACK_SIZE);
+	.stack (__stack_limit) (NOLOAD) : {
+		. += __STACK_SIZE;
+	} >RAM0 :ram
+
+#if __STACKSEAL_SIZE > 0
+	PROVIDE(__stack_seal = __stack)
+  	.stackseal (__stack) (NOLOAD) :
+  	{
+    	. += __STACKSEAL_SIZE;
+  	} >RAM0 :ram
+#endif
+
+	/* Throw away C++ exception handling information */
+
+	/*
+
+	/DISCARD/ : {
+		*(.note .note.*)
+		*(.eh_frame .eh_frame.*)
+		*(.ARM.extab* .gnu.linkonce.armextab.*)
+		*(.ARM.exidx*)
+	}
+
+	*/
+
+	/* Stabs debugging sections.  */
+	.stab          0 : { *(.stab) }
+	.stabstr       0 : { *(.stabstr) }
+	.stab.excl     0 : { *(.stab.excl) }
+	.stab.exclstr  0 : { *(.stab.exclstr) }
+	.stab.index    0 : { *(.stab.index) }
+	.stab.indexstr 0 : { *(.stab.indexstr) }
+	.comment       0 : { *(.comment) }
+	.gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) }
+	/* DWARF debug sections.
+	   Symbols in the DWARF debugging sections are relative to the beginning
+	   of the section so we begin them at 0.  */
+	/* DWARF 1.  */
+	.debug          0 : { *(.debug) }
+	.line           0 : { *(.line) }
+	/* GNU DWARF 1 extensions.  */
+	.debug_srcinfo  0 : { *(.debug_srcinfo) }
+	.debug_sfnames  0 : { *(.debug_sfnames) }
+	/* DWARF 1.1 and DWARF 2.  */
+	.debug_aranges  0 : { *(.debug_aranges) }
+	.debug_pubnames 0 : { *(.debug_pubnames) }
+	/* DWARF 2.  */
+	.debug_info     0 : { *(.debug_info .gnu.linkonce.wi.*) }
+	.debug_abbrev   0 : { *(.debug_abbrev) }
+	.debug_line     0 : { *(.debug_line .debug_line.* .debug_line_end) }
+	.debug_frame    0 : { *(.debug_frame) }
+	.debug_str      0 : { *(.debug_str) }
+	.debug_loc      0 : { *(.debug_loc) }
+	.debug_macinfo  0 : { *(.debug_macinfo) }
+	/* SGI/MIPS DWARF 2 extensions.  */
+	.debug_weaknames 0 : { *(.debug_weaknames) }
+	.debug_funcnames 0 : { *(.debug_funcnames) }
+	.debug_typenames 0 : { *(.debug_typenames) }
+	.debug_varnames  0 : { *(.debug_varnames) }
+	/* DWARF 3.  */
+	.debug_pubtypes 0 : { *(.debug_pubtypes) }
+	.debug_ranges   0 : { *(.debug_ranges) }
+	/* DWARF 5.  */
+	.debug_addr     0 : { *(.debug_addr) }
+	.debug_line_str 0 : { *(.debug_line_str) }
+	.debug_loclists 0 : { *(.debug_loclists) }
+	.debug_macro    0 : { *(.debug_macro) }
+	.debug_names    0 : { *(.debug_names) }
+	.debug_rnglists 0 : { *(.debug_rnglists) }
+	.debug_str_offsets 0 : { *(.debug_str_offsets) }
+	.debug_sup      0 : { *(.debug_sup) }
+	.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
+}
+/*
+ * Check that sections that are copied from flash to RAM have matching
+ * padding, so that a single memcpy() of __data_size copies the correct bytes.
+ */
+ASSERT( __data_size == __data_source_size,
+	"ERROR: .data/.tdata flash size does not match RAM size");
diff --git a/dsppp/RTE/Device/ARMCM0P/gcc_linker_script.ld b/dsppp/RTE/Device/ARMCM0P/gcc_linker_script.ld
new file mode 100644
index 000000000..a018e5d4e
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM0P/gcc_linker_script.ld
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/* ----------------------------------------------------------------------------
+  Memory definition
+ *----------------------------------------------------------------------------*/
+MEMORY
+{
+  ROM0  (rx)  : ORIGIN = __ROM0_BASE, LENGTH = __ROM0_SIZE
+#if __ROM1_SIZE > 0
+  ROM1  (rx)  : ORIGIN = __ROM1_BASE, LENGTH = __ROM1_SIZE
+#endif
+#if __ROM2_SIZE > 0
+  ROM2  (rx)  : ORIGIN = __ROM2_BASE, LENGTH = __ROM2_SIZE
+#endif
+#if __ROM3_SIZE > 0
+  ROM3  (rx)  : ORIGIN = __ROM3_BASE, LENGTH = __ROM3_SIZE
+#endif
+
+  RAM0  (rwx) : ORIGIN = __RAM0_BASE, LENGTH = __RAM0_SIZE
+#if __RAM1_SIZE > 0
+  RAM1  (rwx) : ORIGIN = __RAM1_BASE, LENGTH = __RAM1_SIZE
+#endif
+#if __RAM2_SIZE > 0
+  RAM2  (rwx) : ORIGIN = __RAM2_BASE, LENGTH = __RAM2_SIZE
+#endif
+#if __RAM3_SIZE > 0
+  RAM3  (rwx) : ORIGIN = __RAM3_BASE, LENGTH = __RAM3_SIZE
+#endif
+}
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext          (deprecated)
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+  .text :
+  {
+    KEEP(*(.vectors))
+    *(.text*)
+
+    KEEP(*(.init))
+    KEEP(*(.fini))
+
+    /* .ctors */
+    *crtbegin.o(.ctors)
+    *crtbegin?.o(.ctors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+    *(SORT(.ctors.*))
+    *(.ctors)
+
+    /* .dtors */
+    *crtbegin.o(.dtors)
+    *crtbegin?.o(.dtors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+    *(SORT(.dtors.*))
+    *(.dtors)
+
+    *(.rodata*)
+
+    KEEP(*(.eh_frame*))
+  } > ROM0
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+  .gnu.sgstubs :
+  {
+    . = ALIGN(32);
+  } > ROM0
+#endif
+
+  .ARM.extab :
+  {
+    *(.ARM.extab* .gnu.linkonce.armextab.*)
+  } > ROM0
+
+  __exidx_start = .;
+  .ARM.exidx :
+  {
+    *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+  } > ROM0
+  __exidx_end = .;
+
+  .copy.table :
+  {
+    . = ALIGN(4);
+    __copy_table_start__ = .;
+
+    LONG (LOADADDR(.data))
+    LONG (ADDR(.data))
+    LONG (SIZEOF(.data) / 4)
+
+    /* Add each additional data section here */
+/*
+    LONG (LOADADDR(.data2))
+    LONG (ADDR(.data2))
+    LONG (SIZEOF(.data2) / 4)
+*/
+    __copy_table_end__ = .;
+  } > ROM0
+
+  .zero.table :
+  {
+    . = ALIGN(4);
+    __zero_table_start__ = .;
+
+/*  .bss initialization to zero is already done during C Run-Time Startup.
+    LONG (ADDR(.bss))
+    LONG (SIZEOF(.bss) / 4)
+*/
+
+    /* Add each additional bss section here */
+/*
+    LONG (ADDR(.bss2))
+    LONG (SIZEOF(.bss2) / 4)
+*/
+    __zero_table_end__ = .;
+  } > ROM0
+
+  /*
+   * This __etext variable is kept for backward compatibility with older,
+   * ASM based startup files.
+   */
+  PROVIDE(__etext = LOADADDR(.data));
+
+  .data : ALIGN(4)
+  {
+    __data_start__ = .;
+    *(vtable)
+    *(.data)
+    *(.data.*)
+
+    . = ALIGN(4);
+    /* preinit data */
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP(*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+
+    . = ALIGN(4);
+    /* init data */
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP(*(SORT(.init_array.*)))
+    KEEP(*(.init_array))
+    PROVIDE_HIDDEN (__init_array_end = .);
+
+    . = ALIGN(4);
+    /* finit data */
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP(*(SORT(.fini_array.*)))
+    KEEP(*(.fini_array))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+
+    KEEP(*(.jcr*))
+    . = ALIGN(4);
+    /* All data end */
+    __data_end__ = .;
+
+  } > RAM0 AT > ROM0
+
+  /*
+   * Secondary data section, optional
+   *
+   * Remember to add each additional data section
+   * to the .copy.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .data2 : ALIGN(4)
+  {
+    . = ALIGN(4);
+    __data2_start__ = .;
+    *(.data2)
+    *(.data2.*)
+    . = ALIGN(4);
+    __data2_end__ = .;
+
+  } > RAM1 AT > ROM0 
+*/
+
+  .bss :
+  {
+    . = ALIGN(4);
+    __bss_start__ = .;
+    *(.bss)
+    *(.bss.*)
+    *(COMMON)
+    . = ALIGN(4);
+    __bss_end__ = .;
+  } > RAM0 AT > RAM0
+
+  /*
+   * Secondary bss section, optional
+   *
+   * Remember to add each additional bss section
+   * to the .zero.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .bss2 :
+  {
+    . = ALIGN(4);
+    __bss2_start__ = .;
+    *(.bss2)
+    *(.bss2.*)
+    . = ALIGN(4);
+    __bss2_end__ = .;
+  } > RAM1 AT > RAM1
+*/
+
+  .heap (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __end__ = .;
+    PROVIDE(end = .);
+    . = . + __HEAP_SIZE;
+    . = ALIGN(8);
+    __HeapLimit = .;
+  } > RAM0
+
+  .stack (ORIGIN(RAM0) + LENGTH(RAM0) - __STACK_SIZE - __STACKSEAL_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackLimit = .;
+    . = . + __STACK_SIZE;
+    . = ALIGN(8);
+    __StackTop = .;
+  } > RAM0
+  PROVIDE(__stack = __StackTop);
+
+#if __STACKSEAL_SIZE > 0
+  .stackseal (ORIGIN(RAM0) + LENGTH(RAM0) - __STACKSEAL_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackSeal = .;
+    . = . + 8;
+    . = ALIGN(8);
+  } > RAM0
+#endif
+
+  /* Check if data + heap + stack exceeds RAM limit */
+  ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack")
+}
diff --git a/dsppp/RTE/Device/ARMCM0P/regions_ARMCM0P.h b/dsppp/RTE/Device/ARMCM0P/regions_ARMCM0P.h
new file mode 100644
index 000000000..c9b457cbc
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM0P/regions_ARMCM0P.h
@@ -0,0 +1,60 @@
+#ifndef REGIONS_ARMCM0P_H
+#define REGIONS_ARMCM0P_H
+
+
+//-------- <<< Use Configuration Wizard in Context Menu >>> --------------------
+
+// <n>Device pack:   ARM::Cortex_DFP@1.0.0
+// <i>Device pack used to generate this file
+
+// <h>ROM Configuration
+// =======================
+// <h> ROM=<__ROM0>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x00000000
+#define __ROM0_BASE 0x00000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00040000
+#define __ROM0_SIZE 0x00040000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __ROM0_DEFAULT 1
+//   <q>Startup
+//   <i> Selects region to be used for startup code.
+#define __ROM0_STARTUP 1
+// </h>
+
+// </h>
+
+// <h>RAM Configuration
+// =======================
+// <h> RAM=<__RAM0>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x20000000
+#define __RAM0_BASE 0x20000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00020000
+#define __RAM0_SIZE 0x00020000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM0_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM0_NOINIT 0
+// </h>
+
+// </h>
+
+// <h>Stack / Heap Configuration
+//   <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+//   <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+#define __STACK_SIZE 0x00000200
+#define __HEAP_SIZE 0x00000C00
+// </h>
+
+
+#endif /* REGIONS_ARMCM0P_H */
diff --git a/dsppp/RTE/Device/ARMCM0P/startup_ARMCM0plus.c b/dsppp/RTE/Device/ARMCM0P/startup_ARMCM0plus.c
new file mode 100644
index 000000000..25b202457
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM0P/startup_ARMCM0plus.c
@@ -0,0 +1,146 @@
+/******************************************************************************
+ * @file     startup_ARMCM0plus.c
+ * @brief    CMSIS-Core(M) Device Startup File for a Cortex-M0+ Device
+ * @version  V3.0.0
+ * @date     06. April 2023
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined (ARMCM0P)
+  #include "ARMCM0plus.h"
+#else
+  #error device not specified!
+#endif
+
+/*----------------------------------------------------------------------------
+  External References
+ *----------------------------------------------------------------------------*/
+extern uint32_t __INITIAL_SP;
+
+extern __NO_RETURN void __PROGRAM_START(void);
+
+/*----------------------------------------------------------------------------
+  Internal References
+ *----------------------------------------------------------------------------*/
+__NO_RETURN void Reset_Handler  (void);
+            void Default_Handler(void);
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Handler
+ *----------------------------------------------------------------------------*/
+/* Exceptions */
+void NMI_Handler            (void) __attribute__ ((weak, alias("Default_Handler")));
+void HardFault_Handler      (void) __attribute__ ((weak));
+void SVC_Handler            (void) __attribute__ ((weak, alias("Default_Handler")));
+void PendSV_Handler         (void) __attribute__ ((weak, alias("Default_Handler")));
+void SysTick_Handler        (void) __attribute__ ((weak, alias("Default_Handler")));
+
+void Interrupt0_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt1_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt2_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt3_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt4_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt5_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt6_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt7_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt8_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt9_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Vector table
+ *----------------------------------------------------------------------------*/
+
+#if defined ( __GNUC__ )
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+
+extern const VECTOR_TABLE_Type __VECTOR_TABLE[48];
+       const VECTOR_TABLE_Type __VECTOR_TABLE[48] __VECTOR_TABLE_ATTRIBUTE = {
+  (VECTOR_TABLE_Type)(&__INITIAL_SP),       /*     Initial Stack Pointer */
+  Reset_Handler,                            /*     Reset Handler */
+  NMI_Handler,                              /* -14 NMI Handler */
+  HardFault_Handler,                        /* -13 Hard Fault Handler */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  SVC_Handler,                              /*  -5 SVCall Handler */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  PendSV_Handler,                           /*  -2 PendSV Handler */
+  SysTick_Handler,                          /*  -1 SysTick Handler */
+
+  /* Interrupts */
+  Interrupt0_Handler,                       /*   0 Interrupt 0 */
+  Interrupt1_Handler,                       /*   1 Interrupt 1 */
+  Interrupt2_Handler,                       /*   2 Interrupt 2 */
+  Interrupt3_Handler,                       /*   3 Interrupt 3 */
+  Interrupt4_Handler,                       /*   4 Interrupt 4 */
+  Interrupt5_Handler,                       /*   5 Interrupt 5 */
+  Interrupt6_Handler,                       /*   6 Interrupt 6 */
+  Interrupt7_Handler,                       /*   7 Interrupt 7 */
+  Interrupt8_Handler,                       /*   8 Interrupt 8 */
+  Interrupt9_Handler                        /*   9 Interrupt 9 */
+                                            /* Interrupts 10..31 are left out */
+};
+
+#if defined ( __GNUC__ )
+#pragma GCC diagnostic pop
+#endif
+
+/*----------------------------------------------------------------------------
+  Reset Handler called on controller reset
+ *----------------------------------------------------------------------------*/
+__NO_RETURN void Reset_Handler(void)
+{
+  SystemInit();                             /* CMSIS System Initialization */
+  __PROGRAM_START();                        /* Enter PreMain (C library entry point) */
+}
+
+
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wmissing-noreturn"
+#endif
+
+/*----------------------------------------------------------------------------
+  Hard Fault Handler
+ *----------------------------------------------------------------------------*/
+void HardFault_Handler(void)
+{
+  while(1);
+}
+
+/*----------------------------------------------------------------------------
+  Default Handler for Exceptions / Interrupts
+ *----------------------------------------------------------------------------*/
+void Default_Handler(void)
+{
+  while(1);
+}
+
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+  #pragma clang diagnostic pop
+#endif
+
diff --git a/dsppp/RTE/Device/ARMCM0P/startup_ARMCM0plus.c.base@3.0.0 b/dsppp/RTE/Device/ARMCM0P/startup_ARMCM0plus.c.base@3.0.0
new file mode 100644
index 000000000..25b202457
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM0P/startup_ARMCM0plus.c.base@3.0.0
@@ -0,0 +1,146 @@
+/******************************************************************************
+ * @file     startup_ARMCM0plus.c
+ * @brief    CMSIS-Core(M) Device Startup File for a Cortex-M0+ Device
+ * @version  V3.0.0
+ * @date     06. April 2023
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined (ARMCM0P)
+  #include "ARMCM0plus.h"
+#else
+  #error device not specified!
+#endif
+
+/*----------------------------------------------------------------------------
+  External References
+ *----------------------------------------------------------------------------*/
+extern uint32_t __INITIAL_SP;
+
+extern __NO_RETURN void __PROGRAM_START(void);
+
+/*----------------------------------------------------------------------------
+  Internal References
+ *----------------------------------------------------------------------------*/
+__NO_RETURN void Reset_Handler  (void);
+            void Default_Handler(void);
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Handler
+ *----------------------------------------------------------------------------*/
+/* Exceptions */
+void NMI_Handler            (void) __attribute__ ((weak, alias("Default_Handler")));
+void HardFault_Handler      (void) __attribute__ ((weak));
+void SVC_Handler            (void) __attribute__ ((weak, alias("Default_Handler")));
+void PendSV_Handler         (void) __attribute__ ((weak, alias("Default_Handler")));
+void SysTick_Handler        (void) __attribute__ ((weak, alias("Default_Handler")));
+
+void Interrupt0_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt1_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt2_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt3_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt4_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt5_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt6_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt7_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt8_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt9_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Vector table
+ *----------------------------------------------------------------------------*/
+
+#if defined ( __GNUC__ )
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+
+extern const VECTOR_TABLE_Type __VECTOR_TABLE[48];
+       const VECTOR_TABLE_Type __VECTOR_TABLE[48] __VECTOR_TABLE_ATTRIBUTE = {
+  (VECTOR_TABLE_Type)(&__INITIAL_SP),       /*     Initial Stack Pointer */
+  Reset_Handler,                            /*     Reset Handler */
+  NMI_Handler,                              /* -14 NMI Handler */
+  HardFault_Handler,                        /* -13 Hard Fault Handler */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  SVC_Handler,                              /*  -5 SVCall Handler */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  PendSV_Handler,                           /*  -2 PendSV Handler */
+  SysTick_Handler,                          /*  -1 SysTick Handler */
+
+  /* Interrupts */
+  Interrupt0_Handler,                       /*   0 Interrupt 0 */
+  Interrupt1_Handler,                       /*   1 Interrupt 1 */
+  Interrupt2_Handler,                       /*   2 Interrupt 2 */
+  Interrupt3_Handler,                       /*   3 Interrupt 3 */
+  Interrupt4_Handler,                       /*   4 Interrupt 4 */
+  Interrupt5_Handler,                       /*   5 Interrupt 5 */
+  Interrupt6_Handler,                       /*   6 Interrupt 6 */
+  Interrupt7_Handler,                       /*   7 Interrupt 7 */
+  Interrupt8_Handler,                       /*   8 Interrupt 8 */
+  Interrupt9_Handler                        /*   9 Interrupt 9 */
+                                            /* Interrupts 10..31 are left out */
+};
+
+#if defined ( __GNUC__ )
+#pragma GCC diagnostic pop
+#endif
+
+/*----------------------------------------------------------------------------
+  Reset Handler called on controller reset
+ *----------------------------------------------------------------------------*/
+__NO_RETURN void Reset_Handler(void)
+{
+  SystemInit();                             /* CMSIS System Initialization */
+  __PROGRAM_START();                        /* Enter PreMain (C library entry point) */
+}
+
+
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wmissing-noreturn"
+#endif
+
+/*----------------------------------------------------------------------------
+  Hard Fault Handler
+ *----------------------------------------------------------------------------*/
+void HardFault_Handler(void)
+{
+  while(1);
+}
+
+/*----------------------------------------------------------------------------
+  Default Handler for Exceptions / Interrupts
+ *----------------------------------------------------------------------------*/
+void Default_Handler(void)
+{
+  while(1);
+}
+
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+  #pragma clang diagnostic pop
+#endif
+
diff --git a/dsppp/RTE/Device/ARMCM0P/system_ARMCM0plus.c b/dsppp/RTE/Device/ARMCM0P/system_ARMCM0plus.c
new file mode 100644
index 000000000..164d16da0
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM0P/system_ARMCM0plus.c
@@ -0,0 +1,69 @@
+/**************************************************************************//**
+ * @file     system_ARMCM0plus.c
+ * @brief    CMSIS Device System Source File for
+ *           ARMCM0plus Device
+ * @version  V2.0.0
+ * @date     06. April 2023
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined (ARMCM0P)
+  #include "ARMCM0plus.h"
+#else
+  #error device not specified!
+#endif
+
+/*----------------------------------------------------------------------------
+  Define clocks
+ *----------------------------------------------------------------------------*/
+#define  XTAL            (50000000UL)     /* Oscillator frequency */
+
+#define  SYSTEM_CLOCK    (XTAL / 2U)
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Vector table
+ *----------------------------------------------------------------------------*/
+extern const VECTOR_TABLE_Type __VECTOR_TABLE[48];
+
+/*----------------------------------------------------------------------------
+  System Core Clock Variable
+ *----------------------------------------------------------------------------*/
+uint32_t SystemCoreClock = SYSTEM_CLOCK;  /* System Core Clock Frequency */
+
+
+/*----------------------------------------------------------------------------
+  System Core Clock update function
+ *----------------------------------------------------------------------------*/
+void SystemCoreClockUpdate (void)
+{
+  SystemCoreClock = SYSTEM_CLOCK;
+}
+
+/*----------------------------------------------------------------------------
+  System initialization function
+ *----------------------------------------------------------------------------*/
+void SystemInit (void)
+{
+
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1U)
+  SCB->VTOR = (uint32_t) &(__VECTOR_TABLE[0]);
+#endif
+
+  SystemCoreClock = SYSTEM_CLOCK;
+}
diff --git a/dsppp/RTE/Device/ARMCM0P/system_ARMCM0plus.c.base@2.0.0 b/dsppp/RTE/Device/ARMCM0P/system_ARMCM0plus.c.base@2.0.0
new file mode 100644
index 000000000..164d16da0
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM0P/system_ARMCM0plus.c.base@2.0.0
@@ -0,0 +1,69 @@
+/**************************************************************************//**
+ * @file     system_ARMCM0plus.c
+ * @brief    CMSIS Device System Source File for
+ *           ARMCM0plus Device
+ * @version  V2.0.0
+ * @date     06. April 2023
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined (ARMCM0P)
+  #include "ARMCM0plus.h"
+#else
+  #error device not specified!
+#endif
+
+/*----------------------------------------------------------------------------
+  Define clocks
+ *----------------------------------------------------------------------------*/
+#define  XTAL            (50000000UL)     /* Oscillator frequency */
+
+#define  SYSTEM_CLOCK    (XTAL / 2U)
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Vector table
+ *----------------------------------------------------------------------------*/
+extern const VECTOR_TABLE_Type __VECTOR_TABLE[48];
+
+/*----------------------------------------------------------------------------
+  System Core Clock Variable
+ *----------------------------------------------------------------------------*/
+uint32_t SystemCoreClock = SYSTEM_CLOCK;  /* System Core Clock Frequency */
+
+
+/*----------------------------------------------------------------------------
+  System Core Clock update function
+ *----------------------------------------------------------------------------*/
+void SystemCoreClockUpdate (void)
+{
+  SystemCoreClock = SYSTEM_CLOCK;
+}
+
+/*----------------------------------------------------------------------------
+  System initialization function
+ *----------------------------------------------------------------------------*/
+void SystemInit (void)
+{
+
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1U)
+  SCB->VTOR = (uint32_t) &(__VECTOR_TABLE[0]);
+#endif
+
+  SystemCoreClock = SYSTEM_CLOCK;
+}
diff --git a/dsppp/RTE/Device/ARMCM4/ARMCM4_ac6.sct b/dsppp/RTE/Device/ARMCM4/ARMCM4_ac6.sct
new file mode 100644
index 000000000..eb67b5fe6
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM4/ARMCM4_ac6.sct
@@ -0,0 +1,80 @@
+#! armclang -E --target=arm-arm-none-eabi -mcpu=cortex-m4 -xc
+; command above MUST be in first line (no comment above!)
+
+/*
+;-------- <<< Use Configuration Wizard in Context Menu >>> -------------------
+*/
+
+/*--------------------- Flash Configuration ----------------------------------
+; <h> Flash Configuration
+;   <o0> Flash Base Address <0x0-0xFFFFFFFF:8>
+;   <o1> Flash Size (in Bytes) <0x0-0xFFFFFFFF:8>
+; </h>
+ *----------------------------------------------------------------------------*/
+#define __ROM_BASE      0x00000000
+#define __ROM_SIZE      0x00080000
+
+/*--------------------- Embedded RAM Configuration ---------------------------
+; <h> RAM Configuration
+;   <o0> RAM Base Address    <0x0-0xFFFFFFFF:8>
+;   <o1> RAM Size (in Bytes) <0x0-0xFFFFFFFF:8>
+; </h>
+ *----------------------------------------------------------------------------*/
+#define __RAM_BASE      0x20000000
+#define __RAM_SIZE      0x00040000
+
+/*--------------------- Stack / Heap Configuration ---------------------------
+; <h> Stack / Heap Configuration
+;   <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;   <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+; </h>
+ *----------------------------------------------------------------------------*/
+#define __STACK_SIZE    0x00000200
+#define __HEAP_SIZE     0x00000C00
+
+/*
+;------------- <<< end of configuration section >>> ---------------------------
+*/
+
+
+/*----------------------------------------------------------------------------
+  User Stack & Heap boundary definition
+ *----------------------------------------------------------------------------*/
+#define __STACK_TOP    (__RAM_BASE + __RAM_SIZE)    /* starts at end of RAM */
+#define __HEAP_BASE    (AlignExpr(+0, 8))           /* starts after RW_RAM section, 8 byte aligned */
+
+
+/*----------------------------------------------------------------------------
+  Scatter File Definitions definition
+ *----------------------------------------------------------------------------*/
+#define __RO_BASE       __ROM_BASE
+#define __RO_SIZE       __ROM_SIZE
+
+#define __RW_BASE       __RAM_BASE
+#define __RW_SIZE      (__RAM_SIZE - __STACK_SIZE - __HEAP_SIZE)
+
+
+LR_ROM __RO_BASE __RO_SIZE  {                       ; load region size_region
+  ER_ROM __RO_BASE __RO_SIZE  {                     ; load address = execution address
+   *.o (RESET, +First)
+   *(InRoot$$Sections)
+   .ANY (+RO)
+   .ANY (+XO)
+  }
+
+  RW_NOINIT __RW_BASE UNINIT __RW_SIZE {
+    *(.bss.noinit)
+  }
+
+  RW_RAM AlignExpr(+0, 8) (__RW_SIZE - AlignExpr(ImageLength(RW_NOINIT), 8)) {
+    *(+RW +ZI)
+  }
+
+#if __HEAP_SIZE > 0
+  ARM_LIB_HEAP  __HEAP_BASE EMPTY  __HEAP_SIZE  {   ; Reserve empty region for heap
+  }
+#endif
+
+  ARM_LIB_STACK __STACK_TOP EMPTY -__STACK_SIZE {   ; Reserve empty region for stack
+  }
+}
diff --git a/dsppp/RTE/Device/ARMCM4/ARMCM4_ac6.sct.base@1.0.0 b/dsppp/RTE/Device/ARMCM4/ARMCM4_ac6.sct.base@1.0.0
new file mode 100644
index 000000000..eb67b5fe6
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM4/ARMCM4_ac6.sct.base@1.0.0
@@ -0,0 +1,80 @@
+#! armclang -E --target=arm-arm-none-eabi -mcpu=cortex-m4 -xc
+; command above MUST be in first line (no comment above!)
+
+/*
+;-------- <<< Use Configuration Wizard in Context Menu >>> -------------------
+*/
+
+/*--------------------- Flash Configuration ----------------------------------
+; <h> Flash Configuration
+;   <o0> Flash Base Address <0x0-0xFFFFFFFF:8>
+;   <o1> Flash Size (in Bytes) <0x0-0xFFFFFFFF:8>
+; </h>
+ *----------------------------------------------------------------------------*/
+#define __ROM_BASE      0x00000000
+#define __ROM_SIZE      0x00080000
+
+/*--------------------- Embedded RAM Configuration ---------------------------
+; <h> RAM Configuration
+;   <o0> RAM Base Address    <0x0-0xFFFFFFFF:8>
+;   <o1> RAM Size (in Bytes) <0x0-0xFFFFFFFF:8>
+; </h>
+ *----------------------------------------------------------------------------*/
+#define __RAM_BASE      0x20000000
+#define __RAM_SIZE      0x00040000
+
+/*--------------------- Stack / Heap Configuration ---------------------------
+; <h> Stack / Heap Configuration
+;   <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;   <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+; </h>
+ *----------------------------------------------------------------------------*/
+#define __STACK_SIZE    0x00000200
+#define __HEAP_SIZE     0x00000C00
+
+/*
+;------------- <<< end of configuration section >>> ---------------------------
+*/
+
+
+/*----------------------------------------------------------------------------
+  User Stack & Heap boundary definition
+ *----------------------------------------------------------------------------*/
+#define __STACK_TOP    (__RAM_BASE + __RAM_SIZE)    /* starts at end of RAM */
+#define __HEAP_BASE    (AlignExpr(+0, 8))           /* starts after RW_RAM section, 8 byte aligned */
+
+
+/*----------------------------------------------------------------------------
+  Scatter File Definitions definition
+ *----------------------------------------------------------------------------*/
+#define __RO_BASE       __ROM_BASE
+#define __RO_SIZE       __ROM_SIZE
+
+#define __RW_BASE       __RAM_BASE
+#define __RW_SIZE      (__RAM_SIZE - __STACK_SIZE - __HEAP_SIZE)
+
+
+LR_ROM __RO_BASE __RO_SIZE  {                       ; load region size_region
+  ER_ROM __RO_BASE __RO_SIZE  {                     ; load address = execution address
+   *.o (RESET, +First)
+   *(InRoot$$Sections)
+   .ANY (+RO)
+   .ANY (+XO)
+  }
+
+  RW_NOINIT __RW_BASE UNINIT __RW_SIZE {
+    *(.bss.noinit)
+  }
+
+  RW_RAM AlignExpr(+0, 8) (__RW_SIZE - AlignExpr(ImageLength(RW_NOINIT), 8)) {
+    *(+RW +ZI)
+  }
+
+#if __HEAP_SIZE > 0
+  ARM_LIB_HEAP  __HEAP_BASE EMPTY  __HEAP_SIZE  {   ; Reserve empty region for heap
+  }
+#endif
+
+  ARM_LIB_STACK __STACK_TOP EMPTY -__STACK_SIZE {   ; Reserve empty region for stack
+  }
+}
diff --git a/dsppp/RTE/Device/ARMCM4/ARMCM4_gcc.ld b/dsppp/RTE/Device/ARMCM4/ARMCM4_gcc.ld
new file mode 100644
index 000000000..93ed813c8
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM4/ARMCM4_gcc.ld
@@ -0,0 +1,263 @@
+/*
+ *-------- <<< Use Configuration Wizard in Context Menu >>> -------------------
+ */
+
+/*---------------------- Flash Configuration ----------------------------------
+  <h> Flash Configuration
+    <o0> Flash Base Address <0x0-0xFFFFFFFF:8>
+    <o1> Flash Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+  -----------------------------------------------------------------------------*/
+__ROM_BASE = 0x00000000;
+__ROM_SIZE = 0x00040000;
+
+/*--------------------- Embedded RAM Configuration ----------------------------
+  <h> RAM Configuration
+    <o0> RAM Base Address    <0x0-0xFFFFFFFF:8>
+    <o1> RAM Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+ -----------------------------------------------------------------------------*/
+__RAM_BASE = 0x20000000;
+__RAM_SIZE = 0x00020000;
+
+/*--------------------- Stack / Heap Configuration ----------------------------
+  <h> Stack / Heap Configuration
+    <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+    <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+  -----------------------------------------------------------------------------*/
+__STACK_SIZE = 0x00000400;
+__HEAP_SIZE  = 0x00000C00;
+
+/*
+ *-------------------- <<< end of configuration section >>> -------------------
+ */
+
+MEMORY
+{
+  FLASH (rx)  : ORIGIN = __ROM_BASE, LENGTH = __ROM_SIZE
+  RAM   (rwx) : ORIGIN = __RAM_BASE, LENGTH = __RAM_SIZE
+}
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext          (deprecated)
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+  .text :
+  {
+    KEEP(*(.vectors))
+    *(.text*)
+
+    KEEP(*(.init))
+    KEEP(*(.fini))
+
+    /* .ctors */
+    *crtbegin.o(.ctors)
+    *crtbegin?.o(.ctors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+    *(SORT(.ctors.*))
+    *(.ctors)
+
+    /* .dtors */
+    *crtbegin.o(.dtors)
+    *crtbegin?.o(.dtors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+    *(SORT(.dtors.*))
+    *(.dtors)
+
+    *(.rodata*)
+
+    KEEP(*(.eh_frame*))
+  } > FLASH
+
+  .ARM.extab :
+  {
+    *(.ARM.extab* .gnu.linkonce.armextab.*)
+  } > FLASH
+
+  __exidx_start = .;
+  .ARM.exidx :
+  {
+    *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+  } > FLASH
+  __exidx_end = .;
+
+  .copy.table :
+  {
+    . = ALIGN(4);
+    __copy_table_start__ = .;
+
+    LONG (LOADADDR(.data))
+    LONG (ADDR(.data))
+    LONG (SIZEOF(.data) / 4)
+
+    /* Add each additional data section here */
+/*
+    LONG (LOADADDR(.data2))
+    LONG (ADDR(.data2))
+    LONG (SIZEOF(.data2) / 4)
+*/
+    __copy_table_end__ = .;
+  } > FLASH
+
+  .zero.table :
+  {
+    . = ALIGN(4);
+    __zero_table_start__ = .;
+
+/*  .bss initialization to zero is already done during C Run-Time Startup.
+    LONG (ADDR(.bss))
+    LONG (SIZEOF(.bss) / 4)
+*/
+
+    /* Add each additional bss section here */
+/*
+    LONG (ADDR(.bss2))
+    LONG (SIZEOF(.bss2) / 4)
+*/
+    __zero_table_end__ = .;
+  } > FLASH
+
+  /*
+   * This __etext variable is kept for backward compatibility with older,
+   * ASM based startup files.
+   */
+  PROVIDE(__etext = LOADADDR(.data));
+
+  .data : ALIGN(4)
+  {
+    __data_start__ = .;
+    *(vtable)
+    *(.data)
+    *(.data.*)
+
+    . = ALIGN(4);
+    /* preinit data */
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP(*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+
+    . = ALIGN(4);
+    /* init data */
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP(*(SORT(.init_array.*)))
+    KEEP(*(.init_array))
+    PROVIDE_HIDDEN (__init_array_end = .);
+
+    . = ALIGN(4);
+    /* finit data */
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP(*(SORT(.fini_array.*)))
+    KEEP(*(.fini_array))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+
+    KEEP(*(.jcr*))
+    . = ALIGN(4);
+    /* All data end */
+    __data_end__ = .;
+
+  } > RAM AT > FLASH
+
+  /*
+   * Secondary data section, optional
+   *
+   * Remember to add each additional data section
+   * to the .copy.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .data2 : ALIGN(4)
+  {
+    . = ALIGN(4);
+    __data2_start__ = .;
+    *(.data2)
+    *(.data2.*)
+    . = ALIGN(4);
+    __data2_end__ = .;
+
+  } > RAM2 AT > FLASH
+*/
+
+  .bss :
+  {
+    . = ALIGN(4);
+    __bss_start__ = .;
+    *(.bss)
+    *(.bss.*)
+    *(COMMON)
+    . = ALIGN(4);
+    __bss_end__ = .;
+  } > RAM AT > RAM
+
+  /*
+   * Secondary bss section, optional
+   *
+   * Remember to add each additional bss section
+   * to the .zero.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .bss2 :
+  {
+    . = ALIGN(4);
+    __bss2_start__ = .;
+    *(.bss2)
+    *(.bss2.*)
+    . = ALIGN(4);
+    __bss2_end__ = .;
+  } > RAM2 AT > RAM2
+*/
+
+  .heap (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __end__ = .;
+    PROVIDE(end = .);
+    . = . + __HEAP_SIZE;
+    . = ALIGN(8);
+    __HeapLimit = .;
+  } > RAM
+
+  .stack (ORIGIN(RAM) + LENGTH(RAM) - __STACK_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackLimit = .;
+    . = . + __STACK_SIZE;
+    . = ALIGN(8);
+    __StackTop = .;
+  } > RAM
+  PROVIDE(__stack = __StackTop);
+
+  /* Check if data + heap + stack exceeds RAM limit */
+  ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack")
+}
diff --git a/dsppp/RTE/Device/ARMCM4/ARMCM4_gcc.ld.base@2.2.0 b/dsppp/RTE/Device/ARMCM4/ARMCM4_gcc.ld.base@2.2.0
new file mode 100644
index 000000000..93ed813c8
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM4/ARMCM4_gcc.ld.base@2.2.0
@@ -0,0 +1,263 @@
+/*
+ *-------- <<< Use Configuration Wizard in Context Menu >>> -------------------
+ */
+
+/*---------------------- Flash Configuration ----------------------------------
+  <h> Flash Configuration
+    <o0> Flash Base Address <0x0-0xFFFFFFFF:8>
+    <o1> Flash Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+  -----------------------------------------------------------------------------*/
+__ROM_BASE = 0x00000000;
+__ROM_SIZE = 0x00040000;
+
+/*--------------------- Embedded RAM Configuration ----------------------------
+  <h> RAM Configuration
+    <o0> RAM Base Address    <0x0-0xFFFFFFFF:8>
+    <o1> RAM Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+ -----------------------------------------------------------------------------*/
+__RAM_BASE = 0x20000000;
+__RAM_SIZE = 0x00020000;
+
+/*--------------------- Stack / Heap Configuration ----------------------------
+  <h> Stack / Heap Configuration
+    <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+    <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+  -----------------------------------------------------------------------------*/
+__STACK_SIZE = 0x00000400;
+__HEAP_SIZE  = 0x00000C00;
+
+/*
+ *-------------------- <<< end of configuration section >>> -------------------
+ */
+
+MEMORY
+{
+  FLASH (rx)  : ORIGIN = __ROM_BASE, LENGTH = __ROM_SIZE
+  RAM   (rwx) : ORIGIN = __RAM_BASE, LENGTH = __RAM_SIZE
+}
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext          (deprecated)
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+  .text :
+  {
+    KEEP(*(.vectors))
+    *(.text*)
+
+    KEEP(*(.init))
+    KEEP(*(.fini))
+
+    /* .ctors */
+    *crtbegin.o(.ctors)
+    *crtbegin?.o(.ctors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+    *(SORT(.ctors.*))
+    *(.ctors)
+
+    /* .dtors */
+    *crtbegin.o(.dtors)
+    *crtbegin?.o(.dtors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+    *(SORT(.dtors.*))
+    *(.dtors)
+
+    *(.rodata*)
+
+    KEEP(*(.eh_frame*))
+  } > FLASH
+
+  .ARM.extab :
+  {
+    *(.ARM.extab* .gnu.linkonce.armextab.*)
+  } > FLASH
+
+  __exidx_start = .;
+  .ARM.exidx :
+  {
+    *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+  } > FLASH
+  __exidx_end = .;
+
+  .copy.table :
+  {
+    . = ALIGN(4);
+    __copy_table_start__ = .;
+
+    LONG (LOADADDR(.data))
+    LONG (ADDR(.data))
+    LONG (SIZEOF(.data) / 4)
+
+    /* Add each additional data section here */
+/*
+    LONG (LOADADDR(.data2))
+    LONG (ADDR(.data2))
+    LONG (SIZEOF(.data2) / 4)
+*/
+    __copy_table_end__ = .;
+  } > FLASH
+
+  .zero.table :
+  {
+    . = ALIGN(4);
+    __zero_table_start__ = .;
+
+/*  .bss initialization to zero is already done during C Run-Time Startup.
+    LONG (ADDR(.bss))
+    LONG (SIZEOF(.bss) / 4)
+*/
+
+    /* Add each additional bss section here */
+/*
+    LONG (ADDR(.bss2))
+    LONG (SIZEOF(.bss2) / 4)
+*/
+    __zero_table_end__ = .;
+  } > FLASH
+
+  /*
+   * This __etext variable is kept for backward compatibility with older,
+   * ASM based startup files.
+   */
+  PROVIDE(__etext = LOADADDR(.data));
+
+  .data : ALIGN(4)
+  {
+    __data_start__ = .;
+    *(vtable)
+    *(.data)
+    *(.data.*)
+
+    . = ALIGN(4);
+    /* preinit data */
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP(*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+
+    . = ALIGN(4);
+    /* init data */
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP(*(SORT(.init_array.*)))
+    KEEP(*(.init_array))
+    PROVIDE_HIDDEN (__init_array_end = .);
+
+    . = ALIGN(4);
+    /* finit data */
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP(*(SORT(.fini_array.*)))
+    KEEP(*(.fini_array))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+
+    KEEP(*(.jcr*))
+    . = ALIGN(4);
+    /* All data end */
+    __data_end__ = .;
+
+  } > RAM AT > FLASH
+
+  /*
+   * Secondary data section, optional
+   *
+   * Remember to add each additional data section
+   * to the .copy.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .data2 : ALIGN(4)
+  {
+    . = ALIGN(4);
+    __data2_start__ = .;
+    *(.data2)
+    *(.data2.*)
+    . = ALIGN(4);
+    __data2_end__ = .;
+
+  } > RAM2 AT > FLASH
+*/
+
+  .bss :
+  {
+    . = ALIGN(4);
+    __bss_start__ = .;
+    *(.bss)
+    *(.bss.*)
+    *(COMMON)
+    . = ALIGN(4);
+    __bss_end__ = .;
+  } > RAM AT > RAM
+
+  /*
+   * Secondary bss section, optional
+   *
+   * Remember to add each additional bss section
+   * to the .zero.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .bss2 :
+  {
+    . = ALIGN(4);
+    __bss2_start__ = .;
+    *(.bss2)
+    *(.bss2.*)
+    . = ALIGN(4);
+    __bss2_end__ = .;
+  } > RAM2 AT > RAM2
+*/
+
+  .heap (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __end__ = .;
+    PROVIDE(end = .);
+    . = . + __HEAP_SIZE;
+    . = ALIGN(8);
+    __HeapLimit = .;
+  } > RAM
+
+  .stack (ORIGIN(RAM) + LENGTH(RAM) - __STACK_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackLimit = .;
+    . = . + __STACK_SIZE;
+    . = ALIGN(8);
+    __StackTop = .;
+  } > RAM
+  PROVIDE(__stack = __StackTop);
+
+  /* Check if data + heap + stack exceeds RAM limit */
+  ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack")
+}
diff --git a/dsppp/RTE/Device/ARMCM4/clang_linker_script.ld b/dsppp/RTE/Device/ARMCM4/clang_linker_script.ld
new file mode 100644
index 000000000..40f955c16
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM4/clang_linker_script.ld
@@ -0,0 +1,353 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright © 2019 Keith Packard
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials provided
+ *    with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/* ----------------------------------------------------------------------------
+  Memory definition
+ *----------------------------------------------------------------------------*/
+MEMORY
+{
+  ROM0  (rx!w)  : ORIGIN = __ROM0_BASE, LENGTH = __ROM0_SIZE
+#if __ROM1_SIZE > 0
+  ROM1  (rx!w)  : ORIGIN = __ROM1_BASE, LENGTH = __ROM1_SIZE
+#endif
+#if __ROM2_SIZE > 0
+  ROM2  (rx!w)  : ORIGIN = __ROM2_BASE, LENGTH = __ROM2_SIZE
+#endif
+#if __ROM3_SIZE > 0
+  ROM3  (rx!w)  : ORIGIN = __ROM3_BASE, LENGTH = __ROM3_SIZE
+#endif
+
+  RAM0  (w!rx) : ORIGIN = __RAM0_BASE, LENGTH = __RAM0_SIZE
+#if __RAM1_SIZE > 0
+  RAM1  (w!rx) : ORIGIN = __RAM1_BASE, LENGTH = __RAM1_SIZE
+#endif
+#if __RAM2_SIZE > 0
+  RAM2  (w!rx) : ORIGIN = __RAM2_BASE, LENGTH = __RAM2_SIZE
+#endif
+#if __RAM3_SIZE > 0
+  RAM3  (w!rx) : ORIGIN = __RAM3_BASE, LENGTH = __RAM3_SIZE
+#endif
+}
+
+ENTRY(Reset_Handler)
+
+PHDRS
+{
+	text PT_LOAD;
+	ram PT_LOAD;
+	ram_init PT_LOAD;
+	tls PT_TLS;
+}
+
+SECTIONS
+{
+	.init : {
+		KEEP (*(.vectors))
+		KEEP (*(.text.init.enter))
+		KEEP (*(.data.init.enter))
+		KEEP (*(SORT_BY_NAME(.init) SORT_BY_NAME(.init.*)))
+	} >ROM0 AT>ROM0 :text
+
+	.text : {
+
+		/* code */
+		*(.text.unlikely .text.unlikely.*)
+		*(.text.startup .text.startup.*)
+		*(.text .text.* .opd .opd.*)
+		*(.gnu.linkonce.t.*)
+		KEEP (*(.fini .fini.*))
+		__text_end = .;
+
+		PROVIDE (__etext = __text_end);
+		PROVIDE (_etext = __text_end);
+		PROVIDE (etext = __text_end);
+
+		/* read-only data */
+		*(.rdata)
+		*(.rodata .rodata.*)
+		*(.gnu.linkonce.r.*)
+
+		*(.srodata.cst16)
+		*(.srodata.cst8)
+		*(.srodata.cst4)
+		*(.srodata.cst2)
+		*(.srodata .srodata.*)
+		*(.data.rel.ro .data.rel.ro.*)
+		*(.got .got.*)
+
+		/* Need to pre-align so that the symbols come after padding */
+		. = ALIGN(8);
+
+		/* lists of constructors and destructors */
+		PROVIDE_HIDDEN ( __preinit_array_start = . );
+		KEEP (*(.preinit_array))
+		PROVIDE_HIDDEN ( __preinit_array_end = . );
+
+		PROVIDE_HIDDEN ( __init_array_start = . );
+		KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
+		KEEP (*(.init_array .ctors))
+		PROVIDE_HIDDEN ( __init_array_end = . );
+
+		PROVIDE_HIDDEN ( __fini_array_start = . );
+		KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
+		KEEP (*(.fini_array .dtors))
+		PROVIDE_HIDDEN ( __fini_array_end = . );
+
+	} >ROM0 AT>ROM0 :text
+
+	.toc : {
+		*(.toc .toc.*)
+	} >ROM0 AT>ROM0 :text
+
+	/* additional sections when compiling with C++ exception support */
+	
+	.except_ordered : {
+		*(.gcc_except_table *.gcc_except_table.*)
+		KEEP (*(.eh_frame .eh_frame.*))
+		*(.ARM.extab* .gnu.linkonce.armextab.*)
+	} >ROM0 AT>ROM0 :text
+
+	.except_unordered : {
+		. = ALIGN(8);
+
+		PROVIDE(__exidx_start = .);
+		*(.ARM.exidx*)
+		PROVIDE(__exidx_end = .);
+	} >ROM0 AT>ROM0 :text
+	
+
+	/*
+	 * Data values which are preserved across reset
+	 */
+	.preserve (NOLOAD) : {
+		PROVIDE(__preserve_start__ = .);
+		KEEP(*(SORT_BY_NAME(.preserve.*)))
+		KEEP(*(.preserve))
+		PROVIDE(__preserve_end__ = .);
+	} >RAM0 AT>RAM0 :ram
+
+	.data :  {
+		*(.data .data.*)
+		*(.gnu.linkonce.d.*)
+
+		/* Need to pre-align so that the symbols come after padding */
+		. = ALIGN(8);
+
+		PROVIDE( __global_pointer$ = . + 0x800 );
+		*(.sdata .sdata.* .sdata2.*)
+		*(.gnu.linkonce.s.*)
+	} >RAM0 AT>ROM0 :ram_init
+	PROVIDE(__data_start = ADDR(.data));
+	PROVIDE(__data_source = LOADADDR(.data));
+
+	/* Thread local initialized data. This gets
+	 * space allocated as it is expected to be placed
+	 * in ram to be used as a template for TLS data blocks
+	 * allocated at runtime. We're slightly abusing that
+	 * by placing the data in flash where it will be copied
+	 * into the allocate ram addresses by the existing
+	 * data initialization code in crt0
+	 */
+	.tdata :  {
+		*(.tdata .tdata.* .gnu.linkonce.td.*)
+		PROVIDE(__data_end = .);
+		PROVIDE(__tdata_end = .);
+	} >RAM0 AT>ROM0 :tls :ram_init
+	PROVIDE( __tls_base = ADDR(.tdata));
+	PROVIDE( __tdata_start = ADDR(.tdata));
+	PROVIDE( __tdata_source = LOADADDR(.tdata) );
+	PROVIDE( __tdata_source_end = LOADADDR(.tdata) + SIZEOF(.tdata) );
+	PROVIDE( __data_source_end = __tdata_source_end );
+	PROVIDE( __tdata_size = SIZEOF(.tdata) );
+	PROVIDE( __tls_align = MAX(ALIGNOF(.tdata),ALIGNOF(.tbss)) );
+
+	PROVIDE( __edata = __data_end );
+	PROVIDE( _edata = __data_end );
+	PROVIDE( edata = __data_end );
+	PROVIDE( __data_size = __data_end - __data_start );
+	PROVIDE( __data_source_size = __data_source_end - __data_source );
+
+	.tbss (NOLOAD) : {
+		*(.tbss .tbss.* .gnu.linkonce.tb.*)
+		*(.tcommon)
+		PROVIDE( __tls_end = . );
+		PROVIDE( __tbss_end = . );
+	} >RAM0 AT>RAM0 :tls :ram
+	PROVIDE( __bss_start = ADDR(.tbss));
+	PROVIDE( __tbss_start = ADDR(.tbss));
+	PROVIDE( __tbss_offset = ADDR(.tbss) - ADDR(.tdata) );
+	PROVIDE( __tbss_size = SIZEOF(.tbss) );
+	PROVIDE( __tls_size = __tls_end - __tls_base );
+	PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
+	PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
+	PROVIDE( __arm64_tls_tcb_offset = MAX(16, __tls_align) );
+
+	/*
+	 * The linker special cases .tbss segments which are
+	 * identified as segments which are not loaded and are
+	 * thread_local.
+	 *
+	 * For these segments, the linker does not advance 'dot'
+	 * across them.  We actually need memory allocated for tbss,
+	 * so we create a special segment here just to make room
+	 */
+	/*
+	.tbss_space (NOLOAD) : {
+		. = ADDR(.tbss);
+		. = . + SIZEOF(.tbss);
+	} >RAM0 AT>RAM0 :ram
+	*/
+
+	.bss (NOLOAD) : {
+		*(.sbss*)
+		*(.gnu.linkonce.sb.*)
+		*(.bss .bss.*)
+		*(.gnu.linkonce.b.*)
+		*(COMMON)
+
+		/* Align the heap */
+		. = ALIGN(8);
+		__bss_end = .;
+	} >RAM0 AT>RAM0 :ram
+	PROVIDE( __non_tls_bss_start = ADDR(.bss) );
+	PROVIDE( __end = __bss_end );
+	PROVIDE( _end = __bss_end );
+	PROVIDE( end = __bss_end );
+	PROVIDE( __bss_size = __bss_end - __bss_start );
+
+	/* Make the rest of memory available for heap storage */
+	PROVIDE (__heap_start = __end);
+#ifdef __HEAP_SIZE
+	PROVIDE (__heap_end = __heap_start + __HEAP_SIZE);
+	PROVIDE (__heap_size = __HEAP_SIZE);
+#else
+	PROVIDE (__heap_end = __stack - __STACK_SIZE);
+	PROVIDE (__heap_size = __heap_end - __heap_start);
+#endif
+	.heap (NOLOAD) : {
+		. += __heap_size;
+	} >RAM0 :ram
+
+	/* Define a stack region to make sure it fits in memory */
+	PROVIDE(__stack = ORIGIN(RAM0) + LENGTH(RAM0) - __STACKSEAL_SIZE);
+	PROVIDE(__stack_limit = __stack - __STACK_SIZE);
+	.stack (__stack_limit) (NOLOAD) : {
+		. += __STACK_SIZE;
+	} >RAM0 :ram
+
+#if __STACKSEAL_SIZE > 0
+	PROVIDE(__stack_seal = __stack)
+  	.stackseal (__stack) (NOLOAD) :
+  	{
+    	. += __STACKSEAL_SIZE;
+  	} >RAM0 :ram
+#endif
+
+	/* Throw away C++ exception handling information */
+
+	/*
+
+	/DISCARD/ : {
+		*(.note .note.*)
+		*(.eh_frame .eh_frame.*)
+		*(.ARM.extab* .gnu.linkonce.armextab.*)
+		*(.ARM.exidx*)
+	}
+
+	*/
+
+	/* Stabs debugging sections.  */
+	.stab          0 : { *(.stab) }
+	.stabstr       0 : { *(.stabstr) }
+	.stab.excl     0 : { *(.stab.excl) }
+	.stab.exclstr  0 : { *(.stab.exclstr) }
+	.stab.index    0 : { *(.stab.index) }
+	.stab.indexstr 0 : { *(.stab.indexstr) }
+	.comment       0 : { *(.comment) }
+	.gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) }
+	/* DWARF debug sections.
+	   Symbols in the DWARF debugging sections are relative to the beginning
+	   of the section so we begin them at 0.  */
+	/* DWARF 1.  */
+	.debug          0 : { *(.debug) }
+	.line           0 : { *(.line) }
+	/* GNU DWARF 1 extensions.  */
+	.debug_srcinfo  0 : { *(.debug_srcinfo) }
+	.debug_sfnames  0 : { *(.debug_sfnames) }
+	/* DWARF 1.1 and DWARF 2.  */
+	.debug_aranges  0 : { *(.debug_aranges) }
+	.debug_pubnames 0 : { *(.debug_pubnames) }
+	/* DWARF 2.  */
+	.debug_info     0 : { *(.debug_info .gnu.linkonce.wi.*) }
+	.debug_abbrev   0 : { *(.debug_abbrev) }
+	.debug_line     0 : { *(.debug_line .debug_line.* .debug_line_end) }
+	.debug_frame    0 : { *(.debug_frame) }
+	.debug_str      0 : { *(.debug_str) }
+	.debug_loc      0 : { *(.debug_loc) }
+	.debug_macinfo  0 : { *(.debug_macinfo) }
+	/* SGI/MIPS DWARF 2 extensions.  */
+	.debug_weaknames 0 : { *(.debug_weaknames) }
+	.debug_funcnames 0 : { *(.debug_funcnames) }
+	.debug_typenames 0 : { *(.debug_typenames) }
+	.debug_varnames  0 : { *(.debug_varnames) }
+	/* DWARF 3.  */
+	.debug_pubtypes 0 : { *(.debug_pubtypes) }
+	.debug_ranges   0 : { *(.debug_ranges) }
+	/* DWARF 5.  */
+	.debug_addr     0 : { *(.debug_addr) }
+	.debug_line_str 0 : { *(.debug_line_str) }
+	.debug_loclists 0 : { *(.debug_loclists) }
+	.debug_macro    0 : { *(.debug_macro) }
+	.debug_names    0 : { *(.debug_names) }
+	.debug_rnglists 0 : { *(.debug_rnglists) }
+	.debug_str_offsets 0 : { *(.debug_str_offsets) }
+	.debug_sup      0 : { *(.debug_sup) }
+	.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
+}
+/*
+ * Check that sections that are copied from flash to RAM have matching
+ * padding, so that a single memcpy() of __data_size copies the correct bytes.
+ */
+ASSERT( __data_size == __data_source_size,
+	"ERROR: .data/.tdata flash size does not match RAM size");
diff --git a/dsppp/RTE/Device/ARMCM4/regions_ARMCM4.h b/dsppp/RTE/Device/ARMCM4/regions_ARMCM4.h
new file mode 100644
index 000000000..3ee4d4228
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM4/regions_ARMCM4.h
@@ -0,0 +1,60 @@
+#ifndef REGIONS_ARMCM4_H
+#define REGIONS_ARMCM4_H
+
+
+//-------- <<< Use Configuration Wizard in Context Menu >>> --------------------
+
+// <n>Device pack:   ARM::Cortex_DFP@1.0.0
+// <i>Device pack used to generate this file
+
+// <h>ROM Configuration
+// =======================
+// <h> ROM=<__ROM0>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x00000000
+#define __ROM0_BASE 0x00000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00040000
+#define __ROM0_SIZE 0x00040000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __ROM0_DEFAULT 1
+//   <q>Startup
+//   <i> Selects region to be used for startup code.
+#define __ROM0_STARTUP 1
+// </h>
+
+// </h>
+
+// <h>RAM Configuration
+// =======================
+// <h> RAM=<__RAM0>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x20000000
+#define __RAM0_BASE 0x20000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00020000
+#define __RAM0_SIZE 0x00020000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM0_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM0_NOINIT 0
+// </h>
+
+// </h>
+
+// <h>Stack / Heap Configuration
+//   <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+//   <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+#define __STACK_SIZE 0x00000200
+#define __HEAP_SIZE 0x00000C00
+// </h>
+
+
+#endif /* REGIONS_ARMCM4_H */
diff --git a/dsppp/RTE/Device/ARMCM4/startup_ARMCM4.c b/dsppp/RTE/Device/ARMCM4/startup_ARMCM4.c
new file mode 100644
index 000000000..9d5777366
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM4/startup_ARMCM4.c
@@ -0,0 +1,150 @@
+/******************************************************************************
+ * @file     startup_ARMCM4.c
+ * @brief    CMSIS-Core(M) Device Startup File for a Cortex-M4 Device
+ * @version  V3.0.0
+ * @date     06. April 2023
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined (ARMCM4)
+  #include "ARMCM4.h"
+#else
+  #error device not specified!
+#endif
+
+/*----------------------------------------------------------------------------
+  External References
+ *----------------------------------------------------------------------------*/
+extern uint32_t __INITIAL_SP;
+
+extern __NO_RETURN void __PROGRAM_START(void);
+
+/*----------------------------------------------------------------------------
+  Internal References
+ *----------------------------------------------------------------------------*/
+__NO_RETURN void Reset_Handler  (void);
+            void Default_Handler(void);
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Handler
+ *----------------------------------------------------------------------------*/
+/* Exceptions */
+void NMI_Handler            (void) __attribute__ ((weak, alias("Default_Handler")));
+void HardFault_Handler      (void) __attribute__ ((weak));
+void MemManage_Handler      (void) __attribute__ ((weak, alias("Default_Handler")));
+void BusFault_Handler       (void) __attribute__ ((weak, alias("Default_Handler")));
+void UsageFault_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void SVC_Handler            (void) __attribute__ ((weak, alias("Default_Handler")));
+void DebugMon_Handler       (void) __attribute__ ((weak, alias("Default_Handler")));
+void PendSV_Handler         (void) __attribute__ ((weak, alias("Default_Handler")));
+void SysTick_Handler        (void) __attribute__ ((weak, alias("Default_Handler")));
+
+void Interrupt0_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt1_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt2_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt3_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt4_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt5_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt6_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt7_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt8_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt9_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Vector table
+ *----------------------------------------------------------------------------*/
+
+#if defined ( __GNUC__ )
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+
+extern const VECTOR_TABLE_Type __VECTOR_TABLE[240];
+       const VECTOR_TABLE_Type __VECTOR_TABLE[240] __VECTOR_TABLE_ATTRIBUTE = {
+  (VECTOR_TABLE_Type)(&__INITIAL_SP),       /*     Initial Stack Pointer */
+  Reset_Handler,                            /*     Reset Handler */
+  NMI_Handler,                              /* -14 NMI Handler */
+  HardFault_Handler,                        /* -13 Hard Fault Handler */
+  MemManage_Handler,                        /* -12 MPU Fault Handler */
+  BusFault_Handler,                         /* -11 Bus Fault Handler */
+  UsageFault_Handler,                       /* -10 Usage Fault Handler */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  SVC_Handler,                              /*  -5 SVC Handler */
+  DebugMon_Handler,                         /*  -4 Debug Monitor Handler */
+  0,                                        /*     Reserved */
+  PendSV_Handler,                           /*  -2 PendSV Handler */
+  SysTick_Handler,                          /*  -1 SysTick Handler */
+
+  /* Interrupts */
+  Interrupt0_Handler,                       /*   0 Interrupt 0 */
+  Interrupt1_Handler,                       /*   1 Interrupt 1 */
+  Interrupt2_Handler,                       /*   2 Interrupt 2 */
+  Interrupt3_Handler,                       /*   3 Interrupt 3 */
+  Interrupt4_Handler,                       /*   4 Interrupt 4 */
+  Interrupt5_Handler,                       /*   5 Interrupt 5 */
+  Interrupt6_Handler,                       /*   6 Interrupt 6 */
+  Interrupt7_Handler,                       /*   7 Interrupt 7 */
+  Interrupt8_Handler,                       /*   8 Interrupt 8 */
+  Interrupt9_Handler                        /*   9 Interrupt 9 */
+                                            /* Interrupts 10 .. 223 are left out */
+};
+
+#if defined ( __GNUC__ )
+#pragma GCC diagnostic pop
+#endif
+
+/*----------------------------------------------------------------------------
+  Reset Handler called on controller reset
+ *----------------------------------------------------------------------------*/
+__NO_RETURN void Reset_Handler(void)
+{
+  SystemInit();                             /* CMSIS System Initialization */
+  __PROGRAM_START();                        /* Enter PreMain (C library entry point) */
+}
+
+
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wmissing-noreturn"
+#endif
+
+/*----------------------------------------------------------------------------
+  Hard Fault Handler
+ *----------------------------------------------------------------------------*/
+void HardFault_Handler(void)
+{
+  while(1);
+}
+
+/*----------------------------------------------------------------------------
+  Default Handler for Exceptions / Interrupts
+ *----------------------------------------------------------------------------*/
+void Default_Handler(void)
+{
+  while(1);
+}
+
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+  #pragma clang diagnostic pop
+#endif
+
diff --git a/dsppp/RTE/Device/ARMCM4/startup_ARMCM4.c.base@3.0.0 b/dsppp/RTE/Device/ARMCM4/startup_ARMCM4.c.base@3.0.0
new file mode 100644
index 000000000..9d5777366
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM4/startup_ARMCM4.c.base@3.0.0
@@ -0,0 +1,150 @@
+/******************************************************************************
+ * @file     startup_ARMCM4.c
+ * @brief    CMSIS-Core(M) Device Startup File for a Cortex-M4 Device
+ * @version  V3.0.0
+ * @date     06. April 2023
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined (ARMCM4)
+  #include "ARMCM4.h"
+#else
+  #error device not specified!
+#endif
+
+/*----------------------------------------------------------------------------
+  External References
+ *----------------------------------------------------------------------------*/
+extern uint32_t __INITIAL_SP;
+
+extern __NO_RETURN void __PROGRAM_START(void);
+
+/*----------------------------------------------------------------------------
+  Internal References
+ *----------------------------------------------------------------------------*/
+__NO_RETURN void Reset_Handler  (void);
+            void Default_Handler(void);
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Handler
+ *----------------------------------------------------------------------------*/
+/* Exceptions */
+void NMI_Handler            (void) __attribute__ ((weak, alias("Default_Handler")));
+void HardFault_Handler      (void) __attribute__ ((weak));
+void MemManage_Handler      (void) __attribute__ ((weak, alias("Default_Handler")));
+void BusFault_Handler       (void) __attribute__ ((weak, alias("Default_Handler")));
+void UsageFault_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void SVC_Handler            (void) __attribute__ ((weak, alias("Default_Handler")));
+void DebugMon_Handler       (void) __attribute__ ((weak, alias("Default_Handler")));
+void PendSV_Handler         (void) __attribute__ ((weak, alias("Default_Handler")));
+void SysTick_Handler        (void) __attribute__ ((weak, alias("Default_Handler")));
+
+void Interrupt0_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt1_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt2_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt3_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt4_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt5_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt6_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt7_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt8_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+void Interrupt9_Handler     (void) __attribute__ ((weak, alias("Default_Handler")));
+
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Vector table
+ *----------------------------------------------------------------------------*/
+
+#if defined ( __GNUC__ )
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+
+extern const VECTOR_TABLE_Type __VECTOR_TABLE[240];
+       const VECTOR_TABLE_Type __VECTOR_TABLE[240] __VECTOR_TABLE_ATTRIBUTE = {
+  (VECTOR_TABLE_Type)(&__INITIAL_SP),       /*     Initial Stack Pointer */
+  Reset_Handler,                            /*     Reset Handler */
+  NMI_Handler,                              /* -14 NMI Handler */
+  HardFault_Handler,                        /* -13 Hard Fault Handler */
+  MemManage_Handler,                        /* -12 MPU Fault Handler */
+  BusFault_Handler,                         /* -11 Bus Fault Handler */
+  UsageFault_Handler,                       /* -10 Usage Fault Handler */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  0,                                        /*     Reserved */
+  SVC_Handler,                              /*  -5 SVC Handler */
+  DebugMon_Handler,                         /*  -4 Debug Monitor Handler */
+  0,                                        /*     Reserved */
+  PendSV_Handler,                           /*  -2 PendSV Handler */
+  SysTick_Handler,                          /*  -1 SysTick Handler */
+
+  /* Interrupts */
+  Interrupt0_Handler,                       /*   0 Interrupt 0 */
+  Interrupt1_Handler,                       /*   1 Interrupt 1 */
+  Interrupt2_Handler,                       /*   2 Interrupt 2 */
+  Interrupt3_Handler,                       /*   3 Interrupt 3 */
+  Interrupt4_Handler,                       /*   4 Interrupt 4 */
+  Interrupt5_Handler,                       /*   5 Interrupt 5 */
+  Interrupt6_Handler,                       /*   6 Interrupt 6 */
+  Interrupt7_Handler,                       /*   7 Interrupt 7 */
+  Interrupt8_Handler,                       /*   8 Interrupt 8 */
+  Interrupt9_Handler                        /*   9 Interrupt 9 */
+                                            /* Interrupts 10 .. 223 are left out */
+};
+
+#if defined ( __GNUC__ )
+#pragma GCC diagnostic pop
+#endif
+
+/*----------------------------------------------------------------------------
+  Reset Handler called on controller reset
+ *----------------------------------------------------------------------------*/
+__NO_RETURN void Reset_Handler(void)
+{
+  SystemInit();                             /* CMSIS System Initialization */
+  __PROGRAM_START();                        /* Enter PreMain (C library entry point) */
+}
+
+
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wmissing-noreturn"
+#endif
+
+/*----------------------------------------------------------------------------
+  Hard Fault Handler
+ *----------------------------------------------------------------------------*/
+void HardFault_Handler(void)
+{
+  while(1);
+}
+
+/*----------------------------------------------------------------------------
+  Default Handler for Exceptions / Interrupts
+ *----------------------------------------------------------------------------*/
+void Default_Handler(void)
+{
+  while(1);
+}
+
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+  #pragma clang diagnostic pop
+#endif
+
diff --git a/dsppp/RTE/Device/ARMCM4/system_ARMCM4.c b/dsppp/RTE/Device/ARMCM4/system_ARMCM4.c
new file mode 100644
index 000000000..803d4fc3e
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM4/system_ARMCM4.c
@@ -0,0 +1,79 @@
+/**************************************************************************//**
+ * @file     system_ARMCM4.c
+ * @brief    CMSIS Device System Source File for
+ *           ARMCM4 Device
+ * @version  V2.0.0
+ * @date     06. April 2023
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined (ARMCM4)
+  #include "ARMCM4.h"
+#else
+  #error device not specified!
+#endif
+
+/*----------------------------------------------------------------------------
+  Define clocks
+ *----------------------------------------------------------------------------*/
+#define  XTAL            (50000000UL)     /* Oscillator frequency */
+
+#define  SYSTEM_CLOCK    (XTAL / 2U)
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Vector table
+ *----------------------------------------------------------------------------*/
+extern const VECTOR_TABLE_Type __VECTOR_TABLE[240];
+
+
+/*----------------------------------------------------------------------------
+  System Core Clock Variable
+ *----------------------------------------------------------------------------*/
+uint32_t SystemCoreClock = SYSTEM_CLOCK;  /* System Core Clock Frequency */
+
+
+/*----------------------------------------------------------------------------
+  System Core Clock update function
+ *----------------------------------------------------------------------------*/
+void SystemCoreClockUpdate (void)
+{
+  SystemCoreClock = SYSTEM_CLOCK;
+}
+
+/*----------------------------------------------------------------------------
+  System initialization function
+ *----------------------------------------------------------------------------*/
+void SystemInit (void)
+{
+
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1U)
+  SCB->VTOR = (uint32_t) &(__VECTOR_TABLE[0]);
+#endif
+
+#if defined (__FPU_USED) && (__FPU_USED == 1U)
+  SCB->CPACR |= ((3U << 10U*2U) |           /* enable CP10 Full Access */
+                 (3U << 11U*2U)  );         /* enable CP11 Full Access */
+#endif
+
+#ifdef UNALIGNED_SUPPORT_DISABLE
+  SCB->CCR |= SCB_CCR_UNALIGN_TRP_Msk;
+#endif
+
+  SystemCoreClock = SYSTEM_CLOCK;
+}
diff --git a/dsppp/RTE/Device/ARMCM4/system_ARMCM4.c.base@2.0.0 b/dsppp/RTE/Device/ARMCM4/system_ARMCM4.c.base@2.0.0
new file mode 100644
index 000000000..803d4fc3e
--- /dev/null
+++ b/dsppp/RTE/Device/ARMCM4/system_ARMCM4.c.base@2.0.0
@@ -0,0 +1,79 @@
+/**************************************************************************//**
+ * @file     system_ARMCM4.c
+ * @brief    CMSIS Device System Source File for
+ *           ARMCM4 Device
+ * @version  V2.0.0
+ * @date     06. April 2023
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined (ARMCM4)
+  #include "ARMCM4.h"
+#else
+  #error device not specified!
+#endif
+
+/*----------------------------------------------------------------------------
+  Define clocks
+ *----------------------------------------------------------------------------*/
+#define  XTAL            (50000000UL)     /* Oscillator frequency */
+
+#define  SYSTEM_CLOCK    (XTAL / 2U)
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Vector table
+ *----------------------------------------------------------------------------*/
+extern const VECTOR_TABLE_Type __VECTOR_TABLE[240];
+
+
+/*----------------------------------------------------------------------------
+  System Core Clock Variable
+ *----------------------------------------------------------------------------*/
+uint32_t SystemCoreClock = SYSTEM_CLOCK;  /* System Core Clock Frequency */
+
+
+/*----------------------------------------------------------------------------
+  System Core Clock update function
+ *----------------------------------------------------------------------------*/
+void SystemCoreClockUpdate (void)
+{
+  SystemCoreClock = SYSTEM_CLOCK;
+}
+
+/*----------------------------------------------------------------------------
+  System initialization function
+ *----------------------------------------------------------------------------*/
+void SystemInit (void)
+{
+
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1U)
+  SCB->VTOR = (uint32_t) &(__VECTOR_TABLE[0]);
+#endif
+
+#if defined (__FPU_USED) && (__FPU_USED == 1U)
+  SCB->CPACR |= ((3U << 10U*2U) |           /* enable CP10 Full Access */
+                 (3U << 11U*2U)  );         /* enable CP11 Full Access */
+#endif
+
+#ifdef UNALIGNED_SUPPORT_DISABLE
+  SCB->CCR |= SCB_CCR_UNALIGN_TRP_Msk;
+#endif
+
+  SystemCoreClock = SYSTEM_CLOCK;
+}
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/RTE_Device.h b/dsppp/RTE/Device/SSE-300-MPS3/RTE_Device.h
new file mode 100644
index 000000000..31255472f
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/RTE_Device.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RTE_DEVICE_H
+#define __RTE_DEVICE_H
+
+// <q> USART (Universal synchronous - asynchronous receiver transmitter) [Driver_USART0]
+// <i> Configuration settings for Driver_USART0 in component ::Drivers:USART
+#define   RTE_USART0                     1
+
+// <q> USART (Universal synchronous - asynchronous receiver transmitter) [Driver_USART1]
+// <i> Configuration settings for Driver_USART1 in component ::Drivers:USART
+#define   RTE_USART1                     1
+
+// <q> MPC (Memory Protection Controller) [Driver_ISRAM0_MPC]
+// <i> Configuration settings for Driver_ISRAM0_MPC in component ::Drivers:MPC
+#define   RTE_ISRAM0_MPC                 0
+
+// <q> MPC (Memory Protection Controller) [Driver_ISRAM1_MPC]
+// <i> Configuration settings for Driver_ISRAM1_MPC in component ::Drivers:MPC
+#define   RTE_ISRAM1_MPC                 0
+
+// <q> MPC (Memory Protection Controller) [Driver_SRAM_MPC]
+// <i> Configuration settings for Driver_SRAM_MPC in component ::Drivers:MPC
+#define   RTE_SRAM_MPC                   0
+
+// <q> MPC (Memory Protection Controller) [Driver_QSPI_MPC]
+// <i> Configuration settings for Driver_QSPI_MPC in component ::Drivers:MPC
+#define   RTE_QSPI_MPC                   0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_MAIN0]
+// <i> Configuration settings for Driver_PPC_SSE300_MAIN0 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_MAIN0             0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_MAIN_EXP0]
+// <i> Configuration settings for Driver_PPC_SSE300_MAIN_EXP0 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_MAIN_EXP0             0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_MAIN_EXP1]
+// <i> Configuration settings for Driver_PPC_SSE300_MAIN_EXP1 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_MAIN_EXP1             0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_PERIPH0]
+// <i> Configuration settings for Driver_PPC_SSE300_PERIPH0 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_PERIPH0             0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_PERIPH1]
+// <i> Configuration settings for Driver_PPC_SSE300_PERIPH1 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_PERIPH1             0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_PERIPH_EXP0]
+// <i> Configuration settings for Driver_PPC_SSE300_PERIPH_EXP0 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_PERIPH_EXP0             0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_PERIPH_EXP1]
+// <i> Configuration settings for Driver_PPC_SSE300_PERIPH_EXP1 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_PERIPH_EXP1             0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_PERIPH_EXP2]
+// <i> Configuration settings for Driver_PPC_SSE300_PERIPH_EXP2 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_PERIPH_EXP2             0
+
+// <q> Flash device emulated by SRAM [Driver_Flash0]
+// <i> Configuration settings for Driver_Flash0 in component ::Drivers:Flash
+#define   RTE_FLASH0                     1
+
+// <q> I2C SBCon [Driver_I2C0]
+// <i> Configuration settings for Driver_I2C0 in component ::Drivers:I2C
+#define   RTE_I2C0                    1
+
+#endif  /* __RTE_DEVICE_H */
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/RTE_Device.h.base@1.1.0 b/dsppp/RTE/Device/SSE-300-MPS3/RTE_Device.h.base@1.1.0
new file mode 100644
index 000000000..31255472f
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/RTE_Device.h.base@1.1.0
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RTE_DEVICE_H
+#define __RTE_DEVICE_H
+
+// <q> USART (Universal synchronous - asynchronous receiver transmitter) [Driver_USART0]
+// <i> Configuration settings for Driver_USART0 in component ::Drivers:USART
+#define   RTE_USART0                     1
+
+// <q> USART (Universal synchronous - asynchronous receiver transmitter) [Driver_USART1]
+// <i> Configuration settings for Driver_USART1 in component ::Drivers:USART
+#define   RTE_USART1                     1
+
+// <q> MPC (Memory Protection Controller) [Driver_ISRAM0_MPC]
+// <i> Configuration settings for Driver_ISRAM0_MPC in component ::Drivers:MPC
+#define   RTE_ISRAM0_MPC                 0
+
+// <q> MPC (Memory Protection Controller) [Driver_ISRAM1_MPC]
+// <i> Configuration settings for Driver_ISRAM1_MPC in component ::Drivers:MPC
+#define   RTE_ISRAM1_MPC                 0
+
+// <q> MPC (Memory Protection Controller) [Driver_SRAM_MPC]
+// <i> Configuration settings for Driver_SRAM_MPC in component ::Drivers:MPC
+#define   RTE_SRAM_MPC                   0
+
+// <q> MPC (Memory Protection Controller) [Driver_QSPI_MPC]
+// <i> Configuration settings for Driver_QSPI_MPC in component ::Drivers:MPC
+#define   RTE_QSPI_MPC                   0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_MAIN0]
+// <i> Configuration settings for Driver_PPC_SSE300_MAIN0 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_MAIN0             0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_MAIN_EXP0]
+// <i> Configuration settings for Driver_PPC_SSE300_MAIN_EXP0 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_MAIN_EXP0             0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_MAIN_EXP1]
+// <i> Configuration settings for Driver_PPC_SSE300_MAIN_EXP1 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_MAIN_EXP1             0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_PERIPH0]
+// <i> Configuration settings for Driver_PPC_SSE300_PERIPH0 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_PERIPH0             0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_PERIPH1]
+// <i> Configuration settings for Driver_PPC_SSE300_PERIPH1 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_PERIPH1             0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_PERIPH_EXP0]
+// <i> Configuration settings for Driver_PPC_SSE300_PERIPH_EXP0 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_PERIPH_EXP0             0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_PERIPH_EXP1]
+// <i> Configuration settings for Driver_PPC_SSE300_PERIPH_EXP1 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_PERIPH_EXP1             0
+
+// <q> PPC (Peripheral Protection Controller) [PPC_SSE300_PERIPH_EXP2]
+// <i> Configuration settings for Driver_PPC_SSE300_PERIPH_EXP2 in component ::Drivers:PPC
+#define   RTE_PPC_SSE300_PERIPH_EXP2             0
+
+// <q> Flash device emulated by SRAM [Driver_Flash0]
+// <i> Configuration settings for Driver_Flash0 in component ::Drivers:Flash
+#define   RTE_FLASH0                     1
+
+// <q> I2C SBCon [Driver_I2C0]
+// <i> Configuration settings for Driver_I2C0 in component ::Drivers:I2C
+#define   RTE_I2C0                    1
+
+#endif  /* __RTE_DEVICE_H */
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/cmsis_driver_config.h b/dsppp/RTE/Device/SSE-300-MPS3/cmsis_driver_config.h
new file mode 100644
index 000000000..bfc348f47
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/cmsis_driver_config.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CMSIS_DRIVER_CONFIG_H__
+#define __CMSIS_DRIVER_CONFIG_H__
+
+#include "system_SSE300MPS3.h"
+#include "device_cfg.h"
+#include "device_definition.h"
+#include "platform_base_address.h"
+
+#endif  /* __CMSIS_DRIVER_CONFIG_H__ */
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/cmsis_driver_config.h.base@1.1.1 b/dsppp/RTE/Device/SSE-300-MPS3/cmsis_driver_config.h.base@1.1.1
new file mode 100644
index 000000000..bfc348f47
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/cmsis_driver_config.h.base@1.1.1
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CMSIS_DRIVER_CONFIG_H__
+#define __CMSIS_DRIVER_CONFIG_H__
+
+#include "system_SSE300MPS3.h"
+#include "device_cfg.h"
+#include "device_definition.h"
+#include "platform_base_address.h"
+
+#endif  /* __CMSIS_DRIVER_CONFIG_H__ */
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/device_cfg.h b/dsppp/RTE/Device/SSE-300-MPS3/device_cfg.h
new file mode 100644
index 000000000..2ff3eaa77
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/device_cfg.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2020-2022 Arm Limited. All rights reserved.
+ *
+ * Licensed under the Apache License Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing software
+ * distributed under the License is distributed on an "AS IS" BASIS
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEVICE_CFG_H__
+#define __DEVICE_CFG_H__
+
+/**
+ * \file device_cfg.h
+ * \brief Configuration file native driver re-targeting
+ *
+ * \details This file can be used to add native driver specific macro
+ *          definitions to select which peripherals are available in the build.
+ *
+ * This is a default device configuration file with all peripherals enabled.
+ */
+
+/* Secure only peripheral configuration */
+
+/* ARM MPS3 IO SCC */
+#define MPS3_IO_S
+#define MPS3_IO_DEV                 MPS3_IO_DEV_S
+
+/* I2C_SBCon */
+#define I2C0_SBCON_S
+#define I2C0_SBCON_DEV              I2C0_SBCON_DEV_S
+
+/* I2S */
+#define MPS3_I2S_S
+#define MPS3_I2S_DEV                MPS3_I2S_DEV_S
+
+/* ARM UART Controller PL011 */
+#define UART0_CMSDK_S
+#define UART0_CMSDK_DEV          UART0_CMSDK_DEV_S
+#define UART1_CMSDK_S
+#define UART1_CMSDK_DEV          UART1_CMSDK_DEV_S
+
+#define DEFAULT_UART_BAUDRATE    115200U
+
+/* To be used as CODE and DATA sram */
+#define MPC_ISRAM0_S
+#define MPC_ISRAM0_DEV              MPC_ISRAM0_DEV_S
+
+#define MPC_ISRAM1_S
+#define MPC_ISRAM1_DEV              MPC_ISRAM0_DEV_S
+
+#define MPC_SRAM_S
+#define MPC_SRAM_DEV                MPC_SRAM_DEV_S
+
+#define MPC_QSPI_S
+#define MPC_QSPI_DEV                MPC_QSPI_DEV_S
+
+/** System Counter Armv8-M */
+#define SYSCOUNTER_CNTRL_ARMV8_M_S
+#define SYSCOUNTER_CNTRL_ARMV8_M_DEV    SYSCOUNTER_CNTRL_ARMV8_M_DEV_S
+
+#define SYSCOUNTER_READ_ARMV8_M_S
+#define SYSCOUNTER_READ_ARMV8_M_DEV     SYSCOUNTER_READ_ARMV8_M_DEV_S
+/**
+ * Arbitrary scaling values for test purposes
+ */
+#define SYSCOUNTER_ARMV8_M_DEFAULT_SCALE0_INT           1u
+#define SYSCOUNTER_ARMV8_M_DEFAULT_SCALE0_FRACT         0u
+#define SYSCOUNTER_ARMV8_M_DEFAULT_SCALE1_INT           1u
+#define SYSCOUNTER_ARMV8_M_DEFAULT_SCALE1_FRACT         0u
+
+/* System timer */
+#define SYSTIMER0_ARMV8_M_S
+#define SYSTIMER0_ARMV8_M_DEV    SYSTIMER0_ARMV8_M_DEV_S
+#define SYSTIMER1_ARMV8_M_S
+#define SYSTIMER1_ARMV8_M_DEV    SYSTIMER1_ARMV8_M_DEV_S
+#define SYSTIMER2_ARMV8_M_S
+#define SYSTIMER2_ARMV8_M_DEV    SYSTIMER2_ARMV8_M_DEV_S
+#define SYSTIMER3_ARMV8_M_S
+#define SYSTIMER3_ARMV8_M_DEV    SYSTIMER3_ARMV8_M_DEV_S
+
+#define SYSTIMER0_ARMV8M_DEFAULT_FREQ_HZ    (25000000ul)
+#define SYSTIMER1_ARMV8M_DEFAULT_FREQ_HZ    (25000000ul)
+#define SYSTIMER2_ARMV8M_DEFAULT_FREQ_HZ    (25000000ul)
+#define SYSTIMER3_ARMV8M_DEFAULT_FREQ_HZ    (25000000ul)
+
+/* CMSDK GPIO driver structures */
+#define GPIO0_CMSDK_S
+#define GPIO0_CMSDK_DEV GPIO0_CMSDK_DEV_S
+#define GPIO1_CMSDK_S
+#define GPIO1_CMSDK_DEV GPIO1_CMSDK_DEV_S
+#define GPIO2_CMSDK_S
+#define GPIO2_CMSDK_DEV GPIO2_CMSDK_DEV_S
+#define GPIO3_CMSDK_S
+#define GPIO3_CMSDK_DEV GPIO3_CMSDK_DEV_S
+
+/* System Watchdogs */
+#define SYSWDOG_ARMV8_M_S
+#define SYSWDOG_ARMV8_M_DEV SYSWDOG_ARMV8_M_DEV_S
+
+/* ARM MPC SIE 300 driver structures */
+#define MPC_VM0_S
+#define MPC_VM0_DEV MPC_VM0_DEV_S
+#define MPC_VM1_S
+#define MPC_VM1_DEV MPC_VM1_DEV_S
+#define MPC_SSRAM2_S
+#define MPC_SSRAM2_DEV MPC_SSRAM2_DEV_S
+#define MPC_SSRAM3_S
+#define MPC_SSRAM3_DEV MPC_SSRAM3_DEV_S
+
+/* ARM PPC driver structures */
+#define PPC_SSE300_MAIN0_S
+#define PPC_SSE300_MAIN0_DEV PPC_SSE300_MAIN0_DEV_S
+#define PPC_SSE300_MAIN_EXP0_S
+#define PPC_SSE300_MAIN_EXP0_DEV PPC_SSE300_MAIN_EXP0_DEV_S
+#define PPC_SSE300_MAIN_EXP1_S
+#define PPC_SSE300_MAIN_EXP1_DEV PPC_SSE300_MAIN_EXP1_DEV_S
+#define PPC_SSE300_MAIN_EXP2_S
+#define PPC_SSE300_MAIN_EXP2_DEV PPC_SSE300_MAIN_EXP2_DEV_S
+#define PPC_SSE300_MAIN_EXP3_S
+#define PPC_SSE300_MAIN_EXP3_DEV PPC_SSE300_MAIN_EXP3_DEV_S
+#define PPC_SSE300_PERIPH0_S
+#define PPC_SSE300_PERIPH0_DEV PPC_SSE300_PERIPH0_DEV_S
+#define PPC_SSE300_PERIPH1_S
+#define PPC_SSE300_PERIPH1_DEV PPC_SSE300_PERIPH1_DEV_S
+#define PPC_SSE300_PERIPH_EXP0_S
+#define PPC_SSE300_PERIPH_EXP0_DEV PPC_SSE300_PERIPH_EXP0_DEV_S
+#define PPC_SSE300_PERIPH_EXP1_S
+#define PPC_SSE300_PERIPH_EXP1_DEV PPC_SSE300_PERIPH_EXP1_DEV_S
+#define PPC_SSE300_PERIPH_EXP2_S
+#define PPC_SSE300_PERIPH_EXP2_DEV PPC_SSE300_PERIPH_EXP2_DEV_S
+#define PPC_SSE300_PERIPH_EXP3_S
+#define PPC_SSE300_PERIPH_EXP3_DEV PPC_SSE300_PERIPH_EXP3_DEV_S
+
+/* ARM SPI PL022 */
+/* Invalid device stubs are not defined */
+#define DEFAULT_SPI_SPEED_HZ  4000000U /* 4MHz */
+#define SPI1_PL022_S
+#define SPI1_PL022_DEV SPI1_PL022_DEV_S
+
+
+#endif  /* __DEVICE_CFG_H__ */
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/device_cfg.h.base@1.1.3 b/dsppp/RTE/Device/SSE-300-MPS3/device_cfg.h.base@1.1.3
new file mode 100644
index 000000000..2ff3eaa77
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/device_cfg.h.base@1.1.3
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2020-2022 Arm Limited. All rights reserved.
+ *
+ * Licensed under the Apache License Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing software
+ * distributed under the License is distributed on an "AS IS" BASIS
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEVICE_CFG_H__
+#define __DEVICE_CFG_H__
+
+/**
+ * \file device_cfg.h
+ * \brief Configuration file native driver re-targeting
+ *
+ * \details This file can be used to add native driver specific macro
+ *          definitions to select which peripherals are available in the build.
+ *
+ * This is a default device configuration file with all peripherals enabled.
+ */
+
+/* Secure only peripheral configuration */
+
+/* ARM MPS3 IO SCC */
+#define MPS3_IO_S
+#define MPS3_IO_DEV                 MPS3_IO_DEV_S
+
+/* I2C_SBCon */
+#define I2C0_SBCON_S
+#define I2C0_SBCON_DEV              I2C0_SBCON_DEV_S
+
+/* I2S */
+#define MPS3_I2S_S
+#define MPS3_I2S_DEV                MPS3_I2S_DEV_S
+
+/* ARM UART Controller PL011 */
+#define UART0_CMSDK_S
+#define UART0_CMSDK_DEV          UART0_CMSDK_DEV_S
+#define UART1_CMSDK_S
+#define UART1_CMSDK_DEV          UART1_CMSDK_DEV_S
+
+#define DEFAULT_UART_BAUDRATE    115200U
+
+/* To be used as CODE and DATA sram */
+#define MPC_ISRAM0_S
+#define MPC_ISRAM0_DEV              MPC_ISRAM0_DEV_S
+
+#define MPC_ISRAM1_S
+#define MPC_ISRAM1_DEV              MPC_ISRAM0_DEV_S
+
+#define MPC_SRAM_S
+#define MPC_SRAM_DEV                MPC_SRAM_DEV_S
+
+#define MPC_QSPI_S
+#define MPC_QSPI_DEV                MPC_QSPI_DEV_S
+
+/** System Counter Armv8-M */
+#define SYSCOUNTER_CNTRL_ARMV8_M_S
+#define SYSCOUNTER_CNTRL_ARMV8_M_DEV    SYSCOUNTER_CNTRL_ARMV8_M_DEV_S
+
+#define SYSCOUNTER_READ_ARMV8_M_S
+#define SYSCOUNTER_READ_ARMV8_M_DEV     SYSCOUNTER_READ_ARMV8_M_DEV_S
+/**
+ * Arbitrary scaling values for test purposes
+ */
+#define SYSCOUNTER_ARMV8_M_DEFAULT_SCALE0_INT           1u
+#define SYSCOUNTER_ARMV8_M_DEFAULT_SCALE0_FRACT         0u
+#define SYSCOUNTER_ARMV8_M_DEFAULT_SCALE1_INT           1u
+#define SYSCOUNTER_ARMV8_M_DEFAULT_SCALE1_FRACT         0u
+
+/* System timer */
+#define SYSTIMER0_ARMV8_M_S
+#define SYSTIMER0_ARMV8_M_DEV    SYSTIMER0_ARMV8_M_DEV_S
+#define SYSTIMER1_ARMV8_M_S
+#define SYSTIMER1_ARMV8_M_DEV    SYSTIMER1_ARMV8_M_DEV_S
+#define SYSTIMER2_ARMV8_M_S
+#define SYSTIMER2_ARMV8_M_DEV    SYSTIMER2_ARMV8_M_DEV_S
+#define SYSTIMER3_ARMV8_M_S
+#define SYSTIMER3_ARMV8_M_DEV    SYSTIMER3_ARMV8_M_DEV_S
+
+#define SYSTIMER0_ARMV8M_DEFAULT_FREQ_HZ    (25000000ul)
+#define SYSTIMER1_ARMV8M_DEFAULT_FREQ_HZ    (25000000ul)
+#define SYSTIMER2_ARMV8M_DEFAULT_FREQ_HZ    (25000000ul)
+#define SYSTIMER3_ARMV8M_DEFAULT_FREQ_HZ    (25000000ul)
+
+/* CMSDK GPIO driver structures */
+#define GPIO0_CMSDK_S
+#define GPIO0_CMSDK_DEV GPIO0_CMSDK_DEV_S
+#define GPIO1_CMSDK_S
+#define GPIO1_CMSDK_DEV GPIO1_CMSDK_DEV_S
+#define GPIO2_CMSDK_S
+#define GPIO2_CMSDK_DEV GPIO2_CMSDK_DEV_S
+#define GPIO3_CMSDK_S
+#define GPIO3_CMSDK_DEV GPIO3_CMSDK_DEV_S
+
+/* System Watchdogs */
+#define SYSWDOG_ARMV8_M_S
+#define SYSWDOG_ARMV8_M_DEV SYSWDOG_ARMV8_M_DEV_S
+
+/* ARM MPC SIE 300 driver structures */
+#define MPC_VM0_S
+#define MPC_VM0_DEV MPC_VM0_DEV_S
+#define MPC_VM1_S
+#define MPC_VM1_DEV MPC_VM1_DEV_S
+#define MPC_SSRAM2_S
+#define MPC_SSRAM2_DEV MPC_SSRAM2_DEV_S
+#define MPC_SSRAM3_S
+#define MPC_SSRAM3_DEV MPC_SSRAM3_DEV_S
+
+/* ARM PPC driver structures */
+#define PPC_SSE300_MAIN0_S
+#define PPC_SSE300_MAIN0_DEV PPC_SSE300_MAIN0_DEV_S
+#define PPC_SSE300_MAIN_EXP0_S
+#define PPC_SSE300_MAIN_EXP0_DEV PPC_SSE300_MAIN_EXP0_DEV_S
+#define PPC_SSE300_MAIN_EXP1_S
+#define PPC_SSE300_MAIN_EXP1_DEV PPC_SSE300_MAIN_EXP1_DEV_S
+#define PPC_SSE300_MAIN_EXP2_S
+#define PPC_SSE300_MAIN_EXP2_DEV PPC_SSE300_MAIN_EXP2_DEV_S
+#define PPC_SSE300_MAIN_EXP3_S
+#define PPC_SSE300_MAIN_EXP3_DEV PPC_SSE300_MAIN_EXP3_DEV_S
+#define PPC_SSE300_PERIPH0_S
+#define PPC_SSE300_PERIPH0_DEV PPC_SSE300_PERIPH0_DEV_S
+#define PPC_SSE300_PERIPH1_S
+#define PPC_SSE300_PERIPH1_DEV PPC_SSE300_PERIPH1_DEV_S
+#define PPC_SSE300_PERIPH_EXP0_S
+#define PPC_SSE300_PERIPH_EXP0_DEV PPC_SSE300_PERIPH_EXP0_DEV_S
+#define PPC_SSE300_PERIPH_EXP1_S
+#define PPC_SSE300_PERIPH_EXP1_DEV PPC_SSE300_PERIPH_EXP1_DEV_S
+#define PPC_SSE300_PERIPH_EXP2_S
+#define PPC_SSE300_PERIPH_EXP2_DEV PPC_SSE300_PERIPH_EXP2_DEV_S
+#define PPC_SSE300_PERIPH_EXP3_S
+#define PPC_SSE300_PERIPH_EXP3_DEV PPC_SSE300_PERIPH_EXP3_DEV_S
+
+/* ARM SPI PL022 */
+/* Invalid device stubs are not defined */
+#define DEFAULT_SPI_SPEED_HZ  4000000U /* 4MHz */
+#define SPI1_PL022_S
+#define SPI1_PL022_DEV SPI1_PL022_DEV_S
+
+
+#endif  /* __DEVICE_CFG_H__ */
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.ld b/dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.ld
new file mode 100644
index 000000000..5c64ad4f1
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.ld
@@ -0,0 +1,242 @@
+;/*
+; * Copyright (c) 2009-2023 Arm Limited
+; *
+; * Licensed under the Apache License, Version 2.0 (the "License");
+; * you may not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; *     http://www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an "AS IS" BASIS,
+; * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; */
+
+/* Linker script to configure memory regions. */
+/* This file will be run trough the pre-processor. */
+
+#include "region_defs.h"
+
+MEMORY
+{
+  FLASH (rx)  : ORIGIN = S_CODE_START, LENGTH = S_CODE_SIZE
+  RAM   (rw) : ORIGIN = S_DATA_START, LENGTH = S_DATA_SIZE
+}
+
+__heap_size__  = HEAP_SIZE;
+__stack_size__ = STACK_SIZE;
+
+/* Library configurations */
+GROUP(libgcc.a libc.a libm.a libnosys.a)
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapBase
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ *   __Vectors_End
+ *   __Vectors_Size
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+    .text :
+    {
+        KEEP(*(.vectors))
+        __Vectors_End = .;
+        __Vectors_Size = __Vectors_End - __Vectors;
+        __end__ = .;
+
+        *(.text*)
+
+        KEEP(*(.init))
+        KEEP(*(.fini))
+
+
+        /* .ctors */
+        *crtbegin.o(.ctors)
+        *crtbegin?.o(.ctors)
+        *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+        *(SORT(.ctors.*))
+        *(.ctors)
+
+        /* .dtors */
+         *crtbegin.o(.dtors)
+         *crtbegin?.o(.dtors)
+         *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+         *(SORT(.dtors.*))
+         *(.dtors)
+
+        *(.rodata*)
+
+        KEEP(*(.eh_frame*))
+    } > FLASH
+
+    /*
+     * Place the CMSE Veneers (containing the SG instruction) after the code, in a
+     * separate 32 bytes aligned region so that the SAU can programmed to just set
+     * this region as Non-Secure Callable. The maximum size of this executable
+     * region makes it only used the space left over by the ER_CODE region
+     * so that you can rely on code+veneer size combined will not exceed the
+     * S_CODE_SIZE value. We also substract from the available space the
+     * area used to align this section on 32 bytes boundary (for SAU conf).
+     */
+    .gnu.sgstubs : ALIGN(32)
+    {
+        *(.gnu.sgstubs*)
+    } > FLASH
+    . = ALIGN(32);
+    Image$$ER_CODE_CMSE_VENEER$$Base = ADDR(.gnu.sgstubs);
+    Image$$ER_CODE_CMSE_VENEER$$Limit = .;
+    Image$$ER_CODE_CMSE_VENEER$$Length = Image$$ER_CODE_CMSE_VENEER$$Limit - Image$$ER_CODE_CMSE_VENEER$$Base;
+
+    /* Make sure veneers fit into code memory */
+    ASSERT(((S_CODE_START + S_CODE_SIZE) > Image$$ER_CODE_CMSE_VENEER$$Limit), "Veneer region does not fit into code memory")
+
+    .ARM.extab : ALIGN(32)
+    {
+        *(.ARM.extab* .gnu.linkonce.armextab.*)
+    } > FLASH
+
+    __exidx_start = .;
+    .ARM.exidx :
+    {
+        *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+    } > FLASH
+    __exidx_end = .;
+
+    /* To copy multiple ROM to RAM sections,
+     * define etext2/data2_start/data2_end and
+     * define __STARTUP_COPY_MULTIPLE in startup_cmsdk_mps2_sse_200.S */
+    .copy.table : ALIGN(4)
+    {
+        __copy_table_start__ = .;
+        LONG (__etext)
+        LONG (__data_start__)
+        LONG ((__data_end__ - __data_start__) / 4)
+        LONG (DEFINED(__etext2) ? __etext2 : 0)
+        LONG (DEFINED(__data2_start__) ? __data2_start__ : 0)
+        LONG (DEFINED(__data2_start__) ? ((__data2_end__ - __data2_start__) / 4) : 0)
+        __copy_table_end__ = .;
+    } > FLASH
+
+    /* To clear multiple BSS sections,
+     * uncomment .zero.table section and,
+     * define __STARTUP_CLEAR_BSS_MULTIPLE in startup_ARMCMx.S */
+    .zero.table : ALIGN(4)
+    {
+        __zero_table_start__ = .;
+        LONG (__bss_start__)
+        LONG ((__bss_end__ - __bss_start__) / 4)
+        LONG (DEFINED(__bss2_start__) ? __bss2_start__ : 0)
+        LONG (DEFINED(__bss2_start__) ? ((__bss2_end__ - __bss2_start__) / 4) : 0)
+        __zero_table_end__ = .;
+    } > FLASH
+
+    __etext = ALIGN(4);
+
+    .data : ALIGN(4)
+    {
+        __data_start__ = .;
+        *(vtable)
+        *(.data*)
+
+        . = ALIGN(4);
+        /* preinit data */
+        PROVIDE_HIDDEN (__preinit_array_start = .);
+        KEEP(*(.preinit_array))
+        PROVIDE_HIDDEN (__preinit_array_end = .);
+
+        . = ALIGN(4);
+        /* init data */
+        PROVIDE_HIDDEN (__init_array_start = .);
+        KEEP(*(SORT(.init_array.*)))
+        KEEP(*(.init_array))
+        PROVIDE_HIDDEN (__init_array_end = .);
+
+
+        . = ALIGN(4);
+        /* finit data */
+        PROVIDE_HIDDEN (__fini_array_start = .);
+        KEEP(*(SORT(.fini_array.*)))
+        KEEP(*(.fini_array))
+        PROVIDE_HIDDEN (__fini_array_end = .);
+
+        KEEP(*(.jcr*))
+        . = ALIGN(4);
+        /* All data end */
+        __data_end__ = .;
+
+    } > RAM AT> FLASH
+
+    .bss : ALIGN(4)
+    {
+        __bss_start__ = .;
+        *(.bss*)
+        *(COMMON)
+        . = ALIGN(4);
+        __bss_end__ = .;
+    } > RAM
+
+    bss_size = __bss_end__ - __bss_start__;
+
+    .stack : ALIGN(8)
+    {
+        __StackLimit = .;
+        KEEP(*(.stack*))
+        . += __stack_size__ - 0x8;
+        __StackTop = .;
+    } > RAM
+
+    .msp_stack_seal_res :
+    {
+        . += 0x8;
+    } > RAM
+    __StackSeal = ADDR(.msp_stack_seal_res);
+
+    .heap : ALIGN(8)
+    {
+        __end__ = .;
+        PROVIDE(end = .);
+        __HeapBase = .;
+        . += __heap_size__;
+        __HeapLimit = .;
+        __heap_limit = .; /* Add for _sbrk */
+    } > RAM
+
+    /* Set stack top to end of the used RAM section, and stack limit move down by
+     * size of stack_dummy section */
+    PROVIDE(__stack = __StackTop);
+
+
+    /* Check if data + heap + stack exceeds RAM limit */
+    ASSERT(__StackTop <= (S_DATA_START + S_DATA_SIZE), "Secure RAM region overflowed")
+}
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.ld.base@1.0.0 b/dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.ld.base@1.0.0
new file mode 100644
index 000000000..ff09e8e31
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.ld.base@1.0.0
@@ -0,0 +1,242 @@
+;/*
+; * Copyright (c) 2009-2023 Arm Limited
+; *
+; * Licensed under the Apache License, Version 2.0 (the "License");
+; * you may not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; *     http://www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an "AS IS" BASIS,
+; * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; */
+
+/* Linker script to configure memory regions. */
+/* This file will be run trough the pre-processor. */
+
+#include "region_defs.h"
+
+MEMORY
+{
+  FLASH (rx)  : ORIGIN = S_CODE_START, LENGTH = S_CODE_SIZE
+  RAM   (rwx) : ORIGIN = S_DATA_START, LENGTH = S_DATA_SIZE
+}
+
+__heap_size__  = HEAP_SIZE;
+__stack_size__ = STACK_SIZE;
+
+/* Library configurations */
+GROUP(libgcc.a libc.a libm.a libnosys.a)
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapBase
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ *   __Vectors_End
+ *   __Vectors_Size
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+    .text :
+    {
+        KEEP(*(.vectors))
+        __Vectors_End = .;
+        __Vectors_Size = __Vectors_End - __Vectors;
+        __end__ = .;
+
+        *(.text*)
+
+        KEEP(*(.init))
+        KEEP(*(.fini))
+
+
+        /* .ctors */
+        *crtbegin.o(.ctors)
+        *crtbegin?.o(.ctors)
+        *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+        *(SORT(.ctors.*))
+        *(.ctors)
+
+        /* .dtors */
+         *crtbegin.o(.dtors)
+         *crtbegin?.o(.dtors)
+         *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+         *(SORT(.dtors.*))
+         *(.dtors)
+
+        *(.rodata*)
+
+        KEEP(*(.eh_frame*))
+    } > FLASH
+
+    /*
+     * Place the CMSE Veneers (containing the SG instruction) after the code, in a
+     * separate 32 bytes aligned region so that the SAU can programmed to just set
+     * this region as Non-Secure Callable. The maximum size of this executable
+     * region makes it only used the space left over by the ER_CODE region
+     * so that you can rely on code+veneer size combined will not exceed the
+     * S_CODE_SIZE value. We also substract from the available space the
+     * area used to align this section on 32 bytes boundary (for SAU conf).
+     */
+    .gnu.sgstubs : ALIGN(32)
+    {
+        *(.gnu.sgstubs*)
+    } > FLASH
+    . = ALIGN(32);
+    Image$$ER_CODE_CMSE_VENEER$$Base = ADDR(.gnu.sgstubs);
+    Image$$ER_CODE_CMSE_VENEER$$Limit = .;
+    Image$$ER_CODE_CMSE_VENEER$$Length = Image$$ER_CODE_CMSE_VENEER$$Limit - Image$$ER_CODE_CMSE_VENEER$$Base;
+
+    /* Make sure veneers fit into code memory */
+    ASSERT(((S_CODE_START + S_CODE_SIZE) > Image$$ER_CODE_CMSE_VENEER$$Limit), "Veneer region does not fit into code memory")
+
+    .ARM.extab : ALIGN(32)
+    {
+        *(.ARM.extab* .gnu.linkonce.armextab.*)
+    } > FLASH
+
+    __exidx_start = .;
+    .ARM.exidx :
+    {
+        *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+    } > FLASH
+    __exidx_end = .;
+
+    /* To copy multiple ROM to RAM sections,
+     * define etext2/data2_start/data2_end and
+     * define __STARTUP_COPY_MULTIPLE in startup_cmsdk_mps2_sse_200.S */
+    .copy.table : ALIGN(4)
+    {
+        __copy_table_start__ = .;
+        LONG (__etext)
+        LONG (__data_start__)
+        LONG ((__data_end__ - __data_start__) / 4)
+        LONG (DEFINED(__etext2) ? __etext2 : 0)
+        LONG (DEFINED(__data2_start__) ? __data2_start__ : 0)
+        LONG (DEFINED(__data2_start__) ? ((__data2_end__ - __data2_start__) / 4) : 0)
+        __copy_table_end__ = .;
+    } > FLASH
+
+    /* To clear multiple BSS sections,
+     * uncomment .zero.table section and,
+     * define __STARTUP_CLEAR_BSS_MULTIPLE in startup_ARMCMx.S */
+    .zero.table : ALIGN(4)
+    {
+        __zero_table_start__ = .;
+        LONG (__bss_start__)
+        LONG ((__bss_end__ - __bss_start__) / 4)
+        LONG (DEFINED(__bss2_start__) ? __bss2_start__ : 0)
+        LONG (DEFINED(__bss2_start__) ? ((__bss2_end__ - __bss2_start__) / 4) : 0)
+        __zero_table_end__ = .;
+    } > FLASH
+
+    __etext = ALIGN(4);
+
+    .data : ALIGN(4)
+    {
+        __data_start__ = .;
+        *(vtable)
+        *(.data*)
+
+        . = ALIGN(4);
+        /* preinit data */
+        PROVIDE_HIDDEN (__preinit_array_start = .);
+        KEEP(*(.preinit_array))
+        PROVIDE_HIDDEN (__preinit_array_end = .);
+
+        . = ALIGN(4);
+        /* init data */
+        PROVIDE_HIDDEN (__init_array_start = .);
+        KEEP(*(SORT(.init_array.*)))
+        KEEP(*(.init_array))
+        PROVIDE_HIDDEN (__init_array_end = .);
+
+
+        . = ALIGN(4);
+        /* finit data */
+        PROVIDE_HIDDEN (__fini_array_start = .);
+        KEEP(*(SORT(.fini_array.*)))
+        KEEP(*(.fini_array))
+        PROVIDE_HIDDEN (__fini_array_end = .);
+
+        KEEP(*(.jcr*))
+        . = ALIGN(4);
+        /* All data end */
+        __data_end__ = .;
+
+    } > RAM AT> FLASH
+
+    .bss : ALIGN(4)
+    {
+        __bss_start__ = .;
+        *(.bss*)
+        *(COMMON)
+        . = ALIGN(4);
+        __bss_end__ = .;
+    } > RAM
+
+    bss_size = __bss_end__ - __bss_start__;
+
+    .stack : ALIGN(8)
+    {
+        __StackLimit = .;
+        KEEP(*(.stack*))
+        . += __stack_size__ - 0x8;
+        __StackTop = .;
+    } > RAM
+
+    .msp_stack_seal_res :
+    {
+        . += 0x8;
+    } > RAM
+    __StackSeal = ADDR(.msp_stack_seal_res);
+
+    .heap : ALIGN(8)
+    {
+        __end__ = .;
+        PROVIDE(end = .);
+        __HeapBase = .;
+        . += __heap_size__;
+        __HeapLimit = .;
+        __heap_limit = .; /* Add for _sbrk */
+    } > RAM
+
+    /* Set stack top to end of the used RAM section, and stack limit move down by
+     * size of stack_dummy section */
+    PROVIDE(__stack = __StackTop);
+
+
+    /* Check if data + heap + stack exceeds RAM limit */
+    ASSERT(__StackTop <= (S_DATA_START + S_DATA_SIZE), "Secure RAM region overflowed")
+}
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.sct b/dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.sct
new file mode 100644
index 000000000..8b95c189d
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.sct
@@ -0,0 +1,62 @@
+#! armclang --target=arm-arm-none-eabi -march=armv8.1-m.main -E -xc
+
+;/*
+; * Copyright (c) 2018-2023 Arm Limited
+; *
+; * Licensed under the Apache License, Version 2.0 (the "License");
+; * you may not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; *     http://www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an "AS IS" BASIS,
+; * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; *
+; */
+
+#include "region_defs.h"
+
+LR_CODE S_CODE_START {
+    ER_CODE S_CODE_START {
+        *.o (RESET +First)
+        .ANY (+RO)
+    }
+
+    /* This empty, zero long execution region is here to mark the limit address
+     * of the last execution region that is allocated in SRAM.
+     */
+    CODE_WATERMARK +0 EMPTY 0x0 {
+    }
+    /* Make sure that the sections allocated in the SRAM does not exceed the
+     * size of the SRAM available.
+     */
+    ScatterAssert(ImageLimit(CODE_WATERMARK) <= S_CODE_START + S_CODE_SIZE)
+
+    ER_DATA S_DATA_START {
+        .ANY (+ZI +RW)
+    }
+
+    #if HEAP_SIZE > 0
+    ARM_LIB_HEAP +0 ALIGN 8 EMPTY  HEAP_SIZE  {   ; Reserve empty region for heap
+    }
+    #endif
+
+    ARM_LIB_STACK +0 ALIGN 32 EMPTY STACK_SIZE - 0x8 {   ; Reserve empty region for stack
+    }
+
+    STACKSEAL +0 EMPTY 0x8 {
+    }
+
+    /* This empty, zero long execution region is here to mark the limit address
+     * of the last execution region that is allocated in SRAM.
+     */
+    SRAM_WATERMARK +0 EMPTY 0x0 {
+    }
+    /* Make sure that the sections allocated in the SRAM does not exceed the
+     * size of the SRAM available.
+     */
+    ScatterAssert(ImageLimit(SRAM_WATERMARK) <= S_DATA_START + S_DATA_SIZE)
+}
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.sct.base@1.1.0 b/dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.sct.base@1.1.0
new file mode 100644
index 000000000..8b95c189d
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/linker_SSE300MPS3_secure.sct.base@1.1.0
@@ -0,0 +1,62 @@
+#! armclang --target=arm-arm-none-eabi -march=armv8.1-m.main -E -xc
+
+;/*
+; * Copyright (c) 2018-2023 Arm Limited
+; *
+; * Licensed under the Apache License, Version 2.0 (the "License");
+; * you may not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; *     http://www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an "AS IS" BASIS,
+; * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; *
+; */
+
+#include "region_defs.h"
+
+LR_CODE S_CODE_START {
+    ER_CODE S_CODE_START {
+        *.o (RESET +First)
+        .ANY (+RO)
+    }
+
+    /* This empty, zero long execution region is here to mark the limit address
+     * of the last execution region that is allocated in SRAM.
+     */
+    CODE_WATERMARK +0 EMPTY 0x0 {
+    }
+    /* Make sure that the sections allocated in the SRAM does not exceed the
+     * size of the SRAM available.
+     */
+    ScatterAssert(ImageLimit(CODE_WATERMARK) <= S_CODE_START + S_CODE_SIZE)
+
+    ER_DATA S_DATA_START {
+        .ANY (+ZI +RW)
+    }
+
+    #if HEAP_SIZE > 0
+    ARM_LIB_HEAP +0 ALIGN 8 EMPTY  HEAP_SIZE  {   ; Reserve empty region for heap
+    }
+    #endif
+
+    ARM_LIB_STACK +0 ALIGN 32 EMPTY STACK_SIZE - 0x8 {   ; Reserve empty region for stack
+    }
+
+    STACKSEAL +0 EMPTY 0x8 {
+    }
+
+    /* This empty, zero long execution region is here to mark the limit address
+     * of the last execution region that is allocated in SRAM.
+     */
+    SRAM_WATERMARK +0 EMPTY 0x0 {
+    }
+    /* Make sure that the sections allocated in the SRAM does not exceed the
+     * size of the SRAM available.
+     */
+    ScatterAssert(ImageLimit(SRAM_WATERMARK) <= S_DATA_START + S_DATA_SIZE)
+}
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/region_defs.h b/dsppp/RTE/Device/SSE-300-MPS3/region_defs.h
new file mode 100644
index 000000000..32ac16b37
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/region_defs.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016-2022 Arm Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __REGION_DEFS_H__
+#define __REGION_DEFS_H__
+
+#include "region_limits.h"
+
+/* **************************************************************
+ * WARNING: this file is parsed both by the C/C++ compiler
+ * and the linker. As a result the syntax must be valid not only
+ * for C/C++ but for the linker scripts too.
+ * Beware of the following limitations:
+ *   - LD (GCC linker) requires white space around operators.
+ *   - UL postfix for macros is not suported by the linker script
+ ****************************************************************/
+
+/* Secure regions */
+#define S_CODE_START     ( S_ROM_ALIAS )
+#define S_CODE_SIZE      ( TOTAL_S_ROM_SIZE )
+#define S_CODE_LIMIT     ( S_CODE_START + S_CODE_SIZE )
+
+#define S_DATA_START     ( S_RAM_ALIAS )
+#define S_DATA_SIZE      ( TOTAL_S_RAM_SIZE )
+#define S_DATA_LIMIT     ( S_DATA_START + S_DATA_SIZE )
+
+#define S_DDR4_START     ( S_DDR4_ALIAS )
+#define S_DDR4_SIZE      ( TOTAL_S_DDR4_SIZE )
+#define S_DDR4_LIMIT     ( S_DDR4_START + S_DDR4_SIZE )
+
+#endif /* __REGION_DEFS_H__ */
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/region_defs.h.base@1.0.0 b/dsppp/RTE/Device/SSE-300-MPS3/region_defs.h.base@1.0.0
new file mode 100644
index 000000000..32ac16b37
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/region_defs.h.base@1.0.0
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016-2022 Arm Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __REGION_DEFS_H__
+#define __REGION_DEFS_H__
+
+#include "region_limits.h"
+
+/* **************************************************************
+ * WARNING: this file is parsed both by the C/C++ compiler
+ * and the linker. As a result the syntax must be valid not only
+ * for C/C++ but for the linker scripts too.
+ * Beware of the following limitations:
+ *   - LD (GCC linker) requires white space around operators.
+ *   - UL postfix for macros is not suported by the linker script
+ ****************************************************************/
+
+/* Secure regions */
+#define S_CODE_START     ( S_ROM_ALIAS )
+#define S_CODE_SIZE      ( TOTAL_S_ROM_SIZE )
+#define S_CODE_LIMIT     ( S_CODE_START + S_CODE_SIZE )
+
+#define S_DATA_START     ( S_RAM_ALIAS )
+#define S_DATA_SIZE      ( TOTAL_S_RAM_SIZE )
+#define S_DATA_LIMIT     ( S_DATA_START + S_DATA_SIZE )
+
+#define S_DDR4_START     ( S_DDR4_ALIAS )
+#define S_DDR4_SIZE      ( TOTAL_S_DDR4_SIZE )
+#define S_DDR4_LIMIT     ( S_DDR4_START + S_DDR4_SIZE )
+
+#endif /* __REGION_DEFS_H__ */
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/region_limits.h b/dsppp/RTE/Device/SSE-300-MPS3/region_limits.h
new file mode 100644
index 000000000..0d600a363
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/region_limits.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __REGION_LIMITS_H__
+#define __REGION_LIMITS_H__
+
+/* **************************************************************
+ * WARNING: this file is parsed both by the C/C++ compiler
+ * and the linker. As a result the syntax must be valid not only
+ * for C/C++ but for the linker scripts too.
+ * Beware of the following limitations:
+ *   - LD (GCC linker) requires white space around operators.
+ *   - UL postfix for macros is not suported by the linker script
+ ****************************************************************/
+
+/* Secure Code */
+#define S_ROM_ALIAS               (0x10000000) /* ITCM_BASE_S */
+#define TOTAL_S_ROM_SIZE          (0x00080000) /* 512 kB */
+
+/* Secure Data */
+#define S_RAM_ALIAS               (0x30000000) /* DTCM_BASE_S */
+#define TOTAL_S_RAM_SIZE          (0x00080000) /* 512 kB */
+
+/* Secure DDR4 */
+#define S_DDR4_ALIAS              (0x70000000) /* DDR4_BLK1_BASE_S */
+#define TOTAL_S_DDR4_SIZE         (0x10000000) /* 256 MB */
+
+/* Heap and Stack sizes for secure and nonsecure applications */
+#define HEAP_SIZE                 (0x00038000) /* 1 KiB */
+#define STACK_SIZE                (0x00002000) /* 1 KiB */
+
+#endif /* __REGION_LIMITS_H__ */
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/region_limits.h.base@1.0.0 b/dsppp/RTE/Device/SSE-300-MPS3/region_limits.h.base@1.0.0
new file mode 100644
index 000000000..e7897866a
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/region_limits.h.base@1.0.0
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __REGION_LIMITS_H__
+#define __REGION_LIMITS_H__
+
+/* **************************************************************
+ * WARNING: this file is parsed both by the C/C++ compiler
+ * and the linker. As a result the syntax must be valid not only
+ * for C/C++ but for the linker scripts too.
+ * Beware of the following limitations:
+ *   - LD (GCC linker) requires white space around operators.
+ *   - UL postfix for macros is not suported by the linker script
+ ****************************************************************/
+
+/* Secure Code */
+#define S_ROM_ALIAS               (0x10000000) /* ITCM_BASE_S */
+#define TOTAL_S_ROM_SIZE          (0x00080000) /* 512 kB */
+
+/* Secure Data */
+#define S_RAM_ALIAS               (0x30000000) /* DTCM_BASE_S */
+#define TOTAL_S_RAM_SIZE          (0x00080000) /* 512 kB */
+
+/* Secure DDR4 */
+#define S_DDR4_ALIAS              (0x70000000) /* DDR4_BLK1_BASE_S */
+#define TOTAL_S_DDR4_SIZE         (0x10000000) /* 256 MB */
+
+/* Heap and Stack sizes for secure and nonsecure applications */
+#define HEAP_SIZE                 (0x00000400) /* 1 KiB */
+#define STACK_SIZE                (0x00000400) /* 1 KiB */
+
+#endif /* __REGION_LIMITS_H__ */
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/startup_SSE300MPS3.c b/dsppp/RTE/Device/SSE-300-MPS3/startup_SSE300MPS3.c
new file mode 100644
index 000000000..72b39ca55
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/startup_SSE300MPS3.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This file is derivative of CMSIS V5.9.0 startup_ARMCM55.c
+ * Git SHA: 2b7495b8535bdcb306dac29b9ded4cfb679d7e5c
+ */
+
+#include "SSE300MPS3.h"
+
+/*----------------------------------------------------------------------------
+  External References
+ *----------------------------------------------------------------------------*/
+extern uint32_t __INITIAL_SP;
+extern uint32_t __STACK_LIMIT;
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+extern uint64_t __STACK_SEAL;
+#endif
+
+extern void __PROGRAM_START(void) __NO_RETURN;
+
+/*----------------------------------------------------------------------------
+  Internal References
+ *----------------------------------------------------------------------------*/
+void Reset_Handler  (void) __NO_RETURN;
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Handler
+ *----------------------------------------------------------------------------*/
+#define DEFAULT_IRQ_HANDLER(handler_name)  \
+void __WEAK handler_name(void) __NO_RETURN; \
+void handler_name(void) { \
+    while(1); \
+}
+
+/* Exceptions */
+DEFAULT_IRQ_HANDLER(NMI_Handler)
+DEFAULT_IRQ_HANDLER(HardFault_Handler)
+DEFAULT_IRQ_HANDLER(MemManage_Handler)
+DEFAULT_IRQ_HANDLER(BusFault_Handler)
+DEFAULT_IRQ_HANDLER(UsageFault_Handler)
+DEFAULT_IRQ_HANDLER(SecureFault_Handler)
+DEFAULT_IRQ_HANDLER(SVC_Handler)
+DEFAULT_IRQ_HANDLER(DebugMon_Handler)
+DEFAULT_IRQ_HANDLER(PendSV_Handler)
+DEFAULT_IRQ_HANDLER(SysTick_Handler)
+
+DEFAULT_IRQ_HANDLER(NONSEC_WATCHDOG_RESET_REQ_Handler)
+DEFAULT_IRQ_HANDLER(NONSEC_WATCHDOG_Handler)
+DEFAULT_IRQ_HANDLER(SLOWCLK_Timer_Handler)
+DEFAULT_IRQ_HANDLER(TFM_TIMER0_IRQ_Handler)
+DEFAULT_IRQ_HANDLER(TIMER1_Handler)
+DEFAULT_IRQ_HANDLER(TIMER2_Handler)
+DEFAULT_IRQ_HANDLER(MPC_Handler)
+DEFAULT_IRQ_HANDLER(PPC_Handler)
+DEFAULT_IRQ_HANDLER(MSC_Handler)
+DEFAULT_IRQ_HANDLER(BRIDGE_ERROR_Handler)
+DEFAULT_IRQ_HANDLER(MGMT_PPU_Handler)
+DEFAULT_IRQ_HANDLER(SYS_PPU_Handler)
+DEFAULT_IRQ_HANDLER(CPU0_PPU_Handler)
+DEFAULT_IRQ_HANDLER(DEBUG_PPU_Handler)
+DEFAULT_IRQ_HANDLER(TIMER3_AON_Handler)
+DEFAULT_IRQ_HANDLER(CPU0_CTI_0_Handler)
+DEFAULT_IRQ_HANDLER(CPU0_CTI_1_Handler)
+
+DEFAULT_IRQ_HANDLER(System_Timestamp_Counter_Handler)
+DEFAULT_IRQ_HANDLER(UARTRX0_Handler)
+DEFAULT_IRQ_HANDLER(UARTTX0_Handler)
+DEFAULT_IRQ_HANDLER(UARTRX1_Handler)
+DEFAULT_IRQ_HANDLER(UARTTX1_Handler)
+DEFAULT_IRQ_HANDLER(UARTRX2_Handler)
+DEFAULT_IRQ_HANDLER(UARTTX2_Handler)
+DEFAULT_IRQ_HANDLER(UARTRX3_Handler)
+DEFAULT_IRQ_HANDLER(UARTTX3_Handler)
+DEFAULT_IRQ_HANDLER(UARTRX4_Handler)
+DEFAULT_IRQ_HANDLER(UARTTX4_Handler)
+DEFAULT_IRQ_HANDLER(UART0_Combined_Handler)
+DEFAULT_IRQ_HANDLER(UART1_Combined_Handler)
+DEFAULT_IRQ_HANDLER(UART2_Combined_Handler)
+DEFAULT_IRQ_HANDLER(UART3_Combined_Handler)
+DEFAULT_IRQ_HANDLER(UART4_Combined_Handler)
+DEFAULT_IRQ_HANDLER(UARTOVF_Handler)
+DEFAULT_IRQ_HANDLER(ETHERNET_Handler)
+DEFAULT_IRQ_HANDLER(I2S_Handler)
+DEFAULT_IRQ_HANDLER(TOUCH_SCREEN_Handler)
+DEFAULT_IRQ_HANDLER(USB_Handler)
+DEFAULT_IRQ_HANDLER(SPI_ADC_Handler)
+DEFAULT_IRQ_HANDLER(SPI_SHIELD0_Handler)
+DEFAULT_IRQ_HANDLER(SPI_SHIELD1_Handler)
+DEFAULT_IRQ_HANDLER(ETHOS_U55_Handler)
+#ifdef CORSTONE300_AN547
+DEFAULT_IRQ_HANDLER(DMA_Ch_1_Error_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_1_Terminal_Count_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_1_Combined_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_2_Error_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_2_Terminal_Count_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_2_Combined_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_3_Error_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_3_Terminal_Count_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_3_Combined_Handler)
+#endif
+DEFAULT_IRQ_HANDLER(GPIO0_Combined_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_Combined_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_Combined_Handler)
+DEFAULT_IRQ_HANDLER(GPIO3_Combined_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_0_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_1_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_2_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_3_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_4_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_5_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_6_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_7_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_8_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_9_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_10_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_11_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_12_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_13_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_14_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_15_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_0_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_1_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_2_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_3_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_4_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_5_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_6_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_7_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_8_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_9_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_10_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_11_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_12_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_13_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_14_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_15_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_0_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_1_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_2_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_3_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_4_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_5_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_6_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_7_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_8_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_9_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_10_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_11_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_12_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_13_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_14_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_15_Handler)
+DEFAULT_IRQ_HANDLER(GPIO3_0_Handler)
+DEFAULT_IRQ_HANDLER(GPIO3_1_Handler)
+DEFAULT_IRQ_HANDLER(GPIO3_2_Handler)
+DEFAULT_IRQ_HANDLER(GPIO3_3_Handler)
+DEFAULT_IRQ_HANDLER(UARTRX5_Handler)
+DEFAULT_IRQ_HANDLER(UARTTX5_Handler)
+DEFAULT_IRQ_HANDLER(UART5_Handler)
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Vector table
+ *----------------------------------------------------------------------------*/
+
+#if defined ( __GNUC__ )
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+
+extern const VECTOR_TABLE_Type __VECTOR_TABLE[];
+       const VECTOR_TABLE_Type __VECTOR_TABLE[] __VECTOR_TABLE_ATTRIBUTE = {
+  (VECTOR_TABLE_Type)(&__INITIAL_SP),            /*      Initial Stack Pointer */
+  Reset_Handler,                     /*      Reset Handler */
+  NMI_Handler,                       /* -14: NMI Handler */
+  HardFault_Handler,                 /* -13: Hard Fault Handler */
+  MemManage_Handler,                 /* -12: MPU Fault Handler */
+  BusFault_Handler,                  /* -11: Bus Fault Handler */
+  UsageFault_Handler,                /* -10: Usage Fault Handler */
+  SecureFault_Handler,               /*  -9: Secure Fault Handler */
+  0,                                 /*      Reserved */
+  0,                                 /*      Reserved */
+  0,                                 /*      Reserved */
+  SVC_Handler,                       /*  -5: SVCall Handler */
+  DebugMon_Handler,                  /*  -4: Debug Monitor Handler */
+  0,                                 /*      Reserved */
+  PendSV_Handler,                    /*  -2: PendSV Handler */
+  SysTick_Handler,                   /*  -1: SysTick Handler */
+
+  NONSEC_WATCHDOG_RESET_REQ_Handler, /*   0: Non-Secure Watchdog Reset Request Handler */
+  NONSEC_WATCHDOG_Handler,           /*   1: Non-Secure Watchdog Handler */
+  SLOWCLK_Timer_Handler,             /*   2: SLOWCLK Timer Handler */
+  TFM_TIMER0_IRQ_Handler,            /*   3: TIMER 0 Handler */
+  TIMER1_Handler,                    /*   4: TIMER 1 Handler */
+  TIMER2_Handler,                    /*   5: TIMER 2 Handler */
+  0,                                 /*   6: Reserved */
+  0,                                 /*   7: Reserved */
+  0,                                 /*   8: Reserved */
+  MPC_Handler,                       /*   9: MPC Combined (Secure) Handler */
+  PPC_Handler,                       /*  10: PPC Combined (Secure) Handler */
+  MSC_Handler,                       /*  11: MSC Combined (Secure) Handler */
+  BRIDGE_ERROR_Handler,              /*  12: Bridge Error (Secure) Handler */
+  0,                                 /*  13: Reserved */
+  MGMT_PPU_Handler,                  /*  14: MGMT PPU Handler */
+  SYS_PPU_Handler,                   /*  15: SYS PPU Handler */
+  CPU0_PPU_Handler,                  /*  16: CPU0 PPU Handler */
+  0,                                 /*  17: Reserved */
+  0,                                 /*  18: Reserved */
+  0,                                 /*  19: Reserved */
+  0,                                 /*  20: Reserved */
+  0,                                 /*  21: Reserved */
+  0,                                 /*  22: Reserved */
+  0,                                 /*  23: Reserved */
+  0,                                 /*  24: Reserved */
+  0,                                 /*  25: Reserved */
+  DEBUG_PPU_Handler,                 /*  26: DEBUG PPU Handler */
+  TIMER3_AON_Handler,                /*  27: TIMER 3 AON Handler */
+  CPU0_CTI_0_Handler,                /*  28: CPU0 CTI IRQ 0 Handler */
+  CPU0_CTI_1_Handler,                /*  29: CPU0 CTI IRQ 1 Handler */
+  0,                                 /*  30: Reserved */
+  0,                                 /*  31: Reserved */
+
+  /* External interrupts */
+  System_Timestamp_Counter_Handler,  /*  32: System timestamp counter Handler */
+  UARTRX0_Handler,                   /*  33: UART 0 RX Handler */
+  UARTTX0_Handler,                   /*  34: UART 0 TX Handler */
+  UARTRX1_Handler,                   /*  35: UART 1 RX Handler */
+  UARTTX1_Handler,                   /*  36: UART 1 TX Handler */
+  UARTRX2_Handler,                   /*  37: UART 2 RX Handler */
+  UARTTX2_Handler,                   /*  38: UART 2 TX Handler */
+  UARTRX3_Handler,                   /*  39: UART 3 RX Handler */
+  UARTTX3_Handler,                   /*  40: UART 3 TX Handler */
+  UARTRX4_Handler,                   /*  41: UART 4 RX Handler */
+  UARTTX4_Handler,                   /*  42: UART 4 TX Handler */
+  UART0_Combined_Handler,            /*  43: UART 0 Combined Handler */
+  UART1_Combined_Handler,            /*  44: UART 1 Combined Handler */
+  UART2_Combined_Handler,            /*  45: UART 2 Combined Handler */
+  UART3_Combined_Handler,            /*  46: UART 3 Combined Handler */
+  UART4_Combined_Handler,            /*  47: UART 4 Combined Handler */
+  UARTOVF_Handler,                   /*  48: UART 0, 1, 2, 3, 4 & 5 Overflow Handler */
+  ETHERNET_Handler,                  /*  49: Ethernet Handler */
+  I2S_Handler,                       /*  50: Audio I2S Handler */
+  TOUCH_SCREEN_Handler,              /*  51: Touch Screen Handler */
+  USB_Handler,                       /*  52: USB Handler */
+  SPI_ADC_Handler,                   /*  53: SPI ADC Handler */
+  SPI_SHIELD0_Handler,               /*  54: SPI (Shield 0) Handler */
+  SPI_SHIELD1_Handler,               /*  55: SPI (Shield 0) Handler */
+  ETHOS_U55_Handler,                 /*  56: Ethos-U55 Handler */
+#ifdef CORSTONE300_AN547
+  0,                                 /*  57: Reserved */
+  0,                                 /*  58: Reserved */
+  0,                                 /*  59: Reserved */
+  DMA_Ch_1_Error_Handler,            /*  60: DMA Ch1 Error Handler */
+  DMA_Ch_1_Terminal_Count_Handler,   /*  61: DMA Ch1 Terminal Count Handler */
+  DMA_Ch_1_Combined_Handler,         /*  62: DMA Ch1 Combined Handler */
+  DMA_Ch_2_Error_Handler,            /*  63: DMA Ch2 Error Handler */
+  DMA_Ch_2_Terminal_Count_Handler,   /*  64: DMA Ch2 Terminal Count Handler */
+  DMA_Ch_2_Combined_Handler,         /*  65: DMA Ch2 Combined Handler */
+  DMA_Ch_3_Error_Handler,            /*  66: DMA Ch3 Error Handler */
+  DMA_Ch_3_Terminal_Count_Handler,   /*  67: DMA Ch3 Terminal Count Handler */
+  DMA_Ch_3_Combined_Handler,         /*  68: DMA Ch3 Combined Handler */
+#else
+  0,                                 /*  57: Reserved */
+  0,                                 /*  58: Reserved */
+  0,                                 /*  59: Reserved */
+  0,                                 /*  60: Reserved */
+  0,                                 /*  61: Reserved */
+  0,                                 /*  62: Reserved */
+  0,                                 /*  63: Reserved */
+  0,                                 /*  64: Reserved */
+  0,                                 /*  65: Reserved */
+  0,                                 /*  66: Reserved */
+  0,                                 /*  67: Reserved */
+  0,                                 /*  68: Reserved */
+#endif
+  GPIO0_Combined_Handler,            /*  69: GPIO 0 Combined Handler */
+  GPIO1_Combined_Handler,            /*  70: GPIO 1 Combined Handler */
+  GPIO2_Combined_Handler,            /*  71: GPIO 2 Combined Handler */
+  GPIO3_Combined_Handler,            /*  72: GPIO 3 Combined Handler */
+  GPIO0_0_Handler,                   /*  73: GPIO0 Pin 0 Handler */
+  GPIO0_1_Handler,                   /*  74: GPIO0 Pin 1 Handler */
+  GPIO0_2_Handler,                   /*  75: GPIO0 Pin 2 Handler */
+  GPIO0_3_Handler,                   /*  76: GPIO0 Pin 3 Handler */
+  GPIO0_4_Handler,                   /*  77: GPIO0 Pin 4 Handler */
+  GPIO0_5_Handler,                   /*  78: GPIO0 Pin 5 Handler */
+  GPIO0_6_Handler,                   /*  79: GPIO0 Pin 6 Handler */
+  GPIO0_7_Handler,                   /*  80: GPIO0 Pin 7 Handler */
+  GPIO0_8_Handler,                   /*  81: GPIO0 Pin 8 Handler */
+  GPIO0_9_Handler,                   /*  82: GPIO0 Pin 9 Handler */
+  GPIO0_10_Handler,                  /*  83: GPIO0 Pin 10 Handler */
+  GPIO0_11_Handler,                  /*  84: GPIO0 Pin 11 Handler */
+  GPIO0_12_Handler,                  /*  85: GPIO0 Pin 12 Handler */
+  GPIO0_13_Handler,                  /*  86: GPIO0 Pin 13 Handler */
+  GPIO0_14_Handler,                  /*  87: GPIO0 Pin 14 Handler */
+  GPIO0_15_Handler,                  /*  88: GPIO0 Pin 15 Handler */
+  GPIO1_0_Handler,                   /*  89: GPIO1 Pin 0 Handler */
+  GPIO1_1_Handler,                   /*  90: GPIO1 Pin 1 Handler */
+  GPIO1_2_Handler,                   /*  91: GPIO1 Pin 2 Handler */
+  GPIO1_3_Handler,                   /*  92: GPIO1 Pin 3 Handler */
+  GPIO1_4_Handler,                   /*  93: GPIO1 Pin 4 Handler */
+  GPIO1_5_Handler,                   /*  94: GPIO1 Pin 5 Handler */
+  GPIO1_6_Handler,                   /*  95: GPIO1 Pin 6 Handler */
+  GPIO1_7_Handler,                   /*  96: GPIO1 Pin 7 Handler */
+  GPIO1_8_Handler,                   /*  97: GPIO1 Pin 8 Handler */
+  GPIO1_9_Handler,                   /*  98: GPIO1 Pin 9 Handler */
+  GPIO1_10_Handler,                  /*  99: GPIO1 Pin 10 Handler */
+  GPIO1_11_Handler,                  /*  100: GPIO1 Pin 11 Handler */
+  GPIO1_12_Handler,                  /*  101: GPIO1 Pin 12 Handler */
+  GPIO1_13_Handler,                  /*  102: GPIO1 Pin 13 Handler */
+  GPIO1_14_Handler,                  /*  103: GPIO1 Pin 14 Handler */
+  GPIO1_15_Handler,                  /*  104: GPIO1 Pin 15 Handler */
+  GPIO2_0_Handler,                   /*  105: GPIO2 Pin 0 Handler */
+  GPIO2_1_Handler,                   /*  106: GPIO2 Pin 1 Handler */
+  GPIO2_2_Handler,                   /*  107: GPIO2 Pin 2 Handler */
+  GPIO2_3_Handler,                   /*  108: GPIO2 Pin 3 Handler */
+  GPIO2_4_Handler,                   /*  109: GPIO2 Pin 4 Handler */
+  GPIO2_5_Handler,                   /*  110: GPIO2 Pin 5 Handler */
+  GPIO2_6_Handler,                   /*  111: GPIO2 Pin 6 Handler */
+  GPIO2_7_Handler,                   /*  112: GPIO2 Pin 7 Handler */
+  GPIO2_8_Handler,                   /*  113: GPIO2 Pin 8 Handler */
+  GPIO2_9_Handler,                   /*  114: GPIO2 Pin 9 Handler */
+  GPIO2_10_Handler,                  /*  115: GPIO2 Pin 10 Handler */
+  GPIO2_11_Handler,                  /*  116: GPIO2 Pin 11 Handler */
+  GPIO2_12_Handler,                  /*  117: GPIO2 Pin 12 Handler */
+  GPIO2_13_Handler,                  /*  118: GPIO2 Pin 13 Handler */
+  GPIO2_14_Handler,                  /*  119: GPIO2 Pin 14 Handler */
+  GPIO2_15_Handler,                  /*  120: GPIO2 Pin 15 Handler */
+  GPIO3_0_Handler,                   /*  121: GPIO3 Pin 0 Handler */
+  GPIO3_1_Handler,                   /*  122: GPIO3 Pin 1 Handler */
+  GPIO3_2_Handler,                   /*  123: GPIO3 Pin 2 Handler */
+  GPIO3_3_Handler,                   /*  124: GPIO3 Pin 3 Handler */
+  UARTRX5_Handler,                   /*  125: UART 5 RX Interrupt */
+  UARTTX5_Handler,                   /*  126: UART 5 TX Interrupt */
+  UART5_Handler,                     /*  127: UART 5 combined Interrupt */
+  0,                                 /*  128: Reserved */
+  0,                                 /*  129: Reserved */
+  0,                                 /*  130: Reserved */
+};
+
+#if defined ( __GNUC__ )
+#pragma GCC diagnostic pop
+#endif
+
+/*----------------------------------------------------------------------------
+  Reset Handler called on controller reset
+ *----------------------------------------------------------------------------*/
+void Reset_Handler(void)
+{
+    __set_PSP((uint32_t)(&__INITIAL_SP));
+
+    __set_MSPLIM((uint32_t)(&__STACK_LIMIT));
+    __set_PSPLIM((uint32_t)(&__STACK_LIMIT));
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+    __TZ_set_STACKSEAL_S((uint32_t *)(&__STACK_SEAL));
+#endif
+
+    SystemInit();                             /* CMSIS System Initialization */
+    __PROGRAM_START();                        /* Enter PreMain (C library entry point) */
+}
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/startup_SSE300MPS3.c.base@1.1.1 b/dsppp/RTE/Device/SSE-300-MPS3/startup_SSE300MPS3.c.base@1.1.1
new file mode 100644
index 000000000..72b39ca55
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/startup_SSE300MPS3.c.base@1.1.1
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This file is derivative of CMSIS V5.9.0 startup_ARMCM55.c
+ * Git SHA: 2b7495b8535bdcb306dac29b9ded4cfb679d7e5c
+ */
+
+#include "SSE300MPS3.h"
+
+/*----------------------------------------------------------------------------
+  External References
+ *----------------------------------------------------------------------------*/
+extern uint32_t __INITIAL_SP;
+extern uint32_t __STACK_LIMIT;
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+extern uint64_t __STACK_SEAL;
+#endif
+
+extern void __PROGRAM_START(void) __NO_RETURN;
+
+/*----------------------------------------------------------------------------
+  Internal References
+ *----------------------------------------------------------------------------*/
+void Reset_Handler  (void) __NO_RETURN;
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Handler
+ *----------------------------------------------------------------------------*/
+#define DEFAULT_IRQ_HANDLER(handler_name)  \
+void __WEAK handler_name(void) __NO_RETURN; \
+void handler_name(void) { \
+    while(1); \
+}
+
+/* Exceptions */
+DEFAULT_IRQ_HANDLER(NMI_Handler)
+DEFAULT_IRQ_HANDLER(HardFault_Handler)
+DEFAULT_IRQ_HANDLER(MemManage_Handler)
+DEFAULT_IRQ_HANDLER(BusFault_Handler)
+DEFAULT_IRQ_HANDLER(UsageFault_Handler)
+DEFAULT_IRQ_HANDLER(SecureFault_Handler)
+DEFAULT_IRQ_HANDLER(SVC_Handler)
+DEFAULT_IRQ_HANDLER(DebugMon_Handler)
+DEFAULT_IRQ_HANDLER(PendSV_Handler)
+DEFAULT_IRQ_HANDLER(SysTick_Handler)
+
+DEFAULT_IRQ_HANDLER(NONSEC_WATCHDOG_RESET_REQ_Handler)
+DEFAULT_IRQ_HANDLER(NONSEC_WATCHDOG_Handler)
+DEFAULT_IRQ_HANDLER(SLOWCLK_Timer_Handler)
+DEFAULT_IRQ_HANDLER(TFM_TIMER0_IRQ_Handler)
+DEFAULT_IRQ_HANDLER(TIMER1_Handler)
+DEFAULT_IRQ_HANDLER(TIMER2_Handler)
+DEFAULT_IRQ_HANDLER(MPC_Handler)
+DEFAULT_IRQ_HANDLER(PPC_Handler)
+DEFAULT_IRQ_HANDLER(MSC_Handler)
+DEFAULT_IRQ_HANDLER(BRIDGE_ERROR_Handler)
+DEFAULT_IRQ_HANDLER(MGMT_PPU_Handler)
+DEFAULT_IRQ_HANDLER(SYS_PPU_Handler)
+DEFAULT_IRQ_HANDLER(CPU0_PPU_Handler)
+DEFAULT_IRQ_HANDLER(DEBUG_PPU_Handler)
+DEFAULT_IRQ_HANDLER(TIMER3_AON_Handler)
+DEFAULT_IRQ_HANDLER(CPU0_CTI_0_Handler)
+DEFAULT_IRQ_HANDLER(CPU0_CTI_1_Handler)
+
+DEFAULT_IRQ_HANDLER(System_Timestamp_Counter_Handler)
+DEFAULT_IRQ_HANDLER(UARTRX0_Handler)
+DEFAULT_IRQ_HANDLER(UARTTX0_Handler)
+DEFAULT_IRQ_HANDLER(UARTRX1_Handler)
+DEFAULT_IRQ_HANDLER(UARTTX1_Handler)
+DEFAULT_IRQ_HANDLER(UARTRX2_Handler)
+DEFAULT_IRQ_HANDLER(UARTTX2_Handler)
+DEFAULT_IRQ_HANDLER(UARTRX3_Handler)
+DEFAULT_IRQ_HANDLER(UARTTX3_Handler)
+DEFAULT_IRQ_HANDLER(UARTRX4_Handler)
+DEFAULT_IRQ_HANDLER(UARTTX4_Handler)
+DEFAULT_IRQ_HANDLER(UART0_Combined_Handler)
+DEFAULT_IRQ_HANDLER(UART1_Combined_Handler)
+DEFAULT_IRQ_HANDLER(UART2_Combined_Handler)
+DEFAULT_IRQ_HANDLER(UART3_Combined_Handler)
+DEFAULT_IRQ_HANDLER(UART4_Combined_Handler)
+DEFAULT_IRQ_HANDLER(UARTOVF_Handler)
+DEFAULT_IRQ_HANDLER(ETHERNET_Handler)
+DEFAULT_IRQ_HANDLER(I2S_Handler)
+DEFAULT_IRQ_HANDLER(TOUCH_SCREEN_Handler)
+DEFAULT_IRQ_HANDLER(USB_Handler)
+DEFAULT_IRQ_HANDLER(SPI_ADC_Handler)
+DEFAULT_IRQ_HANDLER(SPI_SHIELD0_Handler)
+DEFAULT_IRQ_HANDLER(SPI_SHIELD1_Handler)
+DEFAULT_IRQ_HANDLER(ETHOS_U55_Handler)
+#ifdef CORSTONE300_AN547
+DEFAULT_IRQ_HANDLER(DMA_Ch_1_Error_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_1_Terminal_Count_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_1_Combined_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_2_Error_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_2_Terminal_Count_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_2_Combined_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_3_Error_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_3_Terminal_Count_Handler)
+DEFAULT_IRQ_HANDLER(DMA_Ch_3_Combined_Handler)
+#endif
+DEFAULT_IRQ_HANDLER(GPIO0_Combined_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_Combined_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_Combined_Handler)
+DEFAULT_IRQ_HANDLER(GPIO3_Combined_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_0_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_1_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_2_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_3_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_4_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_5_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_6_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_7_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_8_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_9_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_10_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_11_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_12_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_13_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_14_Handler)
+DEFAULT_IRQ_HANDLER(GPIO0_15_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_0_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_1_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_2_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_3_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_4_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_5_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_6_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_7_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_8_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_9_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_10_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_11_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_12_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_13_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_14_Handler)
+DEFAULT_IRQ_HANDLER(GPIO1_15_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_0_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_1_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_2_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_3_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_4_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_5_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_6_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_7_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_8_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_9_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_10_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_11_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_12_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_13_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_14_Handler)
+DEFAULT_IRQ_HANDLER(GPIO2_15_Handler)
+DEFAULT_IRQ_HANDLER(GPIO3_0_Handler)
+DEFAULT_IRQ_HANDLER(GPIO3_1_Handler)
+DEFAULT_IRQ_HANDLER(GPIO3_2_Handler)
+DEFAULT_IRQ_HANDLER(GPIO3_3_Handler)
+DEFAULT_IRQ_HANDLER(UARTRX5_Handler)
+DEFAULT_IRQ_HANDLER(UARTTX5_Handler)
+DEFAULT_IRQ_HANDLER(UART5_Handler)
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Vector table
+ *----------------------------------------------------------------------------*/
+
+#if defined ( __GNUC__ )
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+
+extern const VECTOR_TABLE_Type __VECTOR_TABLE[];
+       const VECTOR_TABLE_Type __VECTOR_TABLE[] __VECTOR_TABLE_ATTRIBUTE = {
+  (VECTOR_TABLE_Type)(&__INITIAL_SP),            /*      Initial Stack Pointer */
+  Reset_Handler,                     /*      Reset Handler */
+  NMI_Handler,                       /* -14: NMI Handler */
+  HardFault_Handler,                 /* -13: Hard Fault Handler */
+  MemManage_Handler,                 /* -12: MPU Fault Handler */
+  BusFault_Handler,                  /* -11: Bus Fault Handler */
+  UsageFault_Handler,                /* -10: Usage Fault Handler */
+  SecureFault_Handler,               /*  -9: Secure Fault Handler */
+  0,                                 /*      Reserved */
+  0,                                 /*      Reserved */
+  0,                                 /*      Reserved */
+  SVC_Handler,                       /*  -5: SVCall Handler */
+  DebugMon_Handler,                  /*  -4: Debug Monitor Handler */
+  0,                                 /*      Reserved */
+  PendSV_Handler,                    /*  -2: PendSV Handler */
+  SysTick_Handler,                   /*  -1: SysTick Handler */
+
+  NONSEC_WATCHDOG_RESET_REQ_Handler, /*   0: Non-Secure Watchdog Reset Request Handler */
+  NONSEC_WATCHDOG_Handler,           /*   1: Non-Secure Watchdog Handler */
+  SLOWCLK_Timer_Handler,             /*   2: SLOWCLK Timer Handler */
+  TFM_TIMER0_IRQ_Handler,            /*   3: TIMER 0 Handler */
+  TIMER1_Handler,                    /*   4: TIMER 1 Handler */
+  TIMER2_Handler,                    /*   5: TIMER 2 Handler */
+  0,                                 /*   6: Reserved */
+  0,                                 /*   7: Reserved */
+  0,                                 /*   8: Reserved */
+  MPC_Handler,                       /*   9: MPC Combined (Secure) Handler */
+  PPC_Handler,                       /*  10: PPC Combined (Secure) Handler */
+  MSC_Handler,                       /*  11: MSC Combined (Secure) Handler */
+  BRIDGE_ERROR_Handler,              /*  12: Bridge Error (Secure) Handler */
+  0,                                 /*  13: Reserved */
+  MGMT_PPU_Handler,                  /*  14: MGMT PPU Handler */
+  SYS_PPU_Handler,                   /*  15: SYS PPU Handler */
+  CPU0_PPU_Handler,                  /*  16: CPU0 PPU Handler */
+  0,                                 /*  17: Reserved */
+  0,                                 /*  18: Reserved */
+  0,                                 /*  19: Reserved */
+  0,                                 /*  20: Reserved */
+  0,                                 /*  21: Reserved */
+  0,                                 /*  22: Reserved */
+  0,                                 /*  23: Reserved */
+  0,                                 /*  24: Reserved */
+  0,                                 /*  25: Reserved */
+  DEBUG_PPU_Handler,                 /*  26: DEBUG PPU Handler */
+  TIMER3_AON_Handler,                /*  27: TIMER 3 AON Handler */
+  CPU0_CTI_0_Handler,                /*  28: CPU0 CTI IRQ 0 Handler */
+  CPU0_CTI_1_Handler,                /*  29: CPU0 CTI IRQ 1 Handler */
+  0,                                 /*  30: Reserved */
+  0,                                 /*  31: Reserved */
+
+  /* External interrupts */
+  System_Timestamp_Counter_Handler,  /*  32: System timestamp counter Handler */
+  UARTRX0_Handler,                   /*  33: UART 0 RX Handler */
+  UARTTX0_Handler,                   /*  34: UART 0 TX Handler */
+  UARTRX1_Handler,                   /*  35: UART 1 RX Handler */
+  UARTTX1_Handler,                   /*  36: UART 1 TX Handler */
+  UARTRX2_Handler,                   /*  37: UART 2 RX Handler */
+  UARTTX2_Handler,                   /*  38: UART 2 TX Handler */
+  UARTRX3_Handler,                   /*  39: UART 3 RX Handler */
+  UARTTX3_Handler,                   /*  40: UART 3 TX Handler */
+  UARTRX4_Handler,                   /*  41: UART 4 RX Handler */
+  UARTTX4_Handler,                   /*  42: UART 4 TX Handler */
+  UART0_Combined_Handler,            /*  43: UART 0 Combined Handler */
+  UART1_Combined_Handler,            /*  44: UART 1 Combined Handler */
+  UART2_Combined_Handler,            /*  45: UART 2 Combined Handler */
+  UART3_Combined_Handler,            /*  46: UART 3 Combined Handler */
+  UART4_Combined_Handler,            /*  47: UART 4 Combined Handler */
+  UARTOVF_Handler,                   /*  48: UART 0, 1, 2, 3, 4 & 5 Overflow Handler */
+  ETHERNET_Handler,                  /*  49: Ethernet Handler */
+  I2S_Handler,                       /*  50: Audio I2S Handler */
+  TOUCH_SCREEN_Handler,              /*  51: Touch Screen Handler */
+  USB_Handler,                       /*  52: USB Handler */
+  SPI_ADC_Handler,                   /*  53: SPI ADC Handler */
+  SPI_SHIELD0_Handler,               /*  54: SPI (Shield 0) Handler */
+  SPI_SHIELD1_Handler,               /*  55: SPI (Shield 0) Handler */
+  ETHOS_U55_Handler,                 /*  56: Ethos-U55 Handler */
+#ifdef CORSTONE300_AN547
+  0,                                 /*  57: Reserved */
+  0,                                 /*  58: Reserved */
+  0,                                 /*  59: Reserved */
+  DMA_Ch_1_Error_Handler,            /*  60: DMA Ch1 Error Handler */
+  DMA_Ch_1_Terminal_Count_Handler,   /*  61: DMA Ch1 Terminal Count Handler */
+  DMA_Ch_1_Combined_Handler,         /*  62: DMA Ch1 Combined Handler */
+  DMA_Ch_2_Error_Handler,            /*  63: DMA Ch2 Error Handler */
+  DMA_Ch_2_Terminal_Count_Handler,   /*  64: DMA Ch2 Terminal Count Handler */
+  DMA_Ch_2_Combined_Handler,         /*  65: DMA Ch2 Combined Handler */
+  DMA_Ch_3_Error_Handler,            /*  66: DMA Ch3 Error Handler */
+  DMA_Ch_3_Terminal_Count_Handler,   /*  67: DMA Ch3 Terminal Count Handler */
+  DMA_Ch_3_Combined_Handler,         /*  68: DMA Ch3 Combined Handler */
+#else
+  0,                                 /*  57: Reserved */
+  0,                                 /*  58: Reserved */
+  0,                                 /*  59: Reserved */
+  0,                                 /*  60: Reserved */
+  0,                                 /*  61: Reserved */
+  0,                                 /*  62: Reserved */
+  0,                                 /*  63: Reserved */
+  0,                                 /*  64: Reserved */
+  0,                                 /*  65: Reserved */
+  0,                                 /*  66: Reserved */
+  0,                                 /*  67: Reserved */
+  0,                                 /*  68: Reserved */
+#endif
+  GPIO0_Combined_Handler,            /*  69: GPIO 0 Combined Handler */
+  GPIO1_Combined_Handler,            /*  70: GPIO 1 Combined Handler */
+  GPIO2_Combined_Handler,            /*  71: GPIO 2 Combined Handler */
+  GPIO3_Combined_Handler,            /*  72: GPIO 3 Combined Handler */
+  GPIO0_0_Handler,                   /*  73: GPIO0 Pin 0 Handler */
+  GPIO0_1_Handler,                   /*  74: GPIO0 Pin 1 Handler */
+  GPIO0_2_Handler,                   /*  75: GPIO0 Pin 2 Handler */
+  GPIO0_3_Handler,                   /*  76: GPIO0 Pin 3 Handler */
+  GPIO0_4_Handler,                   /*  77: GPIO0 Pin 4 Handler */
+  GPIO0_5_Handler,                   /*  78: GPIO0 Pin 5 Handler */
+  GPIO0_6_Handler,                   /*  79: GPIO0 Pin 6 Handler */
+  GPIO0_7_Handler,                   /*  80: GPIO0 Pin 7 Handler */
+  GPIO0_8_Handler,                   /*  81: GPIO0 Pin 8 Handler */
+  GPIO0_9_Handler,                   /*  82: GPIO0 Pin 9 Handler */
+  GPIO0_10_Handler,                  /*  83: GPIO0 Pin 10 Handler */
+  GPIO0_11_Handler,                  /*  84: GPIO0 Pin 11 Handler */
+  GPIO0_12_Handler,                  /*  85: GPIO0 Pin 12 Handler */
+  GPIO0_13_Handler,                  /*  86: GPIO0 Pin 13 Handler */
+  GPIO0_14_Handler,                  /*  87: GPIO0 Pin 14 Handler */
+  GPIO0_15_Handler,                  /*  88: GPIO0 Pin 15 Handler */
+  GPIO1_0_Handler,                   /*  89: GPIO1 Pin 0 Handler */
+  GPIO1_1_Handler,                   /*  90: GPIO1 Pin 1 Handler */
+  GPIO1_2_Handler,                   /*  91: GPIO1 Pin 2 Handler */
+  GPIO1_3_Handler,                   /*  92: GPIO1 Pin 3 Handler */
+  GPIO1_4_Handler,                   /*  93: GPIO1 Pin 4 Handler */
+  GPIO1_5_Handler,                   /*  94: GPIO1 Pin 5 Handler */
+  GPIO1_6_Handler,                   /*  95: GPIO1 Pin 6 Handler */
+  GPIO1_7_Handler,                   /*  96: GPIO1 Pin 7 Handler */
+  GPIO1_8_Handler,                   /*  97: GPIO1 Pin 8 Handler */
+  GPIO1_9_Handler,                   /*  98: GPIO1 Pin 9 Handler */
+  GPIO1_10_Handler,                  /*  99: GPIO1 Pin 10 Handler */
+  GPIO1_11_Handler,                  /*  100: GPIO1 Pin 11 Handler */
+  GPIO1_12_Handler,                  /*  101: GPIO1 Pin 12 Handler */
+  GPIO1_13_Handler,                  /*  102: GPIO1 Pin 13 Handler */
+  GPIO1_14_Handler,                  /*  103: GPIO1 Pin 14 Handler */
+  GPIO1_15_Handler,                  /*  104: GPIO1 Pin 15 Handler */
+  GPIO2_0_Handler,                   /*  105: GPIO2 Pin 0 Handler */
+  GPIO2_1_Handler,                   /*  106: GPIO2 Pin 1 Handler */
+  GPIO2_2_Handler,                   /*  107: GPIO2 Pin 2 Handler */
+  GPIO2_3_Handler,                   /*  108: GPIO2 Pin 3 Handler */
+  GPIO2_4_Handler,                   /*  109: GPIO2 Pin 4 Handler */
+  GPIO2_5_Handler,                   /*  110: GPIO2 Pin 5 Handler */
+  GPIO2_6_Handler,                   /*  111: GPIO2 Pin 6 Handler */
+  GPIO2_7_Handler,                   /*  112: GPIO2 Pin 7 Handler */
+  GPIO2_8_Handler,                   /*  113: GPIO2 Pin 8 Handler */
+  GPIO2_9_Handler,                   /*  114: GPIO2 Pin 9 Handler */
+  GPIO2_10_Handler,                  /*  115: GPIO2 Pin 10 Handler */
+  GPIO2_11_Handler,                  /*  116: GPIO2 Pin 11 Handler */
+  GPIO2_12_Handler,                  /*  117: GPIO2 Pin 12 Handler */
+  GPIO2_13_Handler,                  /*  118: GPIO2 Pin 13 Handler */
+  GPIO2_14_Handler,                  /*  119: GPIO2 Pin 14 Handler */
+  GPIO2_15_Handler,                  /*  120: GPIO2 Pin 15 Handler */
+  GPIO3_0_Handler,                   /*  121: GPIO3 Pin 0 Handler */
+  GPIO3_1_Handler,                   /*  122: GPIO3 Pin 1 Handler */
+  GPIO3_2_Handler,                   /*  123: GPIO3 Pin 2 Handler */
+  GPIO3_3_Handler,                   /*  124: GPIO3 Pin 3 Handler */
+  UARTRX5_Handler,                   /*  125: UART 5 RX Interrupt */
+  UARTTX5_Handler,                   /*  126: UART 5 TX Interrupt */
+  UART5_Handler,                     /*  127: UART 5 combined Interrupt */
+  0,                                 /*  128: Reserved */
+  0,                                 /*  129: Reserved */
+  0,                                 /*  130: Reserved */
+};
+
+#if defined ( __GNUC__ )
+#pragma GCC diagnostic pop
+#endif
+
+/*----------------------------------------------------------------------------
+  Reset Handler called on controller reset
+ *----------------------------------------------------------------------------*/
+void Reset_Handler(void)
+{
+    __set_PSP((uint32_t)(&__INITIAL_SP));
+
+    __set_MSPLIM((uint32_t)(&__STACK_LIMIT));
+    __set_PSPLIM((uint32_t)(&__STACK_LIMIT));
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+    __TZ_set_STACKSEAL_S((uint32_t *)(&__STACK_SEAL));
+#endif
+
+    SystemInit();                             /* CMSIS System Initialization */
+    __PROGRAM_START();                        /* Enter PreMain (C library entry point) */
+}
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/system_SSE300MPS3.c b/dsppp/RTE/Device/SSE-300-MPS3/system_SSE300MPS3.c
new file mode 100644
index 000000000..4e67d536d
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/system_SSE300MPS3.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2009-2022 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This file is derivative of CMSIS V5.9.0 system_ARMCM55.c
+ * Git SHA: 2b7495b8535bdcb306dac29b9ded4cfb679d7e5c
+ */
+
+#include "SSE300MPS3.h"
+
+/*----------------------------------------------------------------------------
+  Define clocks
+ *----------------------------------------------------------------------------*/
+ #define  XTAL             (32000000UL)
+ #define  SYSTEM_CLOCK     (XTAL)
+ #define  PERIPHERAL_CLOCK (25000000UL)
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Vector table
+ *----------------------------------------------------------------------------*/
+extern const VECTOR_TABLE_Type __VECTOR_TABLE[496];
+
+/*----------------------------------------------------------------------------
+  System Core Clock Variable
+ *----------------------------------------------------------------------------*/
+uint32_t SystemCoreClock = SYSTEM_CLOCK;
+uint32_t PeripheralClock = PERIPHERAL_CLOCK;
+
+/*----------------------------------------------------------------------------
+  System Core Clock update function
+ *----------------------------------------------------------------------------*/
+void SystemCoreClockUpdate (void)
+{
+    SystemCoreClock = SYSTEM_CLOCK;
+    PeripheralClock = PERIPHERAL_CLOCK;
+}
+
+/*----------------------------------------------------------------------------
+  System initialization function
+ *----------------------------------------------------------------------------*/
+void SystemInit (void)
+{
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1U)
+    SCB->VTOR = (uint32_t)(&__VECTOR_TABLE[0]);
+#endif
+
+#if (defined (__FPU_USED) && (__FPU_USED == 1U)) || \
+    (defined (__ARM_FEATURE_MVE) && (__ARM_FEATURE_MVE > 0U))
+    SCB->CPACR |= ((3U << 10U*2U) |           /* enable CP10 Full Access */
+                   (3U << 11U*2U)  );         /* enable CP11 Full Access */
+
+    /* Set low-power state for PDEPU                */
+    /*  0b00  | ON, PDEPU is not in low-power state */
+    /*  0b01  | ON, but the clock is off            */
+    /*  0b10  | RET(ention)                         */
+    /*  0b11  | OFF                                 */
+
+    /* Clear ELPSTATE, value is 0b11 on Cold reset */
+    PWRMODCTL->CPDLPSTATE &= ~(PWRMODCTL_CPDLPSTATE_ELPSTATE_Msk);
+
+    /* Favor best FP/MVE performance by default, avoid EPU switch-ON delays */
+    /* PDEPU ON, Clock OFF */
+    PWRMODCTL->CPDLPSTATE |= 0x1 << PWRMODCTL_CPDLPSTATE_ELPSTATE_Pos;
+#endif
+
+#ifdef UNALIGNED_SUPPORT_DISABLE
+    SCB->CCR |= SCB_CCR_UNALIGN_TRP_Msk;
+#endif
+
+    /* Enable Loop and branch info cache */
+    SCB->CCR |= SCB_CCR_LOB_Msk;
+    __DSB();
+    __ISB();
+
+
+    SystemCoreClock = SYSTEM_CLOCK;
+    PeripheralClock = PERIPHERAL_CLOCK;
+}
diff --git a/dsppp/RTE/Device/SSE-300-MPS3/system_SSE300MPS3.c.base@1.1.1 b/dsppp/RTE/Device/SSE-300-MPS3/system_SSE300MPS3.c.base@1.1.1
new file mode 100644
index 000000000..4e67d536d
--- /dev/null
+++ b/dsppp/RTE/Device/SSE-300-MPS3/system_SSE300MPS3.c.base@1.1.1
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2009-2022 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This file is derivative of CMSIS V5.9.0 system_ARMCM55.c
+ * Git SHA: 2b7495b8535bdcb306dac29b9ded4cfb679d7e5c
+ */
+
+#include "SSE300MPS3.h"
+
+/*----------------------------------------------------------------------------
+  Define clocks
+ *----------------------------------------------------------------------------*/
+ #define  XTAL             (32000000UL)
+ #define  SYSTEM_CLOCK     (XTAL)
+ #define  PERIPHERAL_CLOCK (25000000UL)
+
+/*----------------------------------------------------------------------------
+  Exception / Interrupt Vector table
+ *----------------------------------------------------------------------------*/
+extern const VECTOR_TABLE_Type __VECTOR_TABLE[496];
+
+/*----------------------------------------------------------------------------
+  System Core Clock Variable
+ *----------------------------------------------------------------------------*/
+uint32_t SystemCoreClock = SYSTEM_CLOCK;
+uint32_t PeripheralClock = PERIPHERAL_CLOCK;
+
+/*----------------------------------------------------------------------------
+  System Core Clock update function
+ *----------------------------------------------------------------------------*/
+void SystemCoreClockUpdate (void)
+{
+    SystemCoreClock = SYSTEM_CLOCK;
+    PeripheralClock = PERIPHERAL_CLOCK;
+}
+
+/*----------------------------------------------------------------------------
+  System initialization function
+ *----------------------------------------------------------------------------*/
+void SystemInit (void)
+{
+#if defined (__VTOR_PRESENT) && (__VTOR_PRESENT == 1U)
+    SCB->VTOR = (uint32_t)(&__VECTOR_TABLE[0]);
+#endif
+
+#if (defined (__FPU_USED) && (__FPU_USED == 1U)) || \
+    (defined (__ARM_FEATURE_MVE) && (__ARM_FEATURE_MVE > 0U))
+    SCB->CPACR |= ((3U << 10U*2U) |           /* enable CP10 Full Access */
+                   (3U << 11U*2U)  );         /* enable CP11 Full Access */
+
+    /* Set low-power state for PDEPU                */
+    /*  0b00  | ON, PDEPU is not in low-power state */
+    /*  0b01  | ON, but the clock is off            */
+    /*  0b10  | RET(ention)                         */
+    /*  0b11  | OFF                                 */
+
+    /* Clear ELPSTATE, value is 0b11 on Cold reset */
+    PWRMODCTL->CPDLPSTATE &= ~(PWRMODCTL_CPDLPSTATE_ELPSTATE_Msk);
+
+    /* Favor best FP/MVE performance by default, avoid EPU switch-ON delays */
+    /* PDEPU ON, Clock OFF */
+    PWRMODCTL->CPDLPSTATE |= 0x1 << PWRMODCTL_CPDLPSTATE_ELPSTATE_Pos;
+#endif
+
+#ifdef UNALIGNED_SUPPORT_DISABLE
+    SCB->CCR |= SCB_CCR_UNALIGN_TRP_Msk;
+#endif
+
+    /* Enable Loop and branch info cache */
+    SCB->CCR |= SCB_CCR_LOB_Msk;
+    __DSB();
+    __ISB();
+
+
+    SystemCoreClock = SYSTEM_CLOCK;
+    PeripheralClock = PERIPHERAL_CLOCK;
+}
diff --git a/dsppp/RTE/Device/SSE_300_MPS3/ac6_linker_script.sct b/dsppp/RTE/Device/SSE_300_MPS3/ac6_linker_script.sct
new file mode 100644
index 000000000..4d6e579d0
--- /dev/null
+++ b/dsppp/RTE/Device/SSE_300_MPS3/ac6_linker_script.sct
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/*----------------------------------------------------------------------------
+  Scatter File Definitions definition
+ *----------------------------------------------------------------------------*/
+
+LR_ROM0 __ROM0_BASE __ROM0_SIZE  {
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+  ER_CMSE_VENEER __ROM0_BASE+__ROM0_SIZE -__ROM0_SIZE {
+   *(Veneer$$CMSE)
+  }
+  #define ER_CMSE_VENEER_SIZE AlignExpr(ImageLength(ER_CMSE_VENEER), 8)
+#else
+  #define ER_CMSE_VENEER_SIZE 0
+#endif
+
+  ER_ROM0 __ROM0_BASE (__ROM0_SIZE - ER_CMSE_VENEER_SIZE) {
+   *.o (RESET, +First)
+   *(InRoot$$Sections)
+   *(+RO +XO)
+  }
+
+  RW_NOINIT __RAM0_BASE UNINIT (__RAM0_SIZE - __HEAP_SIZE - __STACK_SIZE) {
+    *(.bss.noinit)
+  }
+
+  RW_RAM0 AlignExpr(+0, 8) (__RAM0_SIZE - __HEAP_SIZE - __STACK_SIZE - AlignExpr(ImageLength(RW_NOINIT), 8)) {
+    *(+RW +ZI)
+  }
+
+#if __HEAP_SIZE > 0
+  ARM_LIB_HEAP  (AlignExpr(+0, 8)) EMPTY __HEAP_SIZE  {   ; Reserve empty region for heap
+  }
+#endif
+
+  ARM_LIB_STACK (__RAM0_BASE + __RAM0_SIZE - __STACKSEAL_SIZE) EMPTY -__STACK_SIZE {   ; Reserve empty region for stack
+  }
+
+#if __STACKSEAL_SIZE > 0
+  STACKSEAL +0 EMPTY 8 {             ; Reserve empty region for stack seal immediately after stack
+  }
+#endif
+
+#if __RAM1_SIZE > 0
+  RW_RAM1 __RAM1_BASE __RAM1_SIZE  {
+   .ANY (+RW +ZI)
+  }
+#endif
+
+#if __RAM2_SIZE > 0
+  RW_RAM2 __RAM2_BASE __RAM2_SIZE  {
+   .ANY (+RW +ZI)
+  }
+#endif
+
+#if __RAM3_SIZE > 0
+  RW_RAM3 __RAM3_BASE __RAM3_SIZE  {
+   .ANY (+RW +ZI)
+  }
+#endif
+}
+
+#if __ROM1_SIZE > 0
+LR_ROM1 __ROM1_BASE __ROM1_SIZE  {
+  ER_ROM1 +0 __ROM1_SIZE {
+   .ANY (+RO +XO)
+  }
+}
+#endif
+
+#if __ROM2_SIZE > 0
+LR_ROM2 __ROM2_BASE __ROM2_SIZE  {
+  ER_ROM2 +0 __ROM2_SIZE {
+   .ANY (+RO +XO)
+  }
+}
+#endif
+
+#if __ROM3_SIZE > 0
+LR_ROM3 __ROM3_BASE __ROM3_SIZE  {
+  ER_ROM3 +0 __ROM3_SIZE {
+   .ANY (+RO +XO)
+  }
+}
+#endif
diff --git a/dsppp/RTE/Device/SSE_300_MPS3/clang_linker_script.ld b/dsppp/RTE/Device/SSE_300_MPS3/clang_linker_script.ld
new file mode 100644
index 000000000..40f955c16
--- /dev/null
+++ b/dsppp/RTE/Device/SSE_300_MPS3/clang_linker_script.ld
@@ -0,0 +1,353 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright © 2019 Keith Packard
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials provided
+ *    with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/* ----------------------------------------------------------------------------
+  Memory definition
+ *----------------------------------------------------------------------------*/
+MEMORY
+{
+  ROM0  (rx!w)  : ORIGIN = __ROM0_BASE, LENGTH = __ROM0_SIZE
+#if __ROM1_SIZE > 0
+  ROM1  (rx!w)  : ORIGIN = __ROM1_BASE, LENGTH = __ROM1_SIZE
+#endif
+#if __ROM2_SIZE > 0
+  ROM2  (rx!w)  : ORIGIN = __ROM2_BASE, LENGTH = __ROM2_SIZE
+#endif
+#if __ROM3_SIZE > 0
+  ROM3  (rx!w)  : ORIGIN = __ROM3_BASE, LENGTH = __ROM3_SIZE
+#endif
+
+  RAM0  (w!rx) : ORIGIN = __RAM0_BASE, LENGTH = __RAM0_SIZE
+#if __RAM1_SIZE > 0
+  RAM1  (w!rx) : ORIGIN = __RAM1_BASE, LENGTH = __RAM1_SIZE
+#endif
+#if __RAM2_SIZE > 0
+  RAM2  (w!rx) : ORIGIN = __RAM2_BASE, LENGTH = __RAM2_SIZE
+#endif
+#if __RAM3_SIZE > 0
+  RAM3  (w!rx) : ORIGIN = __RAM3_BASE, LENGTH = __RAM3_SIZE
+#endif
+}
+
+ENTRY(Reset_Handler)
+
+PHDRS
+{
+	text PT_LOAD;
+	ram PT_LOAD;
+	ram_init PT_LOAD;
+	tls PT_TLS;
+}
+
+SECTIONS
+{
+	.init : {
+		KEEP (*(.vectors))
+		KEEP (*(.text.init.enter))
+		KEEP (*(.data.init.enter))
+		KEEP (*(SORT_BY_NAME(.init) SORT_BY_NAME(.init.*)))
+	} >ROM0 AT>ROM0 :text
+
+	.text : {
+
+		/* code */
+		*(.text.unlikely .text.unlikely.*)
+		*(.text.startup .text.startup.*)
+		*(.text .text.* .opd .opd.*)
+		*(.gnu.linkonce.t.*)
+		KEEP (*(.fini .fini.*))
+		__text_end = .;
+
+		PROVIDE (__etext = __text_end);
+		PROVIDE (_etext = __text_end);
+		PROVIDE (etext = __text_end);
+
+		/* read-only data */
+		*(.rdata)
+		*(.rodata .rodata.*)
+		*(.gnu.linkonce.r.*)
+
+		*(.srodata.cst16)
+		*(.srodata.cst8)
+		*(.srodata.cst4)
+		*(.srodata.cst2)
+		*(.srodata .srodata.*)
+		*(.data.rel.ro .data.rel.ro.*)
+		*(.got .got.*)
+
+		/* Need to pre-align so that the symbols come after padding */
+		. = ALIGN(8);
+
+		/* lists of constructors and destructors */
+		PROVIDE_HIDDEN ( __preinit_array_start = . );
+		KEEP (*(.preinit_array))
+		PROVIDE_HIDDEN ( __preinit_array_end = . );
+
+		PROVIDE_HIDDEN ( __init_array_start = . );
+		KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
+		KEEP (*(.init_array .ctors))
+		PROVIDE_HIDDEN ( __init_array_end = . );
+
+		PROVIDE_HIDDEN ( __fini_array_start = . );
+		KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
+		KEEP (*(.fini_array .dtors))
+		PROVIDE_HIDDEN ( __fini_array_end = . );
+
+	} >ROM0 AT>ROM0 :text
+
+	.toc : {
+		*(.toc .toc.*)
+	} >ROM0 AT>ROM0 :text
+
+	/* additional sections when compiling with C++ exception support */
+	
+	.except_ordered : {
+		*(.gcc_except_table *.gcc_except_table.*)
+		KEEP (*(.eh_frame .eh_frame.*))
+		*(.ARM.extab* .gnu.linkonce.armextab.*)
+	} >ROM0 AT>ROM0 :text
+
+	.except_unordered : {
+		. = ALIGN(8);
+
+		PROVIDE(__exidx_start = .);
+		*(.ARM.exidx*)
+		PROVIDE(__exidx_end = .);
+	} >ROM0 AT>ROM0 :text
+	
+
+	/*
+	 * Data values which are preserved across reset
+	 */
+	.preserve (NOLOAD) : {
+		PROVIDE(__preserve_start__ = .);
+		KEEP(*(SORT_BY_NAME(.preserve.*)))
+		KEEP(*(.preserve))
+		PROVIDE(__preserve_end__ = .);
+	} >RAM0 AT>RAM0 :ram
+
+	.data :  {
+		*(.data .data.*)
+		*(.gnu.linkonce.d.*)
+
+		/* Need to pre-align so that the symbols come after padding */
+		. = ALIGN(8);
+
+		PROVIDE( __global_pointer$ = . + 0x800 );
+		*(.sdata .sdata.* .sdata2.*)
+		*(.gnu.linkonce.s.*)
+	} >RAM0 AT>ROM0 :ram_init
+	PROVIDE(__data_start = ADDR(.data));
+	PROVIDE(__data_source = LOADADDR(.data));
+
+	/* Thread local initialized data. This gets
+	 * space allocated as it is expected to be placed
+	 * in ram to be used as a template for TLS data blocks
+	 * allocated at runtime. We're slightly abusing that
+	 * by placing the data in flash where it will be copied
+	 * into the allocate ram addresses by the existing
+	 * data initialization code in crt0
+	 */
+	.tdata :  {
+		*(.tdata .tdata.* .gnu.linkonce.td.*)
+		PROVIDE(__data_end = .);
+		PROVIDE(__tdata_end = .);
+	} >RAM0 AT>ROM0 :tls :ram_init
+	PROVIDE( __tls_base = ADDR(.tdata));
+	PROVIDE( __tdata_start = ADDR(.tdata));
+	PROVIDE( __tdata_source = LOADADDR(.tdata) );
+	PROVIDE( __tdata_source_end = LOADADDR(.tdata) + SIZEOF(.tdata) );
+	PROVIDE( __data_source_end = __tdata_source_end );
+	PROVIDE( __tdata_size = SIZEOF(.tdata) );
+	PROVIDE( __tls_align = MAX(ALIGNOF(.tdata),ALIGNOF(.tbss)) );
+
+	PROVIDE( __edata = __data_end );
+	PROVIDE( _edata = __data_end );
+	PROVIDE( edata = __data_end );
+	PROVIDE( __data_size = __data_end - __data_start );
+	PROVIDE( __data_source_size = __data_source_end - __data_source );
+
+	.tbss (NOLOAD) : {
+		*(.tbss .tbss.* .gnu.linkonce.tb.*)
+		*(.tcommon)
+		PROVIDE( __tls_end = . );
+		PROVIDE( __tbss_end = . );
+	} >RAM0 AT>RAM0 :tls :ram
+	PROVIDE( __bss_start = ADDR(.tbss));
+	PROVIDE( __tbss_start = ADDR(.tbss));
+	PROVIDE( __tbss_offset = ADDR(.tbss) - ADDR(.tdata) );
+	PROVIDE( __tbss_size = SIZEOF(.tbss) );
+	PROVIDE( __tls_size = __tls_end - __tls_base );
+	PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
+	PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
+	PROVIDE( __arm64_tls_tcb_offset = MAX(16, __tls_align) );
+
+	/*
+	 * The linker special cases .tbss segments which are
+	 * identified as segments which are not loaded and are
+	 * thread_local.
+	 *
+	 * For these segments, the linker does not advance 'dot'
+	 * across them.  We actually need memory allocated for tbss,
+	 * so we create a special segment here just to make room
+	 */
+	/*
+	.tbss_space (NOLOAD) : {
+		. = ADDR(.tbss);
+		. = . + SIZEOF(.tbss);
+	} >RAM0 AT>RAM0 :ram
+	*/
+
+	.bss (NOLOAD) : {
+		*(.sbss*)
+		*(.gnu.linkonce.sb.*)
+		*(.bss .bss.*)
+		*(.gnu.linkonce.b.*)
+		*(COMMON)
+
+		/* Align the heap */
+		. = ALIGN(8);
+		__bss_end = .;
+	} >RAM0 AT>RAM0 :ram
+	PROVIDE( __non_tls_bss_start = ADDR(.bss) );
+	PROVIDE( __end = __bss_end );
+	PROVIDE( _end = __bss_end );
+	PROVIDE( end = __bss_end );
+	PROVIDE( __bss_size = __bss_end - __bss_start );
+
+	/* Make the rest of memory available for heap storage */
+	PROVIDE (__heap_start = __end);
+#ifdef __HEAP_SIZE
+	PROVIDE (__heap_end = __heap_start + __HEAP_SIZE);
+	PROVIDE (__heap_size = __HEAP_SIZE);
+#else
+	PROVIDE (__heap_end = __stack - __STACK_SIZE);
+	PROVIDE (__heap_size = __heap_end - __heap_start);
+#endif
+	.heap (NOLOAD) : {
+		. += __heap_size;
+	} >RAM0 :ram
+
+	/* Define a stack region to make sure it fits in memory */
+	PROVIDE(__stack = ORIGIN(RAM0) + LENGTH(RAM0) - __STACKSEAL_SIZE);
+	PROVIDE(__stack_limit = __stack - __STACK_SIZE);
+	.stack (__stack_limit) (NOLOAD) : {
+		. += __STACK_SIZE;
+	} >RAM0 :ram
+
+#if __STACKSEAL_SIZE > 0
+	PROVIDE(__stack_seal = __stack)
+  	.stackseal (__stack) (NOLOAD) :
+  	{
+    	. += __STACKSEAL_SIZE;
+  	} >RAM0 :ram
+#endif
+
+	/* Throw away C++ exception handling information */
+
+	/*
+
+	/DISCARD/ : {
+		*(.note .note.*)
+		*(.eh_frame .eh_frame.*)
+		*(.ARM.extab* .gnu.linkonce.armextab.*)
+		*(.ARM.exidx*)
+	}
+
+	*/
+
+	/* Stabs debugging sections.  */
+	.stab          0 : { *(.stab) }
+	.stabstr       0 : { *(.stabstr) }
+	.stab.excl     0 : { *(.stab.excl) }
+	.stab.exclstr  0 : { *(.stab.exclstr) }
+	.stab.index    0 : { *(.stab.index) }
+	.stab.indexstr 0 : { *(.stab.indexstr) }
+	.comment       0 : { *(.comment) }
+	.gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) }
+	/* DWARF debug sections.
+	   Symbols in the DWARF debugging sections are relative to the beginning
+	   of the section so we begin them at 0.  */
+	/* DWARF 1.  */
+	.debug          0 : { *(.debug) }
+	.line           0 : { *(.line) }
+	/* GNU DWARF 1 extensions.  */
+	.debug_srcinfo  0 : { *(.debug_srcinfo) }
+	.debug_sfnames  0 : { *(.debug_sfnames) }
+	/* DWARF 1.1 and DWARF 2.  */
+	.debug_aranges  0 : { *(.debug_aranges) }
+	.debug_pubnames 0 : { *(.debug_pubnames) }
+	/* DWARF 2.  */
+	.debug_info     0 : { *(.debug_info .gnu.linkonce.wi.*) }
+	.debug_abbrev   0 : { *(.debug_abbrev) }
+	.debug_line     0 : { *(.debug_line .debug_line.* .debug_line_end) }
+	.debug_frame    0 : { *(.debug_frame) }
+	.debug_str      0 : { *(.debug_str) }
+	.debug_loc      0 : { *(.debug_loc) }
+	.debug_macinfo  0 : { *(.debug_macinfo) }
+	/* SGI/MIPS DWARF 2 extensions.  */
+	.debug_weaknames 0 : { *(.debug_weaknames) }
+	.debug_funcnames 0 : { *(.debug_funcnames) }
+	.debug_typenames 0 : { *(.debug_typenames) }
+	.debug_varnames  0 : { *(.debug_varnames) }
+	/* DWARF 3.  */
+	.debug_pubtypes 0 : { *(.debug_pubtypes) }
+	.debug_ranges   0 : { *(.debug_ranges) }
+	/* DWARF 5.  */
+	.debug_addr     0 : { *(.debug_addr) }
+	.debug_line_str 0 : { *(.debug_line_str) }
+	.debug_loclists 0 : { *(.debug_loclists) }
+	.debug_macro    0 : { *(.debug_macro) }
+	.debug_names    0 : { *(.debug_names) }
+	.debug_rnglists 0 : { *(.debug_rnglists) }
+	.debug_str_offsets 0 : { *(.debug_str_offsets) }
+	.debug_sup      0 : { *(.debug_sup) }
+	.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
+}
+/*
+ * Check that sections that are copied from flash to RAM have matching
+ * padding, so that a single memcpy() of __data_size copies the correct bytes.
+ */
+ASSERT( __data_size == __data_source_size,
+	"ERROR: .data/.tdata flash size does not match RAM size");
diff --git a/dsppp/RTE/Device/SSE_300_MPS3/gcc_linker_script.ld b/dsppp/RTE/Device/SSE_300_MPS3/gcc_linker_script.ld
new file mode 100644
index 000000000..a018e5d4e
--- /dev/null
+++ b/dsppp/RTE/Device/SSE_300_MPS3/gcc_linker_script.ld
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/* ----------------------------------------------------------------------------
+  Memory definition
+ *----------------------------------------------------------------------------*/
+MEMORY
+{
+  ROM0  (rx)  : ORIGIN = __ROM0_BASE, LENGTH = __ROM0_SIZE
+#if __ROM1_SIZE > 0
+  ROM1  (rx)  : ORIGIN = __ROM1_BASE, LENGTH = __ROM1_SIZE
+#endif
+#if __ROM2_SIZE > 0
+  ROM2  (rx)  : ORIGIN = __ROM2_BASE, LENGTH = __ROM2_SIZE
+#endif
+#if __ROM3_SIZE > 0
+  ROM3  (rx)  : ORIGIN = __ROM3_BASE, LENGTH = __ROM3_SIZE
+#endif
+
+  RAM0  (rwx) : ORIGIN = __RAM0_BASE, LENGTH = __RAM0_SIZE
+#if __RAM1_SIZE > 0
+  RAM1  (rwx) : ORIGIN = __RAM1_BASE, LENGTH = __RAM1_SIZE
+#endif
+#if __RAM2_SIZE > 0
+  RAM2  (rwx) : ORIGIN = __RAM2_BASE, LENGTH = __RAM2_SIZE
+#endif
+#if __RAM3_SIZE > 0
+  RAM3  (rwx) : ORIGIN = __RAM3_BASE, LENGTH = __RAM3_SIZE
+#endif
+}
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext          (deprecated)
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+  .text :
+  {
+    KEEP(*(.vectors))
+    *(.text*)
+
+    KEEP(*(.init))
+    KEEP(*(.fini))
+
+    /* .ctors */
+    *crtbegin.o(.ctors)
+    *crtbegin?.o(.ctors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+    *(SORT(.ctors.*))
+    *(.ctors)
+
+    /* .dtors */
+    *crtbegin.o(.dtors)
+    *crtbegin?.o(.dtors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+    *(SORT(.dtors.*))
+    *(.dtors)
+
+    *(.rodata*)
+
+    KEEP(*(.eh_frame*))
+  } > ROM0
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+  .gnu.sgstubs :
+  {
+    . = ALIGN(32);
+  } > ROM0
+#endif
+
+  .ARM.extab :
+  {
+    *(.ARM.extab* .gnu.linkonce.armextab.*)
+  } > ROM0
+
+  __exidx_start = .;
+  .ARM.exidx :
+  {
+    *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+  } > ROM0
+  __exidx_end = .;
+
+  .copy.table :
+  {
+    . = ALIGN(4);
+    __copy_table_start__ = .;
+
+    LONG (LOADADDR(.data))
+    LONG (ADDR(.data))
+    LONG (SIZEOF(.data) / 4)
+
+    /* Add each additional data section here */
+/*
+    LONG (LOADADDR(.data2))
+    LONG (ADDR(.data2))
+    LONG (SIZEOF(.data2) / 4)
+*/
+    __copy_table_end__ = .;
+  } > ROM0
+
+  .zero.table :
+  {
+    . = ALIGN(4);
+    __zero_table_start__ = .;
+
+/*  .bss initialization to zero is already done during C Run-Time Startup.
+    LONG (ADDR(.bss))
+    LONG (SIZEOF(.bss) / 4)
+*/
+
+    /* Add each additional bss section here */
+/*
+    LONG (ADDR(.bss2))
+    LONG (SIZEOF(.bss2) / 4)
+*/
+    __zero_table_end__ = .;
+  } > ROM0
+
+  /*
+   * This __etext variable is kept for backward compatibility with older,
+   * ASM based startup files.
+   */
+  PROVIDE(__etext = LOADADDR(.data));
+
+  .data : ALIGN(4)
+  {
+    __data_start__ = .;
+    *(vtable)
+    *(.data)
+    *(.data.*)
+
+    . = ALIGN(4);
+    /* preinit data */
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP(*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+
+    . = ALIGN(4);
+    /* init data */
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP(*(SORT(.init_array.*)))
+    KEEP(*(.init_array))
+    PROVIDE_HIDDEN (__init_array_end = .);
+
+    . = ALIGN(4);
+    /* finit data */
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP(*(SORT(.fini_array.*)))
+    KEEP(*(.fini_array))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+
+    KEEP(*(.jcr*))
+    . = ALIGN(4);
+    /* All data end */
+    __data_end__ = .;
+
+  } > RAM0 AT > ROM0
+
+  /*
+   * Secondary data section, optional
+   *
+   * Remember to add each additional data section
+   * to the .copy.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .data2 : ALIGN(4)
+  {
+    . = ALIGN(4);
+    __data2_start__ = .;
+    *(.data2)
+    *(.data2.*)
+    . = ALIGN(4);
+    __data2_end__ = .;
+
+  } > RAM1 AT > ROM0 
+*/
+
+  .bss :
+  {
+    . = ALIGN(4);
+    __bss_start__ = .;
+    *(.bss)
+    *(.bss.*)
+    *(COMMON)
+    . = ALIGN(4);
+    __bss_end__ = .;
+  } > RAM0 AT > RAM0
+
+  /*
+   * Secondary bss section, optional
+   *
+   * Remember to add each additional bss section
+   * to the .zero.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .bss2 :
+  {
+    . = ALIGN(4);
+    __bss2_start__ = .;
+    *(.bss2)
+    *(.bss2.*)
+    . = ALIGN(4);
+    __bss2_end__ = .;
+  } > RAM1 AT > RAM1
+*/
+
+  .heap (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __end__ = .;
+    PROVIDE(end = .);
+    . = . + __HEAP_SIZE;
+    . = ALIGN(8);
+    __HeapLimit = .;
+  } > RAM0
+
+  .stack (ORIGIN(RAM0) + LENGTH(RAM0) - __STACK_SIZE - __STACKSEAL_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackLimit = .;
+    . = . + __STACK_SIZE;
+    . = ALIGN(8);
+    __StackTop = .;
+  } > RAM0
+  PROVIDE(__stack = __StackTop);
+
+#if __STACKSEAL_SIZE > 0
+  .stackseal (ORIGIN(RAM0) + LENGTH(RAM0) - __STACKSEAL_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackSeal = .;
+    . = . + 8;
+    . = ALIGN(8);
+  } > RAM0
+#endif
+
+  /* Check if data + heap + stack exceeds RAM limit */
+  ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack")
+}
diff --git a/dsppp/RTE/Device/SSE_300_MPS3/regions_V2M_MPS3_SSE_300_FVP.h b/dsppp/RTE/Device/SSE_300_MPS3/regions_V2M_MPS3_SSE_300_FVP.h
new file mode 100644
index 000000000..cf6b28cc9
--- /dev/null
+++ b/dsppp/RTE/Device/SSE_300_MPS3/regions_V2M_MPS3_SSE_300_FVP.h
@@ -0,0 +1,400 @@
+#ifndef REGIONS_V2M_MPS3_SSE_300_FVP_H
+#define REGIONS_V2M_MPS3_SSE_300_FVP_H
+
+
+//-------- <<< Use Configuration Wizard in Context Menu >>> --------------------
+
+// <n>Device pack:   ARM::V2M_MPS3_SSE_300_BSP@1.4.0
+// <i>Device pack used to generate this file
+
+// <h>ROM Configuration
+// =======================
+// </h>
+
+// <h>RAM Configuration
+// =======================
+// <h> IROM1=<__RAM0>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x10000000
+#define __RAM0_BASE 0x10000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00200000
+#define __RAM0_SIZE 0x00200000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM0_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM0_NOINIT 0
+// </h>
+
+// <h> IROM2=<__RAM1>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x00000000
+#define __RAM1_BASE 0x00000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00200000
+#define __RAM1_SIZE 0x00200000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM1_DEFAULT 0
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM1_NOINIT 0
+// </h>
+
+// <h> IRAM1=<__RAM2>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x30000000
+#define __RAM2_BASE 0x30000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00020000
+#define __RAM2_SIZE 0x00020000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM2_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM2_NOINIT 0
+// </h>
+
+// <h> IRAM2=<__RAM3>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x20000000
+#define __RAM3_BASE 0x20000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00020000
+#define __RAM3_SIZE 0x00020000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM3_DEFAULT 0
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM3_NOINIT 0
+// </h>
+
+// <h> ITCM_NS=<__RAM4>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x00000000
+#define __RAM4_BASE 0x00000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00080000
+#define __RAM4_SIZE 0x00080000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM4_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM4_NOINIT 0
+// </h>
+
+// <h> SRAM_NS=<__RAM5>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x01000000
+#define __RAM5_BASE 0x01000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00100000
+#define __RAM5_SIZE 0x00100000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM5_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM5_NOINIT 0
+// </h>
+
+// <h> DTCM0_NS=<__RAM6>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x20000000
+#define __RAM6_BASE 0x20000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00020000
+#define __RAM6_SIZE 0x00020000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM6_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM6_NOINIT 0
+// </h>
+
+// <h> DTCM1_NS=<__RAM7>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x20020000
+#define __RAM7_BASE 0x20020000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00020000
+#define __RAM7_SIZE 0x00020000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM7_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM7_NOINIT 0
+// </h>
+
+// <h> DTCM2_NS=<__RAM8>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x20040000
+#define __RAM8_BASE 0x20040000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00020000
+#define __RAM8_SIZE 0x00020000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM8_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM8_NOINIT 0
+// </h>
+
+// <h> DTCM3_NS=<__RAM9>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x20060000
+#define __RAM9_BASE 0x20060000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00020000
+#define __RAM9_SIZE 0x00020000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM9_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM9_NOINIT 0
+// </h>
+
+// <h> ISRAM0_NS=<__RAM10>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x21000000
+#define __RAM10_BASE 0x21000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00100000
+#define __RAM10_SIZE 0x00100000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM10_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM10_NOINIT 0
+// </h>
+
+// <h> ISRAM1_NS=<__RAM11>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x21100000
+#define __RAM11_BASE 0x21100000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00100000
+#define __RAM11_SIZE 0x00100000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM11_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM11_NOINIT 0
+// </h>
+
+// <h> QSPI_SRAM_NS=<__RAM12>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x28000000
+#define __RAM12_BASE 0x28000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00800000
+#define __RAM12_SIZE 0x00800000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM12_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM12_NOINIT 0
+// </h>
+
+// <h> ITCM_S=<__RAM13>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x10000000
+#define __RAM13_BASE 0x10000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00080000
+#define __RAM13_SIZE 0x00080000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM13_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM13_NOINIT 0
+// </h>
+
+// <h> SRAM_S=<__RAM14>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x11000000
+#define __RAM14_BASE 0x11000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00100000
+#define __RAM14_SIZE 0x00100000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM14_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM14_NOINIT 0
+// </h>
+
+// <h> DTCM0_S=<__RAM15>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x30000000
+#define __RAM15_BASE 0x30000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00020000
+#define __RAM15_SIZE 0x00020000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM15_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM15_NOINIT 0
+// </h>
+
+// <h> DTCM1_S=<__RAM16>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x30020000
+#define __RAM16_BASE 0x30020000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00020000
+#define __RAM16_SIZE 0x00020000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM16_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM16_NOINIT 0
+// </h>
+
+// <h> DTCM2_S=<__RAM17>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x30040000
+#define __RAM17_BASE 0x30040000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00020000
+#define __RAM17_SIZE 0x00020000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM17_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM17_NOINIT 0
+// </h>
+
+// <h> DTCM3_S=<__RAM18>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x30060000
+#define __RAM18_BASE 0x30060000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00020000
+#define __RAM18_SIZE 0x00020000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM18_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM18_NOINIT 0
+// </h>
+
+// <h> ISRAM0_S=<__RAM19>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x31000000
+#define __RAM19_BASE 0x31000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00100000
+#define __RAM19_SIZE 0x00100000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM19_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM19_NOINIT 0
+// </h>
+
+// <h> ISRAM1_S=<__RAM20>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x31100000
+#define __RAM20_BASE 0x31100000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00100000
+#define __RAM20_SIZE 0x00100000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM20_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM20_NOINIT 0
+// </h>
+
+// <h> QSPI_SRAM_S=<__RAM21>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x38000000
+#define __RAM21_BASE 0x38000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00800000
+#define __RAM21_SIZE 0x00800000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM21_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM21_NOINIT 0
+// </h>
+
+// </h>
+
+// <h>Stack / Heap Configuration
+//   <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+//   <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+#define __STACK_SIZE 0x00000200
+#define __HEAP_SIZE 0x00038000
+// </h>
+
+
+#endif /* REGIONS_V2M_MPS3_SSE_300_FVP_H */
diff --git a/dsppp/RTE/_Release_IPSS_M0P/RTE_Components.h b/dsppp/RTE/_Release_IPSS_M0P/RTE_Components.h
new file mode 100644
index 000000000..332d6e45f
--- /dev/null
+++ b/dsppp/RTE/_Release_IPSS_M0P/RTE_Components.h
@@ -0,0 +1,20 @@
+/*
+ * CSOLUTION generated file: DO NOT EDIT!
+ * Generated by: csolution version 2.2.1
+ *
+ * Project: 'test.Release+IPSS_M0P' 
+ * Target:  'Release+IPSS_M0P' 
+ */
+
+#ifndef RTE_COMPONENTS_H
+#define RTE_COMPONENTS_H
+
+
+/*
+ * Define the Device Header File: 
+ */
+#define CMSIS_device_header "ARMCM0plus.h"
+
+
+
+#endif /* RTE_COMPONENTS_H */
diff --git a/dsppp/RTE/_Release_IPSS_M4/RTE_Components.h b/dsppp/RTE/_Release_IPSS_M4/RTE_Components.h
new file mode 100644
index 000000000..747232d25
--- /dev/null
+++ b/dsppp/RTE/_Release_IPSS_M4/RTE_Components.h
@@ -0,0 +1,20 @@
+/*
+ * CSOLUTION generated file: DO NOT EDIT!
+ * Generated by: csolution version 2.2.1
+ *
+ * Project: 'test.Release+IPSS_M4' 
+ * Target:  'Release+IPSS_M4' 
+ */
+
+#ifndef RTE_COMPONENTS_H
+#define RTE_COMPONENTS_H
+
+
+/*
+ * Define the Device Header File: 
+ */
+#define CMSIS_device_header "ARMCM4.h"
+
+
+
+#endif /* RTE_COMPONENTS_H */
diff --git a/dsppp/RTE/_Release_LLVM-Corstone-300/RTE_Components.h b/dsppp/RTE/_Release_LLVM-Corstone-300/RTE_Components.h
new file mode 100644
index 000000000..cd99d204e
--- /dev/null
+++ b/dsppp/RTE/_Release_LLVM-Corstone-300/RTE_Components.h
@@ -0,0 +1,23 @@
+/*
+ * CSOLUTION generated file: DO NOT EDIT!
+ * Generated by: csolution version 2.2.1
+ *
+ * Project: 'test.Release+LLVM-Corstone-300' 
+ * Target:  'Release+LLVM-Corstone-300' 
+ */
+
+#ifndef RTE_COMPONENTS_H
+#define RTE_COMPONENTS_H
+
+
+/*
+ * Define the Device Header File: 
+ */
+#define CMSIS_device_header "SSE300MPS3.h"
+
+/* ARM::CMSIS-Compiler:STDOUT:Custom@1.0.0 */
+#define RTE_CMSIS_Compiler_STDOUT                /* CMSIS-Compiler STDOUT */
+        #define RTE_CMSIS_Compiler_STDOUT_Custom         /* CMSIS-Compiler STDOUT: Custom */
+
+
+#endif /* RTE_COMPONENTS_H */
diff --git a/dsppp/RTE/_Release_MPS3-Corstone-300/RTE_Components.h b/dsppp/RTE/_Release_MPS3-Corstone-300/RTE_Components.h
new file mode 100644
index 000000000..d7da60e1c
--- /dev/null
+++ b/dsppp/RTE/_Release_MPS3-Corstone-300/RTE_Components.h
@@ -0,0 +1,25 @@
+/*
+ * CSOLUTION generated file: DO NOT EDIT!
+ * Generated by: csolution version 2.2.1
+ *
+ * Project: 'test.Release+MPS3-Corstone-300' 
+ * Target:  'Release+MPS3-Corstone-300' 
+ */
+
+#ifndef RTE_COMPONENTS_H
+#define RTE_COMPONENTS_H
+
+
+/*
+ * Define the Device Header File: 
+ */
+#define CMSIS_device_header "SSE300MPS3.h"
+
+/* ARM::CMSIS Driver:USART@1.0.0 */
+#define RTE_Drivers_USART
+/* ARM::CMSIS-Compiler:STDOUT:Custom@1.0.0 */
+#define RTE_CMSIS_Compiler_STDOUT                /* CMSIS-Compiler STDOUT */
+        #define RTE_CMSIS_Compiler_STDOUT_Custom         /* CMSIS-Compiler STDOUT: Custom */
+
+
+#endif /* RTE_COMPONENTS_H */
diff --git a/dsppp/RTE/_Release_VHT-Corstone-300/RTE_Components.h b/dsppp/RTE/_Release_VHT-Corstone-300/RTE_Components.h
new file mode 100644
index 000000000..c326941f9
--- /dev/null
+++ b/dsppp/RTE/_Release_VHT-Corstone-300/RTE_Components.h
@@ -0,0 +1,20 @@
+/*
+ * CSOLUTION generated file: DO NOT EDIT!
+ * Generated by: csolution version 2.2.1
+ *
+ * Project: 'test.Release+VHT-Corstone-300' 
+ * Target:  'Release+VHT-Corstone-300' 
+ */
+
+#ifndef RTE_COMPONENTS_H
+#define RTE_COMPONENTS_H
+
+
+/*
+ * Define the Device Header File: 
+ */
+#define CMSIS_device_header "SSE300MPS3.h"
+
+
+
+#endif /* RTE_COMPONENTS_H */
diff --git a/dsppp/RTE/_Release_VHT-M0P/RTE_Components.h b/dsppp/RTE/_Release_VHT-M0P/RTE_Components.h
new file mode 100644
index 000000000..8a0db96bc
--- /dev/null
+++ b/dsppp/RTE/_Release_VHT-M0P/RTE_Components.h
@@ -0,0 +1,20 @@
+/*
+ * CSOLUTION generated file: DO NOT EDIT!
+ * Generated by: csolution version 2.2.1
+ *
+ * Project: 'test.Release+VHT-M0P' 
+ * Target:  'Release+VHT-M0P' 
+ */
+
+#ifndef RTE_COMPONENTS_H
+#define RTE_COMPONENTS_H
+
+
+/*
+ * Define the Device Header File: 
+ */
+#define CMSIS_device_header "ARMCM0plus.h"
+
+
+
+#endif /* RTE_COMPONENTS_H */
diff --git a/dsppp/RTE/_Release_VHT-M4/RTE_Components.h b/dsppp/RTE/_Release_VHT-M4/RTE_Components.h
new file mode 100644
index 000000000..4c34863c1
--- /dev/null
+++ b/dsppp/RTE/_Release_VHT-M4/RTE_Components.h
@@ -0,0 +1,20 @@
+/*
+ * CSOLUTION generated file: DO NOT EDIT!
+ * Generated by: csolution version 2.2.1
+ *
+ * Project: 'test.Release+VHT-M4' 
+ * Target:  'Release+VHT-M4' 
+ */
+
+#ifndef RTE_COMPONENTS_H
+#define RTE_COMPONENTS_H
+
+
+/*
+ * Define the Device Header File: 
+ */
+#define CMSIS_device_header "ARMCM4.h"
+
+
+
+#endif /* RTE_COMPONENTS_H */
diff --git a/dsppp/RTE/_Release_VHT_M0P/RTE_Components.h b/dsppp/RTE/_Release_VHT_M0P/RTE_Components.h
new file mode 100644
index 000000000..768bae446
--- /dev/null
+++ b/dsppp/RTE/_Release_VHT_M0P/RTE_Components.h
@@ -0,0 +1,20 @@
+/*
+ * CSOLUTION generated file: DO NOT EDIT!
+ * Generated by: csolution version 2.2.1
+ *
+ * Project: 'test.Release+VHT_M0P' 
+ * Target:  'Release+VHT_M0P' 
+ */
+
+#ifndef RTE_COMPONENTS_H
+#define RTE_COMPONENTS_H
+
+
+/*
+ * Define the Device Header File: 
+ */
+#define CMSIS_device_header "ARMCM0plus.h"
+
+
+
+#endif /* RTE_COMPONENTS_H */
diff --git a/dsppp/RTE/_Release_VHT_M4/RTE_Components.h b/dsppp/RTE/_Release_VHT_M4/RTE_Components.h
new file mode 100644
index 000000000..44e1e938d
--- /dev/null
+++ b/dsppp/RTE/_Release_VHT_M4/RTE_Components.h
@@ -0,0 +1,20 @@
+/*
+ * CSOLUTION generated file: DO NOT EDIT!
+ * Generated by: csolution version 2.2.1
+ *
+ * Project: 'test.Release+VHT_M4' 
+ * Target:  'Release+VHT_M4' 
+ */
+
+#ifndef RTE_COMPONENTS_H
+#define RTE_COMPONENTS_H
+
+
+/*
+ * Define the Device Header File: 
+ */
+#define CMSIS_device_header "ARMCM4.h"
+
+
+
+#endif /* RTE_COMPONENTS_H */
diff --git a/dsppp/allocator.cpp b/dsppp/allocator.cpp
new file mode 100644
index 000000000..aaf61cc50
--- /dev/null
+++ b/dsppp/allocator.cpp
@@ -0,0 +1,98 @@
+#include "allocator.h"
+
+#define ALLOC_POOL(BYTES,NB) \
+MemoryPool<POOL_BLOCK_##BYTES,user_allocator_aligned_malloc>  vecPool_##BYTES(NB);
+
+#if defined(POOL_ALLOCATOR)
+#include "allocation/all.cpp"
+#endif
+
+std::map<int, int> current_stats;
+std::map<int, int> max_stats;
+std::map<void*, std::size_t> current_dyn_stats;
+
+void print_map(std::string comment)
+{
+
+    std::cout << comment << "\r\n";
+#if !defined(POOL_ALLOCATOR)
+    std::size_t total_static=0;
+    std::size_t total_dynamic=0;
+
+    for (const auto v : max_stats)
+    {
+        // Only count allocations with size known at build time
+        if (v.first > 0)
+        {
+           std::cout << "ALLOC_POOL(" << v.first << "," << v.second << "); \r\n";
+           total_static += v.first * v.second;
+        }
+    }
+ 
+    for (const auto v : max_stats)
+    {
+        // Only count allocations with size known at build time
+        if (v.first > 0)
+        {
+           std::cout << "POOL(" << v.first << "); \r\n";
+        }
+    }
+ 
+    std::cout << "\r\n";
+
+    std::cout << "Total static bytes: " << total_static << std::hex << " (0x" << total_static << ")\r\n";
+
+    total_dynamic = 0;
+    std::cout << "\r\nDynamic allocations\r\n";
+    for (const auto v : max_stats)
+    {
+        // Only count allocations with size known at build time
+        if (v.first < 0)
+        {
+            // Count is meaningless for dynamic allocation
+            // since we can track the destroy (destroy has no length
+            // argument contrary to allocate and so can only get
+            // the length from the static value).
+            std::cout << std::dec << -v.first << " : " << v.second << "\r\n";
+            total_dynamic += (-v.first) * v.second;
+        }
+    }
+    std::cout << "Total dynamic bytes: " << total_dynamic << std::hex << " (0x" << total_dynamic << ")\r\n";
+    std::cout << "Total bytes: " << (total_static+total_dynamic) << std::hex << " (0x" << (total_static+total_dynamic) << ")\r\n";
+
+
+#endif
+}
+
+void reset_current_stats()
+{
+#if !defined(POOL_ALLOCATOR)
+    for (auto v : current_stats)
+    {
+        v.second = 0;
+    }
+#endif
+}
+
+void check_current_stats()
+{
+#if !defined(POOL_ALLOCATOR)
+    for (const auto v : current_stats)
+    {
+        if (v.second > 0)
+        {
+            if (v.first>0)
+            {
+                std::cout << "Error memory pool " << v.first << " not empty = " << v.second << "\r\n";
+            }
+            else 
+            {
+                std::cout << "Error dynamic alloc " << -v.first << " not empty = " << v.second << "\r\n";
+            }
+        }
+    }
+
+    reset_current_stats();
+#endif
+}
+
diff --git a/dsppp/allocator.h b/dsppp/allocator.h
new file mode 100644
index 000000000..61e95006e
--- /dev/null
+++ b/dsppp/allocator.h
@@ -0,0 +1,124 @@
+#pragma once
+
+#include <map>
+#include <utility>
+#include <string>
+#include <iostream>
+#include "test_config.h"
+
+
+// Allocator for temporaries
+#if defined(POOL_ALLOCATOR)
+#define TMP_ALLOC pool_allocator
+#else 
+#define TMP_ALLOC stat_allocator
+#endif
+
+#include <dsppp/memory_pool.hpp>
+
+
+using namespace arm_cmsis_dsp;
+
+
+constexpr int NBVEC_2 = 2;
+constexpr int NBVEC_3 = 3;
+constexpr int NBVEC_4 = 4;
+constexpr int NBVEC_8 = 8;
+constexpr int NBVEC_9 = 9;
+constexpr int NBVEC_16 = 16;
+constexpr int NBVEC_32 = 32;
+constexpr int NBVEC_44 = 44;
+constexpr int NBVEC_47 = 47;
+constexpr int NBVEC_64 = 64;
+constexpr int NBVEC_128 = 128;
+constexpr int NBVEC_256 = 256;
+constexpr int NBVEC_258 = 258;
+constexpr int NBVEC_512 = 512;
+constexpr int NBVEC_1024 = 1024;
+constexpr int NBVEC_2048 = 2048;
+
+
+template<int L>
+struct pool_allocator;
+
+#define POOL(BYTES)                                                                 \
+constexpr int POOL_BLOCK_##BYTES = BYTES;                                            \
+extern  MemoryPool<POOL_BLOCK_##BYTES,user_allocator_aligned_malloc>  vecPool_##BYTES;\
+template<>                                                                          \
+struct pool_allocator<BYTES> {                                                      \
+    static  char* allocate  () noexcept{                                            \
+        return(vecPool_##BYTES.get_new_buffer());                                    \
+    }                                                                               \
+                                                                                    \
+    static void destroy  ( char* ptr ) noexcept {                                   \
+        vecPool_##BYTES.recycle_buffer(ptr);                                         \
+    }                                                                               \
+                                                                                    \
+};
+
+
+#if defined(POOL_ALLOCATOR)
+#include "allocation/all.h"
+#endif 
+
+template<>
+struct pool_allocator<DYNAMIC> {
+    /* Dynamic size allocations */
+    static  char* allocate  ( std::size_t sz) noexcept{
+        return(reinterpret_cast<char*>(std::malloc(sz)));
+    }
+
+    static void destroy  ( char* ptr ) noexcept {
+        std::free(ptr);
+    }
+   
+};
+
+extern std::map<int, int> current_stats;
+extern std::map<int, int> max_stats;
+extern std::map<void*, std::size_t> current_dyn_stats;
+
+
+template<int L>
+struct stat_allocator {
+
+    /* Dynamic allocations */
+    static  char* allocate  ( std::size_t sz) noexcept{
+        current_stats[-sz]++;
+        if (current_stats[-sz]>max_stats[-sz])
+        {
+            max_stats[-sz] = current_stats[-sz];
+        }
+        void *ptr = std::malloc(sz);
+        current_dyn_stats[ptr]=sz;
+        return(reinterpret_cast<char*>(ptr));
+    }
+
+    /* Size known at build time */
+    static  char* allocate  () noexcept{
+        current_stats[L]++;
+        if (current_stats[L]>max_stats[L])
+        {
+            max_stats[L] = current_stats[L];
+        }
+        return(reinterpret_cast<char*>(std::malloc(L)));
+    }
+    
+    static void destroy  ( char* ptr ) noexcept {
+        if (L<0)
+        {
+           std::size_t sz = current_dyn_stats[ptr];
+           current_stats[-sz]--;
+        }
+        else
+        {
+           current_stats[L]--;
+        }
+        std::free(ptr);
+    }
+   
+};
+
+extern void print_map(std::string comment);
+extern void check_current_stats();
+extern void reset_current_stats();
diff --git a/dsppp/cdefault.yml b/dsppp/cdefault.yml
new file mode 100644
index 000000000..0ede69afd
--- /dev/null
+++ b/dsppp/cdefault.yml
@@ -0,0 +1,142 @@
+default:
+
+  compiler: AC6
+  
+  misc:
+    - for-compiler: AC6
+      C:
+        - -Wsign-compare 
+        - -Wdouble-promotion 
+        - -DNDEBUG 
+        - -Wall 
+        - -Wextra  
+        - -Werror
+        - -std=c11 
+        - -Ofast 
+        - -ffast-math
+        - -Wno-packed 
+        - -Wno-missing-variable-declarations 
+        - -Wno-missing-prototypes 
+        - -Wno-missing-noreturn 
+        - -Wno-sign-conversion 
+        - -Wno-nonportable-include-path 
+        - -Wno-reserved-id-macro 
+        - -Wno-unused-macros 
+        - -Wno-documentation-unknown-command 
+        - -Wno-documentation 
+        - -Wno-license-management 
+        - -Wno-parentheses-equality 
+        - -Wno-reserved-identifier
+        - -ffunction-sections
+        - -Wno-nan-infinity-disabled
+        - -DARM_MATH_LOOPUNROLL
+      CPP:
+        - -fno-rtti 
+        - -fno-exceptions
+        - -DNDEBUG 
+        - -Wall 
+        - -Wextra  
+        - -std=c++17
+        - -Ofast 
+        - -ffast-math
+        - -Wno-unused-function
+        - -ffunction-sections
+        - -mllvm -disable-vector-combine
+      ASM:
+        - -masm=auto
+      Link:
+        - --entry=Reset_Handler
+        - --info=summarysizes
+        - --info=sizes
+        - --info=totals
+        - --info=unused
+        - --info=veneers
+
+    - for-compiler: GCC
+      C:
+        - -Wsign-compare 
+        - -Wdouble-promotion 
+        - -DNDEBUG 
+        - -Wall 
+        - -Wextra  
+        - -Werror
+        - -std=c11 
+        - -Ofast 
+        - -ffast-math
+        - -Wno-packed 
+        - -Wno-missing-prototypes 
+        - -Wno-missing-noreturn 
+        - -Wno-sign-conversion 
+        - -Wno-unused-macros 
+        - -ffunction-sections
+        - -DARM_MATH_LOOPUNROLL
+        - -flax-vector-conversions
+        - -Wno-maybe-uninitialized
+        - -fdata-sections
+        - -fno-unroll-loops
+      CPP:
+        - -fno-rtti 
+        - -fno-exceptions
+        - -DNDEBUG 
+        - -Wall 
+        - -Wextra  
+        - -std=c++17
+        - -Ofast 
+        - -ffast-math
+        - -Wno-unused-function
+        - -ffunction-sections
+        - -fdata-sections
+        - -Wno-psabi
+        - -fno-unroll-loops
+      ASM:
+        - -masm=auto
+      Link:
+        - --specs=nano.specs
+        - -Wl,-Map=$elf()$.map
+        - -lm
+        - -Wl,--wrap=SysTick_Handler
+        - -Wl,--gc-sections
+      Library:
+        - -lm
+
+    - for-compiler: CLANG
+      C:
+         - -Wsign-compare 
+         - -Wdouble-promotion 
+         - -DNDEBUG 
+         - -Wall 
+         - -Wextra  
+         - -Werror
+         - -std=c11 
+         - -Ofast 
+         - -ffast-math
+         - -Wno-packed 
+         - -Wno-missing-variable-declarations 
+         - -Wno-missing-prototypes 
+         - -Wno-missing-noreturn 
+         - -Wno-sign-conversion 
+         - -Wno-nonportable-include-path 
+         - -Wno-reserved-id-macro 
+         - -Wno-unused-macros 
+         - -Wno-documentation-unknown-command 
+         - -Wno-documentation 
+         - -Wno-parentheses-equality 
+         - -Wno-reserved-identifier
+         - -ffunction-sections
+         - -DARM_MATH_LOOPUNROLL
+      CPP:
+        - -fno-rtti 
+        - -fno-exceptions
+        - -DNDEBUG 
+        - -Wall 
+        - -Wextra  
+        - -std=c++17
+        - -Ofast 
+        - -ffast-math
+        - -Wno-unused-function
+        - -ffunction-sections
+      ASM:
+        - -masm=auto
+      Link:
+        - -Wl,-Map=$elf()$.map
+        - -Wl,--gc-sections
diff --git a/dsppp/clang_sse300.c b/dsppp/clang_sse300.c
new file mode 100644
index 000000000..c6470905d
--- /dev/null
+++ b/dsppp/clang_sse300.c
@@ -0,0 +1,65 @@
+#include "RTE_Components.h"
+#include <stdio.h>
+
+#include "Driver_USART.h"
+#include "stdout_USART.h"
+
+
+
+
+static int stdin_getc(FILE *file) {
+  (void)file;
+  return(0);
+}
+
+
+// iostream has references to stdin and stderr and there is a link
+// error if not defined.
+static FILE __stdin = FDEV_SETUP_STREAM(NULL,
+                                        stdin_getc,
+                                        NULL,
+                                        _FDEV_SETUP_READ);
+FILE *const stdin = &__stdin;
+
+static int stderr_putc(char c, FILE *file) {
+  (void)file;
+  return(0);
+}
+
+static FILE __stderr = FDEV_SETUP_STREAM(stderr_putc,
+                                         NULL,
+                                         NULL,
+                                         _FDEV_SETUP_WRITE);
+FILE *const stderr = &__stderr;
+
+//-------- <<< Use Configuration Wizard in Context Menu >>> --------------------
+
+// <h>STDOUT USART Interface
+
+//   <o>Connect to hardware via Driver_USART# <0-255>
+//   <i>Select driver control block for USART interface
+#define USART_DRV_NUM           0
+
+//   <o>Baudrate
+#define USART_BAUDRATE          115200
+
+// </h>
+
+
+#define _USART_Driver_(n)  Driver_USART##n
+#define  USART_Driver_(n) _USART_Driver_(n)
+
+extern ARM_DRIVER_USART  USART_Driver_(USART_DRV_NUM);
+#define ptrUSART       (&USART_Driver_(USART_DRV_NUM))
+
+int stdout_putchar(const unsigned char ch) {
+    uint8_t buf[1];
+
+    buf[0] = ch;
+    if (ptrUSART->Send(buf, 1) != ARM_DRIVER_OK) {
+      return (-1);
+    }
+    while (ptrUSART->GetTxCount() != 1);
+    return (ch);
+}
+
diff --git a/dsppp/example.cproject.yml b/dsppp/example.cproject.yml
new file mode 100644
index 000000000..0e41fef75
--- /dev/null
+++ b/dsppp/example.cproject.yml
@@ -0,0 +1,120 @@
+project:
+  groups:
+    - group: Examples
+      files:
+        #- file: Examples/dot_product.cpp
+        #- file: Examples/vector_op.cpp
+        - file: Examples/matrix_op.cpp
+        - file: clang_sse300.c
+          for-context: 
+             - +MPS3-Corstone-300
+          for-compiler:
+             - CLANG
+  add-path:
+      - Include
+      - Examples
+
+  components:
+    - component: ARM::CMSIS:CORE
+    - component: ARM::CMSIS:DSP@1.15.0
+    - component: ARM::Device:Startup&C Startup
+      for-context: 
+        - +VHT-Corstone-300
+        - +VHT-M0P
+        - +VHT-M4
+        - +MPS3-Corstone-300
+    - component: ARM::Device:Definition
+      for-context: 
+          - +VHT-Corstone-300
+          - +MPS3-Corstone-300
+    - component: CMSIS-Compiler:CORE
+      for-context: 
+          - +MPS3-Corstone-300
+    - component: CMSIS-Compiler:STDOUT:Custom@1.0.0
+      for-context: 
+          - +MPS3-Corstone-300
+    - component: ARM::Device:USART STDOUT
+      for-context: 
+          - +MPS3-Corstone-300
+    - component: ARM::CMSIS Driver:USART
+      for-context: 
+          - +MPS3-Corstone-300
+    - component: ARM::Device:Native Driver:SysCounter
+      for-context: 
+          - +VHT-Corstone-300
+          - +MPS3-Corstone-300
+    - component: ARM::Device:Native Driver:SysTimer
+      for-context: 
+          - +VHT-Corstone-300
+          - +MPS3-Corstone-300
+    - component: ARM::Device:Native Driver:Timeout
+      for-context: 
+          - +VHT-Corstone-300
+          - +MPS3-Corstone-300
+    - component: ARM::Device:Native Driver:UART
+      for-context: 
+          - +MPS3-Corstone-300
+  
+  linker:
+    - script: linker_scripts/gcc_sse300_mps3.ld
+      for-context:
+        - +MPS3-Corstone-300
+        - +VHT-Corstone-300
+      for-compiler: GCC
+
+    - script: linker_scripts/clang_sse300_mps3.sct
+      for-context:
+        - +MPS3-Corstone-300
+        - +VHT-Corstone-300
+      for-compiler: CLANG
+
+    - script: linker_scripts/ac6_sse300_mps3_s.sct
+      for-context:
+        - +MPS3-Corstone-300
+        - +VHT-Corstone-300
+      for-compiler: AC6
+
+    - regions: linker_scripts/SSE-300-MPS3/region_defs.h
+      for-context:
+        - +MPS3-Corstone-300
+        - +VHT-Corstone-300
+
+    - script: linker_scripts/gcc_m0p_mps3.ld
+      for-context:
+        - +VHT-M0P
+      for-compiler: GCC
+
+    - script: linker_scripts/clang_m0p_mps3.ld
+      for-context:
+        - +VHT-M0P
+      for-compiler: CLANG
+
+    - script: linker_scripts/ac6_m0p_mps3_s.sct
+      for-context:
+        - +VHT-M0P
+      for-compiler: AC6
+
+    - regions: linker_scripts/ARMCM0P/region_defs.h
+      for-context:
+        - +VHT-M0P
+
+    - script: linker_scripts/gcc_m4_mps3.ld
+      for-context:
+        - +VHT-M4
+      for-compiler: GCC
+
+    - script: linker_scripts/clang_m4_mps3.ld
+      for-context:
+        - +VHT-M4
+      for-compiler: CLANG
+
+    - script: linker_scripts/ac6_m4_mps3_s.sct
+      for-context:
+        - +VHT-M4
+      for-compiler: AC6
+
+    - regions: linker_scripts/ARMCM4/region_defs.h
+      for-context:
+        - +VHT-M4
+
+  
diff --git a/dsppp/fvp_configs/VHT-Corstone-300.txt b/dsppp/fvp_configs/VHT-Corstone-300.txt
new file mode 100644
index 000000000..e352bec1e
--- /dev/null
+++ b/dsppp/fvp_configs/VHT-Corstone-300.txt
@@ -0,0 +1,9 @@
+core_clk.mul=100000000
+cpu0.semihosting-enable=1
+cpu0.semihosting-heap_base=0x0
+cpu0.semihosting-heap_limit=0x0
+cpu0.semihosting-stack_base=0x0
+cpu0.semihosting-stack_limit=0x0
+cpu0.FPU=1
+cpu0.MVE=2
+mps3_board.visualisation.disable-visualisation=1
diff --git a/dsppp/fvp_configs/VHT-M0P.txt b/dsppp/fvp_configs/VHT-M0P.txt
new file mode 100644
index 000000000..4892c1e4e
--- /dev/null
+++ b/dsppp/fvp_configs/VHT-M0P.txt
@@ -0,0 +1,3 @@
+fvp_mps2.mps2_visualisation.disable-visualisation=1
+armcortexm0plusct.semihosting-enable=1
+armcortexm0plusct.NUM_MPU_REGION=0x8
diff --git a/dsppp/fvp_configs/VHT-M4.txt b/dsppp/fvp_configs/VHT-M4.txt
new file mode 100644
index 000000000..fda8c0249
--- /dev/null
+++ b/dsppp/fvp_configs/VHT-M4.txt
@@ -0,0 +1,3 @@
+fvp_mps2.mps2_visualisation.disable-visualisation=1
+armcortexm4ct.semihosting-enable=1
+armcortexm4ct.vfp-present=1
diff --git a/dsppp/getserial.py b/dsppp/getserial.py
new file mode 100644
index 000000000..d3f7e6781
--- /dev/null
+++ b/dsppp/getserial.py
@@ -0,0 +1,28 @@
+import serial
+import re
+import io
+from pyocd.core.target import Target
+
+lines = []
+
+def read_stdout(target):
+    print("Waiting for serial")
+    lines = []
+
+    with serial.Serial('COM6', 115200, timeout=1,parity=serial.PARITY_NONE) as ser:
+        sio = io.TextIOWrapper(ser)
+        DONE = False
+        target.reset()
+        while not DONE:
+            line = sio.readline()
+            if len(line)==0:
+                raise Exception('Timeout error')
+            if re.match(r'Stats',line):
+                DONE=True 
+            else:
+                #print(line)
+                lines.append(line)
+                        
+           
+    return(lines)
+    
\ No newline at end of file
diff --git a/dsppp/linker_scripts/ARMCM0P/region_defs.h b/dsppp/linker_scripts/ARMCM0P/region_defs.h
new file mode 100644
index 000000000..b66150bbb
--- /dev/null
+++ b/dsppp/linker_scripts/ARMCM0P/region_defs.h
@@ -0,0 +1,60 @@
+#ifndef REGIONS_ARMCM0P_H
+#define REGIONS_ARMCM0P_H
+
+
+//-------- <<< Use Configuration Wizard in Context Menu >>> --------------------
+
+// <n>Device pack:   ARM::Cortex_DFP@1.0.0
+// <i>Device pack used to generate this file
+
+// <h>ROM Configuration
+// =======================
+// <h> ROM=<__ROM0>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x00000000
+#define __ROM0_BASE 0x00000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00040000
+#define __ROM0_SIZE 0x00040000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __ROM0_DEFAULT 1
+//   <q>Startup
+//   <i> Selects region to be used for startup code.
+#define __ROM0_STARTUP 1
+// </h>
+
+// </h>
+
+// <h>RAM Configuration
+// =======================
+// <h> RAM=<__RAM0>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x20000000
+#define __RAM0_BASE 0x20000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00020000
+#define __RAM0_SIZE 0x00040000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM0_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM0_NOINIT 0
+// </h>
+
+// </h>
+
+// <h>Stack / Heap Configuration
+//   <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+//   <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+#define __STACK_SIZE 0x00002000
+#define __HEAP_SIZE 0x00038000
+// </h>
+
+
+#endif /* REGIONS_ARMCM0P_H */
diff --git a/dsppp/linker_scripts/ARMCM4/region_defs.h b/dsppp/linker_scripts/ARMCM4/region_defs.h
new file mode 100644
index 000000000..dc63f5bb6
--- /dev/null
+++ b/dsppp/linker_scripts/ARMCM4/region_defs.h
@@ -0,0 +1,60 @@
+#ifndef REGIONS_ARMCM4_H
+#define REGIONS_ARMCM4_H
+
+
+//-------- <<< Use Configuration Wizard in Context Menu >>> --------------------
+
+// <n>Device pack:   ARM::Cortex_DFP@1.0.0
+// <i>Device pack used to generate this file
+
+// <h>ROM Configuration
+// =======================
+// <h> ROM=<__ROM0>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x00000000
+#define __ROM0_BASE 0x00000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00040000
+#define __ROM0_SIZE 0x00040000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __ROM0_DEFAULT 1
+//   <q>Startup
+//   <i> Selects region to be used for startup code.
+#define __ROM0_STARTUP 1
+// </h>
+
+// </h>
+
+// <h>RAM Configuration
+// =======================
+// <h> RAM=<__RAM0>
+//   <o> Base address <0x0-0xFFFFFFFF:8>
+//   <i> Defines base address of memory region.
+//   <i> Default: 0x20000000
+#define __RAM0_BASE 0x20000000
+//   <o> Region size [bytes] <0x0-0xFFFFFFFF:8>
+//   <i> Defines size of memory region.
+//   <i> Default: 0x00020000
+#define __RAM0_SIZE 0x00040000
+//   <q>Default region
+//   <i> Enables memory region globally for the application.
+#define __RAM0_DEFAULT 1
+//   <q>No zero initialize
+//   <i> Excludes region from zero initialization.
+#define __RAM0_NOINIT 0
+// </h>
+
+// </h>
+
+// <h>Stack / Heap Configuration
+//   <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+//   <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+#define __STACK_SIZE 0x00002000
+#define __HEAP_SIZE 0x00038000
+// </h>
+
+
+#endif /* REGIONS_ARMCM4_H */
diff --git a/dsppp/linker_scripts/SSE-300-MPS3/region_defs.h b/dsppp/linker_scripts/SSE-300-MPS3/region_defs.h
new file mode 100644
index 000000000..32ac16b37
--- /dev/null
+++ b/dsppp/linker_scripts/SSE-300-MPS3/region_defs.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016-2022 Arm Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __REGION_DEFS_H__
+#define __REGION_DEFS_H__
+
+#include "region_limits.h"
+
+/* **************************************************************
+ * WARNING: this file is parsed both by the C/C++ compiler
+ * and the linker. As a result the syntax must be valid not only
+ * for C/C++ but for the linker scripts too.
+ * Beware of the following limitations:
+ *   - LD (GCC linker) requires white space around operators.
+ *   - UL postfix for macros is not suported by the linker script
+ ****************************************************************/
+
+/* Secure regions */
+#define S_CODE_START     ( S_ROM_ALIAS )
+#define S_CODE_SIZE      ( TOTAL_S_ROM_SIZE )
+#define S_CODE_LIMIT     ( S_CODE_START + S_CODE_SIZE )
+
+#define S_DATA_START     ( S_RAM_ALIAS )
+#define S_DATA_SIZE      ( TOTAL_S_RAM_SIZE )
+#define S_DATA_LIMIT     ( S_DATA_START + S_DATA_SIZE )
+
+#define S_DDR4_START     ( S_DDR4_ALIAS )
+#define S_DDR4_SIZE      ( TOTAL_S_DDR4_SIZE )
+#define S_DDR4_LIMIT     ( S_DDR4_START + S_DDR4_SIZE )
+
+#endif /* __REGION_DEFS_H__ */
diff --git a/dsppp/linker_scripts/SSE-300-MPS3/region_limits.h b/dsppp/linker_scripts/SSE-300-MPS3/region_limits.h
new file mode 100644
index 000000000..0d600a363
--- /dev/null
+++ b/dsppp/linker_scripts/SSE-300-MPS3/region_limits.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __REGION_LIMITS_H__
+#define __REGION_LIMITS_H__
+
+/* **************************************************************
+ * WARNING: this file is parsed both by the C/C++ compiler
+ * and the linker. As a result the syntax must be valid not only
+ * for C/C++ but for the linker scripts too.
+ * Beware of the following limitations:
+ *   - LD (GCC linker) requires white space around operators.
+ *   - UL postfix for macros is not suported by the linker script
+ ****************************************************************/
+
+/* Secure Code */
+#define S_ROM_ALIAS               (0x10000000) /* ITCM_BASE_S */
+#define TOTAL_S_ROM_SIZE          (0x00080000) /* 512 kB */
+
+/* Secure Data */
+#define S_RAM_ALIAS               (0x30000000) /* DTCM_BASE_S */
+#define TOTAL_S_RAM_SIZE          (0x00080000) /* 512 kB */
+
+/* Secure DDR4 */
+#define S_DDR4_ALIAS              (0x70000000) /* DDR4_BLK1_BASE_S */
+#define TOTAL_S_DDR4_SIZE         (0x10000000) /* 256 MB */
+
+/* Heap and Stack sizes for secure and nonsecure applications */
+#define HEAP_SIZE                 (0x00038000) /* 1 KiB */
+#define STACK_SIZE                (0x00002000) /* 1 KiB */
+
+#endif /* __REGION_LIMITS_H__ */
diff --git a/dsppp/linker_scripts/ac6_m0p_mps3_s.sct b/dsppp/linker_scripts/ac6_m0p_mps3_s.sct
new file mode 100644
index 000000000..4d6e579d0
--- /dev/null
+++ b/dsppp/linker_scripts/ac6_m0p_mps3_s.sct
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/*----------------------------------------------------------------------------
+  Scatter File Definitions definition
+ *----------------------------------------------------------------------------*/
+
+LR_ROM0 __ROM0_BASE __ROM0_SIZE  {
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+  ER_CMSE_VENEER __ROM0_BASE+__ROM0_SIZE -__ROM0_SIZE {
+   *(Veneer$$CMSE)
+  }
+  #define ER_CMSE_VENEER_SIZE AlignExpr(ImageLength(ER_CMSE_VENEER), 8)
+#else
+  #define ER_CMSE_VENEER_SIZE 0
+#endif
+
+  ER_ROM0 __ROM0_BASE (__ROM0_SIZE - ER_CMSE_VENEER_SIZE) {
+   *.o (RESET, +First)
+   *(InRoot$$Sections)
+   *(+RO +XO)
+  }
+
+  RW_NOINIT __RAM0_BASE UNINIT (__RAM0_SIZE - __HEAP_SIZE - __STACK_SIZE) {
+    *(.bss.noinit)
+  }
+
+  RW_RAM0 AlignExpr(+0, 8) (__RAM0_SIZE - __HEAP_SIZE - __STACK_SIZE - AlignExpr(ImageLength(RW_NOINIT), 8)) {
+    *(+RW +ZI)
+  }
+
+#if __HEAP_SIZE > 0
+  ARM_LIB_HEAP  (AlignExpr(+0, 8)) EMPTY __HEAP_SIZE  {   ; Reserve empty region for heap
+  }
+#endif
+
+  ARM_LIB_STACK (__RAM0_BASE + __RAM0_SIZE - __STACKSEAL_SIZE) EMPTY -__STACK_SIZE {   ; Reserve empty region for stack
+  }
+
+#if __STACKSEAL_SIZE > 0
+  STACKSEAL +0 EMPTY 8 {             ; Reserve empty region for stack seal immediately after stack
+  }
+#endif
+
+#if __RAM1_SIZE > 0
+  RW_RAM1 __RAM1_BASE __RAM1_SIZE  {
+   .ANY (+RW +ZI)
+  }
+#endif
+
+#if __RAM2_SIZE > 0
+  RW_RAM2 __RAM2_BASE __RAM2_SIZE  {
+   .ANY (+RW +ZI)
+  }
+#endif
+
+#if __RAM3_SIZE > 0
+  RW_RAM3 __RAM3_BASE __RAM3_SIZE  {
+   .ANY (+RW +ZI)
+  }
+#endif
+}
+
+#if __ROM1_SIZE > 0
+LR_ROM1 __ROM1_BASE __ROM1_SIZE  {
+  ER_ROM1 +0 __ROM1_SIZE {
+   .ANY (+RO +XO)
+  }
+}
+#endif
+
+#if __ROM2_SIZE > 0
+LR_ROM2 __ROM2_BASE __ROM2_SIZE  {
+  ER_ROM2 +0 __ROM2_SIZE {
+   .ANY (+RO +XO)
+  }
+}
+#endif
+
+#if __ROM3_SIZE > 0
+LR_ROM3 __ROM3_BASE __ROM3_SIZE  {
+  ER_ROM3 +0 __ROM3_SIZE {
+   .ANY (+RO +XO)
+  }
+}
+#endif
diff --git a/dsppp/linker_scripts/ac6_m4_mps3_s.sct b/dsppp/linker_scripts/ac6_m4_mps3_s.sct
new file mode 100644
index 000000000..4d6e579d0
--- /dev/null
+++ b/dsppp/linker_scripts/ac6_m4_mps3_s.sct
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/*----------------------------------------------------------------------------
+  Scatter File Definitions definition
+ *----------------------------------------------------------------------------*/
+
+LR_ROM0 __ROM0_BASE __ROM0_SIZE  {
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+  ER_CMSE_VENEER __ROM0_BASE+__ROM0_SIZE -__ROM0_SIZE {
+   *(Veneer$$CMSE)
+  }
+  #define ER_CMSE_VENEER_SIZE AlignExpr(ImageLength(ER_CMSE_VENEER), 8)
+#else
+  #define ER_CMSE_VENEER_SIZE 0
+#endif
+
+  ER_ROM0 __ROM0_BASE (__ROM0_SIZE - ER_CMSE_VENEER_SIZE) {
+   *.o (RESET, +First)
+   *(InRoot$$Sections)
+   *(+RO +XO)
+  }
+
+  RW_NOINIT __RAM0_BASE UNINIT (__RAM0_SIZE - __HEAP_SIZE - __STACK_SIZE) {
+    *(.bss.noinit)
+  }
+
+  RW_RAM0 AlignExpr(+0, 8) (__RAM0_SIZE - __HEAP_SIZE - __STACK_SIZE - AlignExpr(ImageLength(RW_NOINIT), 8)) {
+    *(+RW +ZI)
+  }
+
+#if __HEAP_SIZE > 0
+  ARM_LIB_HEAP  (AlignExpr(+0, 8)) EMPTY __HEAP_SIZE  {   ; Reserve empty region for heap
+  }
+#endif
+
+  ARM_LIB_STACK (__RAM0_BASE + __RAM0_SIZE - __STACKSEAL_SIZE) EMPTY -__STACK_SIZE {   ; Reserve empty region for stack
+  }
+
+#if __STACKSEAL_SIZE > 0
+  STACKSEAL +0 EMPTY 8 {             ; Reserve empty region for stack seal immediately after stack
+  }
+#endif
+
+#if __RAM1_SIZE > 0
+  RW_RAM1 __RAM1_BASE __RAM1_SIZE  {
+   .ANY (+RW +ZI)
+  }
+#endif
+
+#if __RAM2_SIZE > 0
+  RW_RAM2 __RAM2_BASE __RAM2_SIZE  {
+   .ANY (+RW +ZI)
+  }
+#endif
+
+#if __RAM3_SIZE > 0
+  RW_RAM3 __RAM3_BASE __RAM3_SIZE  {
+   .ANY (+RW +ZI)
+  }
+#endif
+}
+
+#if __ROM1_SIZE > 0
+LR_ROM1 __ROM1_BASE __ROM1_SIZE  {
+  ER_ROM1 +0 __ROM1_SIZE {
+   .ANY (+RO +XO)
+  }
+}
+#endif
+
+#if __ROM2_SIZE > 0
+LR_ROM2 __ROM2_BASE __ROM2_SIZE  {
+  ER_ROM2 +0 __ROM2_SIZE {
+   .ANY (+RO +XO)
+  }
+}
+#endif
+
+#if __ROM3_SIZE > 0
+LR_ROM3 __ROM3_BASE __ROM3_SIZE  {
+  ER_ROM3 +0 __ROM3_SIZE {
+   .ANY (+RO +XO)
+  }
+}
+#endif
diff --git a/dsppp/linker_scripts/ac6_sse300_mps3_s.sct b/dsppp/linker_scripts/ac6_sse300_mps3_s.sct
new file mode 100644
index 000000000..6712e5cc0
--- /dev/null
+++ b/dsppp/linker_scripts/ac6_sse300_mps3_s.sct
@@ -0,0 +1,79 @@
+
+;/*
+; * Copyright (c) 2018-2021 Arm Limited. All rights reserved.
+; *
+; * Licensed under the Apache License, Version 2.0 (the "License");
+; * you may not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; *     http://www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an "AS IS" BASIS,
+; * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; *
+; */
+
+;#include "region_defs.h"
+
+LR_CODE S_CODE_START {
+    ER_CODE S_CODE_START {
+        *.o (RESET +First)
+        .ANY (+RO)
+        /* different test vectors */
+        * (InRoot$$Sections)
+    }
+
+    /*
+     * Place the CMSE Veneers (containing the SG instruction) after the code, in
+     * a separate 32 bytes aligned region so that the SAU can programmed to just
+     * set this region as Non-Secure Callable. The maximum size of this
+     * executable region makes it only used the space left over by the ER_CODE
+     * region so that you can rely on code+veneer size combined will not exceed
+     * the S_CODE_SIZE value. We also substract from the available space the
+     * area used to align this section on 32 bytes boundary (for SAU conf).
+     */
+    ER_CODE_CMSE_VENEER +0 ALIGN 32 {
+        *(Veneer$$CMSE)
+    }
+    /*
+     * This dummy region ensures that the next one will be aligned on a 32 bytes
+     * boundary, so that the following region will not be mistakenly configured
+     * as Non-Secure Callable by the SAU.
+     */
+    ER_CODE_CMSE_VENEER_DUMMY +0 ALIGN 32 EMPTY 0 {}
+
+    /* This empty, zero long execution region is here to mark the limit address
+     * of the last execution region that is allocated in SRAM.
+     */
+    CODE_WATERMARK +0 EMPTY 0x0 {
+    }
+    /* Make sure that the sections allocated in the SRAM does not exceed the
+     * size of the SRAM available.
+     */
+    ScatterAssert(ImageLimit(CODE_WATERMARK) <= S_CODE_START + S_CODE_SIZE)
+
+    ER_DATA S_DATA_START {
+        .ANY (+ZI +RW +RO-DATA)
+    }
+
+    #if HEAP_SIZE > 0
+    ARM_LIB_HEAP +0 ALIGN 8 EMPTY  HEAP_SIZE  {   ; Reserve empty region for heap
+    }
+    #endif
+
+    ARM_LIB_STACK +0 ALIGN 32 EMPTY STACK_SIZE {   ; Reserve empty region for stack
+    }
+
+    /* This empty, zero long execution region is here to mark the limit address
+     * of the last execution region that is allocated in SRAM.
+     */
+    SRAM_WATERMARK +0 EMPTY 0x0 {
+    }
+    /* Make sure that the sections allocated in the SRAM does not exceed the
+     * size of the SRAM available.
+     */
+    ScatterAssert(ImageLimit(SRAM_WATERMARK) <= S_DATA_START + S_DATA_SIZE)
+}
diff --git a/dsppp/linker_scripts/ac6_sse310_mps3_s.sct b/dsppp/linker_scripts/ac6_sse310_mps3_s.sct
new file mode 100644
index 000000000..0650639f8
--- /dev/null
+++ b/dsppp/linker_scripts/ac6_sse310_mps3_s.sct
@@ -0,0 +1,60 @@
+
+;/*
+; * Copyright (c) 2018-2021 Arm Limited
+; *
+; * Licensed under the Apache License, Version 2.0 (the "License");
+; * you may not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; *     http://www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an "AS IS" BASIS,
+; * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; *
+; */
+
+
+
+LR_CODE S_CODE_START {
+    ER_CODE S_CODE_START {
+        *.o (RESET +First)
+        .ANY (+RO)
+        /* different test vectors */
+        * (InRoot$$Sections)
+    }
+
+    /* This empty, zero long execution region is here to mark the limit address
+     * of the last execution region that is allocated in SRAM.
+     */
+    CODE_WATERMARK +0 EMPTY 0x0 {
+    }
+    /* Make sure that the sections allocated in the SRAM does not exceed the
+     * size of the SRAM available.
+     */
+    ScatterAssert(ImageLimit(CODE_WATERMARK) <= S_CODE_START + S_CODE_SIZE)
+
+    ER_DATA S_DATA_START {
+        .ANY (+ZI +RW +RO-DATA)
+    }
+
+    #if HEAP_SIZE > 0
+    ARM_LIB_HEAP +0 ALIGN 8 EMPTY  HEAP_SIZE  {   ; Reserve empty region for heap
+    }
+    #endif
+
+    ARM_LIB_STACK +0 ALIGN 32 EMPTY STACK_SIZE {   ; Reserve empty region for stack
+    }
+
+    /* This empty, zero long execution region is here to mark the limit address
+     * of the last execution region that is allocated in SRAM.
+     */
+    SRAM_WATERMARK +0 EMPTY 0x0 {
+    }
+    /* Make sure that the sections allocated in the SRAM does not exceed the
+     * size of the SRAM available.
+     */
+    ScatterAssert(ImageLimit(SRAM_WATERMARK) <= S_DATA_START + S_DATA_SIZE)
+}
diff --git a/dsppp/linker_scripts/clang_m0p_mps3.ld b/dsppp/linker_scripts/clang_m0p_mps3.ld
new file mode 100644
index 000000000..40f955c16
--- /dev/null
+++ b/dsppp/linker_scripts/clang_m0p_mps3.ld
@@ -0,0 +1,353 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright © 2019 Keith Packard
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials provided
+ *    with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/* ----------------------------------------------------------------------------
+  Memory definition
+ *----------------------------------------------------------------------------*/
+MEMORY
+{
+  ROM0  (rx!w)  : ORIGIN = __ROM0_BASE, LENGTH = __ROM0_SIZE
+#if __ROM1_SIZE > 0
+  ROM1  (rx!w)  : ORIGIN = __ROM1_BASE, LENGTH = __ROM1_SIZE
+#endif
+#if __ROM2_SIZE > 0
+  ROM2  (rx!w)  : ORIGIN = __ROM2_BASE, LENGTH = __ROM2_SIZE
+#endif
+#if __ROM3_SIZE > 0
+  ROM3  (rx!w)  : ORIGIN = __ROM3_BASE, LENGTH = __ROM3_SIZE
+#endif
+
+  RAM0  (w!rx) : ORIGIN = __RAM0_BASE, LENGTH = __RAM0_SIZE
+#if __RAM1_SIZE > 0
+  RAM1  (w!rx) : ORIGIN = __RAM1_BASE, LENGTH = __RAM1_SIZE
+#endif
+#if __RAM2_SIZE > 0
+  RAM2  (w!rx) : ORIGIN = __RAM2_BASE, LENGTH = __RAM2_SIZE
+#endif
+#if __RAM3_SIZE > 0
+  RAM3  (w!rx) : ORIGIN = __RAM3_BASE, LENGTH = __RAM3_SIZE
+#endif
+}
+
+ENTRY(Reset_Handler)
+
+PHDRS
+{
+	text PT_LOAD;
+	ram PT_LOAD;
+	ram_init PT_LOAD;
+	tls PT_TLS;
+}
+
+SECTIONS
+{
+	.init : {
+		KEEP (*(.vectors))
+		KEEP (*(.text.init.enter))
+		KEEP (*(.data.init.enter))
+		KEEP (*(SORT_BY_NAME(.init) SORT_BY_NAME(.init.*)))
+	} >ROM0 AT>ROM0 :text
+
+	.text : {
+
+		/* code */
+		*(.text.unlikely .text.unlikely.*)
+		*(.text.startup .text.startup.*)
+		*(.text .text.* .opd .opd.*)
+		*(.gnu.linkonce.t.*)
+		KEEP (*(.fini .fini.*))
+		__text_end = .;
+
+		PROVIDE (__etext = __text_end);
+		PROVIDE (_etext = __text_end);
+		PROVIDE (etext = __text_end);
+
+		/* read-only data */
+		*(.rdata)
+		*(.rodata .rodata.*)
+		*(.gnu.linkonce.r.*)
+
+		*(.srodata.cst16)
+		*(.srodata.cst8)
+		*(.srodata.cst4)
+		*(.srodata.cst2)
+		*(.srodata .srodata.*)
+		*(.data.rel.ro .data.rel.ro.*)
+		*(.got .got.*)
+
+		/* Need to pre-align so that the symbols come after padding */
+		. = ALIGN(8);
+
+		/* lists of constructors and destructors */
+		PROVIDE_HIDDEN ( __preinit_array_start = . );
+		KEEP (*(.preinit_array))
+		PROVIDE_HIDDEN ( __preinit_array_end = . );
+
+		PROVIDE_HIDDEN ( __init_array_start = . );
+		KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
+		KEEP (*(.init_array .ctors))
+		PROVIDE_HIDDEN ( __init_array_end = . );
+
+		PROVIDE_HIDDEN ( __fini_array_start = . );
+		KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
+		KEEP (*(.fini_array .dtors))
+		PROVIDE_HIDDEN ( __fini_array_end = . );
+
+	} >ROM0 AT>ROM0 :text
+
+	.toc : {
+		*(.toc .toc.*)
+	} >ROM0 AT>ROM0 :text
+
+	/* additional sections when compiling with C++ exception support */
+	
+	.except_ordered : {
+		*(.gcc_except_table *.gcc_except_table.*)
+		KEEP (*(.eh_frame .eh_frame.*))
+		*(.ARM.extab* .gnu.linkonce.armextab.*)
+	} >ROM0 AT>ROM0 :text
+
+	.except_unordered : {
+		. = ALIGN(8);
+
+		PROVIDE(__exidx_start = .);
+		*(.ARM.exidx*)
+		PROVIDE(__exidx_end = .);
+	} >ROM0 AT>ROM0 :text
+	
+
+	/*
+	 * Data values which are preserved across reset
+	 */
+	.preserve (NOLOAD) : {
+		PROVIDE(__preserve_start__ = .);
+		KEEP(*(SORT_BY_NAME(.preserve.*)))
+		KEEP(*(.preserve))
+		PROVIDE(__preserve_end__ = .);
+	} >RAM0 AT>RAM0 :ram
+
+	.data :  {
+		*(.data .data.*)
+		*(.gnu.linkonce.d.*)
+
+		/* Need to pre-align so that the symbols come after padding */
+		. = ALIGN(8);
+
+		PROVIDE( __global_pointer$ = . + 0x800 );
+		*(.sdata .sdata.* .sdata2.*)
+		*(.gnu.linkonce.s.*)
+	} >RAM0 AT>ROM0 :ram_init
+	PROVIDE(__data_start = ADDR(.data));
+	PROVIDE(__data_source = LOADADDR(.data));
+
+	/* Thread local initialized data. This gets
+	 * space allocated as it is expected to be placed
+	 * in ram to be used as a template for TLS data blocks
+	 * allocated at runtime. We're slightly abusing that
+	 * by placing the data in flash where it will be copied
+	 * into the allocate ram addresses by the existing
+	 * data initialization code in crt0
+	 */
+	.tdata :  {
+		*(.tdata .tdata.* .gnu.linkonce.td.*)
+		PROVIDE(__data_end = .);
+		PROVIDE(__tdata_end = .);
+	} >RAM0 AT>ROM0 :tls :ram_init
+	PROVIDE( __tls_base = ADDR(.tdata));
+	PROVIDE( __tdata_start = ADDR(.tdata));
+	PROVIDE( __tdata_source = LOADADDR(.tdata) );
+	PROVIDE( __tdata_source_end = LOADADDR(.tdata) + SIZEOF(.tdata) );
+	PROVIDE( __data_source_end = __tdata_source_end );
+	PROVIDE( __tdata_size = SIZEOF(.tdata) );
+	PROVIDE( __tls_align = MAX(ALIGNOF(.tdata),ALIGNOF(.tbss)) );
+
+	PROVIDE( __edata = __data_end );
+	PROVIDE( _edata = __data_end );
+	PROVIDE( edata = __data_end );
+	PROVIDE( __data_size = __data_end - __data_start );
+	PROVIDE( __data_source_size = __data_source_end - __data_source );
+
+	.tbss (NOLOAD) : {
+		*(.tbss .tbss.* .gnu.linkonce.tb.*)
+		*(.tcommon)
+		PROVIDE( __tls_end = . );
+		PROVIDE( __tbss_end = . );
+	} >RAM0 AT>RAM0 :tls :ram
+	PROVIDE( __bss_start = ADDR(.tbss));
+	PROVIDE( __tbss_start = ADDR(.tbss));
+	PROVIDE( __tbss_offset = ADDR(.tbss) - ADDR(.tdata) );
+	PROVIDE( __tbss_size = SIZEOF(.tbss) );
+	PROVIDE( __tls_size = __tls_end - __tls_base );
+	PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
+	PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
+	PROVIDE( __arm64_tls_tcb_offset = MAX(16, __tls_align) );
+
+	/*
+	 * The linker special cases .tbss segments which are
+	 * identified as segments which are not loaded and are
+	 * thread_local.
+	 *
+	 * For these segments, the linker does not advance 'dot'
+	 * across them.  We actually need memory allocated for tbss,
+	 * so we create a special segment here just to make room
+	 */
+	/*
+	.tbss_space (NOLOAD) : {
+		. = ADDR(.tbss);
+		. = . + SIZEOF(.tbss);
+	} >RAM0 AT>RAM0 :ram
+	*/
+
+	.bss (NOLOAD) : {
+		*(.sbss*)
+		*(.gnu.linkonce.sb.*)
+		*(.bss .bss.*)
+		*(.gnu.linkonce.b.*)
+		*(COMMON)
+
+		/* Align the heap */
+		. = ALIGN(8);
+		__bss_end = .;
+	} >RAM0 AT>RAM0 :ram
+	PROVIDE( __non_tls_bss_start = ADDR(.bss) );
+	PROVIDE( __end = __bss_end );
+	PROVIDE( _end = __bss_end );
+	PROVIDE( end = __bss_end );
+	PROVIDE( __bss_size = __bss_end - __bss_start );
+
+	/* Make the rest of memory available for heap storage */
+	PROVIDE (__heap_start = __end);
+#ifdef __HEAP_SIZE
+	PROVIDE (__heap_end = __heap_start + __HEAP_SIZE);
+	PROVIDE (__heap_size = __HEAP_SIZE);
+#else
+	PROVIDE (__heap_end = __stack - __STACK_SIZE);
+	PROVIDE (__heap_size = __heap_end - __heap_start);
+#endif
+	.heap (NOLOAD) : {
+		. += __heap_size;
+	} >RAM0 :ram
+
+	/* Define a stack region to make sure it fits in memory */
+	PROVIDE(__stack = ORIGIN(RAM0) + LENGTH(RAM0) - __STACKSEAL_SIZE);
+	PROVIDE(__stack_limit = __stack - __STACK_SIZE);
+	.stack (__stack_limit) (NOLOAD) : {
+		. += __STACK_SIZE;
+	} >RAM0 :ram
+
+#if __STACKSEAL_SIZE > 0
+	PROVIDE(__stack_seal = __stack)
+  	.stackseal (__stack) (NOLOAD) :
+  	{
+    	. += __STACKSEAL_SIZE;
+  	} >RAM0 :ram
+#endif
+
+	/* Throw away C++ exception handling information */
+
+	/*
+
+	/DISCARD/ : {
+		*(.note .note.*)
+		*(.eh_frame .eh_frame.*)
+		*(.ARM.extab* .gnu.linkonce.armextab.*)
+		*(.ARM.exidx*)
+	}
+
+	*/
+
+	/* Stabs debugging sections.  */
+	.stab          0 : { *(.stab) }
+	.stabstr       0 : { *(.stabstr) }
+	.stab.excl     0 : { *(.stab.excl) }
+	.stab.exclstr  0 : { *(.stab.exclstr) }
+	.stab.index    0 : { *(.stab.index) }
+	.stab.indexstr 0 : { *(.stab.indexstr) }
+	.comment       0 : { *(.comment) }
+	.gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) }
+	/* DWARF debug sections.
+	   Symbols in the DWARF debugging sections are relative to the beginning
+	   of the section so we begin them at 0.  */
+	/* DWARF 1.  */
+	.debug          0 : { *(.debug) }
+	.line           0 : { *(.line) }
+	/* GNU DWARF 1 extensions.  */
+	.debug_srcinfo  0 : { *(.debug_srcinfo) }
+	.debug_sfnames  0 : { *(.debug_sfnames) }
+	/* DWARF 1.1 and DWARF 2.  */
+	.debug_aranges  0 : { *(.debug_aranges) }
+	.debug_pubnames 0 : { *(.debug_pubnames) }
+	/* DWARF 2.  */
+	.debug_info     0 : { *(.debug_info .gnu.linkonce.wi.*) }
+	.debug_abbrev   0 : { *(.debug_abbrev) }
+	.debug_line     0 : { *(.debug_line .debug_line.* .debug_line_end) }
+	.debug_frame    0 : { *(.debug_frame) }
+	.debug_str      0 : { *(.debug_str) }
+	.debug_loc      0 : { *(.debug_loc) }
+	.debug_macinfo  0 : { *(.debug_macinfo) }
+	/* SGI/MIPS DWARF 2 extensions.  */
+	.debug_weaknames 0 : { *(.debug_weaknames) }
+	.debug_funcnames 0 : { *(.debug_funcnames) }
+	.debug_typenames 0 : { *(.debug_typenames) }
+	.debug_varnames  0 : { *(.debug_varnames) }
+	/* DWARF 3.  */
+	.debug_pubtypes 0 : { *(.debug_pubtypes) }
+	.debug_ranges   0 : { *(.debug_ranges) }
+	/* DWARF 5.  */
+	.debug_addr     0 : { *(.debug_addr) }
+	.debug_line_str 0 : { *(.debug_line_str) }
+	.debug_loclists 0 : { *(.debug_loclists) }
+	.debug_macro    0 : { *(.debug_macro) }
+	.debug_names    0 : { *(.debug_names) }
+	.debug_rnglists 0 : { *(.debug_rnglists) }
+	.debug_str_offsets 0 : { *(.debug_str_offsets) }
+	.debug_sup      0 : { *(.debug_sup) }
+	.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
+}
+/*
+ * Check that sections that are copied from flash to RAM have matching
+ * padding, so that a single memcpy() of __data_size copies the correct bytes.
+ */
+ASSERT( __data_size == __data_source_size,
+	"ERROR: .data/.tdata flash size does not match RAM size");
diff --git a/dsppp/linker_scripts/clang_m4_mps3.ld b/dsppp/linker_scripts/clang_m4_mps3.ld
new file mode 100644
index 000000000..40f955c16
--- /dev/null
+++ b/dsppp/linker_scripts/clang_m4_mps3.ld
@@ -0,0 +1,353 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright © 2019 Keith Packard
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials provided
+ *    with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/* ----------------------------------------------------------------------------
+  Memory definition
+ *----------------------------------------------------------------------------*/
+MEMORY
+{
+  ROM0  (rx!w)  : ORIGIN = __ROM0_BASE, LENGTH = __ROM0_SIZE
+#if __ROM1_SIZE > 0
+  ROM1  (rx!w)  : ORIGIN = __ROM1_BASE, LENGTH = __ROM1_SIZE
+#endif
+#if __ROM2_SIZE > 0
+  ROM2  (rx!w)  : ORIGIN = __ROM2_BASE, LENGTH = __ROM2_SIZE
+#endif
+#if __ROM3_SIZE > 0
+  ROM3  (rx!w)  : ORIGIN = __ROM3_BASE, LENGTH = __ROM3_SIZE
+#endif
+
+  RAM0  (w!rx) : ORIGIN = __RAM0_BASE, LENGTH = __RAM0_SIZE
+#if __RAM1_SIZE > 0
+  RAM1  (w!rx) : ORIGIN = __RAM1_BASE, LENGTH = __RAM1_SIZE
+#endif
+#if __RAM2_SIZE > 0
+  RAM2  (w!rx) : ORIGIN = __RAM2_BASE, LENGTH = __RAM2_SIZE
+#endif
+#if __RAM3_SIZE > 0
+  RAM3  (w!rx) : ORIGIN = __RAM3_BASE, LENGTH = __RAM3_SIZE
+#endif
+}
+
+ENTRY(Reset_Handler)
+
+PHDRS
+{
+	text PT_LOAD;
+	ram PT_LOAD;
+	ram_init PT_LOAD;
+	tls PT_TLS;
+}
+
+SECTIONS
+{
+	.init : {
+		KEEP (*(.vectors))
+		KEEP (*(.text.init.enter))
+		KEEP (*(.data.init.enter))
+		KEEP (*(SORT_BY_NAME(.init) SORT_BY_NAME(.init.*)))
+	} >ROM0 AT>ROM0 :text
+
+	.text : {
+
+		/* code */
+		*(.text.unlikely .text.unlikely.*)
+		*(.text.startup .text.startup.*)
+		*(.text .text.* .opd .opd.*)
+		*(.gnu.linkonce.t.*)
+		KEEP (*(.fini .fini.*))
+		__text_end = .;
+
+		PROVIDE (__etext = __text_end);
+		PROVIDE (_etext = __text_end);
+		PROVIDE (etext = __text_end);
+
+		/* read-only data */
+		*(.rdata)
+		*(.rodata .rodata.*)
+		*(.gnu.linkonce.r.*)
+
+		*(.srodata.cst16)
+		*(.srodata.cst8)
+		*(.srodata.cst4)
+		*(.srodata.cst2)
+		*(.srodata .srodata.*)
+		*(.data.rel.ro .data.rel.ro.*)
+		*(.got .got.*)
+
+		/* Need to pre-align so that the symbols come after padding */
+		. = ALIGN(8);
+
+		/* lists of constructors and destructors */
+		PROVIDE_HIDDEN ( __preinit_array_start = . );
+		KEEP (*(.preinit_array))
+		PROVIDE_HIDDEN ( __preinit_array_end = . );
+
+		PROVIDE_HIDDEN ( __init_array_start = . );
+		KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
+		KEEP (*(.init_array .ctors))
+		PROVIDE_HIDDEN ( __init_array_end = . );
+
+		PROVIDE_HIDDEN ( __fini_array_start = . );
+		KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
+		KEEP (*(.fini_array .dtors))
+		PROVIDE_HIDDEN ( __fini_array_end = . );
+
+	} >ROM0 AT>ROM0 :text
+
+	.toc : {
+		*(.toc .toc.*)
+	} >ROM0 AT>ROM0 :text
+
+	/* additional sections when compiling with C++ exception support */
+	
+	.except_ordered : {
+		*(.gcc_except_table *.gcc_except_table.*)
+		KEEP (*(.eh_frame .eh_frame.*))
+		*(.ARM.extab* .gnu.linkonce.armextab.*)
+	} >ROM0 AT>ROM0 :text
+
+	.except_unordered : {
+		. = ALIGN(8);
+
+		PROVIDE(__exidx_start = .);
+		*(.ARM.exidx*)
+		PROVIDE(__exidx_end = .);
+	} >ROM0 AT>ROM0 :text
+	
+
+	/*
+	 * Data values which are preserved across reset
+	 */
+	.preserve (NOLOAD) : {
+		PROVIDE(__preserve_start__ = .);
+		KEEP(*(SORT_BY_NAME(.preserve.*)))
+		KEEP(*(.preserve))
+		PROVIDE(__preserve_end__ = .);
+	} >RAM0 AT>RAM0 :ram
+
+	.data :  {
+		*(.data .data.*)
+		*(.gnu.linkonce.d.*)
+
+		/* Need to pre-align so that the symbols come after padding */
+		. = ALIGN(8);
+
+		PROVIDE( __global_pointer$ = . + 0x800 );
+		*(.sdata .sdata.* .sdata2.*)
+		*(.gnu.linkonce.s.*)
+	} >RAM0 AT>ROM0 :ram_init
+	PROVIDE(__data_start = ADDR(.data));
+	PROVIDE(__data_source = LOADADDR(.data));
+
+	/* Thread local initialized data. This gets
+	 * space allocated as it is expected to be placed
+	 * in ram to be used as a template for TLS data blocks
+	 * allocated at runtime. We're slightly abusing that
+	 * by placing the data in flash where it will be copied
+	 * into the allocate ram addresses by the existing
+	 * data initialization code in crt0
+	 */
+	.tdata :  {
+		*(.tdata .tdata.* .gnu.linkonce.td.*)
+		PROVIDE(__data_end = .);
+		PROVIDE(__tdata_end = .);
+	} >RAM0 AT>ROM0 :tls :ram_init
+	PROVIDE( __tls_base = ADDR(.tdata));
+	PROVIDE( __tdata_start = ADDR(.tdata));
+	PROVIDE( __tdata_source = LOADADDR(.tdata) );
+	PROVIDE( __tdata_source_end = LOADADDR(.tdata) + SIZEOF(.tdata) );
+	PROVIDE( __data_source_end = __tdata_source_end );
+	PROVIDE( __tdata_size = SIZEOF(.tdata) );
+	PROVIDE( __tls_align = MAX(ALIGNOF(.tdata),ALIGNOF(.tbss)) );
+
+	PROVIDE( __edata = __data_end );
+	PROVIDE( _edata = __data_end );
+	PROVIDE( edata = __data_end );
+	PROVIDE( __data_size = __data_end - __data_start );
+	PROVIDE( __data_source_size = __data_source_end - __data_source );
+
+	.tbss (NOLOAD) : {
+		*(.tbss .tbss.* .gnu.linkonce.tb.*)
+		*(.tcommon)
+		PROVIDE( __tls_end = . );
+		PROVIDE( __tbss_end = . );
+	} >RAM0 AT>RAM0 :tls :ram
+	PROVIDE( __bss_start = ADDR(.tbss));
+	PROVIDE( __tbss_start = ADDR(.tbss));
+	PROVIDE( __tbss_offset = ADDR(.tbss) - ADDR(.tdata) );
+	PROVIDE( __tbss_size = SIZEOF(.tbss) );
+	PROVIDE( __tls_size = __tls_end - __tls_base );
+	PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
+	PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
+	PROVIDE( __arm64_tls_tcb_offset = MAX(16, __tls_align) );
+
+	/*
+	 * The linker special cases .tbss segments which are
+	 * identified as segments which are not loaded and are
+	 * thread_local.
+	 *
+	 * For these segments, the linker does not advance 'dot'
+	 * across them.  We actually need memory allocated for tbss,
+	 * so we create a special segment here just to make room
+	 */
+	/*
+	.tbss_space (NOLOAD) : {
+		. = ADDR(.tbss);
+		. = . + SIZEOF(.tbss);
+	} >RAM0 AT>RAM0 :ram
+	*/
+
+	.bss (NOLOAD) : {
+		*(.sbss*)
+		*(.gnu.linkonce.sb.*)
+		*(.bss .bss.*)
+		*(.gnu.linkonce.b.*)
+		*(COMMON)
+
+		/* Align the heap */
+		. = ALIGN(8);
+		__bss_end = .;
+	} >RAM0 AT>RAM0 :ram
+	PROVIDE( __non_tls_bss_start = ADDR(.bss) );
+	PROVIDE( __end = __bss_end );
+	PROVIDE( _end = __bss_end );
+	PROVIDE( end = __bss_end );
+	PROVIDE( __bss_size = __bss_end - __bss_start );
+
+	/* Make the rest of memory available for heap storage */
+	PROVIDE (__heap_start = __end);
+#ifdef __HEAP_SIZE
+	PROVIDE (__heap_end = __heap_start + __HEAP_SIZE);
+	PROVIDE (__heap_size = __HEAP_SIZE);
+#else
+	PROVIDE (__heap_end = __stack - __STACK_SIZE);
+	PROVIDE (__heap_size = __heap_end - __heap_start);
+#endif
+	.heap (NOLOAD) : {
+		. += __heap_size;
+	} >RAM0 :ram
+
+	/* Define a stack region to make sure it fits in memory */
+	PROVIDE(__stack = ORIGIN(RAM0) + LENGTH(RAM0) - __STACKSEAL_SIZE);
+	PROVIDE(__stack_limit = __stack - __STACK_SIZE);
+	.stack (__stack_limit) (NOLOAD) : {
+		. += __STACK_SIZE;
+	} >RAM0 :ram
+
+#if __STACKSEAL_SIZE > 0
+	PROVIDE(__stack_seal = __stack)
+  	.stackseal (__stack) (NOLOAD) :
+  	{
+    	. += __STACKSEAL_SIZE;
+  	} >RAM0 :ram
+#endif
+
+	/* Throw away C++ exception handling information */
+
+	/*
+
+	/DISCARD/ : {
+		*(.note .note.*)
+		*(.eh_frame .eh_frame.*)
+		*(.ARM.extab* .gnu.linkonce.armextab.*)
+		*(.ARM.exidx*)
+	}
+
+	*/
+
+	/* Stabs debugging sections.  */
+	.stab          0 : { *(.stab) }
+	.stabstr       0 : { *(.stabstr) }
+	.stab.excl     0 : { *(.stab.excl) }
+	.stab.exclstr  0 : { *(.stab.exclstr) }
+	.stab.index    0 : { *(.stab.index) }
+	.stab.indexstr 0 : { *(.stab.indexstr) }
+	.comment       0 : { *(.comment) }
+	.gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) }
+	/* DWARF debug sections.
+	   Symbols in the DWARF debugging sections are relative to the beginning
+	   of the section so we begin them at 0.  */
+	/* DWARF 1.  */
+	.debug          0 : { *(.debug) }
+	.line           0 : { *(.line) }
+	/* GNU DWARF 1 extensions.  */
+	.debug_srcinfo  0 : { *(.debug_srcinfo) }
+	.debug_sfnames  0 : { *(.debug_sfnames) }
+	/* DWARF 1.1 and DWARF 2.  */
+	.debug_aranges  0 : { *(.debug_aranges) }
+	.debug_pubnames 0 : { *(.debug_pubnames) }
+	/* DWARF 2.  */
+	.debug_info     0 : { *(.debug_info .gnu.linkonce.wi.*) }
+	.debug_abbrev   0 : { *(.debug_abbrev) }
+	.debug_line     0 : { *(.debug_line .debug_line.* .debug_line_end) }
+	.debug_frame    0 : { *(.debug_frame) }
+	.debug_str      0 : { *(.debug_str) }
+	.debug_loc      0 : { *(.debug_loc) }
+	.debug_macinfo  0 : { *(.debug_macinfo) }
+	/* SGI/MIPS DWARF 2 extensions.  */
+	.debug_weaknames 0 : { *(.debug_weaknames) }
+	.debug_funcnames 0 : { *(.debug_funcnames) }
+	.debug_typenames 0 : { *(.debug_typenames) }
+	.debug_varnames  0 : { *(.debug_varnames) }
+	/* DWARF 3.  */
+	.debug_pubtypes 0 : { *(.debug_pubtypes) }
+	.debug_ranges   0 : { *(.debug_ranges) }
+	/* DWARF 5.  */
+	.debug_addr     0 : { *(.debug_addr) }
+	.debug_line_str 0 : { *(.debug_line_str) }
+	.debug_loclists 0 : { *(.debug_loclists) }
+	.debug_macro    0 : { *(.debug_macro) }
+	.debug_names    0 : { *(.debug_names) }
+	.debug_rnglists 0 : { *(.debug_rnglists) }
+	.debug_str_offsets 0 : { *(.debug_str_offsets) }
+	.debug_sup      0 : { *(.debug_sup) }
+	.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
+}
+/*
+ * Check that sections that are copied from flash to RAM have matching
+ * padding, so that a single memcpy() of __data_size copies the correct bytes.
+ */
+ASSERT( __data_size == __data_source_size,
+	"ERROR: .data/.tdata flash size does not match RAM size");
diff --git a/dsppp/linker_scripts/clang_sse300_mps3.sct b/dsppp/linker_scripts/clang_sse300_mps3.sct
new file mode 100644
index 000000000..62352193b
--- /dev/null
+++ b/dsppp/linker_scripts/clang_sse300_mps3.sct
@@ -0,0 +1,364 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright © 2019 Keith Packard
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials provided
+ *    with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/* ----------------------------------------------------------------------------
+  Memory definition
+ *----------------------------------------------------------------------------*/
+MEMORY
+{
+  ROM0  (rx!w)  : ORIGIN = S_CODE_START, LENGTH = S_CODE_SIZE + 0x000000
+#if __ROM1_SIZE > 0
+  ROM1  (rx!w)  : ORIGIN = __ROM1_BASE, LENGTH = __ROM1_SIZE
+#endif
+#if __ROM2_SIZE > 0
+  ROM2  (rx!w)  : ORIGIN = __ROM2_BASE, LENGTH = __ROM2_SIZE
+#endif
+#if __ROM3_SIZE > 0
+  ROM3  (rx!w)  : ORIGIN = __ROM3_BASE, LENGTH = __ROM3_SIZE
+#endif
+
+  RAM0  (w!rx) : ORIGIN = S_DATA_START, LENGTH = S_DATA_SIZE + 0x000000
+#if __RAM1_SIZE > 0
+  RAM1  (w!rx) : ORIGIN = __RAM1_BASE, LENGTH = __RAM1_SIZE
+#endif
+#if __RAM2_SIZE > 0
+  RAM2  (w!rx) : ORIGIN = __RAM2_BASE, LENGTH = __RAM2_SIZE
+#endif
+#if __RAM3_SIZE > 0
+  RAM3  (w!rx) : ORIGIN = __RAM3_BASE, LENGTH = __RAM3_SIZE
+#endif
+}
+
+ENTRY(Reset_Handler)
+
+PHDRS
+{
+	text PT_LOAD;
+	ram PT_LOAD;
+	ram_init PT_LOAD;
+	tls PT_TLS;
+}
+
+SECTIONS
+{
+	.init : {
+		KEEP (*(.vectors))
+		KEEP (*(.text.init.enter))
+		KEEP (*(.data.init.enter))
+		KEEP (*(SORT_BY_NAME(.init) SORT_BY_NAME(.init.*)))
+	} >ROM0 AT>ROM0 :text
+
+	.text : {
+
+		/* code */
+		*(.text.unlikely .text.unlikely.*)
+		*(.text.startup .text.startup.*)
+		*(.text .text.* .opd .opd.*)
+		*(.gnu.linkonce.t.*)
+		KEEP (*(.fini .fini.*))
+		__text_end = .;
+
+		PROVIDE (__etext = __text_end);
+		PROVIDE (_etext = __text_end);
+		PROVIDE (etext = __text_end);
+
+		*(.gnu.linkonce.r.*)
+
+
+
+		*(.srodata.cst16)
+		*(.srodata.cst8)
+		*(.srodata.cst4)
+		*(.srodata.cst2)
+		*(.srodata .srodata.*)
+		*(.data.rel.ro .data.rel.ro.*)
+		*(.got .got.*)
+
+		/* Need to pre-align so that the symbols come after padding */
+		. = ALIGN(8);
+
+		/* lists of constructors and destructors */
+		PROVIDE_HIDDEN ( __preinit_array_start = . );
+		KEEP (*(.preinit_array))
+		PROVIDE_HIDDEN ( __preinit_array_end = . );
+
+		PROVIDE_HIDDEN ( __init_array_start = . );
+		KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
+		KEEP (*(.init_array .ctors))
+		PROVIDE_HIDDEN ( __init_array_end = . );
+
+		PROVIDE_HIDDEN ( __fini_array_start = . );
+		KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
+		KEEP (*(.fini_array .dtors))
+		PROVIDE_HIDDEN ( __fini_array_end = . );
+
+	} >ROM0 AT>ROM0 :text
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+	.veneers :
+	{
+		. = ALIGN(32);
+		KEEP(*(.gnu.sgstubs))
+	} > ROM0 AT>ROM0 :text
+#endif
+
+	.toc : {
+		*(.toc .toc.*)
+	} >ROM0 AT>ROM0 :text
+
+	/* additional sections when compiling with C++ exception support */
+
+	.except_ordered : {
+		*(.gcc_except_table *.gcc_except_table.*)
+		KEEP (*(.eh_frame .eh_frame.*))
+		*(.ARM.extab* .gnu.linkonce.armextab.*)
+	} >ROM0 AT>ROM0 :text
+
+	.except_unordered : {
+		. = ALIGN(8);
+
+		PROVIDE(__exidx_start = .);
+		*(.ARM.exidx*)
+		PROVIDE(__exidx_end = .);
+	} >ROM0 AT>ROM0 :text
+
+
+	/*
+	 * Data values which are preserved across reset
+	 */
+	.preserve (NOLOAD) : {
+		PROVIDE(__preserve_start__ = .);
+		KEEP(*(SORT_BY_NAME(.preserve.*)))
+		KEEP(*(.preserve))
+		PROVIDE(__preserve_end__ = .);
+	} >RAM0 AT>RAM0 :ram
+
+	.data :  {
+		*(.data .data.*)
+		*(.gnu.linkonce.d.*)
+
+		/* read-only data */
+		*(.rdata)
+		*(.rodata .rodata.*)
+
+		/* Need to pre-align so that the symbols come after padding */
+		. = ALIGN(8);
+
+		PROVIDE( __global_pointer$ = . + 0x800 );
+		*(.sdata .sdata.* .sdata2.*)
+		*(.gnu.linkonce.s.*)
+	} >RAM0 AT>ROM0 :ram_init
+	PROVIDE(__data_start = ADDR(.data));
+	PROVIDE(__data_source = LOADADDR(.data));
+
+	/* Thread local initialized data. This gets
+	 * space allocated as it is expected to be placed
+	 * in ram to be used as a template for TLS data blocks
+	 * allocated at runtime. We're slightly abusing that
+	 * by placing the data in flash where it will be copied
+	 * into the allocate ram addresses by the existing
+	 * data initialization code in crt0
+	 */
+	.tdata :  {
+		*(.tdata .tdata.* .gnu.linkonce.td.*)
+		PROVIDE(__data_end = .);
+		PROVIDE(__tdata_end = .);
+	} >RAM0 AT>ROM0 :tls :ram_init
+	PROVIDE( __tls_base = ADDR(.tdata));
+	PROVIDE( __tdata_start = ADDR(.tdata));
+	PROVIDE( __tdata_source = LOADADDR(.tdata) );
+	PROVIDE( __tdata_source_end = LOADADDR(.tdata) + SIZEOF(.tdata) );
+	PROVIDE( __data_source_end = __tdata_source_end );
+	PROVIDE( __tdata_size = SIZEOF(.tdata) );
+	PROVIDE( __tls_align = MAX(ALIGNOF(.tdata),ALIGNOF(.tbss)) );
+
+	PROVIDE( __edata = __data_end );
+	PROVIDE( _edata = __data_end );
+	PROVIDE( edata = __data_end );
+	PROVIDE( __data_size = __data_end - __data_start );
+	PROVIDE( __data_source_size = __data_source_end - __data_source );
+
+	.tbss (NOLOAD) : {
+		*(.tbss .tbss.* .gnu.linkonce.tb.*)
+		*(.tcommon)
+		PROVIDE( __tls_end = . );
+		PROVIDE( __tbss_end = . );
+	} >RAM0 AT>RAM0 :tls :ram
+	PROVIDE( __bss_start = ADDR(.tbss));
+	PROVIDE( __tbss_start = ADDR(.tbss));
+	PROVIDE( __tbss_offset = ADDR(.tbss) - ADDR(.tdata) );
+	PROVIDE( __tbss_size = SIZEOF(.tbss) );
+	PROVIDE( __tls_size = __tls_end - __tls_base );
+	PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
+	PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
+	PROVIDE( __arm64_tls_tcb_offset = MAX(16, __tls_align) );
+
+	/*
+	 * The linker special cases .tbss segments which are
+	 * identified as segments which are not loaded and are
+	 * thread_local.
+	 *
+	 * For these segments, the linker does not advance 'dot'
+	 * across them.  We actually need memory allocated for tbss,
+	 * so we create a special segment here just to make room
+	 */
+	/*
+	.tbss_space (NOLOAD) : {
+		. = ADDR(.tbss);
+		. = . + SIZEOF(.tbss);
+	} >RAM0 AT>RAM0 :ram
+	*/
+
+	.bss (NOLOAD) : {
+		*(.sbss*)
+		*(.gnu.linkonce.sb.*)
+		*(.bss .bss.*)
+		*(.gnu.linkonce.b.*)
+		*(COMMON)
+
+		/* Align the heap */
+		. = ALIGN(8);
+		__bss_end = .;
+	} >RAM0 AT>RAM0 :ram
+	PROVIDE( __non_tls_bss_start = ADDR(.bss) );
+	PROVIDE( __end = __bss_end );
+	PROVIDE( _end = __bss_end );
+	PROVIDE( end = __bss_end );
+	PROVIDE( __bss_size = __bss_end - __bss_start );
+
+	/* Make the rest of memory available for heap storage */
+	PROVIDE (__heap_start = __end);
+#ifdef HEAP_SIZE
+	PROVIDE (__heap_end = __heap_start + HEAP_SIZE);
+	PROVIDE (__heap_size = HEAP_SIZE);
+#else
+	PROVIDE (__heap_end = __stack - STACK_SIZE);
+	PROVIDE (__heap_size = __heap_end - __heap_start);
+#endif
+	.heap (NOLOAD) : {
+		. += __heap_size;
+	} >RAM0 :ram
+
+	/* Define a stack region to make sure it fits in memory */
+	PROVIDE(__stack = ORIGIN(RAM0) + LENGTH(RAM0) - __STACKSEAL_SIZE);
+	PROVIDE(__stack_limit = __stack - STACK_SIZE);
+	.stack (__stack_limit) (NOLOAD) : {
+		. += STACK_SIZE;
+	} >RAM0 :ram
+
+#if __STACKSEAL_SIZE > 0
+	PROVIDE(__stack_seal = __stack);
+  	.stackseal (__stack) (NOLOAD) :
+  	{
+    	. += __STACKSEAL_SIZE;
+  	} >RAM0 :ram
+#endif
+
+	/* Throw away C++ exception handling information */
+
+	/*
+
+	/DISCARD/ : {
+		*(.note .note.*)
+		*(.eh_frame .eh_frame.*)
+		*(.ARM.extab* .gnu.linkonce.armextab.*)
+		*(.ARM.exidx*)
+	}
+
+	*/
+
+	/* Stabs debugging sections.  */
+	.stab          0 : { *(.stab) }
+	.stabstr       0 : { *(.stabstr) }
+	.stab.excl     0 : { *(.stab.excl) }
+	.stab.exclstr  0 : { *(.stab.exclstr) }
+	.stab.index    0 : { *(.stab.index) }
+	.stab.indexstr 0 : { *(.stab.indexstr) }
+	.comment       0 : { *(.comment) }
+	.gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) }
+	/* DWARF debug sections.
+	   Symbols in the DWARF debugging sections are relative to the beginning
+	   of the section so we begin them at 0.  */
+	/* DWARF 1.  */
+	.debug          0 : { *(.debug) }
+	.line           0 : { *(.line) }
+	/* GNU DWARF 1 extensions.  */
+	.debug_srcinfo  0 : { *(.debug_srcinfo) }
+	.debug_sfnames  0 : { *(.debug_sfnames) }
+	/* DWARF 1.1 and DWARF 2.  */
+	.debug_aranges  0 : { *(.debug_aranges) }
+	.debug_pubnames 0 : { *(.debug_pubnames) }
+	/* DWARF 2.  */
+	.debug_info     0 : { *(.debug_info .gnu.linkonce.wi.*) }
+	.debug_abbrev   0 : { *(.debug_abbrev) }
+	.debug_line     0 : { *(.debug_line .debug_line.* .debug_line_end) }
+	.debug_frame    0 : { *(.debug_frame) }
+	.debug_str      0 : { *(.debug_str) }
+	.debug_loc      0 : { *(.debug_loc) }
+	.debug_macinfo  0 : { *(.debug_macinfo) }
+	/* SGI/MIPS DWARF 2 extensions.  */
+	.debug_weaknames 0 : { *(.debug_weaknames) }
+	.debug_funcnames 0 : { *(.debug_funcnames) }
+	.debug_typenames 0 : { *(.debug_typenames) }
+	.debug_varnames  0 : { *(.debug_varnames) }
+	/* DWARF 3.  */
+	.debug_pubtypes 0 : { *(.debug_pubtypes) }
+	.debug_ranges   0 : { *(.debug_ranges) }
+	/* DWARF 5.  */
+	.debug_addr     0 : { *(.debug_addr) }
+	.debug_line_str 0 : { *(.debug_line_str) }
+	.debug_loclists 0 : { *(.debug_loclists) }
+	.debug_macro    0 : { *(.debug_macro) }
+	.debug_names    0 : { *(.debug_names) }
+	.debug_rnglists 0 : { *(.debug_rnglists) }
+	.debug_str_offsets 0 : { *(.debug_str_offsets) }
+	.debug_sup      0 : { *(.debug_sup) }
+	.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
+}
+/*
+ * Check that sections that are copied from flash to RAM have matching
+ * padding, so that a single memcpy() of __data_size copies the correct bytes.
+ */
+ASSERT( __data_size == __data_source_size,
+	"ERROR: .data/.tdata flash size does not match RAM size");
diff --git a/dsppp/linker_scripts/clang_sse310_mps3.sct b/dsppp/linker_scripts/clang_sse310_mps3.sct
new file mode 100644
index 000000000..3f4877162
--- /dev/null
+++ b/dsppp/linker_scripts/clang_sse310_mps3.sct
@@ -0,0 +1,363 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright © 2019 Keith Packard
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials provided
+ *    with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/* ----------------------------------------------------------------------------
+  Memory definition
+ *----------------------------------------------------------------------------*/
+MEMORY
+{
+  ROM0  (rx!w)  : ORIGIN = S_CODE_START, LENGTH = S_CODE_SIZE + 0x000000
+#if __ROM1_SIZE > 0
+  ROM1  (rx!w)  : ORIGIN = __ROM1_BASE, LENGTH = __ROM1_SIZE
+#endif
+#if __ROM2_SIZE > 0
+  ROM2  (rx!w)  : ORIGIN = __ROM2_BASE, LENGTH = __ROM2_SIZE
+#endif
+#if __ROM3_SIZE > 0
+  ROM3  (rx!w)  : ORIGIN = __ROM3_BASE, LENGTH = __ROM3_SIZE
+#endif
+
+  RAM0  (w!rx) : ORIGIN = S_DATA_START, LENGTH = S_DATA_SIZE + 0x000000
+#if __RAM1_SIZE > 0
+  RAM1  (w!rx) : ORIGIN = __RAM1_BASE, LENGTH = __RAM1_SIZE
+#endif
+#if __RAM2_SIZE > 0
+  RAM2  (w!rx) : ORIGIN = __RAM2_BASE, LENGTH = __RAM2_SIZE
+#endif
+#if __RAM3_SIZE > 0
+  RAM3  (w!rx) : ORIGIN = __RAM3_BASE, LENGTH = __RAM3_SIZE
+#endif
+}
+
+ENTRY(Reset_Handler)
+
+PHDRS
+{
+	text PT_LOAD;
+	ram PT_LOAD;
+	ram_init PT_LOAD;
+	tls PT_TLS;
+}
+
+SECTIONS
+{
+	.init : {
+		KEEP (*(.vectors))
+		KEEP (*(.text.init.enter))
+		KEEP (*(.data.init.enter))
+		KEEP (*(SORT_BY_NAME(.init) SORT_BY_NAME(.init.*)))
+	} >ROM0 AT>ROM0 :text
+
+	.text : {
+
+		/* code */
+		*(.text.unlikely .text.unlikely.*)
+		*(.text.startup .text.startup.*)
+		*(.text .text.* .opd .opd.*)
+		*(.gnu.linkonce.t.*)
+		KEEP (*(.fini .fini.*))
+		__text_end = .;
+
+		PROVIDE (__etext = __text_end);
+		PROVIDE (_etext = __text_end);
+		PROVIDE (etext = __text_end);
+
+		*(.gnu.linkonce.r.*)
+
+
+		*(.srodata.cst16)
+		*(.srodata.cst8)
+		*(.srodata.cst4)
+		*(.srodata.cst2)
+		*(.srodata .srodata.*)
+		*(.data.rel.ro .data.rel.ro.*)
+		*(.got .got.*)
+
+		/* Need to pre-align so that the symbols come after padding */
+		. = ALIGN(8);
+
+		/* lists of constructors and destructors */
+		PROVIDE_HIDDEN ( __preinit_array_start = . );
+		KEEP (*(.preinit_array))
+		PROVIDE_HIDDEN ( __preinit_array_end = . );
+
+		PROVIDE_HIDDEN ( __init_array_start = . );
+		KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
+		KEEP (*(.init_array .ctors))
+		PROVIDE_HIDDEN ( __init_array_end = . );
+
+		PROVIDE_HIDDEN ( __fini_array_start = . );
+		KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
+		KEEP (*(.fini_array .dtors))
+		PROVIDE_HIDDEN ( __fini_array_end = . );
+
+	} >ROM0 AT>ROM0 :text
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+	.veneers :
+	{
+		. = ALIGN(32);
+		KEEP(*(.gnu.sgstubs))
+	} > ROM0 AT>ROM0 :text
+#endif
+
+	.toc : {
+		*(.toc .toc.*)
+	} >ROM0 AT>ROM0 :text
+
+	/* additional sections when compiling with C++ exception support */
+
+	.except_ordered : {
+		*(.gcc_except_table *.gcc_except_table.*)
+		KEEP (*(.eh_frame .eh_frame.*))
+		*(.ARM.extab* .gnu.linkonce.armextab.*)
+	} >ROM0 AT>ROM0 :text
+
+	.except_unordered : {
+		. = ALIGN(8);
+
+		PROVIDE(__exidx_start = .);
+		*(.ARM.exidx*)
+		PROVIDE(__exidx_end = .);
+	} >ROM0 AT>ROM0 :text
+
+
+	/*
+	 * Data values which are preserved across reset
+	 */
+	.preserve (NOLOAD) : {
+		PROVIDE(__preserve_start__ = .);
+		KEEP(*(SORT_BY_NAME(.preserve.*)))
+		KEEP(*(.preserve))
+		PROVIDE(__preserve_end__ = .);
+	} >RAM0 AT>RAM0 :ram
+
+	.data :  {
+		*(.data .data.*)
+		*(.gnu.linkonce.d.*)
+
+		/* read-only data */
+		*(.rdata)
+		*(.rodata .rodata.*)
+
+		/* Need to pre-align so that the symbols come after padding */
+		. = ALIGN(8);
+
+		PROVIDE( __global_pointer$ = . + 0x800 );
+		*(.sdata .sdata.* .sdata2.*)
+		*(.gnu.linkonce.s.*)
+	} >RAM0 AT>ROM0 :ram_init
+	PROVIDE(__data_start = ADDR(.data));
+	PROVIDE(__data_source = LOADADDR(.data));
+
+	/* Thread local initialized data. This gets
+	 * space allocated as it is expected to be placed
+	 * in ram to be used as a template for TLS data blocks
+	 * allocated at runtime. We're slightly abusing that
+	 * by placing the data in flash where it will be copied
+	 * into the allocate ram addresses by the existing
+	 * data initialization code in crt0
+	 */
+	.tdata :  {
+		*(.tdata .tdata.* .gnu.linkonce.td.*)
+		PROVIDE(__data_end = .);
+		PROVIDE(__tdata_end = .);
+	} >RAM0 AT>ROM0 :tls :ram_init
+	PROVIDE( __tls_base = ADDR(.tdata));
+	PROVIDE( __tdata_start = ADDR(.tdata));
+	PROVIDE( __tdata_source = LOADADDR(.tdata) );
+	PROVIDE( __tdata_source_end = LOADADDR(.tdata) + SIZEOF(.tdata) );
+	PROVIDE( __data_source_end = __tdata_source_end );
+	PROVIDE( __tdata_size = SIZEOF(.tdata) );
+	PROVIDE( __tls_align = MAX(ALIGNOF(.tdata),ALIGNOF(.tbss)) );
+
+	PROVIDE( __edata = __data_end );
+	PROVIDE( _edata = __data_end );
+	PROVIDE( edata = __data_end );
+	PROVIDE( __data_size = __data_end - __data_start );
+	PROVIDE( __data_source_size = __data_source_end - __data_source );
+
+	.tbss (NOLOAD) : {
+		*(.tbss .tbss.* .gnu.linkonce.tb.*)
+		*(.tcommon)
+		PROVIDE( __tls_end = . );
+		PROVIDE( __tbss_end = . );
+	} >RAM0 AT>RAM0 :tls :ram
+	PROVIDE( __bss_start = ADDR(.tbss));
+	PROVIDE( __tbss_start = ADDR(.tbss));
+	PROVIDE( __tbss_offset = ADDR(.tbss) - ADDR(.tdata) );
+	PROVIDE( __tbss_size = SIZEOF(.tbss) );
+	PROVIDE( __tls_size = __tls_end - __tls_base );
+	PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
+	PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
+	PROVIDE( __arm64_tls_tcb_offset = MAX(16, __tls_align) );
+
+	/*
+	 * The linker special cases .tbss segments which are
+	 * identified as segments which are not loaded and are
+	 * thread_local.
+	 *
+	 * For these segments, the linker does not advance 'dot'
+	 * across them.  We actually need memory allocated for tbss,
+	 * so we create a special segment here just to make room
+	 */
+	/*
+	.tbss_space (NOLOAD) : {
+		. = ADDR(.tbss);
+		. = . + SIZEOF(.tbss);
+	} >RAM0 AT>RAM0 :ram
+	*/
+
+	.bss (NOLOAD) : {
+		*(.sbss*)
+		*(.gnu.linkonce.sb.*)
+		*(.bss .bss.*)
+		*(.gnu.linkonce.b.*)
+		*(COMMON)
+
+		/* Align the heap */
+		. = ALIGN(8);
+		__bss_end = .;
+	} >RAM0 AT>RAM0 :ram
+	PROVIDE( __non_tls_bss_start = ADDR(.bss) );
+	PROVIDE( __end = __bss_end );
+	PROVIDE( _end = __bss_end );
+	PROVIDE( end = __bss_end );
+	PROVIDE( __bss_size = __bss_end - __bss_start );
+
+	/* Make the rest of memory available for heap storage */
+	PROVIDE (__heap_start = __end);
+#ifdef HEAP_SIZE
+	PROVIDE (__heap_end = __heap_start + HEAP_SIZE);
+	PROVIDE (__heap_size = HEAP_SIZE);
+#else
+	PROVIDE (__heap_end = __stack - STACK_SIZE);
+	PROVIDE (__heap_size = __heap_end - __heap_start);
+#endif
+	.heap (NOLOAD) : {
+		. += __heap_size;
+	} >RAM0 :ram
+
+	/* Define a stack region to make sure it fits in memory */
+	PROVIDE(__stack = ORIGIN(RAM0) + LENGTH(RAM0) - __STACKSEAL_SIZE);
+	PROVIDE(__stack_limit = __stack - STACK_SIZE);
+	.stack (__stack_limit) (NOLOAD) : {
+		. += STACK_SIZE;
+	} >RAM0 :ram
+
+#if __STACKSEAL_SIZE > 0
+	PROVIDE(__stack_seal = __stack);
+  	.stackseal (__stack) (NOLOAD) :
+  	{
+    	. += __STACKSEAL_SIZE;
+  	} >RAM0 :ram
+#endif
+
+	/* Throw away C++ exception handling information */
+
+	/*
+
+	/DISCARD/ : {
+		*(.note .note.*)
+		*(.eh_frame .eh_frame.*)
+		*(.ARM.extab* .gnu.linkonce.armextab.*)
+		*(.ARM.exidx*)
+	}
+
+	*/
+
+	/* Stabs debugging sections.  */
+	.stab          0 : { *(.stab) }
+	.stabstr       0 : { *(.stabstr) }
+	.stab.excl     0 : { *(.stab.excl) }
+	.stab.exclstr  0 : { *(.stab.exclstr) }
+	.stab.index    0 : { *(.stab.index) }
+	.stab.indexstr 0 : { *(.stab.indexstr) }
+	.comment       0 : { *(.comment) }
+	.gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) }
+	/* DWARF debug sections.
+	   Symbols in the DWARF debugging sections are relative to the beginning
+	   of the section so we begin them at 0.  */
+	/* DWARF 1.  */
+	.debug          0 : { *(.debug) }
+	.line           0 : { *(.line) }
+	/* GNU DWARF 1 extensions.  */
+	.debug_srcinfo  0 : { *(.debug_srcinfo) }
+	.debug_sfnames  0 : { *(.debug_sfnames) }
+	/* DWARF 1.1 and DWARF 2.  */
+	.debug_aranges  0 : { *(.debug_aranges) }
+	.debug_pubnames 0 : { *(.debug_pubnames) }
+	/* DWARF 2.  */
+	.debug_info     0 : { *(.debug_info .gnu.linkonce.wi.*) }
+	.debug_abbrev   0 : { *(.debug_abbrev) }
+	.debug_line     0 : { *(.debug_line .debug_line.* .debug_line_end) }
+	.debug_frame    0 : { *(.debug_frame) }
+	.debug_str      0 : { *(.debug_str) }
+	.debug_loc      0 : { *(.debug_loc) }
+	.debug_macinfo  0 : { *(.debug_macinfo) }
+	/* SGI/MIPS DWARF 2 extensions.  */
+	.debug_weaknames 0 : { *(.debug_weaknames) }
+	.debug_funcnames 0 : { *(.debug_funcnames) }
+	.debug_typenames 0 : { *(.debug_typenames) }
+	.debug_varnames  0 : { *(.debug_varnames) }
+	/* DWARF 3.  */
+	.debug_pubtypes 0 : { *(.debug_pubtypes) }
+	.debug_ranges   0 : { *(.debug_ranges) }
+	/* DWARF 5.  */
+	.debug_addr     0 : { *(.debug_addr) }
+	.debug_line_str 0 : { *(.debug_line_str) }
+	.debug_loclists 0 : { *(.debug_loclists) }
+	.debug_macro    0 : { *(.debug_macro) }
+	.debug_names    0 : { *(.debug_names) }
+	.debug_rnglists 0 : { *(.debug_rnglists) }
+	.debug_str_offsets 0 : { *(.debug_str_offsets) }
+	.debug_sup      0 : { *(.debug_sup) }
+	.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
+}
+/*
+ * Check that sections that are copied from flash to RAM have matching
+ * padding, so that a single memcpy() of __data_size copies the correct bytes.
+ */
+ASSERT( __data_size == __data_source_size,
+	"ERROR: .data/.tdata flash size does not match RAM size");
diff --git a/dsppp/linker_scripts/gcc_m0p_mps3.ld b/dsppp/linker_scripts/gcc_m0p_mps3.ld
new file mode 100644
index 000000000..a018e5d4e
--- /dev/null
+++ b/dsppp/linker_scripts/gcc_m0p_mps3.ld
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/* ----------------------------------------------------------------------------
+  Memory definition
+ *----------------------------------------------------------------------------*/
+MEMORY
+{
+  ROM0  (rx)  : ORIGIN = __ROM0_BASE, LENGTH = __ROM0_SIZE
+#if __ROM1_SIZE > 0
+  ROM1  (rx)  : ORIGIN = __ROM1_BASE, LENGTH = __ROM1_SIZE
+#endif
+#if __ROM2_SIZE > 0
+  ROM2  (rx)  : ORIGIN = __ROM2_BASE, LENGTH = __ROM2_SIZE
+#endif
+#if __ROM3_SIZE > 0
+  ROM3  (rx)  : ORIGIN = __ROM3_BASE, LENGTH = __ROM3_SIZE
+#endif
+
+  RAM0  (rwx) : ORIGIN = __RAM0_BASE, LENGTH = __RAM0_SIZE
+#if __RAM1_SIZE > 0
+  RAM1  (rwx) : ORIGIN = __RAM1_BASE, LENGTH = __RAM1_SIZE
+#endif
+#if __RAM2_SIZE > 0
+  RAM2  (rwx) : ORIGIN = __RAM2_BASE, LENGTH = __RAM2_SIZE
+#endif
+#if __RAM3_SIZE > 0
+  RAM3  (rwx) : ORIGIN = __RAM3_BASE, LENGTH = __RAM3_SIZE
+#endif
+}
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext          (deprecated)
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+  .text :
+  {
+    KEEP(*(.vectors))
+    *(.text*)
+
+    KEEP(*(.init))
+    KEEP(*(.fini))
+
+    /* .ctors */
+    *crtbegin.o(.ctors)
+    *crtbegin?.o(.ctors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+    *(SORT(.ctors.*))
+    *(.ctors)
+
+    /* .dtors */
+    *crtbegin.o(.dtors)
+    *crtbegin?.o(.dtors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+    *(SORT(.dtors.*))
+    *(.dtors)
+
+    *(.rodata*)
+
+    KEEP(*(.eh_frame*))
+  } > ROM0
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+  .gnu.sgstubs :
+  {
+    . = ALIGN(32);
+  } > ROM0
+#endif
+
+  .ARM.extab :
+  {
+    *(.ARM.extab* .gnu.linkonce.armextab.*)
+  } > ROM0
+
+  __exidx_start = .;
+  .ARM.exidx :
+  {
+    *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+  } > ROM0
+  __exidx_end = .;
+
+  .copy.table :
+  {
+    . = ALIGN(4);
+    __copy_table_start__ = .;
+
+    LONG (LOADADDR(.data))
+    LONG (ADDR(.data))
+    LONG (SIZEOF(.data) / 4)
+
+    /* Add each additional data section here */
+/*
+    LONG (LOADADDR(.data2))
+    LONG (ADDR(.data2))
+    LONG (SIZEOF(.data2) / 4)
+*/
+    __copy_table_end__ = .;
+  } > ROM0
+
+  .zero.table :
+  {
+    . = ALIGN(4);
+    __zero_table_start__ = .;
+
+/*  .bss initialization to zero is already done during C Run-Time Startup.
+    LONG (ADDR(.bss))
+    LONG (SIZEOF(.bss) / 4)
+*/
+
+    /* Add each additional bss section here */
+/*
+    LONG (ADDR(.bss2))
+    LONG (SIZEOF(.bss2) / 4)
+*/
+    __zero_table_end__ = .;
+  } > ROM0
+
+  /*
+   * This __etext variable is kept for backward compatibility with older,
+   * ASM based startup files.
+   */
+  PROVIDE(__etext = LOADADDR(.data));
+
+  .data : ALIGN(4)
+  {
+    __data_start__ = .;
+    *(vtable)
+    *(.data)
+    *(.data.*)
+
+    . = ALIGN(4);
+    /* preinit data */
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP(*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+
+    . = ALIGN(4);
+    /* init data */
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP(*(SORT(.init_array.*)))
+    KEEP(*(.init_array))
+    PROVIDE_HIDDEN (__init_array_end = .);
+
+    . = ALIGN(4);
+    /* finit data */
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP(*(SORT(.fini_array.*)))
+    KEEP(*(.fini_array))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+
+    KEEP(*(.jcr*))
+    . = ALIGN(4);
+    /* All data end */
+    __data_end__ = .;
+
+  } > RAM0 AT > ROM0
+
+  /*
+   * Secondary data section, optional
+   *
+   * Remember to add each additional data section
+   * to the .copy.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .data2 : ALIGN(4)
+  {
+    . = ALIGN(4);
+    __data2_start__ = .;
+    *(.data2)
+    *(.data2.*)
+    . = ALIGN(4);
+    __data2_end__ = .;
+
+  } > RAM1 AT > ROM0 
+*/
+
+  .bss :
+  {
+    . = ALIGN(4);
+    __bss_start__ = .;
+    *(.bss)
+    *(.bss.*)
+    *(COMMON)
+    . = ALIGN(4);
+    __bss_end__ = .;
+  } > RAM0 AT > RAM0
+
+  /*
+   * Secondary bss section, optional
+   *
+   * Remember to add each additional bss section
+   * to the .zero.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .bss2 :
+  {
+    . = ALIGN(4);
+    __bss2_start__ = .;
+    *(.bss2)
+    *(.bss2.*)
+    . = ALIGN(4);
+    __bss2_end__ = .;
+  } > RAM1 AT > RAM1
+*/
+
+  .heap (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __end__ = .;
+    PROVIDE(end = .);
+    . = . + __HEAP_SIZE;
+    . = ALIGN(8);
+    __HeapLimit = .;
+  } > RAM0
+
+  .stack (ORIGIN(RAM0) + LENGTH(RAM0) - __STACK_SIZE - __STACKSEAL_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackLimit = .;
+    . = . + __STACK_SIZE;
+    . = ALIGN(8);
+    __StackTop = .;
+  } > RAM0
+  PROVIDE(__stack = __StackTop);
+
+#if __STACKSEAL_SIZE > 0
+  .stackseal (ORIGIN(RAM0) + LENGTH(RAM0) - __STACKSEAL_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackSeal = .;
+    . = . + 8;
+    . = ALIGN(8);
+  } > RAM0
+#endif
+
+  /* Check if data + heap + stack exceeds RAM limit */
+  ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack")
+}
diff --git a/dsppp/linker_scripts/gcc_m4_mps3.ld b/dsppp/linker_scripts/gcc_m4_mps3.ld
new file mode 100644
index 000000000..a018e5d4e
--- /dev/null
+++ b/dsppp/linker_scripts/gcc_m4_mps3.ld
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/* ----------------------------------------------------------------------------
+  Memory definition
+ *----------------------------------------------------------------------------*/
+MEMORY
+{
+  ROM0  (rx)  : ORIGIN = __ROM0_BASE, LENGTH = __ROM0_SIZE
+#if __ROM1_SIZE > 0
+  ROM1  (rx)  : ORIGIN = __ROM1_BASE, LENGTH = __ROM1_SIZE
+#endif
+#if __ROM2_SIZE > 0
+  ROM2  (rx)  : ORIGIN = __ROM2_BASE, LENGTH = __ROM2_SIZE
+#endif
+#if __ROM3_SIZE > 0
+  ROM3  (rx)  : ORIGIN = __ROM3_BASE, LENGTH = __ROM3_SIZE
+#endif
+
+  RAM0  (rwx) : ORIGIN = __RAM0_BASE, LENGTH = __RAM0_SIZE
+#if __RAM1_SIZE > 0
+  RAM1  (rwx) : ORIGIN = __RAM1_BASE, LENGTH = __RAM1_SIZE
+#endif
+#if __RAM2_SIZE > 0
+  RAM2  (rwx) : ORIGIN = __RAM2_BASE, LENGTH = __RAM2_SIZE
+#endif
+#if __RAM3_SIZE > 0
+  RAM3  (rwx) : ORIGIN = __RAM3_BASE, LENGTH = __RAM3_SIZE
+#endif
+}
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext          (deprecated)
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+  .text :
+  {
+    KEEP(*(.vectors))
+    *(.text*)
+
+    KEEP(*(.init))
+    KEEP(*(.fini))
+
+    /* .ctors */
+    *crtbegin.o(.ctors)
+    *crtbegin?.o(.ctors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+    *(SORT(.ctors.*))
+    *(.ctors)
+
+    /* .dtors */
+    *crtbegin.o(.dtors)
+    *crtbegin?.o(.dtors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+    *(SORT(.dtors.*))
+    *(.dtors)
+
+    *(.rodata*)
+
+    KEEP(*(.eh_frame*))
+  } > ROM0
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+  .gnu.sgstubs :
+  {
+    . = ALIGN(32);
+  } > ROM0
+#endif
+
+  .ARM.extab :
+  {
+    *(.ARM.extab* .gnu.linkonce.armextab.*)
+  } > ROM0
+
+  __exidx_start = .;
+  .ARM.exidx :
+  {
+    *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+  } > ROM0
+  __exidx_end = .;
+
+  .copy.table :
+  {
+    . = ALIGN(4);
+    __copy_table_start__ = .;
+
+    LONG (LOADADDR(.data))
+    LONG (ADDR(.data))
+    LONG (SIZEOF(.data) / 4)
+
+    /* Add each additional data section here */
+/*
+    LONG (LOADADDR(.data2))
+    LONG (ADDR(.data2))
+    LONG (SIZEOF(.data2) / 4)
+*/
+    __copy_table_end__ = .;
+  } > ROM0
+
+  .zero.table :
+  {
+    . = ALIGN(4);
+    __zero_table_start__ = .;
+
+/*  .bss initialization to zero is already done during C Run-Time Startup.
+    LONG (ADDR(.bss))
+    LONG (SIZEOF(.bss) / 4)
+*/
+
+    /* Add each additional bss section here */
+/*
+    LONG (ADDR(.bss2))
+    LONG (SIZEOF(.bss2) / 4)
+*/
+    __zero_table_end__ = .;
+  } > ROM0
+
+  /*
+   * This __etext variable is kept for backward compatibility with older,
+   * ASM based startup files.
+   */
+  PROVIDE(__etext = LOADADDR(.data));
+
+  .data : ALIGN(4)
+  {
+    __data_start__ = .;
+    *(vtable)
+    *(.data)
+    *(.data.*)
+
+    . = ALIGN(4);
+    /* preinit data */
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP(*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+
+    . = ALIGN(4);
+    /* init data */
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP(*(SORT(.init_array.*)))
+    KEEP(*(.init_array))
+    PROVIDE_HIDDEN (__init_array_end = .);
+
+    . = ALIGN(4);
+    /* finit data */
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP(*(SORT(.fini_array.*)))
+    KEEP(*(.fini_array))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+
+    KEEP(*(.jcr*))
+    . = ALIGN(4);
+    /* All data end */
+    __data_end__ = .;
+
+  } > RAM0 AT > ROM0
+
+  /*
+   * Secondary data section, optional
+   *
+   * Remember to add each additional data section
+   * to the .copy.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .data2 : ALIGN(4)
+  {
+    . = ALIGN(4);
+    __data2_start__ = .;
+    *(.data2)
+    *(.data2.*)
+    . = ALIGN(4);
+    __data2_end__ = .;
+
+  } > RAM1 AT > ROM0 
+*/
+
+  .bss :
+  {
+    . = ALIGN(4);
+    __bss_start__ = .;
+    *(.bss)
+    *(.bss.*)
+    *(COMMON)
+    . = ALIGN(4);
+    __bss_end__ = .;
+  } > RAM0 AT > RAM0
+
+  /*
+   * Secondary bss section, optional
+   *
+   * Remember to add each additional bss section
+   * to the .zero.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .bss2 :
+  {
+    . = ALIGN(4);
+    __bss2_start__ = .;
+    *(.bss2)
+    *(.bss2.*)
+    . = ALIGN(4);
+    __bss2_end__ = .;
+  } > RAM1 AT > RAM1
+*/
+
+  .heap (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __end__ = .;
+    PROVIDE(end = .);
+    . = . + __HEAP_SIZE;
+    . = ALIGN(8);
+    __HeapLimit = .;
+  } > RAM0
+
+  .stack (ORIGIN(RAM0) + LENGTH(RAM0) - __STACK_SIZE - __STACKSEAL_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackLimit = .;
+    . = . + __STACK_SIZE;
+    . = ALIGN(8);
+    __StackTop = .;
+  } > RAM0
+  PROVIDE(__stack = __StackTop);
+
+#if __STACKSEAL_SIZE > 0
+  .stackseal (ORIGIN(RAM0) + LENGTH(RAM0) - __STACKSEAL_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackSeal = .;
+    . = . + 8;
+    . = ALIGN(8);
+  } > RAM0
+#endif
+
+  /* Check if data + heap + stack exceeds RAM limit */
+  ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack")
+}
diff --git a/dsppp/linker_scripts/gcc_sse300_mps3.ld b/dsppp/linker_scripts/gcc_sse300_mps3.ld
new file mode 100644
index 000000000..e00625ea6
--- /dev/null
+++ b/dsppp/linker_scripts/gcc_sse300_mps3.ld
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/* ----------------------------------------------------------------------------
+  Memory definition
+ *----------------------------------------------------------------------------*/
+MEMORY
+{
+  ROM0  (rx)  : ORIGIN = S_CODE_START, LENGTH = S_CODE_SIZE
+#if __ROM1_SIZE > 0
+  ROM1  (rx)  : ORIGIN = __ROM1_BASE, LENGTH = __ROM1_SIZE
+#endif
+#if __ROM2_SIZE > 0
+  ROM2  (rx)  : ORIGIN = __ROM2_BASE, LENGTH = __ROM2_SIZE
+#endif
+#if __ROM3_SIZE > 0
+  ROM3  (rx)  : ORIGIN = __ROM3_BASE, LENGTH = __ROM3_SIZE
+#endif
+
+  RAM0  (rw) : ORIGIN = S_DATA_START, LENGTH = S_DATA_SIZE
+#if __RAM1_SIZE > 0
+  RAM1  (rw) : ORIGIN = __RAM1_BASE, LENGTH = __RAM1_SIZE
+#endif
+#if __RAM2_SIZE > 0
+  RAM2  (rw) : ORIGIN = __RAM2_BASE, LENGTH = __RAM2_SIZE
+#endif
+#if __RAM3_SIZE > 0
+  RAM3  (rw) : ORIGIN = __RAM3_BASE, LENGTH = __RAM3_SIZE
+#endif
+}
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext          (deprecated)
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+  .text :
+  {
+    KEEP(*(.vectors))
+    *(.text*)
+
+    KEEP(*(.init))
+    KEEP(*(.fini))
+
+    /* .ctors */
+    *crtbegin.o(.ctors)
+    *crtbegin?.o(.ctors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+    *(SORT(.ctors.*))
+    *(.ctors)
+
+    /* .dtors */
+    *crtbegin.o(.dtors)
+    *crtbegin?.o(.dtors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+    *(SORT(.dtors.*))
+    *(.dtors)
+
+    /* *(.rodata*) */
+
+    KEEP(*(.eh_frame*))
+  } > ROM0
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+  .gnu.sgstubs :
+  {
+    . = ALIGN(32);
+  } > ROM0
+#endif
+
+  .ARM.extab :
+  {
+    *(.ARM.extab* .gnu.linkonce.armextab.*)
+  } > ROM0
+
+  __exidx_start = .;
+  .ARM.exidx :
+  {
+    *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+  } > ROM0
+  __exidx_end = .;
+
+  .copy.table :
+  {
+    . = ALIGN(4);
+    __copy_table_start__ = .;
+
+    LONG (LOADADDR(.data))
+    LONG (ADDR(.data))
+    LONG (SIZEOF(.data) / 4)
+
+    /* Add each additional data section here */
+/*
+    LONG (LOADADDR(.data2))
+    LONG (ADDR(.data2))
+    LONG (SIZEOF(.data2) / 4)
+*/
+    __copy_table_end__ = .;
+  } > ROM0
+
+  .zero.table :
+  {
+    . = ALIGN(4);
+    __zero_table_start__ = .;
+
+/*  .bss initialization to zero is already done during C Run-Time Startup.
+    LONG (ADDR(.bss))
+    LONG (SIZEOF(.bss) / 4)
+*/
+
+    /* Add each additional bss section here */
+/*
+    LONG (ADDR(.bss2))
+    LONG (SIZEOF(.bss2) / 4)
+*/
+    __zero_table_end__ = .;
+  } > ROM0
+
+  /*
+   * This __etext variable is kept for backward compatibility with older,
+   * ASM based startup files.
+   */
+  PROVIDE(__etext = LOADADDR(.data));
+
+  .data : ALIGN(4)
+  {
+    __data_start__ = .;
+    *(vtable)
+    *(.data)
+    *(.data.*)
+    *(.rodata*)
+
+    . = ALIGN(4);
+    /* preinit data */
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP(*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+
+    . = ALIGN(4);
+    /* init data */
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP(*(SORT(.init_array.*)))
+    KEEP(*(.init_array))
+    PROVIDE_HIDDEN (__init_array_end = .);
+
+    . = ALIGN(4);
+    /* finit data */
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP(*(SORT(.fini_array.*)))
+    KEEP(*(.fini_array))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+
+    KEEP(*(.jcr*))
+    . = ALIGN(4);
+    /* All data end */
+    __data_end__ = .;
+
+  } > RAM0 AT > ROM0
+
+  /*
+   * Secondary data section, optional
+   *
+   * Remember to add each additional data section
+   * to the .copy.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .data2 : ALIGN(4)
+  {
+    . = ALIGN(4);
+    __data2_start__ = .;
+    *(.data2)
+    *(.data2.*)
+    . = ALIGN(4);
+    __data2_end__ = .;
+
+  } > RAM1 AT > ROM0
+*/
+
+  .bss :
+  {
+    . = ALIGN(4);
+    __bss_start__ = .;
+    *(.bss)
+    *(.bss.*)
+    *(COMMON)
+    . = ALIGN(4);
+    __bss_end__ = .;
+  } > RAM0 AT > RAM0
+
+  /*
+   * Secondary bss section, optional
+   *
+   * Remember to add each additional bss section
+   * to the .zero.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .bss2 :
+  {
+    . = ALIGN(4);
+    __bss2_start__ = .;
+    *(.bss2)
+    *(.bss2.*)
+    . = ALIGN(4);
+    __bss2_end__ = .;
+  } > RAM1 AT > RAM1
+*/
+
+  .heap (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __end__ = .;
+    PROVIDE(end = .);
+    . = . + HEAP_SIZE;
+    . = ALIGN(8);
+    __HeapLimit = .;
+  } > RAM0
+
+  .stack (ORIGIN(RAM0) + LENGTH(RAM0) - STACK_SIZE - __STACKSEAL_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackLimit = .;
+    . = . + STACK_SIZE;
+    . = ALIGN(8);
+    __StackTop = .;
+  } > RAM0
+  PROVIDE(__stack = __StackTop);
+
+#if __STACKSEAL_SIZE > 0
+  .stackseal (ORIGIN(RAM0) + LENGTH(RAM0) - __STACKSEAL_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackSeal = .;
+    . = . + 8;
+    . = ALIGN(8);
+  } > RAM0
+#endif
+
+  /* Check if data + heap + stack exceeds RAM limit */
+  ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack")
+}
diff --git a/dsppp/linker_scripts/gcc_sse310_mps3_s.ld b/dsppp/linker_scripts/gcc_sse310_mps3_s.ld
new file mode 100644
index 000000000..7bea37e1a
--- /dev/null
+++ b/dsppp/linker_scripts/gcc_sse310_mps3_s.ld
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2023 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------------
+  Stack seal size definition
+ *----------------------------------------------------------------------------*/
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+#define __STACKSEAL_SIZE   ( 8 )
+#else
+#define __STACKSEAL_SIZE   ( 0 )
+#endif
+
+/* ----------------------------------------------------------------------------
+  Memory definition
+ *----------------------------------------------------------------------------*/
+MEMORY
+{
+  ROM0  (rx)  : ORIGIN = S_CODE_START, LENGTH = S_CODE_SIZE
+#if __ROM1_SIZE > 0
+  ROM1  (rx)  : ORIGIN = __ROM1_BASE, LENGTH = __ROM1_SIZE
+#endif
+#if __ROM2_SIZE > 0
+  ROM2  (rx)  : ORIGIN = __ROM2_BASE, LENGTH = __ROM2_SIZE
+#endif
+#if __ROM3_SIZE > 0
+  ROM3  (rx)  : ORIGIN = __ROM3_BASE, LENGTH = __ROM3_SIZE
+#endif
+
+  RAM0  (rwx) : ORIGIN = S_DATA_START, LENGTH = S_DATA_SIZE
+#if __RAM1_SIZE > 0
+  RAM1  (rwx) : ORIGIN = __RAM1_BASE, LENGTH = __RAM1_SIZE
+#endif
+#if __RAM2_SIZE > 0
+  RAM2  (rwx) : ORIGIN = __RAM2_BASE, LENGTH = __RAM2_SIZE
+#endif
+#if __RAM3_SIZE > 0
+  RAM3  (rwx) : ORIGIN = __RAM3_BASE, LENGTH = __RAM3_SIZE
+#endif
+}
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext          (deprecated)
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+  .text :
+  {
+    KEEP(*(.vectors))
+    *(.text*)
+
+    KEEP(*(.init))
+    KEEP(*(.fini))
+
+    /* .ctors */
+    *crtbegin.o(.ctors)
+    *crtbegin?.o(.ctors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+    *(SORT(.ctors.*))
+    *(.ctors)
+
+    /* .dtors */
+    *crtbegin.o(.dtors)
+    *crtbegin?.o(.dtors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+    *(SORT(.dtors.*))
+    *(.dtors)
+
+    /* *(.rodata*) */
+
+    KEEP(*(.eh_frame*))
+  } > ROM0
+
+#if defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3U)
+  .gnu.sgstubs :
+  {
+    . = ALIGN(32);
+  } > ROM0
+#endif
+
+  .ARM.extab :
+  {
+    *(.ARM.extab* .gnu.linkonce.armextab.*)
+  } > ROM0
+
+  __exidx_start = .;
+  .ARM.exidx :
+  {
+    *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+  } > ROM0
+  __exidx_end = .;
+
+  .copy.table :
+  {
+    . = ALIGN(4);
+    __copy_table_start__ = .;
+
+    LONG (LOADADDR(.data))
+    LONG (ADDR(.data))
+    LONG (SIZEOF(.data) / 4)
+
+    /* Add each additional data section here */
+/*
+    LONG (LOADADDR(.data2))
+    LONG (ADDR(.data2))
+    LONG (SIZEOF(.data2) / 4)
+*/
+    __copy_table_end__ = .;
+  } > ROM0
+
+  .zero.table :
+  {
+    . = ALIGN(4);
+    __zero_table_start__ = .;
+
+/*  .bss initialization to zero is already done during C Run-Time Startup.
+    LONG (ADDR(.bss))
+    LONG (SIZEOF(.bss) / 4)
+*/
+
+    /* Add each additional bss section here */
+/*
+    LONG (ADDR(.bss2))
+    LONG (SIZEOF(.bss2) / 4)
+*/
+    __zero_table_end__ = .;
+  } > ROM0
+
+  /*
+   * This __etext variable is kept for backward compatibility with older,
+   * ASM based startup files.
+   */
+  PROVIDE(__etext = LOADADDR(.data));
+
+  .data : ALIGN(4)
+  {
+    __data_start__ = .;
+    *(vtable)
+    *(.data)
+    *(.data.*)
+    *(.rodata*)
+
+    . = ALIGN(4);
+    /* preinit data */
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP(*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+
+    . = ALIGN(4);
+    /* init data */
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP(*(SORT(.init_array.*)))
+    KEEP(*(.init_array))
+    PROVIDE_HIDDEN (__init_array_end = .);
+
+    . = ALIGN(4);
+    /* finit data */
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP(*(SORT(.fini_array.*)))
+    KEEP(*(.fini_array))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+
+    KEEP(*(.jcr*))
+    . = ALIGN(4);
+    /* All data end */
+    __data_end__ = .;
+
+  } > RAM0 AT > ROM0
+
+  /*
+   * Secondary data section, optional
+   *
+   * Remember to add each additional data section
+   * to the .copy.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .data2 : ALIGN(4)
+  {
+    . = ALIGN(4);
+    __data2_start__ = .;
+    *(.data2)
+    *(.data2.*)
+    . = ALIGN(4);
+    __data2_end__ = .;
+
+  } > RAM1 AT > ROM0
+*/
+
+  .bss :
+  {
+    . = ALIGN(4);
+    __bss_start__ = .;
+    *(.bss)
+    *(.bss.*)
+    *(COMMON)
+    . = ALIGN(4);
+    __bss_end__ = .;
+  } > RAM0 AT > RAM0
+
+  /*
+   * Secondary bss section, optional
+   *
+   * Remember to add each additional bss section
+   * to the .zero.table above to assure proper
+   * initialization during startup.
+   */
+/*
+  .bss2 :
+  {
+    . = ALIGN(4);
+    __bss2_start__ = .;
+    *(.bss2)
+    *(.bss2.*)
+    . = ALIGN(4);
+    __bss2_end__ = .;
+  } > RAM1 AT > RAM1
+*/
+
+  .heap (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __end__ = .;
+    PROVIDE(end = .);
+    . = . + HEAP_SIZE;
+    . = ALIGN(8);
+    __HeapLimit = .;
+  } > RAM0
+
+  .stack (ORIGIN(RAM0) + LENGTH(RAM0) - STACK_SIZE - __STACKSEAL_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackLimit = .;
+    . = . + STACK_SIZE;
+    . = ALIGN(8);
+    __StackTop = .;
+  } > RAM0
+  PROVIDE(__stack = __StackTop);
+
+#if __STACKSEAL_SIZE > 0
+  .stackseal (ORIGIN(RAM0) + LENGTH(RAM0) - __STACKSEAL_SIZE) (NOLOAD) :
+  {
+    . = ALIGN(8);
+    __StackSeal = .;
+    . = . + 8;
+    . = ALIGN(8);
+  } > RAM0
+#endif
+
+  /* Check if data + heap + stack exceeds RAM limit */
+  ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack")
+}
diff --git a/dsppp/main.c b/dsppp/main.c
new file mode 100644
index 000000000..3a7cdcefd
--- /dev/null
+++ b/dsppp/main.c
@@ -0,0 +1,93 @@
+#include "test_config.h"
+#include "RTE_Components.h"
+#include  CMSIS_device_header
+#include "stdio.h"
+
+#if defined(MPS3)
+#include "cmsis_driver_config.h"
+#include "stdout_USART.h"
+#endif 
+
+#if defined(RTE_Compiler_EventRecorder)
+#include "EventRecorder.h"
+#endif 
+
+#include "test.h"
+
+
+int main(void)
+{
+#if defined(MPS3)
+    stdout_init();
+#endif
+
+#if defined(RTE_Compiler_EventRecorder) && !defined(MPS3)
+   uint32_t res =  EventRecorderInitialize (EventRecordAll, 1); 
+   if (!res)
+   {
+      printf("Error enabling event recorder\n");
+      goto endThread;
+   }
+#endif
+	
+    #if !defined(SERIAL_DUMP)
+    printf("\033c\r\n\r\n");
+    #endif
+    printf("\r\n\r\n\r\n----------------------\r\n");
+    printf(__TIME__"\r\n");
+    #if defined(ARMCM55)
+    printf("M55\r\n");
+    #endif
+    #if defined(ARMCM4_FP)
+    printf("ARMCM4_FP\r\n");
+    #endif
+    #if defined(ARMCM0P)
+    printf("ARMCM0P\r\n");
+    #endif
+
+    #if defined(MPS3)
+    printf("MPS3\r\n");
+    #endif
+    #if defined(VHT)
+    printf("VHT\r\n");
+    #endif
+    #if defined(IPSS)
+    printf("IPSS\r\n");
+    #endif
+
+    #if defined(DOT_TEST)
+    dot_test();
+    #endif
+    #if defined(VECTOR_TEST)
+    vector_test();
+    #endif
+    #if defined(ROW_TEST)
+    row_test();
+    #endif
+    #if defined(COL_TEST)
+    col_test();
+    #endif
+    #if defined(MATRIX_TEST)
+    matrix_test();
+    #endif
+    #if 0
+    filter_test();
+    #endif
+    #if defined(FUSION_TEST)
+    fusion_test();
+    #endif
+    //debug_test();
+
+    memory_pool_stats();
+
+#if defined(MPS3)
+    while(1);
+#else
+#if defined(RTE_Compiler_EventRecorder)
+endThread:
+#endif
+    while(0);
+#endif
+}
+
+
diff --git a/dsppp/mps3run.py b/dsppp/mps3run.py
new file mode 100644
index 000000000..799e7145f
--- /dev/null
+++ b/dsppp/mps3run.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+from pyocd.core.helpers import ConnectHelper
+from pyocd.flash.file_programmer import FileProgrammer
+from pyocd.debug.elf.symbols import ELFSymbolProvider
+from pyocd.core.target import Target
+from pyocd.debug.elf.elf  import ELFBinaryFile
+from pyocd.flash.loader import MemoryLoader
+import getserial
+
+import time
+import os.path
+
+import serial
+import re
+import io
+
+import logging
+logging.basicConfig(level=logging.ERROR)
+
+
+
+def run_out(exe_path,uuid):
+    lines= ""
+    
+    with ConnectHelper.session_with_chosen_probe(unique_id = uuid) as session:
+        print("Connecting")
+        board = session.board
+        target = board.target
+        #flash = target.memory_map.get_boot_memory()
+        
+        # Load firmware into device.
+        FileProgrammer(session).program(exe_path)
+        
+        #target.elf = elf_path
+    
+        
+        #provider = ELFSymbolProvider(target.elf)
+        #main_addr = provider.get_symbol_value("main")
+        #print("main() address: 0x%X" % main_addr)
+        
+        ## Set breakpoint.
+        #target.set_breakpoint(main_addr)
+    
+        #target.reset()
+        lines = getserial.read_stdout(target)
+        return("".join(lines))
+        #target.resume()
+        ##
+        ##
+        #target.reset()
+        ##
+        ### Wait until breakpoint is hit.
+        #while target.get_state() != Target.State.HALTED:
+        #    pass
+        ##
+        #pc = target.read_core_register("pc")
+        #print("pc: 0x%X" % pc)
+    #
+        #target.remove_breakpoint()
+    #
+        #target.resume()
+
+if __name__ == "__main__":
+   path = "."
+   out = "cprj/out/test/MPS3-Corstone-300"
+   bin = "Release/test.axf"
+   
+   axf_path = os.path.join(path,out,bin)
+   
+   #axf=ELFBinaryFile(axf_path)
+   #axf.close()
+
+   lines = run_out(axf_path,"L85986697A")
+   
+
+   print(lines)
+
+    
\ No newline at end of file
diff --git a/dsppp/process.py b/dsppp/process.py
new file mode 100644
index 000000000..be7aff9fd
--- /dev/null
+++ b/dsppp/process.py
@@ -0,0 +1,137 @@
+import re
+import xlsxwriter
+
+START = 0
+IN_TEST = 1 
+MEASURE = 2
+CYCLE_CPP = 3
+CYCLE_C = 4
+ERROR = 5
+
+line_nb = 0
+state = START 
+dimensions = "?"
+
+cpp = 0
+c = 0 
+
+stats = {}
+
+with open("result.txt","r") as f:
+    lines = f.readlines()
+    for l in lines:
+        if line_nb >= 3:
+           if re.match('Error',l):
+              state = ERROR
+              continue 
+           if state == ERROR:
+              state = IN_TEST
+              continue
+           if state == START:
+              if re.match(r'^[a-zA-Z]+.*$',l):
+                 #print(l)
+                 test_name = l.strip("\n")
+                 state = IN_TEST
+                 stats[test_name]=[]
+                 continue
+           if state == IN_TEST:
+              if re.match(r'----',l):
+                 state = MEASURE 
+                 continue
+              if re.match(r'^[a-zA-Z]+.*$',l):
+                 state = IN_TEST
+                 test_name = l.strip("\n")
+                 stats[test_name]=[]
+                 continue
+           if state == MEASURE:
+              dimensions = l.strip("\n")
+              state = CYCLE_CPP 
+              continue 
+           if state == CYCLE_CPP:
+              m = re.match(r'Cycle count = ([0-9]+)',l)
+              if m:
+                 cpp = m.group(1)
+                 state = CYCLE_C 
+              continue
+           if state == CYCLE_C:
+              if re.match(r'----',l):
+                 state = MEASURE 
+                 stats[test_name].append({"dim":dimensions,"cpp":cpp})
+                 continue
+              m = re.match(r'Cycle count = ([0-9]+)',l)
+              if m:
+                c = m.group(1)
+                state = IN_TEST 
+                stats[test_name].append({"dim":dimensions,"cpp":cpp,"c":c})
+                continue
+              else:
+                stats[test_name].append({"dim":dimensions,"cpp":cpp})
+                state = IN_TEST 
+                continue
+
+
+
+
+
+        line_nb = line_nb + 1 
+
+dst="C:/Users/CHRFAV01/OneDrive - ARM/Documents/Presentations/CMSIS_Compute"
+
+def pos(row,col):
+    return(f"{chr(ord('A')+col)}{row}")
+
+for s in stats:
+    ns = re.sub(r'[ ]',"_",s) + ".xlsx"
+    print(ns)
+    workbook = xlsxwriter.Workbook(dst+"/"+ns)
+    worksheet = workbook.add_worksheet("Results")
+    line_nb = 0
+
+    title = workbook.add_format({'bold': True,'font_size':24})
+    sub_title = workbook.add_format({'bold': True,
+                                     'font_size':14,
+                                     'align':"center",
+                                     'bg_color':"#CCCCCC"})
+    percent = workbook.add_format({'num_format': '0.00%'})
+    dimEven = workbook.add_format({'bold': True,'bg_color':"#CCCCCC"})
+    dimOdd = workbook.add_format({'bold': True,'bg_color':"#EEEEEE"})
+
+    worksheet.write(line_nb,0, s,title)
+    line_nb = line_nb + 1
+
+    worksheet.set_row(line_nb, 30)
+    worksheet.set_column("D:D", 30)
+
+    if len(stats[s])==2:
+            worksheet.write(line_nb,0, 'dims',sub_title)
+            worksheet.write(line_nb,1, 'cpp',sub_title)
+            worksheet.write(line_nb, 2, 'CPP Improvement',sub_title)
+
+    else:
+            worksheet.write(line_nb,0, 'dims',sub_title)
+            worksheet.write(line_nb,1, 'cpp',sub_title)
+            worksheet.write(line_nb,2, 'c',sub_title)
+            worksheet.write(line_nb, 3, 'CPP Improvement',sub_title)
+
+    line_nb = line_nb + 1
+    for x in stats[s]:
+             if (line_nb % 2 == 0):
+                    dim = dimOdd
+             else:
+                    dim = dimEven
+             if "c" in x:
+                 worksheet.write(line_nb,0, x["dim"],dim)
+                 worksheet.write(line_nb,1, float(x["cpp"]))
+                 worksheet.write(line_nb,2, float(x["c"]))
+                 worksheet.write(line_nb, 3, f"=(C{line_nb+1}-B{line_nb+1})/C{line_nb+1}",percent)
+             else:
+                 worksheet.write(line_nb,0, x["dim"],dim)
+                 worksheet.write(line_nb,1, float(x["cpp"]))
+                 worksheet.write(line_nb, 2, f"=(C{line_nb+1}-B{line_nb+1})/C{line_nb+1}",percent)
+
+             line_nb = line_nb + 1
+    
+
+
+    workbook.close()
+         
\ No newline at end of file
diff --git a/dsppp/run_all.py b/dsppp/run_all.py
new file mode 100644
index 000000000..9c15f71b8
--- /dev/null
+++ b/dsppp/run_all.py
@@ -0,0 +1,390 @@
+import re
+import argparse
+import os.path 
+import itertools
+import subprocess 
+import sys 
+import mps3run
+
+from colorama import init,Fore, Back, Style
+
+try:
+   os.mkdir("ac6_results")
+except:
+    pass 
+
+try:
+   os.mkdir("gcc_results")
+except:
+    pass 
+
+try:
+   os.mkdir("clang_results")
+except:
+    pass 
+
+DEBUG = False
+ERROR_OCCURED = False
+
+all_errors = []
+
+def printTitle(s):
+    print("\n" + Fore.GREEN + Style.BRIGHT +  s + Style.RESET_ALL)
+
+def printSubTitle(s):
+    print(Fore.YELLOW + Style.BRIGHT + s + Style.RESET_ALL)
+
+def printError(s):
+    print(Fore.RED + Style.BRIGHT +  s + Style.RESET_ALL+"\n")
+
+class Result:
+    def __init__(self,msg,error=False):
+        self._error = error
+        self._msg = msg
+
+    @property
+    def error(self):
+        return self._error
+
+    @property
+    def msg(self):
+        return self._msg
+
+def is_error(res,test_name,err):
+    if res.error:
+        printError("Error")
+        all_errors.append(test_name)
+        print(test_name,file=err)
+        print(res.msg,file=err)
+        print("--------------",file=err)
+        return(True)
+    return(False)
+
+def run(args,mustPrint=False,dumpStdErr=True,timeout=20,printCmd=False):
+    global ERROR_OCCURED
+    global DEBUG
+    try:
+        if DEBUG or printCmd:
+            print(" ".join(args))
+        result=subprocess.run(args,text=True,capture_output=True,timeout=timeout)
+        if result.returncode !=0 :
+             ERROR_OCCURED = True
+             if dumpStdErr:
+                return(Result(result.stderr + "\n\nSTDOUT:\n\n" + result.stdout,error=True))
+             else:
+                return(Result(result.stdout,error=True))
+
+        if mustPrint:
+            print(result.stdout)
+        return(Result(result.stdout))
+    except Exception as e:
+        printError("Exception occured")
+        ERROR_OCCURED = True
+        return(Result(str(e),error=True))
+
+parser = argparse.ArgumentParser(description='Parse test description')
+parser.add_argument('-c', nargs='?',type = str, default="M55",help="M55/M4/M0")
+parser.add_argument('-p', nargs='?',type = str, default="VHT",help="VHT/MPS3")
+parser.add_argument('-a', action='store_true', help="Generate allocator definitions")
+parser.add_argument('-i', action='store_true', help="Refresh global allocator index")
+parser.add_argument('-b', action='store_true', help="Only benchmarks")
+parser.add_argument('-d', action='store_true', help="Dry run")
+parser.add_argument('-g', nargs='?',type = str, default="AC6",help="AC6 / CLANG / GCC")
+parser.add_argument('-u', nargs='?',type = str, default="L85986697A",help="Debug UUID")
+
+args = parser.parse_args()
+
+init()
+
+if args.a:
+    printTitle("Mode allocator generations")
+
+if args.i:
+    printTitle("Allocator test index refresh")
+
+NAME_TO_BOARD = {
+    "M55": "Corstone-300",
+    "Corstone-300": "Corstone-300",
+    "M4": "M4",
+    "M0" : "M0P"
+}
+
+def results():
+    if args.g == "AC6":
+        return("ac6_results")
+
+    if args.g == "GCC":
+        return("gcc_results")
+
+    if args.g == "CLANG":
+        return("clang_results")
+
+    print(f"Compiler {args.g} not known")
+    exit(1)
+
+def target_name():
+    return(f"{args.p}-{NAME_TO_BOARD[args.c]}")
+
+def cmd_args():
+    # cbuild -O cprj test.csolution.yml -r --toolchain AC6 -c test.Release+MPS3-Corstone-300 
+    toolchain = args.g
+    target = f"test.Release+{target_name()}"
+ 
+    command = ["-O", "cprj", 
+               "test.csolution.yml", 
+               "--toolchain", toolchain, 
+               "-c", target]
+
+    return(command)
+
+
+
+if args.g == "AC6":
+    ext = ".axf"
+else:
+    ext = ".elf"
+
+fvp = {"M55":"C:\\Keil_v5\\ARM\\VHT\\VHT_Corstone_SSE-300_Ethos-U55.exe",
+        "M4":"C:\\Keil_v5\\ARM\\VHT\\VHT_MPS2_Cortex-M4.exe",
+        "M0":"C:\\Keil_v5\\ARM\\VHT\\VHT_MPS2_Cortex-M0plus.exe"}
+
+TESTS=["DOT_TEST",
+       "VECTOR_TEST",
+       "ROW_TEST",
+       "COL_TEST",
+       "MATRIX_TEST",
+       "FUSION_TEST"
+       ]
+
+# Some tests are too big (code size) and needs to be decomposed
+# They contain SUBTEST1, SUBTEST2 ... #if in the code
+# This script must know how many subtests are defined in each test
+# suite
+# No need to define an entry in this dictionary when no
+# subtest is defined
+SUBTESTS = {"MATRIX_TEST":19}
+# Subtests that are only for testing and not benchmarks
+ONLY_TESTS = {"MATRIX_TEST":[3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]}
+
+def is_only_test(n,i):
+    if n[0] in ONLY_TESTS:
+       return(i in ONLY_TESTS[n[0]])
+    return False
+
+DATATYPES = ["F64_DT",
+             "F32_DT",
+             "F16_DT",
+             "Q31_DT",
+             "Q15_DT",
+             "Q7_DT"
+             ]
+
+MODE = ["STATIC_TEST",
+        "DYNAMIC_TEST"
+        ]
+
+TESTS=["DOT_TEST","VECTOR_TEST"]
+DATATYPES=["F32_DT"]
+MODE = ["STATIC_TEST"]
+
+all_tests = list(itertools.product(TESTS,DATATYPES,MODE))
+
+
+
+ALLOC = "#define POOL_ALLOCATOR"
+if args.a:
+    # Stat allocator enabled and we do stats on VHT CS300 only
+    ALLOC = "//#define POOL_ALLOCATOR"
+    args.c = "M55"
+    args.p = "VHT"
+
+BENCH = "//#define ONLY_BENCHMARKS"
+if args.b:
+    BENCH = "#define ONLY_BENCHMARKS"
+
+HEADER = f"""#ifndef TEST_CONFIG_H
+#define TEST_CONFIG_H
+
+{ALLOC}
+{BENCH}
+
+#define %s
+#define %s
+#define %s
+%s
+
+#endif
+"""
+
+
+
+def out_path():
+    return(os.path.join("cprj","out","test",target_name(),"Release","test"+ ext))
+
+def configure_and_build_test(test_name,test,err,subtest,first):
+    if subtest is not None:
+        subteststr = f"#define SUBTEST{subtest}"
+    else:
+        subteststr = ""
+    with open("test_config.h","w") as c:
+        print(HEADER % (test + (subteststr,)),file=c)
+    if first:
+       res = run(["cbuild"] + cmd_args() + ["-r","--update-rte"],timeout=600,printCmd=True)
+    else:
+       res = run(["cbuild"] +cmd_args(),timeout=600,printCmd=True)
+    if not is_error(res,test_name,err):
+        if DEBUG:
+            print(res.msg)
+        return(True)
+    return(False)
+
+def process_allocator_data(test_name,test,msg,subtest):
+    lines = msg.splitlines()
+    state = 0
+    alloc_cpp = []
+    alloc_h = []
+    for l in lines:
+        if re.match(r"^ALLOC_POOL.*$",l):
+            alloc_cpp.append(l.strip())
+        if re.match(r"^POOL.*$",l):
+            alloc_h.append(l.strip())
+    if subtest is not None:
+        HEADER=f"#if defined({test[0]}) && defined({test[1]}) && defined({test[2]}) && defined(SUBTEST{subtest})"
+    else:
+        HEADER=f"#if defined({test[0]}) && defined({test[1]}) && defined({test[2]})"
+    # Gen h
+    with open(os.path.join("allocation",test_name)+".h","w") as h:
+        print(HEADER,file=h)
+        for l in alloc_h:
+            print(l,file=h)
+        print("#endif",file=h)
+
+    # Gen cpp
+    with open(os.path.join("allocation",test_name)+".cpp","w") as h:
+         print(HEADER,file=h)
+         for l in alloc_cpp:
+             print(l,file=h)
+         print("#endif",file=h)
+
+def process_bench(test_name,test,msg,subtest):
+    global DEBUG
+    lines = msg.splitlines()
+    test_name = args.p +"_" + args.c + "_" + test_name
+    if DEBUG:
+       print(os.path.join(results(),test_name)+".txt")
+    with open(os.path.join(results(),test_name)+".txt","w") as h:
+       for l in lines:
+           print(l.rstrip(),file=h)
+
+
+def process_result(test_name,test,msg,subtest):
+    printSubTitle("Process result")
+    if args.a:
+        process_allocator_data(test_name,test,msg,subtest)
+    else:
+        process_bench(test_name,test,msg,subtest)
+
+def runVHT(test_name,test,err,subtest):
+    core = args.c
+    target = target_name()
+    config = os.path.join("fvp_configs",target) + ".txt"
+    #print(target)
+    #print(config)
+    if core == "M55":
+        exe = "cpu0=" + out_path()
+    else:
+        exe = out_path()
+    res=run([fvp[core],"-f",config,"-a",exe])
+    if not is_error(res,test_name,err):
+       process_result(test_name,test,res.msg,subtest)
+
+def runMPS3(test_name,test,err,subtest):
+    lines=""
+    res = None
+    try:
+        exe = out_path()
+        lines = mps3run.run_out(exe,args.u)
+        res = Result(lines)
+    except Exception as e:    
+        res = Result(str(e),error = True)
+    if not is_error(res,test_name,err):
+       process_result(test_name,test,res.msg,subtest)
+   
+def runATest(test,file_err,nb,NB_MAX,current_nb_axf,nb_axf,first=True,subtest=None):
+    global DEBUG
+    if subtest is not None:
+       maxsub = SUBTESTS[test[0]]
+       test_name=f"{test[0]}_{test[1]}_{test[2]}_{subtest}"
+       printTitle(test_name + f" : AXF {current_nb_axf} / {nb_axf}, TEST {nb}/{NB_MAX} (subtest {subtest}/{maxsub})")
+    else:
+       test_name=f"{test[0]}_{test[1]}_{test[2]}"
+       printTitle(test_name + f" : AXF {current_nb_axf} / {nb_axf}, TEST {nb}/{NB_MAX}")
+    if args.d:
+        return
+    printSubTitle("Configure and build")
+    if configure_and_build_test(test_name,test,file_err,subtest,first):
+        printSubTitle("Run")
+        if args.p == "VHT":
+            runVHT(test_name,test,file_err,subtest)
+        if args.p == "MPS3" and args.c == "M55":
+            runMPS3(test_name,test,file_err,subtest)     
+
+nb_axf = 0
+for test in all_tests:
+    if test[0] in SUBTESTS:
+        for subtestnbb in range(SUBTESTS[test[0]]):
+            if not args.b or not is_only_test(test,subtestnbb+1):
+               nb_axf = nb_axf + 1
+    else:
+            nb_axf = nb_axf + 1
+print(f"Number of axf to test = {nb_axf}")
+
+with open(os.path.join(results(),"errors.txt"),"w") as err:
+    # Generate include for allocations
+    if args.a or args.i:
+        with open(os.path.join("allocation","all.h"),"w") as fh:
+            for test in all_tests:
+                if test[0] in SUBTESTS:
+                   for subtestnbb in range(SUBTESTS[test[0]]):
+                      test_name=f"{test[0]}_{test[1]}_{test[2]}_{subtestnbb+1}"
+                      print(f"#include \"{test_name}.h\"",file=fh)
+                else:
+                    test_name=f"{test[0]}_{test[1]}_{test[2]}"
+                    print(f"#include \"{test_name}.h\"",file=fh)
+    
+        with open(os.path.join("allocation","all.cpp"),"w") as fc:
+            for test in all_tests:
+                if test[0] in SUBTESTS:
+                   for subtestnbb in range(SUBTESTS[test[0]]):
+                       test_name=f"{test[0]}_{test[1]}_{test[2]}_{subtestnbb+1}"
+                       print(f"#include \"{test_name}.cpp\"",file=fc)
+                else:
+                   test_name=f"{test[0]}_{test[1]}_{test[2]}"
+                   print(f"#include \"{test_name}.cpp\"",file=fc)
+    
+    if not args.i:
+        NB_MAX = len(all_tests)
+        nb = 1 # test cases
+        current_axf = 1
+        first = True
+        for test in all_tests:
+            if test[0] in SUBTESTS:
+                for subtestnbb in range(SUBTESTS[test[0]]):
+                    if not args.b or not is_only_test(test,subtestnbb+1):
+                       runATest(test,err,nb,NB_MAX,current_axf,nb_axf,first,subtestnbb+1)
+                       current_axf = current_axf + 1
+                       first = False
+            else:
+                runATest(test,err,nb,NB_MAX,current_axf,nb_axf,first)
+                current_axf = current_axf + 1
+                first = False
+            nb = nb + 1
+
+
+if ERROR_OCCURED:
+    printError("Error in tests:")
+    for n in all_errors:
+        printError(n)
+    sys.exit("Error occurred")
+else:
+    sys.exit(0)
diff --git a/dsppp/test.cbuild-pack.yml b/dsppp/test.cbuild-pack.yml
new file mode 100644
index 000000000..0f3c7dcfb
--- /dev/null
+++ b/dsppp/test.cbuild-pack.yml
@@ -0,0 +1,17 @@
+cbuild-pack:
+  resolved-packs:
+    - resolved-pack: ARM::CMSIS@6.0.0
+      selected-by:
+        - ARM::CMSIS@6.0.0
+    - resolved-pack: ARM::CMSIS-Compiler@2.0.0
+      selected-by:
+        - ARM::CMSIS-Compiler@2.0.0
+    - resolved-pack: ARM::CMSIS-DSP@1.15.0
+      selected-by:
+        - ARM::CMSIS-DSP@1.15.0
+    - resolved-pack: ARM::Cortex_DFP@1.0.0
+      selected-by:
+        - ARM::Cortex_DFP@1.0.0
+    - resolved-pack: ARM::V2M_MPS3_SSE_300_BSP@1.4.0
+      selected-by:
+        - ARM::V2M_MPS3_SSE_300_BSP@1.4.0
diff --git a/dsppp/test.cproject.yml b/dsppp/test.cproject.yml
new file mode 100644
index 000000000..bdb280636
--- /dev/null
+++ b/dsppp/test.cproject.yml
@@ -0,0 +1,146 @@
+project:
+  groups:
+    - group: Tests
+      files:
+        - file: tests/matrix_test.cpp
+        - file: tests/dot_test.cpp
+        - file: tests/vector_test.cpp
+        - file: tests/row_test.cpp
+        - file: tests/col_test.cpp
+        #- file: tests/filter_test.cpp
+        - file: tests/fusion_test.cpp
+        #- file: tests/debug_test.cpp
+        #- file: tests/debug_test_external.cpp
+        - file: tests/common_tests.cpp
+        - file: tests/bench.c
+        - file: tests/cmsisdsp.cpp
+        - file: clang_sse300.c
+          for-context: 
+             - +MPS3-Corstone-300
+          for-compiler:
+             - CLANG
+    - group: App
+      files:
+        - file: main.c
+        - file: allocator.cpp
+    - group: IPSS
+      for-context: 
+        - +IPSS-M0P
+        - +IPSS-M4
+      files:
+        - file: IPSS/retarget_m0.c
+          for-context: 
+             - +IPSS-M0P
+        - file: IPSS/retarget_m4.c
+          for-context: 
+             - +IPSS-M4
+  add-path:
+      - Include
+      - ../../../boost_1_84_0
+      - .
+      - tests
+
+  components:
+    - component: ARM::CMSIS:CORE
+    - component: ARM::CMSIS:DSP@1.15.0
+    - component: ARM::Device:Startup&C Startup
+      for-context: 
+        - +VHT-Corstone-300
+        - +VHT-M0P
+        - +VHT-M4
+        - +MPS3-Corstone-300
+    - component: ARM::Device:Definition
+      for-context: 
+          - +VHT-Corstone-300
+          - +MPS3-Corstone-300
+    - component: CMSIS-Compiler:CORE
+      for-context: 
+          - +MPS3-Corstone-300
+    - component: CMSIS-Compiler:STDOUT:Custom@1.0.0
+      for-context: 
+          - +MPS3-Corstone-300
+    - component: ARM::Device:USART STDOUT
+      for-context: 
+          - +MPS3-Corstone-300
+    - component: ARM::CMSIS Driver:USART
+      for-context: 
+          - +MPS3-Corstone-300
+    - component: ARM::Device:Native Driver:SysCounter
+      for-context: 
+          - +VHT-Corstone-300
+          - +MPS3-Corstone-300
+    - component: ARM::Device:Native Driver:SysTimer
+      for-context: 
+          - +VHT-Corstone-300
+          - +MPS3-Corstone-300
+    - component: ARM::Device:Native Driver:Timeout
+      for-context: 
+          - +VHT-Corstone-300
+          - +MPS3-Corstone-300
+    - component: ARM::Device:Native Driver:UART
+      for-context: 
+          - +MPS3-Corstone-300
+  
+  linker:
+    - script: linker_scripts/gcc_sse300_mps3.ld
+      for-context:
+        - +MPS3-Corstone-300
+        - +VHT-Corstone-300
+      for-compiler: GCC
+
+    - script: linker_scripts/clang_sse300_mps3.sct
+      for-context:
+        - +MPS3-Corstone-300
+        - +VHT-Corstone-300
+      for-compiler: CLANG
+
+    - script: linker_scripts/ac6_sse300_mps3_s.sct
+      for-context:
+        - +MPS3-Corstone-300
+        - +VHT-Corstone-300
+      for-compiler: AC6
+
+    - regions: linker_scripts/SSE-300-MPS3/region_defs.h
+      for-context:
+        - +MPS3-Corstone-300
+        - +VHT-Corstone-300
+
+    - script: linker_scripts/gcc_m0p_mps3.ld
+      for-context:
+        - +VHT-M0P
+      for-compiler: GCC
+
+    - script: linker_scripts/clang_m0p_mps3.ld
+      for-context:
+        - +VHT-M0P
+      for-compiler: CLANG
+
+    - script: linker_scripts/ac6_m0p_mps3_s.sct
+      for-context:
+        - +VHT-M0P
+      for-compiler: AC6
+
+    - regions: linker_scripts/ARMCM0P/region_defs.h
+      for-context:
+        - +VHT-M0P
+
+    - script: linker_scripts/gcc_m4_mps3.ld
+      for-context:
+        - +VHT-M4
+      for-compiler: GCC
+
+    - script: linker_scripts/clang_m4_mps3.ld
+      for-context:
+        - +VHT-M4
+      for-compiler: CLANG
+
+    - script: linker_scripts/ac6_m4_mps3_s.sct
+      for-context:
+        - +VHT-M4
+      for-compiler: AC6
+
+    - regions: linker_scripts/ARMCM4/region_defs.h
+      for-context:
+        - +VHT-M4
+
+  
diff --git a/dsppp/test.csolution.yml b/dsppp/test.csolution.yml
new file mode 100644
index 000000000..028759a73
--- /dev/null
+++ b/dsppp/test.csolution.yml
@@ -0,0 +1,108 @@
+solution:
+  compiler: AC6@6.22.0
+
+  language-C: c11
+  language-CPP: c++17
+  cdefault:
+
+  packs:
+    - pack: ARM::CMSIS@6.0.0
+    - pack: ARM::CMSIS-DSP@1.15.0
+    - pack: ARM::V2M_MPS3_SSE_300_BSP@1.4.0
+    - pack: ARM::CMSIS-Compiler@2.0.0
+    - pack: ARM::Cortex_DFP@1.0.0
+
+  target-types:
+    - type: MPS3-Corstone-300
+      device: ARM::SSE-300-MPS3
+      board: ARM::V2M-MPS3-SSE-300-FVP
+      define:
+        - CORTEXM
+        - SSE300MPS3
+        - MPS3
+        - ARMCM55
+      misc:
+        - for-compiler: GCC
+          C:
+            - -Wno-sign-compare
+            - -Wno-unused-parameter
+          CPP:
+            - -Wno-sign-compare
+            - -Wno-unused-parameter
+          Link:
+            - --specs=nosys.specs            
+        - for-compiler: CLANG
+          C:
+            - -Wno-sign-compare
+            - -Wno-unused-parameter
+          CPP:
+            - -Wno-sign-compare
+            - -Wno-unused-parameter
+          Link:
+            - -lcrt0
+
+    - type: VHT-Corstone-300
+      device: ARM::SSE-300-MPS3
+      board: ARM::V2M-MPS3-SSE-300-FVP
+      define:
+        - CORTEXM
+        - ARMCM55
+        - VHT
+      misc:
+        - for-compiler: GCC
+          Link:
+            - --specs=rdimon.specs
+          Library:
+            - -lrdimon
+        - for-compiler: CLANG
+          Link:
+            - -lcrt0-semihost
+            - -lsemihost
+
+    - type: VHT-M0P
+      device: ARMCM0P
+      #board: uVision Simulator
+      define:
+        - CORTEXM
+        - ARMCM0P
+        - DISABLEFLOAT16
+        - VHT
+      misc:
+        - for-compiler: GCC
+          Link:
+            - --specs=rdimon.specs
+          Library:
+            - -lrdimon
+        - for-compiler: CLANG
+          Link:
+            - -lcrt0-semihost
+            - -lsemihost
+
+    - type: VHT-M4
+      device: ARMCM4
+      #board: uVision Simulator
+      define:
+        - CORTEXM
+        - ARMCM4_FP
+        - DISABLEFLOAT16
+        - VHT
+      misc:
+        - for-compiler: GCC
+          Link:
+            - --specs=rdimon.specs
+          Library:
+            - -lrdimon
+        - for-compiler: CLANG
+          Link:
+            - -lcrt0-semihost
+            - -lsemihost
+
+  build-types:
+    - type: Release
+      debug: on
+
+
+  projects:
+    - project: ./test.cproject.yml
+    - project: ./example.cproject.yml
+    
\ No newline at end of file
diff --git a/dsppp/test_config.h b/dsppp/test_config.h
new file mode 100644
index 000000000..9349fbb83
--- /dev/null
+++ b/dsppp/test_config.h
@@ -0,0 +1,13 @@
+#ifndef TEST_CONFIG_H
+#define TEST_CONFIG_H
+
+#define POOL_ALLOCATOR
+//#define ONLY_BENCHMARKS
+
+#define VECTOR_TEST
+#define F32_DT
+#define STATIC_TEST
+
+
+#endif
+
diff --git a/dsppp/tests/bench.c b/dsppp/tests/bench.c
new file mode 100644
index 000000000..d4055c846
--- /dev/null
+++ b/dsppp/tests/bench.c
@@ -0,0 +1,3 @@
+#include "bench.h"
+
+uint32_t start_time, stop_time, cycle_count;
diff --git a/dsppp/tests/bench.h b/dsppp/tests/bench.h
new file mode 100644
index 000000000..b045ddeea
--- /dev/null
+++ b/dsppp/tests/bench.h
@@ -0,0 +1,60 @@
+#if !defined(HOST)
+#if !defined(NORTE)
+   #include "RTE_Components.h"
+   #include  CMSIS_device_header
+#endif
+#endif
+
+#ifdef   __cplusplus
+
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#ifdef   __cplusplus
+
+
+extern "C"
+{
+#endif
+
+extern  uint32_t start_time;
+extern  uint32_t stop_time;
+extern  uint32_t cycle_count;
+
+#if defined(HOST)
+#define INIT_SYSTICK 
+#define START_CYCLE_MEASUREMENT 
+#define STOP_CYCLE_MEASUREMENT
+#else 
+#define INIT_SYSTICK \
+ SysTick->CTRL=0;         \
+ SysTick->LOAD=0xFFFFFFUL;\
+ SysTick->VAL=0;          \
+ SysTick->CTRL=5;         \
+ while (SysTick->VAL==0)\
+    ;  
+
+#define START_CYCLE_MEASUREMENT \
+    start_time= SysTick->VAL;
+
+#define STOP_CYCLE_MEASUREMENT \
+    stop_time= SysTick->VAL;                  \
+    SysTick->CTRL=0;                          \
+    cycle_count = start_time - stop_time;     \
+    printf ("Cycle count = %d\r\n",(int)cycle_count);
+#endif 
+
+#if !defined(HOST) && (__ARM_ARCH > 6)
+#define dbgInst(imm) __asm volatile("DBG %0\n\t" : :"Ir" ((imm)) )
+#define startSectionNB(num) dbgInst(((num) & 0x7) | 0x0)
+#define stopSectionNB(num)  dbgInst(((num) & 0x7) | 0x8)
+#else 
+#define startSectionNB(num)
+#define stopSectionNB(num) 
+#endif
+
+#ifdef   __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/dsppp/tests/cmsis_tests.h b/dsppp/tests/cmsis_tests.h
new file mode 100644
index 000000000..ed5e4486b
--- /dev/null
+++ b/dsppp/tests/cmsis_tests.h
@@ -0,0 +1,699 @@
+#pragma once
+
+
+extern "C" {
+#include "arm_math_types.h"
+#include "arm_math_types_f16.h"
+#include "dsp/filtering_functions.h"
+#include "dsp/matrix_functions.h"
+#include "dsp/matrix_functions_f16.h"
+
+}
+
+template<typename T>
+struct NameOfType;
+
+template<typename T>
+struct TailForTests;
+
+template<>
+struct NameOfType<float64_t>
+{
+    constexpr static const char* v="float64_t";
+    constexpr static const char* xls="f64";
+};
+
+template<>
+struct NameOfType<float32_t>
+{
+    constexpr static const char* v="float32_t";
+    constexpr static const char* xls="f32";
+
+};
+
+#if !defined(DISABLEFLOAT16)
+template<>
+struct NameOfType<float16_t>
+{
+    constexpr static const char* v="float16_t";
+    constexpr static const char* xls="f16";
+
+};
+#endif
+
+template<>
+struct NameOfType<Q31>
+{
+    constexpr static const char* v="q31";
+    constexpr static const char* xls="q31";
+
+};
+
+template<>
+struct NameOfType<Q15>
+{
+    constexpr static const char* v="q15";
+    constexpr static const char* xls="q15";
+
+};
+
+template<>
+struct NameOfType<Q7>
+{
+    constexpr static const char* v="q7";
+    constexpr static const char* xls="q7";
+
+};
+
+template<>
+struct TailForTests<double>
+{
+    constexpr static const int tail = 1;
+    constexpr static const int loop = 2;
+
+};
+
+template<>
+struct TailForTests<float>
+{
+    constexpr static const int tail = 3;
+    constexpr static const int loop = 2*4;
+};
+
+#if !defined(DISABLEFLOAT16)
+template<>
+struct TailForTests<float16_t>
+{
+    constexpr static const int tail = 7;
+    constexpr static const int loop = 2*8;
+
+};
+#endif
+
+template<>
+struct TailForTests<Q31>
+{
+    constexpr static const int tail = 3;
+    constexpr static const int loop = 2*4;
+};
+
+template<>
+struct TailForTests<Q15>
+{
+    constexpr static const int tail = 7;
+    constexpr static const int loop = 2*8;
+};
+
+template<>
+struct TailForTests<Q7>
+{
+    constexpr static const int tail = 15;
+    constexpr static const int loop = 2*16;
+};
+
+#include "common_tests.h"
+
+#if !defined(DISABLEFLOAT16)
+extern void cmsisdsp_add(const float16_t* a, 
+              const float16_t* b, 
+                    float16_t* c, 
+              uint32_t l);
+#endif
+
+extern void cmsisdsp_add(const float64_t* a, 
+              const float64_t* b, 
+                    float64_t* c, 
+              uint32_t l);
+
+extern void cmsisdsp_add(const float32_t* a, 
+              const float32_t* b, 
+                    float32_t* c, 
+              uint32_t l);
+
+extern void cmsisdsp_add(const Q31* a, 
+              const Q31* b, 
+                    Q31* c, 
+              uint32_t l);
+
+extern void cmsisdsp_add(const Q15* a, 
+              const Q15* b, 
+                    Q15* c, 
+              uint32_t l);
+
+extern void cmsisdsp_add(const Q7* a, 
+              const Q7* b, 
+                    Q7* c, 
+              uint32_t l);
+
+extern void cmsisdsp_mat_add(const float32_t* a, 
+              const float32_t* b, 
+                    float32_t* c, 
+              uint32_t row,uint32_t col);
+
+#if !defined(DISABLEFLOAT16)
+extern void cmsisdsp_mat_add(const float16_t* a, 
+              const float16_t* b, 
+                    float16_t* c, 
+              uint32_t row,uint32_t col);
+#endif 
+
+extern void cmsisdsp_mat_add(const Q31* a, 
+              const Q31* b, 
+                    Q31* c, 
+              uint32_t row,uint32_t col);
+
+extern void cmsisdsp_mat_add(const Q15* a, 
+              const Q15* b, 
+                    Q15* c, 
+              uint32_t row,uint32_t col);
+
+extern void cmsisdsp_mat_add(const Q7* a, 
+              const Q7* b, 
+                    Q7* c, 
+             uint32_t row,uint32_t col);
+
+#if !defined(DISABLEFLOAT16)
+extern void cmsisdsp_dot(const float16_t* a, 
+              const float16_t* b, 
+                    float16_t &c, 
+              uint32_t l);
+#endif
+
+extern void cmsisdsp_dot(const float64_t* a, 
+              const float64_t* b, 
+                    float64_t &c, 
+              uint32_t l);
+
+extern void cmsisdsp_dot(const float32_t* a, 
+              const float32_t* b, 
+                    float32_t &c, 
+              uint32_t l);
+
+extern void cmsisdsp_dot(const Q31* a, 
+              const Q31* b, 
+                    Q<15,48> &c, 
+              uint32_t l);
+
+extern void cmsisdsp_dot(const Q15* a, 
+              const Q15* b, 
+                    Q<33,30> &c, 
+              uint32_t l);
+
+extern void cmsisdsp_dot(const Q7* a, 
+              const Q7* b, 
+                    Q<17,14> &c, 
+              uint32_t l);
+
+extern void cmsisdsp_dot_expr(const double* a, 
+                              const double* b, 
+                              const double* c, 
+                              const double* d, 
+                              double* tmp1, 
+                              double* tmp2, 
+                              const double scale,
+                              double &r, 
+                              uint32_t l);
+
+extern void cmsisdsp_dot_expr(const float32_t* a, 
+                              const float32_t* b, 
+                              const float32_t* c, 
+                              const float32_t* d, 
+                              float32_t* tmp1, 
+                              float32_t* tmp2, 
+                              const float32_t scale,
+                              float32_t &r, 
+                              uint32_t l);
+
+#if !defined(DISABLEFLOAT16)
+extern void cmsisdsp_dot_expr(const float16_t* a, 
+                              const float16_t* b, 
+                              const float16_t* c, 
+                              const float16_t* d, 
+                              float16_t* tmp1, 
+                              float16_t* tmp2, 
+                              const float16_t scale,
+                              float16_t &r, 
+                              uint32_t l);
+#endif
+
+extern void cmsisdsp_dot_expr(const Q7* a, 
+                              const Q7* b, 
+                              const Q7* c, 
+                              const Q7* d, 
+                              Q7* tmp1, 
+                              Q7* tmp2,
+                              const Q7 scale, 
+                              Q<17,14> &r, 
+                              uint32_t l);
+
+extern void cmsisdsp_dot_expr(const Q15* a, 
+                              const Q15* b, 
+                              const Q15* c, 
+                              const Q15* d, 
+                              Q15* tmp1, 
+                              Q15* tmp2,
+                              const Q15 scale, 
+                              Q<33,30> &r, 
+                              uint32_t l);
+
+extern void cmsisdsp_dot_expr(const Q31* a, 
+                              const Q31* b, 
+                              const Q31* c, 
+                              const Q31* d, 
+                              Q31* tmp1, 
+                              Q31* tmp2,
+                              const Q31 scale, 
+                              Q<15,48> &r, 
+                              uint32_t l);
+
+extern void cmsisdsp_fir(const arm_fir_instance_f32 * S,
+                         const float32_t * pSrc,
+                         float32_t * pDst,
+                         uint32_t blockSize);
+
+extern void cmsisdsp_fir(const arm_fir_instance_q7 * S,
+                         const Q7 * pSrc,
+                         Q7 * pDst,
+                         uint32_t blockSize);
+
+extern void cmsisdsp_fir(const arm_fir_instance_q15 * S,
+                         const Q15 * pSrc,
+                         Q15 * pDst,
+                         uint32_t blockSize);
+
+extern void cmsisdsp_fir(const arm_fir_instance_q31 * S,
+                         const Q31 * pSrc,
+                         Q31 * pDst,
+                         uint32_t blockSize);
+
+template<typename T>
+struct CMSISOuter {
+    static void run(const T *a,
+             const T *b,
+                   T *res,
+                   const uint32_t r,const uint32_t c)
+    {
+       DISABLE_LOOP_UNROLL
+       for(unsigned int row=0; row<r ; row ++ )
+       {
+           DISABLE_LOOP_UNROLL
+           for(unsigned int col=0; col<c ; col ++ )
+           {
+              res[row*c+col] = a[row]*b[col];
+           }
+       }
+    }
+};
+
+#if defined(ARM_MATH_MVEI) || defined(ARM_MATH_MVEF)
+extern void _cmsis_outer(const float32_t *a,
+                        const float32_t *b,
+                        float32_t *res,
+                        const uint32_t r,const uint32_t c);
+
+#if !defined(DISABLEFLOAT16)
+extern void _cmsis_outer(const float16_t *a,
+                        const float16_t *b,
+                        float16_t *res,
+                        const uint32_t r,const uint32_t c);
+#endif
+
+extern void _cmsis_outer(const Q7 *a,
+                        const Q7 *b,
+                        Q7 *res,
+                        const uint32_t r,const uint32_t c);
+
+extern void _cmsis_outer(const Q15 *a,
+                        const Q15 *b,
+                        Q15 *res,
+                        const uint32_t r,const uint32_t c);
+
+extern void _cmsis_outer(const Q31 *a,
+                        const Q31 *b,
+                        Q31 *res,
+                        const uint32_t r,const uint32_t c);
+template<>
+struct CMSISOuter<float32_t> {
+    static void run(const float32_t *a,
+             const float32_t *b,
+                   float32_t *res,
+            const uint32_t r,const uint32_t c)
+    {
+        _cmsis_outer(a,b,res,r,c);
+    }
+};
+
+#if !defined(DISABLEFLOAT16)
+template<>
+struct CMSISOuter<float16_t> {
+    static void run(const float16_t *a,
+             const float16_t *b,
+                   float16_t *res,
+            const uint32_t r,const uint32_t c)
+    {
+        _cmsis_outer(a,b,res,r,c);
+    }
+};
+#endif
+
+template<>
+struct CMSISOuter<Q31> {
+    static void run(const Q31 *a,
+             const Q31 *b,
+                   Q31 *res,
+            const uint32_t r,const uint32_t c)
+    {
+        _cmsis_outer(a,b,res,r,c);
+    }
+};
+
+template<>
+struct CMSISOuter<Q15> {
+    static void run(const Q15 *a,
+             const Q15 *b,
+                   Q15 *res,
+            const uint32_t r,const uint32_t c)
+    {
+        _cmsis_outer(a,b,res,r,c);
+    }
+};
+
+template<>
+struct CMSISOuter<Q7> {
+    static void run(const Q7 *a,
+             const Q7 *b,
+                   Q7 *res,
+            const uint32_t r,const uint32_t c)
+    {
+        _cmsis_outer(a,b,res,r,c);
+    }
+};
+
+#endif 
+
+extern void cmsis_init_householder(double *f,const int nb);
+extern void cmsis_init_householder(float32_t *f,const int nb);
+
+#if !defined(DISABLEFLOAT16)
+extern void cmsis_init_householder(float16_t *f,const int nb);
+#endif
+
+extern void cmsis_init_qr(double *f,const int r,const int c);
+extern void cmsis_init_qr(float32_t *f,const int r,const int c);
+
+#if !defined(DISABLEFLOAT16)
+extern void cmsis_init_qr(float16_t *f,const int r,const int c);
+#endif
+
+extern void cmsis_init_cholesky(double *f,const int r,const int c);
+extern void cmsis_init_cholesky(float32_t *f,const int r,const int c);
+
+#if !defined(DISABLEFLOAT16)
+extern void cmsis_init_cholesky(float16_t *f,const int r,const int c);
+#endif
+
+extern void cmsis_mat_mult(const arm_matrix_instance_f64* a, 
+                           const arm_matrix_instance_f64* b, 
+                                 arm_matrix_instance_f64 *c,
+                                 double *pState);
+
+extern void cmsis_mat_mult(const arm_matrix_instance_f32* a, 
+                           const arm_matrix_instance_f32* b, 
+                                 arm_matrix_instance_f32 *c,
+                                 float32_t *pState);
+#if !defined(DISABLEFLOAT16)
+extern void cmsis_mat_mult(const arm_matrix_instance_f16* a, 
+                           const arm_matrix_instance_f16* b, 
+                                 arm_matrix_instance_f16 *c,
+                                 float16_t *pState);
+#endif
+
+extern void cmsis_mat_mult(const arm_matrix_instance_q7* a, 
+                           const arm_matrix_instance_q7* b, 
+                                 arm_matrix_instance_q7 *c,
+                                 q7_t *pState);
+
+extern void cmsis_mat_mult(const arm_matrix_instance_q15* a, 
+                           const arm_matrix_instance_q15* b, 
+                                 arm_matrix_instance_q15 *c,
+                                 q15_t *pState);
+
+extern void cmsis_mat_mult(const arm_matrix_instance_q31* a, 
+                           const arm_matrix_instance_q31* b, 
+                                 arm_matrix_instance_q31 *c,
+                                 q31_t *pState);
+
+extern void cmsis_mat_trans(const arm_matrix_instance_q7* a, 
+                            arm_matrix_instance_q7* b);
+
+extern void cmsis_mat_trans(const arm_matrix_instance_q15* a, 
+                            arm_matrix_instance_q15* b);
+
+extern void cmsis_mat_trans(const arm_matrix_instance_q31* a, 
+                            arm_matrix_instance_q31* b);
+
+#if !defined(DISABLEFLOAT16)
+extern void cmsis_mat_trans(const arm_matrix_instance_f16* a, 
+                            arm_matrix_instance_f16* b);
+#endif
+
+extern void cmsis_mat_trans(const arm_matrix_instance_f64* a, 
+                            arm_matrix_instance_f64* b);
+
+extern void cmsis_mat_trans(const arm_matrix_instance_f32* a, 
+                            arm_matrix_instance_f32* b);
+
+extern double cmsis_householder(const double *,double* ,uint32_t);
+
+extern float32_t cmsis_householder(const float32_t *,float32_t* ,uint32_t);
+
+#if !defined(DISABLEFLOAT16)
+extern float16_t cmsis_householder(const float16_t *,float16_t* ,uint32_t);
+#endif
+
+extern void cmsis_mat_vec_mult(
+  const arm_matrix_instance_f64 *pSrcMat, 
+  const double *pVec, 
+  double *pDst);
+
+extern void cmsis_mat_vec_mult(
+  const arm_matrix_instance_f32 *pSrcMat, 
+  const float32_t *pVec, 
+  float32_t *pDst);
+
+#if !defined(DISABLEFLOAT16)
+extern void cmsis_mat_vec_mult(
+  const arm_matrix_instance_f16 *pSrcMat, 
+  const float16_t *pVec, 
+  float16_t *pDst);
+#endif
+
+extern void cmsis_mat_vec_mult(
+  const arm_matrix_instance_q31 *pSrcMat, 
+  const Q31 *pVec, 
+  Q31 *pDst);
+
+extern void cmsis_mat_vec_mult(
+  const arm_matrix_instance_q15 *pSrcMat, 
+  const Q15 *pVec, 
+  Q15 *pDst);
+
+extern void cmsis_mat_vec_mult(
+  const arm_matrix_instance_q7 *pSrcMat, 
+  const Q7 *pVec, 
+  Q7 *pDst);
+
+extern arm_status cmsis_qr(
+    const arm_matrix_instance_f64 * pSrc,
+    const double threshold,
+    arm_matrix_instance_f64 * pOutR,
+    arm_matrix_instance_f64 * pOutQ,
+    double * pOutTau,
+    double *pTmpA,
+    double *pTmpB
+    );
+
+extern arm_status cmsis_qr(
+    const arm_matrix_instance_f32 * pSrc,
+    const float32_t threshold,
+    arm_matrix_instance_f32 * pOutR,
+    arm_matrix_instance_f32 * pOutQ,
+    float32_t * pOutTau,
+    float32_t *pTmpA,
+    float32_t *pTmpB
+    );
+
+#if !defined(DISABLEFLOAT16)
+extern arm_status cmsis_qr(
+    const arm_matrix_instance_f16 * pSrc,
+    const float16_t threshold,
+    arm_matrix_instance_f16 * pOutR,
+    arm_matrix_instance_f16 * pOutQ,
+    float16_t * pOutTau,
+    float16_t *pTmpA,
+    float16_t *pTmpB
+    );
+#endif
+
+extern arm_status cmsis_cholesky(
+  const arm_matrix_instance_f64 * src,
+  arm_matrix_instance_f64 * dst);
+
+extern arm_status cmsis_cholesky(
+  const arm_matrix_instance_f32 * src,
+  arm_matrix_instance_f32 * dst);
+
+#if !defined(DISABLEFLOAT16)
+extern arm_status cmsis_cholesky(
+  const arm_matrix_instance_f16 * src,
+  arm_matrix_instance_f16 * dst);
+#endif
+
+extern void cmsis_complex_mat_vec(
+  const arm_matrix_instance_f64 * src,
+  const double * a,
+  const double * b,
+  const double scalar,
+  double * tmp,
+  double * dst);
+
+extern void cmsis_complex_mat_vec(
+  const arm_matrix_instance_f32 * src,
+  const float32_t * a,
+  const float32_t * b,
+  const float32_t scalar,
+  float32_t * tmp,
+  float32_t * dst);
+
+#if !defined(DISABLEFLOAT16)
+extern void cmsis_complex_mat_vec(
+  const arm_matrix_instance_f16 * src,
+  const float16_t * a,
+  const float16_t * b,
+  const float16_t scalar,
+  float16_t * tmp,
+  float16_t * dst);
+#endif
+
+extern void cmsis_complex_mat_vec(
+  const arm_matrix_instance_q31 * src,
+  const Q31 * a,
+  const Q31 * b,
+  const Q31 scalar,
+  Q31 * tmp,
+  Q31 * dst);
+
+extern void cmsis_complex_mat_vec(
+  const arm_matrix_instance_q15 * src,
+  const Q15 * a,
+  const Q15 * b,
+  const Q15 scalar,
+  Q15 * tmp,
+  Q15 * dst);
+
+extern void cmsis_complex_mat_vec(
+  const arm_matrix_instance_q7 * src,
+  const Q7 * a,
+  const Q7 * b,
+  const Q7 scalar,
+  Q7 * tmp,
+  Q7 * dst);
+
+template<typename T>
+struct CMSISMatrixType;
+
+template<>
+struct CMSISMatrixType<double>
+{
+   typedef arm_matrix_instance_f64 type;
+   typedef double scalar;
+};
+
+template<>
+struct CMSISMatrixType<float32_t>
+{
+   typedef arm_matrix_instance_f32 type;
+   typedef float32_t scalar;
+};
+
+#if !defined(DISABLEFLOAT16)
+template<>
+struct CMSISMatrixType<float16_t>
+{
+   typedef arm_matrix_instance_f16 type;
+   typedef float16_t scalar;
+};
+#endif
+
+template<>
+struct CMSISMatrixType<Q7>
+{
+   typedef arm_matrix_instance_q7 type;
+   typedef q7_t scalar;
+
+};
+
+template<>
+struct CMSISMatrixType<Q15>
+{
+   typedef arm_matrix_instance_q15 type;
+   typedef q15_t scalar;
+
+};
+
+template<>
+struct CMSISMatrixType<Q31>
+{
+   typedef arm_matrix_instance_q31 type;
+   typedef q31_t scalar;
+
+};
+
+template<typename T>
+struct TestConstant;
+
+template<>
+struct TestConstant<double>
+{
+   constexpr static double v = 0.2;
+   constexpr static double small = 0.001;
+};
+
+template<>
+struct TestConstant<float32_t>
+{
+   constexpr static float v = 0.2f;
+   constexpr static float small = 0.001f;
+};
+
+#if !defined(DISABLEFLOAT16)
+template<>
+struct TestConstant<float16_t>
+{
+   constexpr static float16_t v = 0.2f;
+   constexpr static float16_t small = 0.001f;
+
+};
+#endif
+
+template<>
+struct TestConstant<Q7>
+{
+   constexpr static Q7 v = 0.2_q7;
+   constexpr static Q7 small = 0.001_q7;
+};
+
+
+template<>
+struct TestConstant<Q15>
+{
+   constexpr static Q15 v = 0.2_q15;
+   constexpr static Q15 small = 0.001_q15;
+};
+
+template<>
+struct TestConstant<Q31>
+{
+   constexpr static Q31 v = 0.2_q31;
+   constexpr static Q31 small = 0.001_q31;
+};
\ No newline at end of file
diff --git a/dsppp/tests/cmsisdsp.cpp b/dsppp/tests/cmsisdsp.cpp
new file mode 100644
index 000000000..7c6ad0c65
--- /dev/null
+++ b/dsppp/tests/cmsisdsp.cpp
@@ -0,0 +1,1146 @@
+#include "allocator.h"
+
+#include <dsppp/arch.hpp>
+#include <dsppp/fixed_point.hpp>
+#include <dsppp/matrix.hpp>
+
+using namespace arm_cmsis_dsp;
+
+
+#include "dsp/basic_math_functions.h"
+#include "dsp/basic_math_functions_f16.h"
+#include "dsp/filtering_functions.h"
+#include "dsp/matrix_functions.h"
+#include "dsp/matrix_functions_f16.h"
+
+
+#include "bench.h"
+
+#if !defined(DISABLEFLOAT16)
+void cmsisdsp_add(const float16_t* a, 
+              const float16_t* b, 
+                    float16_t* c, 
+              uint32_t l)
+{
+   
+   arm_add_f16(a,b,c,l);
+};
+#endif
+
+
+void cmsisdsp_add(const float64_t* a, 
+              const float64_t* b, 
+                    float64_t* c, 
+              uint32_t l)
+{
+   
+   arm_add_f64(a,b,c,l);
+};
+
+
+void cmsisdsp_add(const float32_t* a, 
+              const float32_t* b, 
+                    float32_t* c, 
+              uint32_t l)
+{
+   arm_add_f32(a,b,c,l);
+};
+
+
+
+
+void cmsisdsp_add(const Q31* a, 
+              const Q31* b, 
+                    Q31* c, 
+              uint32_t l)
+{
+  
+   arm_add_q31(reinterpret_cast<const q31_t*>(a),
+               reinterpret_cast<const q31_t*>(b),
+               reinterpret_cast<q31_t*>(c),l);
+};
+
+
+void cmsisdsp_add(const Q15* a, 
+              const Q15* b, 
+                    Q15* c, 
+              uint32_t l)
+{
+   
+   arm_add_q15(reinterpret_cast<const q15_t*>(a),
+               reinterpret_cast<const q15_t*>(b),
+               reinterpret_cast<q15_t*>(c),l);
+};
+
+void cmsisdsp_add(const Q7* a, 
+              const Q7* b, 
+                    Q7* c, 
+              uint32_t l)
+{
+   
+   arm_add_q7(reinterpret_cast<const q7_t*>(a),
+               reinterpret_cast<const q7_t*>(b),
+               reinterpret_cast<q7_t*>(c),l);
+};
+
+#if !defined(DISABLEFLOAT16)
+void cmsisdsp_dot(const float16_t* a, 
+              const float16_t* b, 
+                    float16_t &c, 
+              uint32_t l)
+{
+   arm_dot_prod_f16(a,b,l,&c);
+};
+#endif
+
+
+void cmsisdsp_dot(const float64_t* a, 
+              const float64_t* b, 
+                    float64_t &c, 
+              uint32_t l)
+{
+   arm_dot_prod_f64(a,b,l,&c);
+};
+
+void cmsisdsp_dot(const float32_t* a, 
+              const float32_t* b, 
+                    float32_t &c, 
+              uint32_t l)
+{
+   arm_dot_prod_f32(a,b,l,&c);
+};
+
+
+
+
+void cmsisdsp_dot(const Q31* a, 
+              const Q31* b, 
+                    Q<15,48> &c, 
+              uint32_t l)
+{
+   arm_dot_prod_q31(reinterpret_cast<const q31_t*>(a),
+               reinterpret_cast<const q31_t*>(b),l,
+               reinterpret_cast<q63_t*>(&c));
+};
+
+
+void cmsisdsp_dot(const Q15* a, 
+              const Q15* b, 
+                    Q<33,30> &c, 
+              uint32_t l)
+{
+   arm_dot_prod_q15(reinterpret_cast<const q15_t*>(a),
+               reinterpret_cast<const q15_t*>(b),l,
+               reinterpret_cast<q63_t*>(&c));
+};
+
+
+void cmsisdsp_dot(const Q7* a, 
+              const Q7* b, 
+                    Q<17,14> &c, 
+              uint32_t l)
+{
+   arm_dot_prod_q7(reinterpret_cast<const q7_t*>(a),
+               reinterpret_cast<const q7_t*>(b),l,
+               reinterpret_cast<q31_t*>(&c));
+};
+
+void cmsisdsp_dot_expr(const double* a, 
+                       const double* b, 
+                       const double* c, 
+                       const double* d, 
+                       double* tmp1, 
+                       double* tmp2, 
+                       const double scale,
+                       double &r, 
+                       uint32_t l)
+{
+   arm_add_f64(a,b,tmp1,l);
+   arm_scale_f64(tmp1,scale,tmp1,l);
+   arm_mult_f64(c,d,tmp2,l);
+   arm_dot_prod_f64(tmp1,tmp2,l,&r);
+};
+
+void cmsisdsp_dot_expr(const float32_t* a, 
+                       const float32_t* b, 
+                       const float32_t* c, 
+                       const float32_t* d, 
+                       float32_t* tmp1, 
+                       float32_t* tmp2, 
+                       const float32_t scale,
+                       float32_t &r, 
+                       uint32_t l)
+{
+   arm_add_f32(a,b,tmp1,l);
+   arm_scale_f32(tmp1,scale,tmp1,l);
+   arm_mult_f32(c,d,tmp2,l);
+   arm_dot_prod_f32(tmp1,tmp2,l,&r);
+};
+
+#if !defined(DISABLEFLOAT16)
+void cmsisdsp_dot_expr(const float16_t* a, 
+                       const float16_t* b, 
+                       const float16_t* c, 
+                       const float16_t* d, 
+                       float16_t* tmp1, 
+                       float16_t* tmp2, 
+                       const float16_t scale,
+                       float16_t &r, 
+                       uint32_t l)
+{
+   arm_add_f16(a,b,tmp1,l);
+   arm_scale_f16(tmp1,scale,tmp1,l);
+   arm_mult_f16(c,d,tmp2,l);
+   arm_dot_prod_f16(tmp1,tmp2,l,&r);
+};
+#endif
+
+void cmsisdsp_fir(const arm_fir_instance_f32 * S,
+                  const float32_t * pSrc,
+                  float32_t * pDst,
+                  uint32_t blockSize)
+{
+    arm_fir_f32(S,pSrc,pDst,blockSize);
+};
+
+void cmsisdsp_fir(const arm_fir_instance_q7 * S,
+                  const Q7 * pSrc,
+                  Q7 * pDst,
+                  uint32_t blockSize)
+{
+    arm_fir_q7(S,reinterpret_cast<const q7_t*>(pSrc),
+                  reinterpret_cast<q7_t*>(pDst),blockSize);
+};
+
+void cmsisdsp_fir(const arm_fir_instance_q15 * S,
+                  const Q15 * pSrc,
+                  Q15 * pDst,
+                  uint32_t blockSize)
+{
+    arm_fir_q15(S,reinterpret_cast<const q15_t*>(pSrc),
+                  reinterpret_cast<q15_t*>(pDst),blockSize);
+};
+
+void cmsisdsp_fir(const arm_fir_instance_q31 * S,
+                  const Q31 * pSrc,
+                  Q31 * pDst,
+                  uint32_t blockSize)
+{
+    arm_fir_q31(S,reinterpret_cast<const q31_t*>(pSrc),
+                  reinterpret_cast<q31_t*>(pDst),blockSize);
+};
+
+
+void cmsisdsp_dot_expr(const Q7* a, 
+                       const Q7* b, 
+                       const Q7* c, 
+                       const Q7* d, 
+                       Q7* tmp1, 
+                       Q7* tmp2, 
+                       const Q7 scale,
+                       Q<17,14> &r, 
+                       uint32_t l)
+{
+   arm_add_q7(reinterpret_cast<const q7_t*>(a),
+               reinterpret_cast<const q7_t*>(b),
+               reinterpret_cast<q7_t*>(tmp1),l);
+   arm_scale_q7(reinterpret_cast<q7_t*>(tmp1),scale.v,0,
+                 reinterpret_cast<q7_t*>(tmp1),l);
+
+
+   arm_mult_q7(reinterpret_cast<const q7_t*>(c),
+               reinterpret_cast<const q7_t*>(d),
+               reinterpret_cast<q7_t*>(tmp2),l);
+
+  
+   arm_dot_prod_q7(reinterpret_cast<q7_t*>(tmp1),
+                    reinterpret_cast<q7_t*>(tmp2),l,&r.v);
+};
+
+void cmsisdsp_dot_expr(const Q15* a, 
+                       const Q15* b, 
+                       const Q15* c, 
+                       const Q15* d, 
+                       Q15* tmp1, 
+                       Q15* tmp2, 
+                       const Q15 scale,
+                       Q<33,30> &r, 
+                       uint32_t l)
+{
+   arm_add_q15(reinterpret_cast<const q15_t*>(a),
+               reinterpret_cast<const q15_t*>(b),
+               reinterpret_cast<q15_t*>(tmp1),l);
+   arm_scale_q15(reinterpret_cast<q15_t*>(tmp1),scale.v,0,
+                 reinterpret_cast<q15_t*>(tmp1),l);
+   arm_mult_q15(reinterpret_cast<const q15_t*>(c),
+               reinterpret_cast<const q15_t*>(d),
+               reinterpret_cast<q15_t*>(tmp2),l);
+   arm_dot_prod_q15(reinterpret_cast<q15_t*>(tmp1),
+                    reinterpret_cast<q15_t*>(tmp2),l,&r.v);
+};
+
+void cmsisdsp_dot_expr(const Q31* a, 
+                       const Q31* b, 
+                       const Q31* c, 
+                       const Q31* d, 
+                       Q31* tmp1, 
+                       Q31* tmp2, 
+                       const Q31 scale,
+                       Q<15,48> &r, 
+                       uint32_t l)
+{
+   arm_add_q31(reinterpret_cast<const q31_t*>(a),
+               reinterpret_cast<const q31_t*>(b),
+               reinterpret_cast<q31_t*>(tmp1),l);
+   arm_scale_q31(reinterpret_cast<q31_t*>(tmp1),scale.v,0,
+                 reinterpret_cast<q31_t*>(tmp1),l);
+   arm_mult_q31(reinterpret_cast<const q31_t*>(c),
+               reinterpret_cast<const q31_t*>(d),
+               reinterpret_cast<q31_t*>(tmp2),l);
+   arm_dot_prod_q31(reinterpret_cast<q31_t*>(tmp1),
+                    reinterpret_cast<q31_t*>(tmp2),l,&r.v);
+};
+
+
+void cmsisdsp_mat_add(const float32_t* a, 
+              const float32_t* b, 
+                    float32_t* c, 
+              uint32_t row,uint32_t col)
+{
+   arm_matrix_instance_f32 srca;
+   arm_matrix_instance_f32 srcb;
+
+   arm_matrix_instance_f32 dst;
+
+
+   srca.numRows = row;
+   srca.numCols = col;
+   srca.pData = (float32_t*)a;
+
+   srcb.numRows = row;
+   srcb.numCols = col;
+   srcb.pData = (float32_t*)b;
+
+   dst.numRows = row;
+   dst.numCols = col;
+   dst.pData = c;
+   arm_mat_add_f32(&srca,&srcb,&dst);
+
+}
+
+#if !defined(DISABLEFLOAT16)
+void cmsisdsp_mat_add(const float16_t* a, 
+              const float16_t* b, 
+                    float16_t* c, 
+              uint32_t row,uint32_t col)
+{
+   arm_matrix_instance_f16 srca;
+   arm_matrix_instance_f16 srcb;
+
+   arm_matrix_instance_f16 dst;
+
+
+   srca.numRows = row;
+   srca.numCols = col;
+   srca.pData = (float16_t*)a;
+
+   srcb.numRows = row;
+   srcb.numCols = col;
+   srcb.pData = (float16_t*)b;
+
+   dst.numRows = row;
+   dst.numCols = col;
+   dst.pData = c;
+   arm_mat_add_f16(&srca,&srcb,&dst);
+
+}
+#endif
+
+void cmsisdsp_mat_add(const Q31* a, 
+                      const Q31* b, 
+                            Q31* c, 
+                      uint32_t row,uint32_t col)
+{
+   arm_matrix_instance_q31 srca;
+   arm_matrix_instance_q31 srcb;
+
+   arm_matrix_instance_q31 dst;
+
+
+   srca.numRows = row;
+   srca.numCols = col;
+   srca.pData = reinterpret_cast<q31_t *>(const_cast<Q31*>(a));
+
+   srcb.numRows = row;
+   srcb.numCols = col;
+   srcb.pData = reinterpret_cast<q31_t *>(const_cast<Q31*>(b));
+
+   dst.numRows = row;
+   dst.numCols = col;
+   dst.pData = reinterpret_cast<q31_t *>(c);
+   arm_mat_add_q31(&srca,&srcb,&dst);
+
+}
+
+void cmsisdsp_mat_add(const Q15* a, 
+                      const Q15* b, 
+                            Q15* c, 
+                      uint32_t row,uint32_t col)
+{
+   arm_matrix_instance_q15 srca;
+   arm_matrix_instance_q15 srcb;
+
+   arm_matrix_instance_q15 dst;
+
+
+   srca.numRows = row;
+   srca.numCols = col;
+   srca.pData = reinterpret_cast<q15_t *>(const_cast<Q15*>(a));
+
+   srcb.numRows = row;
+   srcb.numCols = col;
+   srcb.pData = reinterpret_cast<q15_t *>(const_cast<Q15*>(b));
+
+   dst.numRows = row;
+   dst.numCols = col;
+   dst.pData = reinterpret_cast<q15_t *>(c);
+   arm_mat_add_q15(&srca,&srcb,&dst);
+
+}
+
+
+void cmsisdsp_mat_add(const Q7* a, 
+                      const Q7* b, 
+                            Q7* c, 
+                      uint32_t row,uint32_t col)
+{
+   (void)a;
+   (void)b;
+   (void)c;
+   (void)row;
+   (void)col;
+   // Doing nothing since there is no equivalent CMSIS-DSP
+   // function
+   // Required to enable the build
+
+   /*
+   arm_matrix_instance_q7 srca;
+   arm_matrix_instance_q7 srcb;
+
+   arm_matrix_instance_q7 dst;
+
+
+   srca.numRows = row;
+   srca.numCols = col;
+   srca.pData = reinterpret_cast<q7_t *>(const_cast<Q7*>(a));
+
+   srcb.numRows = row;
+   srcb.numCols = col;
+   srcb.pData = reinterpret_cast<q7_t *>(const_cast<Q7*>(b));
+
+   dst.numRows = row;
+   dst.numCols = col;
+   dst.pData = reinterpret_cast<q7_t *>(c);
+   arm_mat_add_q7(&srca,&srcb,&dst);
+*/
+}
+
+#if defined(ARM_MATH_MVEI) || defined(ARM_MATH_MVEF)
+void _cmsis_outer(const float32_t *a,
+                 const float32_t *b,
+                 float32_t *res,
+                 const uint32_t r,const uint32_t c)
+{
+   for(unsigned int row=0; row<r ; row ++ )
+   {
+       for(unsigned int col=0; col<c ; col +=4 )
+       {
+          mve_pred16_t p0 = vctp32q(c-col);
+    
+          float32x4_t v1 = vld1q_z(&b[col],p0);
+          v1 = vmulq_x_n_f32(v1,a[row],p0);
+          vstrwq_p(&res[row*c+col],v1,p0);
+       }
+   }
+}
+
+
+#if !defined(DISABLEFLOAT16)
+void _cmsis_outer(const float16_t *a,
+                 const float16_t *b,
+                 float16_t *res,
+                 const uint32_t r,const uint32_t c)
+{
+   for(unsigned int row=0; row<r ; row ++ )
+   {
+       for(unsigned int col=0; col<c ; col +=8 )
+       {
+          mve_pred16_t p0 = vctp16q(c-col);
+    
+          float16x8_t v1 = vld1q_z(&b[col],p0);
+          v1 = vmulq_x_n_f16(v1,a[row],p0);
+          vstrhq_p(&res[row*c+col],v1,p0);
+       }
+   }
+}
+#endif
+
+void _cmsis_outer(const Q31 *a,
+                 const Q31 *b,
+                 Q31 *res,
+                 const uint32_t r,const uint32_t c)
+{
+   const q31_t *pa = reinterpret_cast<const q31_t *>(a);
+   const q31_t *pb = reinterpret_cast<const q31_t *>(b);
+   q31_t *pr = reinterpret_cast<q31_t *>(res);
+   for(unsigned int row=0; row<r ; row ++ )
+   {
+       for(unsigned int col=0; col<c ; col +=4 )
+       {
+          mve_pred16_t p0 = vctp32q(c-col);
+    
+          int32x4_t v1 = vld1q_z(&pb[col],p0);
+          v1 = vqdmulhq_m_n_s32(vuninitializedq_s32(), v1,pa[row],p0);
+          vstrwq_p(&pr[row*c+col],v1,p0);
+       }
+   }
+}
+
+void _cmsis_outer(const Q15 *a,
+                 const Q15 *b,
+                 Q15 *res,
+                 const uint32_t r,const uint32_t c)
+{
+   const q15_t *pa = reinterpret_cast<const q15_t *>(a);
+   const q15_t *pb = reinterpret_cast<const q15_t *>(b);
+   q15_t *pr = reinterpret_cast<q15_t *>(res);
+   for(unsigned int row=0; row<r ; row ++ )
+   {
+       for(unsigned int col=0; col<c ; col +=8 )
+       {
+          mve_pred16_t p0 = vctp16q(c-col);
+    
+          int16x8_t v1 = vld1q_z(&pb[col],p0);
+          v1 = vqdmulhq_m_n_s16(vuninitializedq_s16(),v1,pa[row],p0);
+          vstrhq_p(&pr[row*c+col],v1,p0);
+       }
+   }
+}
+
+void _cmsis_outer(const Q7 *a,
+                 const Q7 *b,
+                 Q7 *res,
+                 const uint32_t r,const uint32_t c)
+{
+   const q7_t *pa = reinterpret_cast<const q7_t *>(a);
+   const q7_t *pb = reinterpret_cast<const q7_t *>(b);
+   q7_t *pr = reinterpret_cast<q7_t *>(res);
+   for(unsigned int row=0; row<r ; row ++ )
+   {
+       for(unsigned int col=0; col<c ; col +=16 )
+       {
+          mve_pred16_t p0 = vctp8q(c-col);
+    
+          int8x16_t v1 = vld1q_z(&pb[col],p0);
+          v1 = vqdmulhq_m_n_s8(vuninitializedq_s8(),v1,pa[row],p0);
+          vstrbq_p(&pr[row*c+col],v1,p0);
+       }
+   }
+}
+#endif
+
+const float32_t householder_pattern_4[4] = {-0.45143621, -1.        ,  0.07823104,  0.56856041};
+const float32_t householder_pattern_16[16] ={ 0.5122637 , -0.86917937, -0.19042511,  0.46574409,  0.14121624,
+       -0.19945171,  0.12364709,  0.56458161, -1.        , -0.46570847,
+        0.37724627, -0.68176618,  0.66132393, -0.27301838, -0.30102456,
+       -0.46863711};
+const float32_t householder_pattern_32[32] ={-0.27607152, -0.64010692,  0.24924816, -0.1221291 ,  0.09912046,
+        0.085217  ,  0.24820676, -0.19291274,  0.41545563, -0.02633568,
+        0.60912863,  0.13165173, -0.20704703, -0.70260591,  0.69223474,
+        0.50554362, -1.        , -0.02547366, -0.35907901,  0.11506218,
+        0.26298233, -0.42770096, -0.01737023, -0.00818395,  0.17304538,
+        0.25923665,  0.16836481,  0.08255562, -0.11357458,  0.02184729,
+        0.76109268, -0.02213071};
+
+static void copy_f32_to_f64(float64_t *dst,float32_t *src,int nb)
+{
+   for(int i=0;i<nb;i++)
+   {
+      dst[i] = (float64_t)src[i];
+   }
+}
+
+void cmsis_init_householder(float32_t *f,const int nb)
+{
+   if (nb==4)
+   {
+      memcpy(f,householder_pattern_4,sizeof(float32_t)*nb);
+   }
+   if (nb==16)
+   {
+      memcpy(f,householder_pattern_16,sizeof(float32_t)*nb);
+   }
+   if (nb==32)
+   {
+      memcpy(f,householder_pattern_32,sizeof(float32_t)*nb);
+   }
+}
+
+void cmsis_init_householder(float64_t *f,const int nb)
+{
+   if (nb==4)
+   {
+      copy_f32_to_f64(f,(float32_t*)householder_pattern_4,nb);
+   }
+   if (nb==16)
+   {
+      copy_f32_to_f64(f,(float32_t*)householder_pattern_16,nb);
+   }
+   if (nb==32)
+   {
+      copy_f32_to_f64(f,(float32_t*)householder_pattern_32,nb);
+   }
+}
+
+#if !defined(DISABLEFLOAT16)
+static void copy_f32_to_f16(float16_t *dst,float32_t *src,int nb)
+{
+   for(int i=0;i<nb;i++)
+   {
+      dst[i] = (float16_t)src[i];
+   }
+}
+#endif
+
+#if !defined(DISABLEFLOAT16)
+void cmsis_init_householder(float16_t *f,const int nb)
+{
+   if (nb==4)
+   {
+
+      copy_f32_to_f16(f,(float32_t*)householder_pattern_4,nb);
+   }
+   if (nb==16)
+   {
+      copy_f32_to_f16(f,(float32_t*)householder_pattern_16,nb);
+   }
+   if (nb==32)
+   {
+      copy_f32_to_f16(f,(float32_t*)householder_pattern_32,nb);
+   }
+}
+#endif
+
+const float32_t random_isometry_4x4_f32[4*4] = {-0.46109607, -0.24352342, -0.81533958, -0.20615929, -0.3728161 ,
+        0.62793435, -0.16185064,  0.32881827,  0.06821765,  0.46658338,
+       -0.15378958,  0.43166011, -0.71032818,  0.16213376,  0.49825106,
+       -0.37650103};
+
+const float32_t random_isometry_16x16_f32[16*16] = {5.69380480e-02, -1.37633572e-01,  1.02055860e-01, -8.48624465e-02,
+  1.94451169e-01,  1.03345545e-01, -3.74331557e-01, -4.13616325e-02,
+  7.63005061e-02, -4.94134212e-01, -1.36399799e-01, -2.34170108e-01,
+  2.21098471e-01,  4.51396341e-01, -1.67600155e-01, -3.11297704e-01,
+  1.36993291e-01,  1.51453253e-01,  3.08991232e-01, -3.50304890e-02,
+ -8.17043008e-02,  7.23517595e-02,  4.08315013e-02,  2.53488652e-01,
+ -3.50447543e-02, -1.99633336e-01, -8.31197208e-02, -2.76697671e-02,
+  2.66466406e-01, -1.78378772e-01, -3.66992587e-01, -5.04665668e-02,
+ -2.51978196e-01,  2.02263809e-01,  8.35011008e-02, -4.91719830e-02,
+ -1.29486498e-01,  1.04851169e-01, -1.88649843e-01, -5.32032212e-01,
+  1.43910051e-01,  1.61278447e-01,  2.22376337e-01, -2.36662900e-01,
+ -2.71846944e-01,  3.66596236e-01, -1.11185786e-01,  7.81310847e-02,
+  3.53524969e-01, -1.35451309e-01, -3.65224785e-01, -1.73087258e-01,
+  3.56828211e-01,  1.31555555e-01, -1.88240136e-01,  4.81464791e-02,
+  1.67439223e-01,  1.89666937e-03, -1.71239944e-01, -4.21910491e-02,
+  1.80633363e-02,  1.20020438e-01,  1.75296329e-01,  5.74151092e-01,
+  2.13621928e-01, -7.10558544e-02,  1.83525432e-01,  4.08087648e-01,
+  3.49914215e-01, -1.52501220e-01,  3.00653159e-01,  1.92397288e-01,
+  6.65496283e-04,  2.04129695e-01,  2.44058447e-01, -5.00917810e-01,
+ -1.66055735e-01,  1.86709041e-01, -1.02839917e-02, -2.63332969e-02,
+  1.97894305e-01,  1.22772391e-01, -1.61258705e-01, -1.72451564e-01,
+  3.09024149e-03,  9.12842953e-02,  3.09511036e-01, -2.07323365e-01,
+ -6.99629188e-02, -6.97344404e-02, -2.50295951e-01, -1.53226594e-01,
+ -2.93244592e-01, -1.34252109e-01, -6.64644986e-01,  1.55594413e-01,
+ -1.47153070e-01, -3.98551792e-01,  1.74488203e-01, -6.32203920e-01,
+  1.82021260e-01, -2.98756961e-01, -1.31454807e-02,  1.85808006e-01,
+ -2.08605255e-02,  2.68635742e-01,  1.41703314e-01, -1.51864630e-01,
+  1.97872685e-02, -3.14114244e-02, -1.68553523e-01, -4.41946998e-02,
+  4.02577933e-02,  2.47885359e-01,  5.27003522e-01,  6.13041442e-03,
+ -1.07534783e-01,  2.88014531e-01, -1.55107579e-01,  1.58441382e-01,
+ -3.21054176e-02,  3.81734338e-01, -2.87779553e-01, -1.14830542e-01,
+  2.00144582e-01,  1.07566114e-01,  3.69813222e-02,  3.34238020e-01,
+  1.15886084e-01, -2.11580182e-02, -2.16095820e-01, -9.38499013e-02,
+ -6.79561766e-02,  4.25121661e-01, -6.97635074e-02,  7.95747381e-02,
+  2.13286846e-01,  2.58985918e-01, -2.38493454e-01, -3.77274172e-01,
+ -1.33047386e-01, -2.85864539e-01,  1.90643744e-01, -5.21457552e-01,
+  2.12904642e-01, -9.08100801e-02,  5.82814618e-02, -7.44380511e-02,
+  3.10668097e-01,  1.74627375e-01,  1.19603505e-01, -4.26906593e-01,
+ -5.43326591e-01,  3.06609246e-01, -9.20588152e-02,  2.61406134e-01,
+  2.05725912e-01,  1.17398660e-01,  4.55734185e-02, -2.53345011e-01,
+ -2.89322012e-01, -7.37293826e-02, -3.05009103e-01,  1.46566531e-01,
+  6.68149325e-02,  2.70238436e-01,  3.62324813e-01,  1.22503173e-01,
+  3.35018407e-01,  2.18284015e-01,  6.81461115e-02,  1.47314917e-01,
+  4.55167266e-01,  2.89331790e-01, -2.79291688e-01, -2.99945263e-02,
+  3.87733429e-01,  2.53180306e-01,  2.07995234e-01, -3.59696008e-02,
+  3.08937582e-01,  1.29214761e-02, -8.02533250e-02, -2.16496133e-01,
+  4.72150532e-01, -4.17375281e-04,  4.00640017e-01,  2.30224042e-01,
+  1.07230047e-01, -2.75754164e-01, -5.68768879e-02, -1.08502400e-01,
+ -2.13123631e-01,  2.20179510e-01, -2.04425269e-01, -7.12711002e-02,
+  1.50341733e-01,  2.99767117e-01, -7.57381577e-02,  3.20772225e-02,
+ -4.16191946e-01, -1.76969952e-01,  4.60127602e-01, -3.63194969e-01,
+  2.64088733e-01, -2.78004410e-01,  3.45176282e-02,  1.66762311e-01,
+  5.57703424e-01, -1.18007838e-01, -1.99062209e-01, -4.34665120e-03,
+ -5.57962734e-01,  6.64024950e-03, -1.34880116e-01,  1.39829967e-01,
+ -1.69707741e-01,  1.32167214e-01,  3.67184117e-01,  7.29227841e-04,
+  8.16874452e-02,  2.51968616e-01, -1.26216578e-01, -3.72284365e-02,
+ -6.79566707e-03,  2.97424012e-01, -1.07099948e-01,  1.73646793e-01,
+  3.16005321e-01,  2.39779868e-02, -1.60636477e-01,  3.74439024e-01,
+ -2.39797156e-01, -2.38134722e-02, -5.97277519e-02,  2.26667442e-01,
+ -3.54040830e-01,  2.03223484e-01, -1.18698212e-01, -1.59747336e-01,
+  1.37615392e-01, -2.44439739e-01,  1.06614263e-01,  2.58171650e-01,
+ -6.86640701e-02, -9.59506452e-02,  2.68646691e-01, -2.80800039e-01,
+ -3.64537678e-02, -2.64308516e-01, -1.87361734e-01, -2.38791138e-01,
+  1.89438379e-01, -6.88119003e-02,  1.65692833e-01,  7.86450332e-02};
+
+
+const float32_t random_isometry_32x32_f32[32*32] = {
+   0.2561178037923184, 0.12684598745056058, -0.1917076894756357, -0.10063528247711584, 0.11693583745997702, -0.04671492794164545, -0.07624476799030552, 0.30189950350250494, 0.23310890269419243, 0.1386829877274808, -0.2663701610096585, 0.04246632758331309, -0.21673664107159046, -0.08088422032913818, 0.06156467481215969, 0.03341794504741294, 0.0067768692433383265, -0.16145279814163666, -0.23539592611537435, -0.030866635697717748, -0.13576095002151292, -0.03539874938471446, 0.25577510718870045, -0.05135496657724662, -0.17725793700446804, 0.1441799927702897, -0.09520657792755119, -0.0829904428835437, -0.07643382798626183, 0.0103549422503809, 0.03706561744379559, 0.1915738967205854, -0.10570006934208497, -0.17622733109065847, -0.10798867493313294, 0.20365667774247836, -0.03987816355768947, -0.11871419561812659, -0.2902905748026591, -0.03047258459569701, 0.044252100115127445, 0.09039343485126038, -0.17158351854245388, -0.004913415468891688, 0.2821763530331642, 0.0758076324854308, 0.22192341633413717, 0.11394456976452935, -0.04506972478619754, -0.11047989079552313, 0.055316728323324245, -0.19674317890403176, -0.2620859130894082, -0.20220090259783494, 0.22562636596391122, 0.2079132037845987, 0.3007258077032465, -0.12770191135652412, 0.1147187625872622, -0.08385198957235337, -0.16889144920579613, -0.15335823599250653, -0.10032911111524084, 0.15154496254960204, -0.27110430255331736, 0.03446590444825111, 0.011569347954848952, 0.12199803400778499, 0.03853130094342648, 0.04327847721840131, -0.07140001994011351, -0.06968255182190973, 0.09371312587629484, -0.0796257520410159, 0.09963927745937123, 0.08825034122941652, 0.026134222163579544, 0.10445374894647012, 0.009612145145104006, -0.0888659092353381, -0.10060493665314264, 0.17806090677054262, -0.34456529296806176, -0.22610574956595422, 0.2442709299379727, 0.27558699814619053, -0.02166696853280889, -0.06932987771181881, 0.10972961272808911, -0.02441705320252146, 0.015655150855478257, -0.12057080426341928, 0.07701417928154868, 0.14878359631523494, -0.17138748341826546, 0.15252347699142516, 0.07307874117692119, 0.2599945228212697, -0.19196817529229523, 0.23893586474101416, -0.07696262651096888, -0.1352211155526756, -0.041284295600799846, -0.21301715397179527, -0.2563211183482171, -0.3340021012892192, -0.04716693418400724, -0.09675400307718163, -0.018854948314686037, -0.19149500450816775, 0.16882225279666865, 0.20268698505306687, 0.013405172657128499, -0.05979550282061416, -0.032433133080206934, 0.07763877618480201, 0.03355076586756994, 0.056574287061141974, -0.1052728227980981, 0.0945889184782356, -0.14030390244022983, -0.16043105366130245, 0.037304705278071164, 0.04474869486186574, 0.052368429087749184, 0.0650786493415232, -0.2713473792246009, 0.27494626846571385, 0.028268454734329485, 0.3480451702808647, -0.04974495736760035, -0.2433707430132722, -0.05789585540739654, -0.09012593022296074, -0.30601458521037606, -0.1039674276492338, 0.026269600828528278, 0.0047719861976243325, 0.06401955646839647, 0.15774233020915135, 0.07618405846921543, -0.09503916249041991, -0.10695351845514715, -0.22847050770676344, 0.09028232721073787, 0.1160625046605568, -0.11393053313832605, -0.0086390746345271, -0.3742125095214824, -0.09911681417004073, -0.08593841289626461, -0.05751767662292259, 0.035454126573429987, -0.07218929694719181, -0.03073384340671336, -0.18303958077972324, 0.23503956085066274, -0.10822474547082754, -0.11007405828556358, 0.21190515111918792, -0.0038990384561383862, -0.04739961762613928, -0.12815401665453657, -0.1003128020536073, -0.02252448054702075, -0.15758625545483165, 0.1266132337998702, -0.21725602995974114, 0.03149917880408743, -0.00909335252543754, 0.035315621071771275, -0.08275213549137503, 0.03426829458468214, -0.30852315093568117, -0.3230221334146357, -0.07262921157711788, 0.23072168928420925, -0.16087779392884421, 0.08020960450704782, 0.08975714667808486, -0.09550248365062361, -0.07646796406699388, 0.056467673772112065, -0.27229915166121926, -0.004110167828106709, -0.1389216524764349, 0.11929904568430544, 0.1364113675062776, -0.014638640583718804, -0.31240243132266704, -0.03229212403822174, -0.004221995736745128, -0.14402246538226524, 0.0571342635921929, -0.32835824252753615, 0.06644559086044265, 0.22438679402648234, 0.1440731439998364, 0.030336681899681504, 0.1502422775169925, -0.23109608324250408, -0.1411459168968176, -0.11844680704246013, -0.30520244037022565, -0.02738381793479607, 0.0035442706572370846, 0.09669433997401872, -0.04613343950201049, 0.04732975087956472, 0.014549027076758099, -0.13121759468895322, -0.02460356523763056, -0.03232667431347976, -0.04845182066413095, -0.08235638016484574, 0.14031229797310532, 0.0027054463226032595, -0.027757254644707397, 0.1778802867881957, 0.17388526757999986, 0.42539349226150647, -0.2654577160148921, 0.065142774477254, -0.28357814944277326, 0.14366337749590563, -0.22725449247582014, -0.01094072435276211, -0.3359412918589741, 0.13015352997964805, 6.61381730776713e-05, 0.08456880983473965, -0.006407755340981518, -0.20468197843712385, -0.13425056077921313, 0.3659396678359845, 0.034244315221463026, 0.2211714539010516, -0.09356799438273207, 0.15484605034297158, -0.07455158489012252, 0.02431204214394494, -0.20894882404726028, -0.061184904003959685, -0.20326936016454725, -0.16713601372314507, 0.10251203228488513, 0.07051786519736776, -0.015850756839375067, 0.08698743682235752, 0.1808211918463936, -0.07027928645645996, 0.07228532352817084, -0.046685869245335535, 0.04695068580019925, -0.21228429025285783, -0.042819862002978465, -0.01641530617727657, 0.10437861890641827, -0.29047774620511024, -0.0025272113452721873, -0.15014654479414677, -0.09023228879133674, -0.2572436252403658, 0.13368736800193667, -0.12427412934089052, -0.27352343835927845, -0.11157545725513314, 0.0358341683354513, -0.041146733302599624, 0.20367355713217644, -0.22442550122296206, 0.058848612259126534, -0.04171407967178504, 0.0942820307815541, -0.10887728411171237, -0.036125804496997004, 0.03142928904292007, 0.026875340793428064, -0.13094262410679905, -0.3454922285904002, 0.06617687623889394, 0.06526560679207162, -0.2437200950162795, -0.10277695006179548, -0.1154201402845259, 0.05880025885478013, 0.10217653678407898, -0.17682216371606368, 0.1296681198309713, -0.04154197140733042, -0.247566185742127, 0.11367250354290868, -0.13171738584805343, 0.01386919903311146, -0.1786875657372822, -0.11819554116277885, 0.09632413081147624, -0.12171188383320061, 0.11346764265106, -0.000341461008715633, -0.18534776087822208, 0.034983866856437194, -0.07731778930904892, 0.1310631956390652, 9.802660529369879e-05, -0.2566486523002406, -0.10053418279600666, 0.11665691324950325, 0.1649942147666625, 0.0706473424464617, 0.253920504316277, -0.04970100548737276, 0.147035811639944, -0.019656497252087787, 0.02524423607366051, 0.09744888733945554, -0.20312395787565346, -0.017399080315047666, -0.11420779070918645, -0.09035521834583925, -0.05889905896759835, 0.1097338005911875, -0.24275087661833344, -0.0731637801578856, -0.014917794259080882, 0.20817836178470084, 0.08193733078867455, -0.015750226291373792, 0.1818438232854288, 0.12831409315890965, 0.0529640884692024, -0.34808451520778044, 0.22867881985871605, -0.1538200566062835, -0.1024700947516053, -0.08303207038161074, -0.1630929990662372, 0.18241980585294665, 0.31213534717490155, 0.0026446227377929757, -0.005878898805021282, 0.299825078910776, 0.11801786547261377, 0.058000400570935966, -0.12448397524927313, 0.036112836240251606, -0.2294994861755894, -0.06948439989328542, -0.03700083681452396, -0.09251408066762663, 0.07979535036142396, 0.2497096441696611, -0.20518640386471043, -0.03874910090319277, 0.2585740178600322, -0.02368749865907113, -0.03584334479242219, -0.2674979360876386, 0.240434637639629, -0.0683666287019284, 0.19258402434152408, -0.1573662960584378, -0.02754179109569939, -0.16692457100523547, -0.25912695416346465, 0.18096841459244678, 0.08874282619386742, -0.07010033210535456, 0.30840609449569933, 0.2472121920179338, -0.18867709336490102, 0.12147477277335744, -0.08057276075343331, 0.012160404226121116, 0.29311589430865137, -0.08378499506014465, 0.05604529677429519, -0.1621315713177062, 0.030444652850968704, 0.06210215503210762, 0.033674742745930464, -0.10505726087941071, 0.04626921795164311, 0.17382620008497077, -0.07328619634753421, 0.022098156903996805, -0.10026200787013871, 0.05465018238951794, 0.1604556066780265, 0.1466543923140461, 0.017902930194840735, -0.2624593034947462, 0.1993851914178314, -0.10375320444066778, 0.1983262549323277, 0.10841315202691629, -0.007487464484773034, 0.24209493710843374, -0.004083436673047244, 0.06368268726660438, 0.07705723454671544, -0.2249983332073088, -0.21291609117482685, -0.13037563048229045, -0.21133001480031602, -0.15749775764014268, 0.1213492169376623, 0.07996381281129579, -0.4115702205321703, 0.18149284590043518, -0.13206559528370365, 0.19315049918129332, 0.13492148858575792, 0.0918492794948074, 0.17666636351609327, -0.023061471478566818, 0.15133204809306544, -0.15535559663355772, -0.07497873851906624, -0.12414109479445153, -0.039373221753715104, -0.17381368752228446, 0.17838570811453014, -0.0636110952603086, -0.2197158741536992, 0.028442880552786345, -0.29422350314033396, -0.09959855358103781, 0.0027965813413910047, 0.21364906521977925, 0.013363760294760567, 0.1897594401945677, -0.0070972097699741514, 0.08273506520546635, 0.05690319929122007, -0.061666408405260636, -0.22648390867159962, 0.18275938146348367, 0.05044901843748163, -0.0026200473647586503, 0.007748839829812429, 0.029012856435583614, 0.14709554193014532, 0.12964080099818714, -0.11658180762115257, 0.16293823184044193, 0.04688488646000517, -0.0104152863619153, 0.06115092131516587, -0.20910933655694985, -0.35977989803684657, 0.009471940657896766, -0.07493366607248766, -0.13369370209602285, -0.083909063003001, -0.005746218092670262, 0.1244278769270802, 0.2565077357155445, 0.23069661573371963, 0.01609987186619864, -0.05551036820739742, -0.02689516419253168, -0.036461413781285514, -0.05637110802216756, 0.015146337208381292, 0.23225606552711045, -0.01366768726596876, 0.018906123708229798, -0.07457092047891911, 0.054836685560106575, 0.19073602481311228, -0.07883677199963147, 0.07165912249025612, -0.08425594530373907, 0.2911655576681845, -0.22515531568575609, 0.25344534931802065, 0.2039519582162844, 0.03188849190919943, -0.324652179951761, 0.011857039779151401, 0.33289177167682327, -0.020080112782194303, 0.14294434305306206, 0.22509208668674802, 0.09578234695972923, 0.2221391050270396, -0.09974658651923257, 0.0426145378703141, 0.18713363198031557, -0.11067963124634331, 0.10629696573474223, 0.22777580196435132, 0.03353592040337919, -0.2328365915609321, 0.2129943213534501, 0.43707089115249154, 0.05566384348817685, -0.13359110691682513, -0.0517535505305312, 0.006398993034951886, 0.20431371339896823, 0.06402823850418864, 0.18778353282727103, 0.11410065355785408, 0.10244376610181996, -0.01883393551590708, -0.1747101741128215, -0.17145398592590078, -0.047711502753134495, -0.2417337154493197, -0.26698684378491117, 0.013316461409577722, 0.14680713539876497, -0.02605847216031829, -0.15080533662137197, -0.07501494556539297, 0.09338555476844451, 0.011496448486698627, -0.08463832118231716, -0.13021925976062773, -0.0665295338451879, 0.1644176402548816, -0.125226185305711, 0.18225233761186183, 0.3480997811559141, 0.10520778959276025, 0.007937766500041224, 0.16509093415678036, 0.022689622570169116, 0.21053059456243886, -0.08196375114289262, -0.08549400472101745, 0.03830894059133025, 0.15383915632214595, -0.19823686086820527, -0.11256028844310179, -0.303325000726246, -0.015241824240610486, 0.0009702000539829959, 0.009851911959416535, 0.32486860622431896, -0.08313101457038266, 0.007868930586343057, -0.10209139729213472, 0.1018445625989364, -0.09472111158487753, 0.02158574463732234, 0.0007447315479310631, -0.14700821325499444, -0.08151465637456679, -0.09978005200105113, -0.03642674704924549, 0.2222522623700555, 0.1039092936979776, 0.16435031931580696, -0.16141559345602532, -0.0585451392966191, 0.2775748969149004, 0.192582643951237, 0.035933516734382005, 0.15844313347666247, 0.22957605131983702, 0.32051898376429055, 0.021789795682756512, -0.133738052197895, 0.10114612825527054, 0.2486176570123965, -0.08466720379267295, 0.027455118397749025, -0.27978134530405874, 0.014075938407688901, 0.1730405670528408, 0.15689832178917718, -0.08832306213365565, -0.10508536307862607, 0.19090427380366137, -0.06654583374149685, 0.07590600994935545, 0.04758524051570592, -0.05746876265534356, -0.08910304008721094, -0.15161981001215374, 0.21288009927445295, -0.12806849142071702, -0.0643953934417623, 0.0077005847796731575, 0.06695521166606018, -0.11624150708919286, 0.16407747847973364, 0.06576804325767316, 0.048998750646844696, 0.27430833064139043, 0.12225166589643804, 0.15234573464375112, -0.1732421988580575, -0.20327568326952003, 0.33123371384387273, -0.06304678300784218, 0.13131123361025374, 0.02285426872038852, -0.03663807549787293, -0.3281667718342268, 0.018744697930205233, 0.09224661589407485, -0.2838179930598059, 0.17895289725650118, -0.03993615984881065, -0.21307344234146677, 0.01720267772445222, 0.02498748092691066, -0.09779814372920163, -0.2084391276868567, -0.10197487616923799, -0.138611459232242, 0.11770727784056159, 0.1093575743073583, 0.1779346208898737, -0.13381986571128943, -0.09859537850017386, 0.12043983350786394, 0.07217676618982384, 0.13444303511211048, 0.031176071734401775, 0.18732807571993393, 0.042802868915874114, -0.018462319729519283, -0.1649316735567051, -0.016225593660044856, -0.033043475837437865, -0.3043172024021449, -0.0479629154811289, -0.25550529906551894, 0.33693283924599543, 0.3498026984378602, 0.0758626478903696, 0.006055531164562854, 0.05146810974609485, -0.15777807811572972, 0.06321037609158547, 0.10422473643714457, 0.26038788790602463, 0.04971527385154977, -0.18639620120840072, 0.01465660915875183, 0.024932025105752688, -0.02507035578131709, 0.16737627096607954, -0.05022755556871816, 0.09209629237756908, 0.08590222765557777, 0.07304464104169527, -0.17996746048370674, -0.04883325838480134, 0.04355652351349871, 0.18202783081542567, 0.09965388386158044, 0.09661181671002227, 0.1774379614068804, 0.04371905479471258, 0.029139084243306516, -0.12540369357599895, -0.06972542025320613, -0.21370326666631406, -0.03427170944258605, 0.03579947287057464, 0.1677629806791523, 0.19625282697679272, -0.16650552335306867, 0.029492759973581844, -0.18212429027607438, -0.08978623695968901, -0.10640401803240693, 0.20203751682346044, 0.05641980532248272, 0.19755579170191134, 0.017103718128771397, 0.3353343937998915, -0.046986842646847535, 0.17994212056919923, -0.04660542881869505, 0.19660598523684739, 0.24455360582365174, 0.07468901154026832, -0.19412511313402397, 0.12900105774203632, 0.22256121038789875, -0.09168317394725548, 0.04905800193624112, -0.13400818695281272, 0.12012600591845897, -0.11082125534632785, -0.024805873201654276, -0.010505657921361397, 0.07771291305202682, -0.24401243807174114, 0.22798978303874265, -0.07226937498823606, 0.04804240759905105, 0.12151381976632987, -0.00484077880945019, -0.24481617171522457, -0.09003810827469454, 0.044829646480003316, -0.2007484178561026, 0.021333928661019352, -0.18368744485192556, -0.3945278911064757, 0.3450388372202634, -0.31324871329444376, 0.0334238997401158, 0.08332480114547877, -0.034823258969994705, 0.023664719563604466, 0.03203614938650961, -0.01843495352062995, -0.07638801017196499, 0.07545142950586059, 0.15756801304618853, -0.06348029717905401, -0.06303266422577995, -0.07781948705092531, -0.06748234256064453, 0.15348656381492884, 0.17831085847902894, 0.16523660639762922, -0.008609493421024154, 0.05521137336581912, 0.25156537378140087, 0.15390819310273196, -0.11123274878708922, 0.05735904281790028, -0.009743606036425217, -0.04401248200963786, -0.15898221986219915, 0.10635383392245179, 0.1177347122219892, -0.039031430456485536, 0.14657083965776424, -0.28151804005331127, -0.0960298764735709, 0.22681346696514573, -0.3078510330883708, 0.05478756617211884, 0.19596365262602147, -0.22577487524366147, 0.05707322420763417, -0.10239420324194279, -0.029064217342557496, -0.1671115578223764, -0.07827174135948453, 0.20604178987214544, -0.04623452357723414, 0.36584223245167835, -0.11443440145139766, -0.12600902738047678, 0.08867466941498482, 0.021741014902388927, 0.21069964816467782, 0.26960609450304457, -0.01794086483398188, -0.053382715284288514, 0.0762512845898333, 0.3016537411505285, -0.02904674749836766, -0.08314348404859277, -0.16799614428133752, -0.11227284412590108, -0.01479364119851934, -0.03368335459633163, -0.035609471465698775, 0.012364453788310394, -0.27920782135499933, -0.022144743265321957, -0.15728611497174236, -0.3669613839802514, 0.03816337857200884, 0.14872904933635614, 0.10690424235363429, 0.06960071699468763, -0.18616569389912457, 0.192221683275039, -0.12896528951045944, -0.054508888891250454, -0.007301843225704774, -0.11232637109634373, 0.07468497791842349, 0.13463170024889806, -0.06187207258709113, 0.3409985850406003, -0.18653795960273561, 0.16500504690274606, 0.08917277898123656, 0.2743125500617754, 0.10765500318007457, 0.21103504654349084, 0.021157599513007084, -0.15797716090026265, 0.022891025896423604, 0.06371041711647721, 0.09636564177536265, 0.08280531125540562, 0.08362040225326248, -0.07484882416763194, -0.39712453274041726, 0.19292977115361273, 0.14270355346903432, 0.16184288773500113, 0.05226571918856513, -0.20504872669223403, 0.12331774191903737, -0.2639359067416911, 0.10650723746154868, 0.0899341119402021, 0.1657772352144212, 0.016142197177269762, 0.06921000695881123, -0.1354744159919178, -0.1591744407955443, 0.41626444614198616, -0.12118067242192326, -0.06281763311277432, 0.012239988480697477, 0.10922700436759217, 0.15812249173406812, 0.2543468103332324, 0.027493586861333887, -0.3498855414125585, 0.03212985982609805, -0.023713316146788517, -0.024163706233171482, -0.07294008432562879, -0.05477133348505127, 0.08034323112871752, -0.21017592572665902, 0.19062561950814758, -0.024524792991013705, -0.022393986110341838, -0.03549982219681733, 0.03324559884642771, 0.021579173477672357, 0.018130380157821775, 0.00893632613351368, -0.35146602375766556, -0.10976169856446748, 0.022157802079062493, 0.11548979108714448, -0.2702568654632607, 0.08506540722816211, 0.06383730244280185, 0.10656797930200143, -0.08922575030745049, 0.04298333961259477, 0.027340720762492814, 0.17574267615921693, -0.07256541289935946, -0.11874536381311326, 0.22292492942827627, -0.009642779777273526, 0.10845719457945065, -0.2558122207518897, 0.060858674070968115, -0.05872992798048492, 0.01767755611633381, -0.06868055250335897, -0.07231485522443135, -0.07250332119037206, -0.12692971733797612, 0.05321040116863758, -0.15360536021071566, -0.1584166701476434, 0.20092176258309025, 0.33756731923965233, -0.07815184407819252, 0.12879039204991216, -0.052057985599975244, 0.19282860851464123, 0.1758997731302304, -0.006819462175680617, 0.2439986253218668, 0.23084687649600116, -0.30140878955688505, 0.17743500789987515, 0.04863727353622686, 0.33096441642760355, 0.057768788197386955, -0.14855821913318967, 0.04621205400912512, -0.153009489286864, -0.26066425672441806, 0.05596292379080578, -0.10559774828119901, 0.06312967840949385, -0.12013582386124567, 0.09512326305324108, 0.04344167624767195, -0.1617265773685114, -0.08456191936572527, -0.1595255943647195, 0.10316601723604268, -0.04723428851593683, -0.0001976232806646932, 0.2846720769325796, 0.0846314304090992, 0.10898269027112098, -0.06555489774349015, 0.23462626928402278, 0.132302382138479, 0.0111949198576615, -0.1277851003857463, 0.08701771038004372, 0.5697742736960127, -0.17264584689348367, 0.196802705553214, -0.036756114778173865, -0.05484553762333096, -0.21857480395130202, 0.06979322560945575, -0.07600199475818348, -0.2719219876499719, 0.01602247150769837, 0.1356575987363262, 0.022145693322166943, -0.02859305587353583, 0.024704483724477137, -0.1778617456757566, 0.05994709013335252, 0.1602593924053039, 0.34871280113389724, 0.15613237290595078, -0.021069535581369418, -0.05876650939753909, 0.11657096507119778, 0.1380155137276868, 0.10317126111152579, 0.04414724139227518, 0.036502272001425286, 0.09427478269242494, -0.0251726967794742, 0.020182058697151412, -0.16880071504526833, 0.12878544057540436, 0.007785717185646125, 0.045378504383455, -0.17076077390975958, -0.11998867325318517, -0.05930651707161328, 0.022580367247136964, -0.03428610405575067, 0.288193828520059, -0.033307715875566496, 0.08874864509352656, 0.012770814780526388, -0.21925312115585802, 0.3704150240776723, -0.05698645490957832, 0.15398799420677453, 0.08099948941021562, 0.06710604973868305, 0.22788366325393555, 0.07926748978086051, -0.26140475943996094, 0.052065480057414175, 0.12225684516708936, -0.2673520789095843, -0.0010810637285782107, 0.029780725166791307, 0.10897552205026335, -0.10737424508932075, 0.13443660123943066, -0.05851102311616972, 0.058329713540546245, -0.04835652953159115, -0.3305492748522251, 0.14560636644260935, -0.05617096073198725, 0.2611876744059577, -0.15251957577059577, -0.13382027663481028, -0.12461249212337004, 0.15280884725862195, 0.25986605727449397, 0.08344931482972708, -0.11720043182192975, -0.20916219228881988, -0.03185161338677769, -0.1474518025129455, -0.05606297225265404, 0.23530315971809831, 0.04647468274074494, -0.2042854560499739, 0.01615658042657827, 0.2731923320544898, 0.035099115401969735, 0.029772519068574872, 0.2777021092790425, 0.2232684570869011, -0.3012412250925181, -0.003647882468379049, 0.06990472728960301, 0.08622117834143679, -0.2265744656745616, 0.3897846094959791, -0.03983623812673232, -0.09558366640194939, -0.1105330210528379, 0.05492805131034025, -0.17746150624680887, 0.08538716739210248, 0.15208612973878907, 0.09552636046829496, -0.12400994339552185, -0.02557268925240459, -0.02371829599712921, -0.17795161434234, 0.17017914652418786, -0.06616123412041572, 0.11104481873813121, 0.18819993055647785, -0.17227486978744855, -0.21921572989010069, 0.01068723206755737, 0.08684370014871967, -0.008758956850276112};
+
+void cmsis_init_qr(float32_t *f,const int r,const int c)
+{
+   if ((r==4) && (c==4))
+   {
+      memcpy(f,random_isometry_4x4_f32,sizeof(float32_t)*r*c);
+   }
+   if ((r==16) && (c==16))
+   {
+      memcpy(f,random_isometry_16x16_f32,sizeof(float32_t)*r*c);
+   }
+   if ((r==32) && (c==32))
+   {
+      memcpy(f,random_isometry_32x32_f32,sizeof(float32_t)*r*c);
+   }
+}
+
+
+void cmsis_init_qr(float64_t *f,const int r,const int c)
+{
+   if ((r==4) && (c==4))
+   {
+      copy_f32_to_f64(f,(float32_t*)random_isometry_4x4_f32,r*c);
+   }
+   if ((r==16) && (c==16))
+   {
+      copy_f32_to_f64(f,(float32_t*)random_isometry_16x16_f32,r*c);
+   }
+   if ((r==32) && (c==32))
+   {
+      copy_f32_to_f64(f,(float32_t*)random_isometry_32x32_f32,r*c);
+   }
+}
+
+#if !defined(DISABLEFLOAT16)
+void cmsis_init_qr(float16_t *f,const int r,const int c)
+{
+   if ((r==4) && (c==4))
+   {
+      copy_f32_to_f16(f,(float32_t*)random_isometry_4x4_f32,r*c);
+   }
+   if ((r==16) && (c==16))
+   {
+      copy_f32_to_f16(f,(float32_t*)random_isometry_16x16_f32,r*c);
+   }
+   if ((r==32) && (c==32))
+   {
+      copy_f32_to_f16(f,(float32_t*)random_isometry_32x32_f32,r*c);
+   }
+}
+#endif
+
+const float32_t random_sdp_4x4_f32[4*4]={
+25.0, 39.75, 66.0, 93.5, 39.75, 63.75, 105.25, 147.5, 66.0, 105.25, 175.5, 245.5, 93.5, 147.5, 245.5, 353.0
+};
+
+const float32_t random_sdp_16x16_f32[16*16]={
+2.700945297501233, 2.279307100092882, 1.5849755137283175, 2.417789946689702, 2.1514472142359065, 2.6413305532099938, 2.026227680747844, 1.7078250022934063, 1.5348109459558188, 2.4012149501845625, 1.7725768495595937, 2.488697467808775, 2.068715814553819, 2.5311387770052503, 1.0231398628800124, 2.4969556781492686, 2.279307100092882, 2.6564388387340614, 1.4512508865033824, 2.3362931631915225, 2.2022641318875498, 2.544070682449193, 2.1668252749472816, 2.0319723933818348, 1.3947248959004686, 2.202687851122394, 1.6281812459575296, 2.2257145135264063, 2.023398481811725, 2.2396249860896185, 1.1154785609593063, 2.2212595341670935, 1.5849755137283175, 1.4512508865033824, 1.6569550617596642, 1.4108021059813987, 1.6627059842699814, 2.076225217809575, 1.3765836151027875, 1.589420912872039, 1.3403316198834774, 1.836722771319919, 0.8678634773434062, 1.728402611152431, 1.3615509552340186, 1.6102504498177936, 0.47068891979500005, 1.684990944826532, 2.417789946689702, 2.3362931631915225, 1.4108021059813987, 3.2977954871174324, 2.3700971746706934, 2.7750190489484314, 2.248672968327009, 1.8687438430842913, 1.6209886829713436, 2.63830775439295, 2.1089115458534433, 2.718742724433719, 2.6323234102836435, 2.3460912975174124, 1.2270607357960501, 2.5721257623316998, 2.151447214235906, 2.2022641318875498, 1.6627059842699814, 2.3700971746706934, 3.0230567587214376, 3.191103128332187, 1.875287551874037, 2.319758396156411, 2.247936302529488, 2.9501892532332747, 1.833720516752425, 2.58300998901625, 2.6962033493638122, 2.6361607307799377, 0.9104534672305687, 2.8915802013860312, 2.641330553209994, 2.544070682449193, 2.076225217809575, 2.7750190489484314, 3.1911031283321876, 4.3484734981060615, 2.569210676449175, 2.855623851444787, 3.1344020403009623, 3.1472490536236997, 1.9814080616896939, 3.0774078450536253, 3.2778222516109374, 2.9671743747278128, 1.4048415258735187, 3.016934200980412, 2.026227680747844, 2.1668252749472816, 1.3765836151027875, 2.248672968327009, 1.8752875518740373, 2.569210676449175, 2.634968121194319, 2.2493369236498197, 1.7144578362626999, 1.8431862033710502, 1.8101187705570811, 2.491183185729788, 2.195985858813213, 2.2790423526339376, 1.3014559195883564, 2.169629117150412, 1.7078250022934063, 2.0319723933818348, 1.589420912872039, 1.8687438430842913, 2.319758396156411, 2.8556238514447867, 2.24933692364982, 2.6338671573465913, 2.052039836699484, 2.1572063363553937, 1.5406292680679787, 2.3405114868522885, 2.2642224985836377, 2.2623963844332184, 0.9967642116659812, 2.2464621063026424, 1.534810945955819, 1.3947248959004686, 1.3403316198834774, 1.6209886829713434, 2.2479363025294874, 3.1344020403009623, 1.7144578362626999, 2.052039836699484, 2.7144108908441327, 2.0370544226628753, 1.2935791532813437, 2.1703470519822, 2.3749127415903377, 2.1674537956084623, 0.8344205409590062, 1.967349528138, 2.4012149501845625, 2.202687851122394, 1.8367227713199188, 2.63830775439295, 2.9501892532332747, 3.1472490536236997, 1.8431862033710502, 2.1572063363553937, 2.0370544226628753, 3.205410914995665, 2.0481982559972085, 2.65239077271705, 2.701559542415575, 2.7468960455555504, 0.8823172086187251, 3.0500211870766316, 1.772576849559594, 1.6281812459575296, 0.8678634773434062, 2.1089115458534433, 1.833720516752425, 1.9814080616896936, 1.8101187705570811, 1.5406292680679787, 1.2935791532813437, 2.048198255997209, 2.2547060600561646, 2.109266757642494, 2.172574216035175, 2.238825547924319, 0.9422534917819564, 2.43656985711335, 2.488697467808775, 2.2257145135264063, 1.728402611152431, 2.718742724433719, 2.5830099890162503, 3.077407845053625, 2.4911831857297875, 2.3405114868522885, 2.1703470519822, 2.65239077271705, 2.109266757642494, 3.2866709398645626, 2.4857920499819373, 2.8814369596795624, 1.0989863855011937, 2.985768815796319, 2.068715814553819, 2.023398481811725, 1.3615509552340186, 2.6323234102836435, 2.6962033493638127, 3.2778222516109374, 2.195985858813213, 2.2642224985836377, 2.3749127415903377, 2.701559542415575, 2.172574216035175, 2.4857920499819377, 3.103332874731625, 2.5354740586623126, 1.2815604601575314, 2.708101500955981, 2.5311387770052503, 2.2396249860896185, 1.6102504498177934, 2.3460912975174124, 2.636160730779938, 2.9671743747278128, 2.2790423526339376, 2.2623963844332184, 2.1674537956084623, 2.7468960455555504, 2.238825547924319, 2.881436959679562, 2.5354740586623126, 3.402328502508312, 1.0105553901175188, 3.0030756040726687, 1.0231398628800126, 1.1154785609593063, 0.47068891979500005, 1.2270607357960501, 0.9104534672305687, 1.4048415258735187, 1.3014559195883564, 0.9967642116659812, 0.8344205409590062, 0.8823172086187251, 0.9422534917819564, 1.0989863855011937, 1.2815604601575314, 1.0105553901175188, 0.9633674825639494, 0.9878779132923687, 2.4969556781492686, 2.2212595341670935, 1.684990944826532, 2.5721257623316998, 2.8915802013860312, 3.016934200980412, 2.169629117150412, 2.2464621063026424, 1.967349528138, 3.0500211870766316, 2.43656985711335, 2.985768815796319, 2.7081015009559812, 3.0030756040726687, 0.9878779132923688, 3.4586125928033367
+};
+
+const float32_t random_sdp_32x32_f32[32*32] = {
+   3.962982337952489, 3.258063047322271, 4.049537103963542, 3.4596919611189594, 2.608748240260557, 3.457762006881469, 3.2193769362666877, 2.36878681220616, 2.608860056155112, 2.9359903261121003, 3.4347952190769093, 3.3489562216269997, 3.236532222240136, 3.3908161478252565, 2.6004287471585146, 3.4778799533527907, 3.9441297355021367, 2.690339079959059, 2.7040435571588723, 4.097516198078844, 2.689397904612436, 3.1757221504502433, 2.7840089755113824, 3.282253321577887, 3.5861682887660606, 3.2593594395795438, 2.856336126475509, 2.875790806724925, 3.5697944697713715, 3.0718633171796257, 2.8639288036044017, 3.4953216860726917, 3.2580630473222714, 5.023658450702154, 4.724924357607405, 4.2959691524241785, 3.0328418814455684, 4.379776948368773, 4.531656553346246, 2.0805311484707274, 4.146861899214031, 2.8448770454684675, 4.298596385456439, 3.5677857895875187, 3.8868091580495783, 4.172158250801878, 3.077421836311209, 3.8000981155967586, 4.764590496557555, 3.404267929293151, 4.099327726255604, 3.993442101897572, 3.2371606240613864, 3.389573370397508, 2.7108312600028444, 3.7189728972187863, 4.832988078372525, 4.0438269034720635, 3.637021975641108, 3.069547233798435, 3.8911683886781083, 4.002699656759023, 3.686659336383217, 4.420883318787573, 4.049537103963542, 4.724924357607405, 7.284179248268556, 5.214541222622886, 3.887970860004088, 5.753806053322463, 5.629546679017673, 3.1802228348908663, 4.152764541595372, 4.036275511661986, 5.254827170066871, 5.532275675413409, 4.654156631928212, 4.841400421776575, 4.2678346017184055, 5.064399938356294, 5.787496583752425, 4.061770065163565, 4.561935395436077, 5.338178627385598, 4.631982669583265, 4.04817911502475, 4.074773528636097, 4.0556967255953875, 6.547668531149082, 5.060337202052291, 4.1585976592724805, 4.179632273577691, 5.390197171213056, 5.109426074778116, 4.47639612115159, 5.501184786945122, 3.4596919611189594, 4.2959691524241785, 5.214541222622886, 5.755337351440731, 3.2793603164682876, 4.507148475335209, 4.719209780833385, 2.7260604546475093, 3.8596642985116865, 4.050808909983587, 4.296660527027206, 4.467077491977087, 3.807955281455994, 4.236755979586986, 3.091042946631457, 4.64090491578105, 4.762949639787548, 3.434756962715794, 3.9622911604240936, 4.9414617708424755, 3.5718883848046374, 4.18403299118855, 3.2526585082487345, 4.375478116829063, 5.019622247849673, 4.328262826144412, 3.5869724275885746, 3.7991926581926996, 4.49573655533737, 4.1338105680488315, 3.8378506208796788, 4.6474502405231695, 2.608748240260557, 3.0328418814455684, 3.887970860004088, 3.2793603164682876, 4.04308751252136, 4.119152859559557, 3.8500257234268096, 2.880692983322458, 3.022137277417667, 3.2126022287886626, 2.859833633423297, 3.3549685517285246, 3.211429795975937, 3.3415961605570947, 3.000485405462325, 2.943122255088694, 3.5880171934791205, 2.3462117508666482, 3.741867881663351, 3.5815075799994185, 2.977398687181556, 3.0738523051908526, 2.647727588076209, 3.1846864162261803, 4.243204991826781, 3.808550080377437, 3.048375263329588, 3.184050092366138, 2.9819193805283186, 4.130140010902999, 3.3669248875223405, 4.209854238756024, 3.457762006881469, 4.379776948368773, 5.753806053322463, 4.50714847533521, 4.119152859559557, 6.546506251335145, 5.5051153713896, 3.6885831799404025, 4.2935116379229, 4.937349888136175, 4.649634614150263, 5.162565873103582, 4.321208816517622, 4.860526725438859, 4.165866130280626, 4.508121000863119, 5.000798950996206, 3.853839597047175, 4.681209355831753, 5.205088044633326, 4.656066608350676, 3.763668449755257, 3.7186768765948246, 4.4137552971329965, 5.972371664188578, 4.870960893115659, 4.0940154113020935, 4.256499943049193, 4.935598187319021, 5.193115956877178, 4.538659358985781, 5.467162237890344, 3.2193769362666877, 4.531656553346247, 5.629546679017673, 4.719209780833385, 3.8500257234268096, 5.5051153713896, 6.22927527739691, 3.1619392465984997, 4.272672935545915, 4.215494152853278, 4.747103580602807, 4.4113724085939685, 4.394126410946309, 4.876346299318329, 4.3970941492352065, 4.690453908817078, 4.987124888771773, 3.8720128577642714, 5.005936538290864, 5.135590050457474, 4.430956864154369, 4.282234094786728, 3.4980695116273197, 4.651259656732732, 6.098362448110332, 4.902641138451369, 4.002140716219812, 4.180661781965751, 4.531866821353604, 5.19681235887693, 4.420638184264228, 5.239797812982575, 2.36878681220616, 2.0805311484707274, 3.1802228348908663, 2.7260604546475093, 2.8806929833224584, 3.6885831799404025, 3.1619392465984997, 3.6015844418444867, 2.061690049445126, 3.1774296031892844, 2.638687436275006, 3.3259966748517122, 2.916641244437966, 3.4147088239201686, 2.995511844750009, 3.1144775076574, 2.8617292785893884, 2.44407453417666, 2.8462162417036643, 3.6254255401247812, 2.4228595252895717, 2.8049680051424444, 2.6415721529678855, 3.0381755089847204, 3.7032550120916836, 3.0862509694866507, 2.7427512583789655, 3.0183933997984527, 3.1416872372889757, 3.4657503224322217, 2.84541646065416, 3.1309563860597294, 2.6088600561551116, 4.146861899214031, 4.152764541595372, 3.8596642985116865, 3.022137277417667, 4.2935116379229, 4.272672935545915, 2.061690049445126, 4.7294795506368175, 3.0221070327854753, 4.025459294089731, 3.080139219364956, 3.631347582120466, 3.5106350455758206, 2.6597059216322783, 3.4239263410600724, 4.129353706826948, 2.9820850873953497, 3.9822531680505335, 3.555341926595753, 3.232590122945775, 3.0681871064352157, 2.641712330539264, 3.563373999945094, 4.621924180487924, 4.029114523703862, 3.667069952178138, 3.017108598729165, 3.4655186480439966, 3.7571439035724405, 3.7973654368338905, 4.429482895305322, 2.9359903261121003, 2.844877045468468, 4.036275511661986, 4.050808909983588, 3.2126022287886626, 4.937349888136175, 4.215494152853278, 3.1774296031892844, 3.0221070327854753, 5.507250764726119, 3.6082520296113896, 4.37712941178483, 3.0201828483854625, 3.532212628781479, 3.3784887957799734, 3.6430306828318626, 3.533512253218044, 2.6605862162118044, 3.1879783187523123, 4.73475228265243, 3.5585532841037466, 3.37039024073841, 3.11298560417523, 4.165794870909168, 4.107268173069444, 3.3972906236575535, 3.0254360330349517, 3.820546211777247, 4.118197397914825, 3.88092917593326, 3.319431850310794, 4.543079325847466, 3.434795219076909, 4.298596385456439, 5.254827170066871, 4.296660527027206, 2.859833633423297, 4.649634614150263, 4.747103580602807, 2.638687436275006, 4.025459294089731, 3.608252029611389, 5.489222332395801, 4.321653434790665, 4.048299728306069, 4.2655438311448926, 3.5161464946579217, 4.12528702657051, 4.846068415371156, 3.641773586306044, 3.9391712251653876, 4.361063940213462, 3.3283375684527057, 3.3471760835941597, 3.2652377881591974, 3.743159929772741, 4.981291662201602, 4.1136187187582065, 3.710298330122175, 3.398733012364021, 4.581966936566409, 4.135729857066328, 3.4875870437888126, 4.315861446066634, 3.3489562216269997, 3.5677857895875187, 5.532275675413409, 4.467077491977087, 3.354968551728525, 5.162565873103582, 4.4113724085939685, 3.3259966748517122, 3.080139219364956, 4.37712941178483, 4.321653434790666, 5.8116977142889334, 4.248043030563612, 4.634109230793916, 3.755341543781859, 4.232720913713794, 4.4373230263368875, 3.5373288401130125, 3.753067541946593, 5.107994467928271, 3.703709687087838, 3.4017887580895625, 3.7166820404002197, 3.477478511872312, 5.271159972924597, 3.9185500461955938, 3.570272481340088, 3.648356694041031, 4.832017644670343, 4.496102872835249, 3.375479815407288, 4.593329950518181, 3.236532222240136, 3.8868091580495787, 4.654156631928212, 3.807955281455994, 3.211429795975937, 4.321208816517622, 4.394126410946309, 2.916641244437966, 3.631347582120466, 3.020182848385463, 4.048299728306069, 4.248043030563612, 4.800715845660676, 4.775593312563134, 3.567661110748682, 3.934824454477356, 4.421474943200809, 3.7596900032465976, 4.00720439929119, 4.080602229227414, 2.9301499225361947, 3.5559236690468814, 3.3068988390064016, 3.3129740057910935, 5.104192922328095, 4.335540173260794, 3.9003990175161416, 3.099814362883594, 4.2276464665513185, 4.409178153986425, 3.347618191967478, 4.1400665110533135, 3.3908161478252565, 4.172158250801878, 4.841400421776575, 4.236755979586986, 3.3415961605570947, 4.86052672543886, 4.87634629931833, 3.4147088239201686, 3.5106350455758206, 3.532212628781479, 4.265543831144893, 4.634109230793916, 4.775593312563134, 6.3161185512667375, 4.3607357543538745, 4.472926823961644, 4.787804168868831, 4.480257059053383, 4.351129672886095, 4.956723805718066, 3.2817180767772847, 4.064608483458253, 3.4630363070355243, 4.082032324226313, 5.456701268308576, 4.425087165705437, 4.207166220406497, 3.648249402047222, 4.762939171146694, 4.59238479787869, 3.7084370569394034, 4.8961512102843665, 2.6004287471585146, 3.077421836311209, 4.2678346017184055, 3.0910429466314566, 3.000485405462325, 4.165866130280626, 4.3970941492352065, 2.995511844750009, 2.6597059216322787, 3.3784887957799734, 3.5161464946579217, 3.755341543781859, 3.5676611107486824, 4.3607357543538745, 4.504732532790424, 3.7786294994658562, 3.506558137892223, 3.2051962359778194, 3.275049853320842, 4.232134097007591, 2.885351767644541, 3.297796466284369, 2.989726470512907, 3.258897345148313, 4.601283551978352, 3.2143148920973594, 3.506895006774462, 3.1650104186493535, 3.8577079292344436, 4.420040964607784, 3.2248880254093124, 4.228615535093103, 3.47787995335279, 3.8000981155967586, 5.064399938356294, 4.64090491578105, 2.943122255088694, 4.508121000863119, 4.690453908817077, 3.1144775076574, 3.4239263410600724, 3.6430306828318626, 4.12528702657051, 4.232720913713794, 3.934824454477356, 4.472926823961644, 3.778629499465856, 5.160690642707333, 4.70288675157755, 3.592106107308394, 3.6150011096851773, 4.974456614987606, 3.5948072939803066, 4.066356141365569, 3.5263453556874835, 4.008246249772924, 5.157227995207579, 3.910049366071169, 3.737927553944732, 3.586808327844562, 4.714437165385775, 4.361707870106988, 4.098479783361043, 4.328992257281945, 3.9441297355021363, 4.764590496557555, 5.787496583752425, 4.762949639787548, 3.5880171934791205, 5.000798950996206, 4.987124888771774, 2.861729278589389, 4.129353706826948, 3.533512253218044, 4.846068415371156, 4.4373230263368875, 4.421474943200809, 4.787804168868831, 3.506558137892223, 4.70288675157755, 6.281630386609974, 3.695682860273643, 4.323695795169043, 4.582339249584636, 3.6999533439715244, 4.389942828045708, 3.496355211553264, 4.02004497778957, 5.6123415073582725, 4.91454946859334, 3.8478620477990098, 3.3830386760358624, 4.817651280938564, 4.415266293875235, 4.377789722768812, 4.769824720234984, 2.690339079959059, 3.404267929293151, 4.061770065163565, 3.4347569627157943, 2.3462117508666482, 3.8538395970471746, 3.872012857764271, 2.44407453417666, 2.9820850873953497, 2.6605862162118035, 3.641773586306044, 3.537328840113012, 3.7596900032465976, 4.480257059053383, 3.2051962359778194, 3.592106107308394, 3.695682860273643, 4.086893915175692, 3.6637315499518297, 3.7972745471459, 2.6123858780303184, 3.260962256638132, 2.5107417878406024, 3.147522234777949, 4.507682555601473, 3.9817157236029717, 3.548071933465432, 2.8485210896754625, 3.4347312974375157, 3.4773243786626478, 2.7884515756799844, 3.46627039278344, 2.7040435571588723, 4.099327726255604, 4.561935395436076, 3.9622911604240936, 3.741867881663351, 4.681209355831753, 5.005936538290864, 2.8462162417036643, 3.9822531680505335, 3.1879783187523127, 3.9391712251653876, 3.7530675419465935, 4.00720439929119, 4.3511296728860955, 3.275049853320842, 3.6150011096851773, 4.323695795169043, 3.6637315499518297, 5.478257310901688, 4.19993313888363, 3.614452539945133, 3.6089548995618363, 2.700334364077501, 3.7689206389182917, 5.517721253756119, 4.810875315673601, 3.739654328170675, 3.355587862975343, 3.1178505116500475, 4.432696657662017, 3.6115104635959305, 4.45407178999449, 4.097516198078844, 3.993442101897572, 5.338178627385598, 4.9414617708424755, 3.5815075799994185, 5.205088044633326, 5.135590050457474, 3.6254255401247812, 3.555341926595753, 4.73475228265243, 4.361063940213462, 5.107994467928271, 4.080602229227414, 4.956723805718066, 4.232134097007591, 4.974456614987606, 4.582339249584636, 3.7972745471459, 4.19993313888363, 6.391155065398847, 4.205786901340056, 4.345539543307413, 3.9040138628914605, 4.630881140699793, 5.378053961665847, 4.0485975810778, 3.985507482186319, 4.188519410754106, 4.79323310700591, 4.805461546372272, 4.142294602254569, 4.997091818139618, 2.689397904612436, 3.2371606240613864, 4.631982669583265, 3.5718883848046374, 2.977398687181556, 4.656066608350676, 4.430956864154369, 2.4228595252895717, 3.232590122945775, 3.558553284103746, 3.3283375684527057, 3.7037096870878377, 2.9301499225361947, 3.281718076777285, 2.885351767644541, 3.5948072939803066, 3.6999533439715244, 2.6123858780303184, 3.614452539945133, 4.205786901340056, 4.461577543541839, 2.768835855171012, 2.800749837488376, 3.5029818891504254, 4.703943150080182, 3.3498285948948254, 2.8376627557852707, 3.620245352174845, 3.6054379243286756, 3.6884397530429562, 3.83220999148105, 4.236864571331281, 3.1757221504502433, 3.389573370397508, 4.04817911502475, 4.18403299118855, 3.073852305190852, 3.763668449755257, 4.282234094786728, 2.8049680051424444, 3.068187106435216, 3.37039024073841, 3.3471760835941597, 3.401788758089563, 3.5559236690468814, 4.064608483458253, 3.297796466284369, 4.066356141365569, 4.389942828045708, 3.260962256638132, 3.608954899561836, 4.345539543307413, 2.768835855171012, 4.778666558950309, 2.7252758569408844, 4.139799353641271, 4.358587299358624, 4.336419723335723, 3.463399986647839, 3.2431942050084186, 3.599032326588088, 3.656256519712694, 3.4610318583337536, 4.012940929905369, 2.7840089755113824, 2.7108312600028444, 4.074773528636097, 3.2526585082487345, 2.647727588076209, 3.718676876594824, 3.4980695116273193, 2.6415721529678855, 2.641712330539264, 3.1129856041752295, 3.265237788159197, 3.7166820404002197, 3.3068988390064016, 3.463036307035525, 2.989726470512907, 3.526345355687484, 3.496355211553264, 2.5107417878406024, 2.7003343640775013, 3.904013862891461, 2.8007498374883757, 2.7252758569408844, 3.437434769935966, 2.721578934269042, 3.889481141612586, 2.803310271841279, 2.7204072641058397, 2.820035885184703, 3.939030619721151, 3.6190065375626665, 3.008567058328858, 3.5147109177202065, 3.2822533215778864, 3.7189728972187868, 4.0556967255953875, 4.375478116829063, 3.1846864162261803, 4.4137552971329965, 4.651259656732731, 3.0381755089847204, 3.563373999945094, 4.165794870909169, 3.743159929772741, 3.477478511872312, 3.3129740057910935, 4.082032324226313, 3.2588973451483128, 4.008246249772924, 4.02004497778957, 3.147522234777949, 3.768920638918291, 4.630881140699793, 3.5029818891504254, 4.139799353641271, 2.721578934269042, 5.007191587031482, 4.587256368407053, 4.295549772520548, 3.428474472825706, 4.0557837598725754, 3.8107006803081807, 3.6694459006224625, 3.8238305401306754, 4.646115194066262, 3.5861682887660598, 4.832988078372525, 6.547668531149082, 5.019622247849673, 4.243204991826781, 5.972371664188578, 6.098362448110332, 3.703255012091683, 4.621924180487924, 4.107268173069444, 4.981291662201602, 5.271159972924597, 5.104192922328095, 5.456701268308576, 4.601283551978352, 5.157227995207579, 5.6123415073582725, 4.507682555601473, 5.517721253756119, 5.378053961665848, 4.703943150080182, 4.358587299358624, 3.889481141612586, 4.587256368407053, 7.4682073838626275, 5.786947450753669, 4.584325682050921, 4.465525005563425, 5.070608638207554, 5.700669236606462, 4.862583198804054, 5.829303490547293, 3.2593594395795438, 4.0438269034720635, 5.060337202052291, 4.328262826144412, 3.808550080377437, 4.870960893115659, 4.902641138451369, 3.0862509694866507, 4.029114523703862, 3.3972906236575535, 4.1136187187582065, 3.9185500461955938, 4.335540173260794, 4.425087165705437, 3.2143148920973594, 3.910049366071169, 4.91454946859334, 3.981715723602971, 4.810875315673601, 4.0485975810778, 3.3498285948948254, 4.336419723335723, 2.8033102718412795, 4.295549772520548, 5.786947450753669, 5.9708576015969514, 3.862990457206838, 3.688856786458443, 3.8123793851787267, 4.263857914345461, 3.793624490926946, 4.56651781737832, 2.856336126475509, 3.6370219756411086, 4.1585976592724805, 3.5869724275885746, 3.0483752633295884, 4.0940154113020935, 4.002140716219812, 2.7427512583789655, 3.667069952178138, 3.0254360330349517, 3.710298330122175, 3.570272481340088, 3.9003990175161416, 4.207166220406497, 3.5068950067744624, 3.737927553944732, 3.84786204779901, 3.548071933465432, 3.739654328170675, 3.985507482186319, 2.8376627557852707, 3.463399986647839, 2.7204072641058397, 3.4284744728257066, 4.584325682050921, 3.8629904572068376, 4.227355585470887, 2.9691756342532503, 3.417068845143175, 4.075641623192482, 3.406268092268903, 4.115988350722507, 2.875790806724925, 3.0695472337984353, 4.179632273577691, 3.7991926581927, 3.184050092366138, 4.256499943049193, 4.180661781965751, 3.0183933997984527, 3.017108598729165, 3.820546211777247, 3.3987330123640214, 3.648356694041031, 3.099814362883594, 3.6482494020472225, 3.1650104186493535, 3.5868083278445626, 3.3830386760358624, 2.8485210896754625, 3.355587862975343, 4.188519410754106, 3.620245352174845, 3.243194205008418, 2.820035885184703, 4.0557837598725754, 4.465525005563425, 3.688856786458443, 2.9691756342532503, 4.151691400203102, 3.764884638050069, 3.625803929274025, 3.5481220676177156, 4.376103742915374, 3.5697944697713715, 3.891168388678108, 5.390197171213057, 4.495736555337369, 2.9819193805283186, 4.935598187319021, 4.531866821353604, 3.1416872372889757, 3.4655186480439966, 4.118197397914825, 4.581966936566409, 4.832017644670343, 4.2276464665513185, 4.762939171146694, 3.8577079292344436, 4.714437165385775, 4.817651280938564, 3.4347312974375153, 3.117850511650048, 4.79323310700591, 3.6054379243286756, 3.599032326588088, 3.939030619721151, 3.8107006803081807, 5.070608638207554, 3.8123793851787267, 3.4170688451431754, 3.7648846380500696, 5.825676096413891, 4.34505859006525, 3.9109971626599247, 4.6718773894135035, 3.0718633171796257, 4.002699656759023, 5.109426074778116, 4.1338105680488315, 4.130140010902999, 5.193115956877178, 5.19681235887693, 3.4657503224322217, 3.7571439035724405, 3.88092917593326, 4.135729857066328, 4.496102872835249, 4.409178153986424, 4.59238479787869, 4.420040964607785, 4.361707870106988, 4.415266293875235, 3.4773243786626473, 4.432696657662016, 4.805461546372272, 3.6884397530429562, 3.656256519712694, 3.619006537562667, 3.6694459006224625, 5.700669236606462, 4.263857914345462, 4.075641623192482, 3.6258039292740247, 4.34505859006525, 5.70044338947371, 4.048640396071456, 4.818445249885493, 2.8639288036044017, 3.686659336383218, 4.47639612115159, 3.8378506208796788, 3.3669248875223405, 4.538659358985781, 4.420638184264228, 2.84541646065416, 3.7973654368338905, 3.319431850310794, 3.4875870437888126, 3.375479815407288, 3.347618191967478, 3.7084370569394034, 3.2248880254093124, 4.098479783361043, 4.377789722768812, 2.7884515756799844, 3.61151046359593, 4.142294602254569, 3.83220999148105, 3.4610318583337536, 3.0085670583288575, 3.8238305401306754, 4.862583198804054, 3.793624490926946, 3.4062680922689035, 3.5481220676177156, 3.910997162659925, 4.048640396071456, 4.453197123559195, 4.5010891645489, 3.4953216860726917, 4.420883318787573, 5.501184786945122, 4.647450240523169, 4.209854238756024, 5.467162237890344, 5.239797812982575, 3.1309563860597294, 4.429482895305322, 4.543079325847466, 4.315861446066634, 4.593329950518181, 4.140066511053313, 4.8961512102843665, 4.228615535093103, 4.328992257281945, 4.7698247202349835, 3.4662703927834397, 4.45407178999449, 4.9970918181396184, 4.236864571331281, 4.012940929905369, 3.514710917720206, 4.646115194066262, 5.829303490547293, 4.56651781737832, 4.115988350722507, 4.376103742915374, 4.6718773894135035, 4.818445249885493, 4.5010891645489, 6.4605565465606265
+};
+
+void cmsis_init_cholesky(float32_t *f,const int r,const int c)
+{
+   if ((r==4) && (c==4))
+   {
+      memcpy(f,random_sdp_4x4_f32,sizeof(float32_t)*r*c);
+   }
+   if ((r==16) && (c==16))
+   {
+      memcpy(f,random_sdp_16x16_f32,sizeof(float32_t)*r*c);
+   }
+   if ((r==32) && (c==32))
+   {
+      memcpy(f,random_sdp_32x32_f32,sizeof(float32_t)*r*c);
+   }
+}
+
+void cmsis_init_cholesky(float64_t *f,const int r,const int c)
+{
+   if ((r==4) && (c==4))
+   {
+      copy_f32_to_f64(f,(float32_t*)random_sdp_4x4_f32,r*c);
+   }
+   if ((r==16) && (c==16))
+   {
+      copy_f32_to_f64(f,(float32_t*)random_sdp_16x16_f32,r*c);
+   }
+   if ((r==32) && (c==32))
+   {
+      copy_f32_to_f64(f,(float32_t*)random_sdp_32x32_f32,r*c);
+   }
+}
+
+#if !defined(DISABLEFLOAT16)
+void cmsis_init_cholesky(float16_t *f,const int r,const int c)
+{
+   if ((r==4) && (c==4))
+   {
+      copy_f32_to_f16(f,(float32_t*)random_sdp_4x4_f32,r*c);
+   }
+   if ((r==16) && (c==16))
+   {
+      copy_f32_to_f16(f,(float32_t*)random_sdp_16x16_f32,r*c);
+   }
+   if ((r==32) && (c==32))
+   {
+      copy_f32_to_f16(f,(float32_t*)random_sdp_32x32_f32,r*c);
+   }
+}
+#endif
+
+void cmsis_mat_mult(const arm_matrix_instance_f64* a, 
+                    const arm_matrix_instance_f64* b, 
+                          arm_matrix_instance_f64 *c,
+                          float64_t * pState=nullptr)
+{
+   (void)pState;
+   arm_mat_mult_f64(a,b,c);
+}
+
+void cmsis_mat_mult(const arm_matrix_instance_f32* a, 
+                    const arm_matrix_instance_f32* b, 
+                          arm_matrix_instance_f32 *c,
+                          float32_t * pState=nullptr)
+{
+   (void)pState;
+   arm_mat_mult_f32(a,b,c);
+}
+
+#if !defined(DISABLEFLOAT16)
+void cmsis_mat_mult(const arm_matrix_instance_f16* a, 
+                    const arm_matrix_instance_f16* b, 
+                          arm_matrix_instance_f16 *c,
+                          float16_t * pState=nullptr)
+{
+   (void)pState;
+   arm_mat_mult_f16(a,b,c);
+}
+#endif
+
+void cmsis_mat_mult(const arm_matrix_instance_q7* a, 
+                    const arm_matrix_instance_q7* b, 
+                          arm_matrix_instance_q7 *c,
+                          q7_t *pState)
+{
+   arm_mat_mult_q7(a,b,c,pState);
+}
+
+void cmsis_mat_mult(const arm_matrix_instance_q15* a, 
+                    const arm_matrix_instance_q15* b, 
+                          arm_matrix_instance_q15 *c,
+                          q15_t *pState)
+{
+   arm_mat_mult_q15(a,b,c,pState);
+}
+
+void cmsis_mat_mult(const arm_matrix_instance_q31* a, 
+                    const arm_matrix_instance_q31* b, 
+                          arm_matrix_instance_q31 *c,
+                          q31_t *pState)
+{
+   (void)pState;
+   arm_mat_mult_q31(a,b,c);
+}
+
+void cmsis_mat_trans(const arm_matrix_instance_q7* a, 
+                            arm_matrix_instance_q7* b)
+{
+   arm_mat_trans_q7(a,b);
+
+}
+
+void cmsis_mat_trans(const arm_matrix_instance_q15* a, 
+                            arm_matrix_instance_q15* b)
+{
+   arm_mat_trans_q15(a,b);
+
+}
+
+void cmsis_mat_trans(const arm_matrix_instance_q31* a, 
+                            arm_matrix_instance_q31* b)
+{
+   arm_mat_trans_q31(a,b);
+
+}
+
+void cmsis_mat_trans(const arm_matrix_instance_f64* a, 
+                            arm_matrix_instance_f64* b)
+{
+   arm_mat_trans_f64(a,b);
+}
+
+void cmsis_mat_trans(const arm_matrix_instance_f32* a, 
+                            arm_matrix_instance_f32* b)
+{
+   arm_mat_trans_f32(a,b);
+}
+
+#if !defined(DISABLEFLOAT16)
+void cmsis_mat_trans(const arm_matrix_instance_f16* a, 
+                            arm_matrix_instance_f16* b)
+{
+   arm_mat_trans_f16(a,b);
+}
+#endif
+
+float64_t cmsis_householder(const float64_t *src,float64_t* dst,uint32_t nb)
+{
+return arm_householder_f64(src,DEFAULT_HOUSEHOLDER_THRESHOLD_F64,nb,dst);
+}
+
+
+float32_t cmsis_householder(const float32_t *src,float32_t* dst,uint32_t nb)
+{
+return arm_householder_f32(src,DEFAULT_HOUSEHOLDER_THRESHOLD_F32,nb,dst);
+}
+
+#if !defined(DISABLEFLOAT16)
+float16_t cmsis_householder(const float16_t *src,float16_t* dst,uint32_t nb)
+{
+return arm_householder_f16(src,DEFAULT_HOUSEHOLDER_THRESHOLD_F16,nb,dst);
+
+}
+#endif
+
+arm_status cmsis_qr(
+    const arm_matrix_instance_f64 * pSrc,
+    const float64_t threshold,
+    arm_matrix_instance_f64 * pOutR,
+    arm_matrix_instance_f64 * pOutQ,
+    float64_t * pOutTau,
+    float64_t *pTmpA,
+    float64_t *pTmpB
+    )
+{
+return arm_mat_qr_f64(pSrc,threshold,
+                       pOutR,pOutQ,
+                       pOutTau,
+                       pTmpA,
+                       pTmpB);
+}
+
+arm_status cmsis_qr(
+    const arm_matrix_instance_f32 * pSrc,
+    const float32_t threshold,
+    arm_matrix_instance_f32 * pOutR,
+    arm_matrix_instance_f32 * pOutQ,
+    float32_t * pOutTau,
+    float32_t *pTmpA,
+    float32_t *pTmpB
+    )
+{
+return arm_mat_qr_f32(pSrc,threshold,
+                       pOutR,pOutQ,
+                       pOutTau,
+                       pTmpA,
+                       pTmpB);
+}
+
+#if !defined(DISABLEFLOAT16)
+arm_status cmsis_qr(
+    const arm_matrix_instance_f16 * pSrc,
+    const float16_t threshold,
+    arm_matrix_instance_f16 * pOutR,
+    arm_matrix_instance_f16 * pOutQ,
+    float16_t * pOutTau,
+    float16_t *pTmpA,
+    float16_t *pTmpB
+    )
+{
+return arm_mat_qr_f16(pSrc,threshold,
+                       pOutR,pOutQ,
+                       pOutTau,
+                       pTmpA,
+                       pTmpB);
+}
+#endif
+
+arm_status cmsis_cholesky(
+  const arm_matrix_instance_f64 * src,
+  arm_matrix_instance_f64 * dst)
+{
+  return arm_mat_cholesky_f64(src,dst);
+}
+
+arm_status cmsis_cholesky(
+  const arm_matrix_instance_f32 * src,
+  arm_matrix_instance_f32 * dst)
+{
+  return arm_mat_cholesky_f32(src,dst);
+}
+
+#if !defined(DISABLEFLOAT16)
+arm_status cmsis_cholesky(
+  const arm_matrix_instance_f16 * src,
+  arm_matrix_instance_f16 * dst)
+{
+  return arm_mat_cholesky_f16(src,dst);
+
+}
+#endif
+
+
+
+void cmsis_mat_vec_mult(
+  const arm_matrix_instance_f32 *pSrcMat, 
+  const float32_t *pVec, 
+  float32_t *pDst)
+{
+arm_mat_vec_mult_f32(pSrcMat, pVec, pDst);
+}
+
+#if !defined(DISABLEFLOAT16)
+void cmsis_mat_vec_mult(
+  const arm_matrix_instance_f16 *pSrcMat, 
+  const float16_t *pVec, 
+  float16_t *pDst)
+{
+  arm_mat_vec_mult_f16(pSrcMat, pVec, pDst);
+ 
+}
+#endif
+
+void cmsis_mat_vec_mult(
+  const arm_matrix_instance_q31 *pSrcMat, 
+  const Q31 *pVec, 
+  Q31 *pDst)
+{
+arm_mat_vec_mult_q31(pSrcMat, 
+   reinterpret_cast<const q31_t*>(pVec), 
+   reinterpret_cast<q31_t*>(pDst));
+}
+
+void cmsis_mat_vec_mult(
+  const arm_matrix_instance_q15 *pSrcMat, 
+  const Q15 *pVec, 
+  Q15 *pDst)
+{
+arm_mat_vec_mult_q15(pSrcMat, 
+   reinterpret_cast<const q15_t*>(pVec), 
+   reinterpret_cast<q15_t*>(pDst));
+}
+
+void cmsis_mat_vec_mult(
+  const arm_matrix_instance_q7 *pSrcMat, 
+  const Q7 *pVec, 
+  Q7 *pDst)
+{
+arm_mat_vec_mult_q7(pSrcMat, 
+   reinterpret_cast<const q7_t*>(pVec), 
+   reinterpret_cast<q7_t*>(pDst));
+}
+
+extern void cmsis_complex_mat_vec(
+  const arm_matrix_instance_f32 * src,
+  const float32_t * a,
+  const float32_t * b,
+  const float32_t scalar,
+  float32_t * tmp,
+  float32_t * dst)
+{
+   arm_scale_f32(b,scalar,tmp,src->numCols);
+   arm_add_f32(a,tmp,tmp,src->numCols);
+   arm_mat_vec_mult_f32(src, tmp, dst);
+}
+
+#if !defined(DISABLEFLOAT16)
+extern void cmsis_complex_mat_vec(
+  const arm_matrix_instance_f16 * src,
+  const float16_t * a,
+  const float16_t * b,
+  const float16_t scalar,
+  float16_t * tmp,
+  float16_t * dst)
+{
+   arm_scale_f16(b,scalar,tmp,src->numCols);
+   arm_add_f16(a,tmp,tmp,src->numCols);
+   arm_mat_vec_mult_f16(src, tmp, dst);
+}
+#endif
+
+extern void cmsis_complex_mat_vec(
+  const arm_matrix_instance_q31 * src,
+  const Q31 * a,
+  const Q31 * b,
+  const Q31 scalar,
+  Q31 * tmp,
+  Q31 * dst)
+{
+   arm_scale_q31(reinterpret_cast<const q31_t*>(b),
+                 scalar.v,0,
+                 reinterpret_cast<q31_t*>(tmp),src->numCols);
+   arm_add_q31(reinterpret_cast<const q31_t*>(a),
+               reinterpret_cast<const q31_t*>(tmp),
+               reinterpret_cast<q31_t*>(tmp),src->numCols);
+   arm_mat_vec_mult_q31(src, 
+                        reinterpret_cast<const q31_t*>(tmp), 
+                        reinterpret_cast<q31_t*>(dst));
+}
+
+extern void cmsis_complex_mat_vec(
+  const arm_matrix_instance_q15 * src,
+  const Q15 * a,
+  const Q15 * b,
+  const Q15 scalar,
+  Q15 * tmp,
+  Q15 * dst)
+{
+   arm_scale_q15(reinterpret_cast<const q15_t*>(b),
+                 scalar.v,0,
+                 reinterpret_cast<q15_t*>(tmp),src->numCols);
+   arm_add_q15(reinterpret_cast<const q15_t*>(a),
+               reinterpret_cast<const q15_t*>(tmp),
+               reinterpret_cast<q15_t*>(tmp),src->numCols);
+   arm_mat_vec_mult_q15(src, 
+                        reinterpret_cast<const q15_t*>(tmp), 
+                        reinterpret_cast<q15_t*>(dst));
+}
+
+extern void cmsis_complex_mat_vec(
+  const arm_matrix_instance_q7 * src,
+  const Q7 * a,
+  const Q7 * b,
+  const Q7 scalar,
+  Q7 * tmp,
+  Q7 * dst)
+{
+   arm_scale_q7(reinterpret_cast<const q7_t*>(b),
+                 scalar.v,0,
+                 reinterpret_cast<q7_t*>(tmp),src->numCols);
+   arm_add_q7(reinterpret_cast<const q7_t*>(a),
+               reinterpret_cast<const q7_t*>(tmp),
+               reinterpret_cast<q7_t*>(tmp),src->numCols);
+   arm_mat_vec_mult_q7(src, 
+                        reinterpret_cast<const q7_t*>(tmp), 
+                        reinterpret_cast<q7_t*>(dst));
+}
\ No newline at end of file
diff --git a/dsppp/tests/col_test.cpp b/dsppp/tests/col_test.cpp
new file mode 100644
index 000000000..f7c2bde49
--- /dev/null
+++ b/dsppp/tests/col_test.cpp
@@ -0,0 +1,112 @@
+extern "C" {
+    extern void col_test();
+}
+
+#include "allocator.h"
+
+#include <dsppp/arch.hpp>
+#include <dsppp/fixed_point.hpp>
+#include <dsppp/matrix.hpp>
+
+#include <iostream>
+
+#include <cmsis_tests.h>
+
+
+
+#include "dsp/matrix_functions.h"
+#include "matrix_utils.h"
+
+template<typename T,int R,int C>
+static void test()
+{
+   std::cout << "----\r\n";
+   std::cout << R << " x " << C << "\r\n";
+
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> a;
+   PVector<T,R> ref;
+   #else 
+   PMat<T> a(R,C);
+   PVector<T> ref(R);
+   #endif
+
+   init_array(a,R*C);
+
+
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   #if defined(STATIC_TEST)
+   PVector<T,R> res = copy(a.col(4));
+   #else 
+   PVector<T> res = copy(a.col(4));
+   #endif
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   for(int i=0;i<R;i++)
+   {
+      ref[i] = a(i,4);
+
+   }
+   STOP_CYCLE_MEASUREMENT;
+
+   if (!validate(res,ref))
+   {
+      printf("col failed \r\n");
+   }
+
+   std::cout << "=====\r\n";
+}
+
+
+
+template<typename T>
+void all_col_test()
+{
+   const int nb_tails = TailForTests<T>::tail;
+   const int nb_loops = TailForTests<T>::loop;
+
+    title<T>("Col test");
+    
+    test<T,NBVEC_4,5>();
+    test<T,NBVEC_8,5>();
+    test<T,NBVEC_16,5>();
+    test<T,NBVEC_32,5>();
+
+    test<T,1,5>();
+    test<T,nb_tails,5>();
+    test<T,nb_loops,5>();
+    test<T,nb_loops+1,5>();
+    test<T,nb_loops+nb_tails,5>();
+
+}
+
+void col_test()
+{
+#if defined(COL_TEST)
+   #if defined(F64_DT)
+   all_col_test<double>();
+   #endif
+   #if defined(F32_DT)
+   all_col_test<float>();
+   #endif
+   #if defined(F16_DT) && !defined(DISABLEFLOAT16)
+   all_col_test<float16_t>();
+   #endif
+   #if defined(Q31_DT)
+   all_col_test<Q31>();
+   #endif
+   #if defined(Q15_DT)
+   all_col_test<Q15>();
+   #endif
+   #if defined(Q7_DT)
+   all_col_test<Q7>();
+   #endif
+#endif
+}
\ No newline at end of file
diff --git a/dsppp/tests/common_tests.cpp b/dsppp/tests/common_tests.cpp
new file mode 100644
index 000000000..999eb3cf0
--- /dev/null
+++ b/dsppp/tests/common_tests.cpp
@@ -0,0 +1,48 @@
+#include "allocator.h"
+#include <dsppp/arch.hpp>
+#include <dsppp/fixed_point.hpp>
+#include <dsppp/matrix.hpp>
+
+#include "cmsis_tests.h"
+
+extern "C" {
+   void memory_pool_stats();
+}
+
+#if 0
+template<>
+void init_array(Vector_Base<float16_t> &pDst,std::size_t nb)
+{
+   for(std::size_t i=0;i<nb;i++)
+   {
+      pDst[i] = 0.1*i;
+   }
+}
+#endif
+
+template<>
+bool validate(const float32_t* a, 
+              const float32_t* b, 
+              std::size_t nb,
+              float abser, 
+              float reler)
+{
+   for(std::size_t i=0;i<nb;i++)
+   {
+      if (ERROR(a[i],b[i],abser,reler) )
+      {
+         std::cout << "Error at :" << i << " ; res=" << a[i] << " ; ref=" << b[i] << "\r\n";
+         ERRVAL(a[i],b[i],abser,reler);
+         return(false);
+      }
+   }
+   return(true);
+}
+
+
+
+
+void memory_pool_stats()
+{
+    print_map("Stats");
+}
diff --git a/dsppp/tests/common_tests.h b/dsppp/tests/common_tests.h
new file mode 100644
index 000000000..2db59ade0
--- /dev/null
+++ b/dsppp/tests/common_tests.h
@@ -0,0 +1,282 @@
+#pragma once
+
+
+extern "C" {
+#include "bench.h"
+#include "test.h"
+}
+
+
+#include <type_traits>
+#include "allocator.h"
+
+using namespace arm_cmsis_dsp;
+
+#define REL_ERROR (1.0e-6)
+#define ABS_ERROR (1.0e-6)
+#define ERROR(A,B,AE,RE) ((fabs((A) - (B)) > (AE + RE * fabs((B)))))
+#define ERRVAL(VAL,REF,AE,RE) \
+   std::cout << "Error = " << fabs(VAL-REF) << "\r\n"; \
+   std::cout << "compared to " << (AE + RE * abs((REF))) << "\r\n";
+
+/************
+ * 
+ *  Data types
+ * 
+ */
+
+
+#if defined(POOL_ALLOCATOR)
+
+  template<typename P,int L=arm_cmsis_dsp::DYNAMIC>
+  using PVector = Vector<P,L,pool_allocator>;
+  
+  template<typename P,int R=arm_cmsis_dsp::DYNAMIC,int C=arm_cmsis_dsp::DYNAMIC>
+  using PMat = Matrix<P,R,C,pool_allocator>;
+
+#else
+
+  template<typename P,int L=arm_cmsis_dsp::DYNAMIC>
+  using PVector = Vector<P,L,stat_allocator>;
+
+  template<typename P,int R=arm_cmsis_dsp::DYNAMIC,int C=arm_cmsis_dsp::DYNAMIC>
+  using PMat = Matrix<P,R,C,stat_allocator>;
+
+#endif
+
+template<typename P,int stride=1>
+using PView = VectorView<P,stride>;
+
+template<typename T,
+         int L,
+         template<int> typename A>
+void init_array(Vector<T,L,A> &pDst,std::size_t nb)
+{
+   for(std::size_t i=0;i<nb;i++)
+   {
+      pDst[i] = T(i);
+   }
+}
+
+#if !defined(DISABLEFLOAT16)
+template<int L,
+         template<int> typename A>
+void init_array(Vector<float16_t,L,A> &pDst,std::size_t nb)
+{
+   for(std::size_t i=0;i<nb;i++)
+   {
+      // Scaled down to prevent saturations in the tests
+      pDst[i] = (float16_t)i*0.001;
+   }
+}
+#endif
+
+template<typename T>
+void init_array(Vector_Base<T> &pDst,std::size_t nb)
+{
+   for(std::size_t i=0;i<nb;i++)
+   {
+      pDst[i] = T(i);
+   }
+}
+
+
+//template<int L,
+//         template<int> typename A>
+//void init_array(Vector<float16_t,L,A> &pDst,std::size_t nb);
+
+
+//extern template void init_array<>(Vector_Base<float16_t> &pDst,std::size_t nb);
+
+
+template<typename T,
+ typename std::enable_if<std::is_pointer<T>::value,bool>::type = true>
+bool validate(const T a, const T b, std::size_t nb,float abser = ABS_ERROR, float reler = REL_ERROR)
+{
+   for(std::size_t i=0;i<nb;i++)
+   {
+      if constexpr (number_traits<std::remove_cv_t<std::remove_reference_t<decltype(a[0])>>>::is_float)
+      {
+         if (ERROR(a[i],b[i],abser,reler) )
+         {
+            std::cout << "Error at:" << i << " ; res=" << a[i] << " ; ref=" << b[i] << "\r\n";
+            ERRVAL(a[i],b[i],abser,reler);
+            return(false);
+         }
+      }
+      else
+      {
+         if (a[i]!=b[i])
+         {
+            std::cout << "Error at:" << i << " ; res=" << a[i] << " ; ref=" << b[i] << "\r\n";
+            return(false);
+         }
+      }
+   }
+   return(true);
+}
+
+template<typename TA,typename TB,
+ typename std::enable_if<IsVector<TA>::value && 
+                        !HasMatrixIndexing<TA>::value &&
+                        IsVector<TB>::value && 
+                        !HasMatrixIndexing<TB>::value,bool>::type = true>
+bool validate(const TA &a, const TB &b,float abser = ABS_ERROR, float reler = REL_ERROR)
+{
+   for(index_t i=0;i<a.length();i++)
+   {
+      if constexpr (number_traits<typename ElementType<TA>::type>::is_float)
+      {
+         if (ERROR(a[i],b[i],abser,reler) )
+         {
+            std::cout << "Error at:" << i << " ; res=" << a[i] << " ; ref=" << b[i] << "\r\n";
+            ERRVAL(a[i],b[i],abser,reler);
+            return(false);
+         }
+      }
+      else 
+      {
+         if (a[i]!=b[i])
+         {
+            std::cout << "Error at:" << i << " ; res=" << a[i] << " ; ref=" << b[i] << "\r\n";
+            return(false);
+         }
+      }
+   }
+   return(true);
+}
+
+
+template<typename T,
+ typename std::enable_if<!std::is_pointer<T>::value
+ && !IsVector<T>::value && !HasMatrixIndexing<T>::value,bool>::type = true>
+bool validate(const T a, const T b,float abser = ABS_ERROR, float reler = REL_ERROR)
+{
+   
+    if constexpr (number_traits<std::remove_cv_t<T>>::is_float)
+    {
+         if (ERROR(a,b,abser,reler))
+         {
+            std::cout << "Error: res=" << a << " ; ref=" << b << "\r\n";
+            ERRVAL(a,b,abser,reler);
+            return(false);
+         }
+    }
+    else 
+    {
+        if (a != b )
+        {
+             std::cout << "Error : res=" << a << " ; ref=" << b << "\r\n";
+             return(false);
+        }
+    } 
+   
+   return(true);
+}
+
+template<typename MA,typename MB,
+         typename std::enable_if<
+         HasMatrixIndexing<MA>::value && 
+         HasMatrixIndexing<MB>::value &&
+         number_traits<typename ElementType<MA>::type>::is_float,bool>::type = true>
+bool validate(const MA& a, const MB& b,float abser = ABS_ERROR, float reler = REL_ERROR)
+{
+   for(index_t row=0;row < a.rows() ; row++)
+   {
+      for(index_t col=0;col < a.columns() ; col++)
+      {
+         if (ERROR(a(row,col),b(row,col),abser,reler) )
+         {
+            //std::cout << fabs(a(row,col)-b(row,col)) << "\r\n";
+            //std::cout << REL_ERROR*fabsf(a(row,col)) << "\r\n";
+            //std::cout << a(row,col) << "\r\n";
+            //std::cout << b(row,col) << "\r\n";
+
+            std::cout << "Error at : (" << row << "," << col << ") ; res=" << a(row,col) << " ; ref=" << b(row,col) << "\r\n";
+            ERRVAL(a(row,col),b(row,col),abser,reler);
+            return(false);
+         }
+      }
+   }
+   return(true);
+}
+
+template<typename MA,typename MB,
+         typename std::enable_if<
+         HasMatrixIndexing<MA>::value && 
+         HasMatrixIndexing<MB>::value &&
+         number_traits<typename ElementType<MA>::type>::is_float,bool>::type = true>
+bool validateLT(const MA& a, const MB& b,float abser = ABS_ERROR, float reler = REL_ERROR)
+{
+   for(index_t row=0;row < a.rows() ; row++)
+   {
+      for(index_t col=0;col <= row ; col++)
+      {
+         if (ERROR(a(row,col),b(row,col),abser,reler) )
+         {
+            //std::cout << fabs(a(row,col)-b(row,col)) << "\r\n";
+            //std::cout << REL_ERROR*fabsf(a(row,col)) << "\r\n";
+            //std::cout << a(row,col) << "\r\n";
+            //std::cout << b(row,col) << "\r\n";
+
+            std::cout << "Error at : (" << row << "," << col << ") ; res=" << a(row,col) << " ; ref=" << b(row,col) << "\r\n";
+            ERRVAL(a(row,col),b(row,col),abser,reler);
+            return(false);
+         }
+      }
+   }
+   return(true);
+}
+
+template<typename MA,typename MB,
+         typename std::enable_if<
+         HasMatrixIndexing<MA>::value && 
+         HasMatrixIndexing<MB>::value &&
+         number_traits<typename ElementType<MA>::type>::is_fixed,bool>::type = true>
+bool validate(const MA& a, const MB& b,float abser = ABS_ERROR, float reler = REL_ERROR)
+{
+   (void)abser;
+   (void)reler;
+   for(index_t row=0;row < a.rows() ; row++)
+   {
+      for(index_t col=0;col < a.columns() ; col++)
+      {
+         if (a(row,col).v != b(row,col).v)
+         {
+            std::cout << "Error at : (" << row << "," << col << ") ; res=" << a(row,col) << " ; ref=" << b(row,col) << "\r\n";
+            std::cout << "Error = " << abs(a(row,col).v - b(row,col).v) << "\r\n";
+            return(false);
+         }
+      }
+   }
+   return(true);
+}
+
+template<>
+bool validate(const float32_t* a, const float32_t* b, std::size_t nb,float abser , float reler );
+
+
+extern template
+bool validate<>(const float32_t* a, const float32_t* b, std::size_t nb,float abser , float reler );
+
+
+
+
+
+template<typename T>
+void title(const std::string &s)
+{
+#if !defined(SERIAL_DUMP)
+#if defined(STATIC_TEST)
+    std::cout<<"\r\n\033[31;1;4m" << s << " " << NameOfType<T>::xls << "\033[0m\r\n";
+#else 
+    std::cout<<"\r\n\033[31;1;4m" << s << " dynamic " << NameOfType<T>::xls << "\033[0m\r\n";
+#endif
+#else 
+#if defined(STATIC_TEST)
+    std::cout << "\r\n" << s << " " << NameOfType<T>::xls << "\r\n";
+#else 
+    std::cout << "\r\n" << s << " dynamic " << NameOfType<T>::xls << "\r\n";
+#endif
+#endif
+};
\ No newline at end of file
diff --git a/dsppp/tests/debug_mat.h b/dsppp/tests/debug_mat.h
new file mode 100644
index 000000000..a0d7c707e
--- /dev/null
+++ b/dsppp/tests/debug_mat.h
@@ -0,0 +1,738 @@
+void pmat(float32_t *p,int nbrows,int nbcols)
+{
+    for(int r=0;r<nbrows;r++)
+    {
+        for(int c=0;c<nbcols;c++)
+        {
+            printf("%f ",(double)p[c+r*nbcols]);
+        }
+        printf("\r\n");
+    }
+    printf("\r\n");
+}
+
+void pvec(float32_t *p,int nb)
+{
+    for(int c=0;c<nb;c++)
+    {
+        printf("%f ",(double)p[c]);
+    }
+    printf("\r\n");
+}
+
+void pvec(Q7 *p,int nb)
+{
+    for(int c=0;c<nb;c++)
+    {
+        printf("%f ",(double)(1.0f*p[c].v/128.0f));
+    }
+    printf("\r\n");
+}
+
+#if !defined(ARM_MATH_AUTOVECTORIZE)
+#if defined(ARM_MATH_MVEF)
+
+arm_status _arm_mat_qr_f32(
+    const arm_matrix_instance_f32 * pSrc,
+    const float32_t threshold,
+    arm_matrix_instance_f32 * pOutR,
+    arm_matrix_instance_f32 * pOutQ,
+    float32_t * pOutTau,
+    float32_t *pTmpA,
+    float32_t *pTmpB
+    )
+
+{
+  int32_t col=0;
+  int32_t nb,pos;
+  float32_t *pa,*pc;
+  float32_t beta;
+  float32_t *pv;
+  float32_t *pdst;
+  float32_t *p;
+
+  if (pSrc->numRows < pSrc->numCols)
+  {
+    return(ARM_MATH_SIZE_MISMATCH);
+  }
+
+  memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float32_t));
+  pOutR->numCols = pSrc->numCols;
+  pOutR->numRows = pSrc->numRows;
+  
+  p = pOutR->pData;
+  
+  pc = pOutTau;
+  for(col=0 ; col < pSrc->numCols; col++)
+  {
+      int32_t j,k,blkCnt,blkCnt2;
+      float32_t *pa0,*pa1,*pa2,*pa3,*ptemp;
+      float32_t temp;
+      float32x4_t v1,v2,vtemp;
+
+      
+      COPY_COL_F32(pOutR,col,col,pTmpA);
+
+      beta = arm_householder_f32(pTmpA,threshold,pSrc->numRows - col,pTmpA);
+      *pc++ = beta;
+
+      //pvec(pTmpA,pSrc->numRows-col);
+      //pmat(p,pSrc->numRows-col,pSrc->numCols-col);
+
+      pdst = pTmpB;
+
+      /* v.T A(col:,col:) -> tmpb */
+      pv = pTmpA;
+      pa = p;
+
+      temp = *pv;
+      blkCnt = (pSrc->numCols-col) >> 2;
+      while (blkCnt > 0)
+      {
+          v1 = vld1q_f32(pa);
+          v2 = vmulq_n_f32(v1,temp);
+          vst1q_f32(pdst,v2);
+
+          pa += 4;
+          pdst += 4;
+          blkCnt--;
+      }
+      blkCnt = (pSrc->numCols-col) & 3;
+      if (blkCnt > 0)
+      {
+          mve_pred16_t p0 = vctp32q(blkCnt);
+          v1 = vld1q_f32(pa);
+          v2 = vmulq_n_f32(v1,temp);
+          vst1q_p_f32(pdst,v2,p0);
+
+          pa += blkCnt;
+      }
+
+
+
+      pa += col;
+      pv++;
+      pdst = pTmpB;
+
+      pa0 = pa;
+      pa1 = pa0 + pSrc->numCols;
+      pa2 = pa1 + pSrc->numCols;
+      pa3 = pa2 + pSrc->numCols;
+
+      /* Unrolled loop */
+      blkCnt = (pSrc->numRows-col - 1) >> 2;
+      k=1;
+      while(blkCnt > 0)
+      {
+          vtemp=vld1q_f32(pv);
+
+          blkCnt2 = (pSrc->numCols-col) >> 2;
+          while (blkCnt2 > 0)
+          {
+              v1 = vld1q_f32(pdst);
+
+              v2 = vld1q_f32(pa0);
+              v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,0));
+
+              v2 = vld1q_f32(pa1);
+              v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,1));
+
+              v2 = vld1q_f32(pa2);
+              v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,2));
+
+              v2 = vld1q_f32(pa3);
+              v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,3));
+
+              vst1q_f32(pdst,v1);
+
+              pdst += 4;
+              pa0 += 4;
+              pa1 += 4;
+              pa2 += 4;
+              pa3 += 4;
+              blkCnt2--;
+          }
+          blkCnt2 = (pSrc->numCols-col) & 3;
+          if (blkCnt2 > 0)
+          {
+              mve_pred16_t p0 = vctp32q(blkCnt2);
+
+              v1 = vld1q_f32(pdst);
+
+              v2 = vld1q_f32(pa0);
+              v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,0));
+
+              v2 = vld1q_f32(pa1);
+              v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,1));
+
+              v2 = vld1q_f32(pa2);
+              v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,2));
+
+              v2 = vld1q_f32(pa3);
+              v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,3));
+
+              vst1q_p_f32(pdst,v1,p0);
+
+              pa0 += blkCnt2;
+              pa1 += blkCnt2;
+              pa2 += blkCnt2;
+              pa3 += blkCnt2;
+          }
+              
+          pa0 += col + 3*pSrc->numCols;
+          pa1 += col + 3*pSrc->numCols;
+          pa2 += col + 3*pSrc->numCols;
+          pa3 += col + 3*pSrc->numCols;
+          pv  += 4;
+          pdst = pTmpB;
+          k += 4;
+          blkCnt--;
+      }
+
+      pa = pa0;
+      for(;k<pSrc->numRows-col; k++)
+      {
+          temp = *pv;
+          blkCnt2 = (pSrc->numCols-col) >> 2;
+          while (blkCnt2 > 0)
+          {
+              v1 = vld1q_f32(pa);
+              v2 = vld1q_f32(pdst);
+              v2 = vfmaq_n_f32(v2,v1,temp);
+              vst1q_f32(pdst,v2);
+
+              pa += 4;
+              pdst += 4;
+              blkCnt2--;
+          }
+          blkCnt2 = (pSrc->numCols-col) & 3;
+          if (blkCnt2 > 0)
+          {
+              mve_pred16_t p0 = vctp32q(blkCnt2);
+              v1 = vld1q_f32(pa);
+              v2 = vld1q_f32(pdst);
+              v2 = vfmaq_n_f32(v2,v1,temp);
+              vst1q_p_f32(pdst,v2,p0);
+
+              pa += blkCnt2;
+          }
+          
+          pa += col;
+          pv++;
+          pdst = pTmpB;
+      }
+
+      //pvec(pTmpB,pSrc->numCols-col);
+      //printf("--\r\n");
+
+      /* A(col:,col:) - beta v tmpb */
+      pa = p;
+      for(j=0;j<pSrc->numRows-col; j++)
+      {
+        float32_t f = -beta * pTmpA[j];
+        ptemp = pTmpB; 
+
+        blkCnt2 = (pSrc->numCols-col) >> 2;
+        while (blkCnt2 > 0)
+        {
+            v1 = vld1q_f32(pa);
+            v2 = vld1q_f32(ptemp);
+            v1 = vfmaq_n_f32(v1,v2,f);
+            vst1q_f32(pa,v1);
+
+            pa += 4;
+            ptemp += 4;
+
+            blkCnt2--;
+        }
+        blkCnt2 = (pSrc->numCols-col) & 3;
+        if (blkCnt2 > 0)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt2);
+
+            v1 = vld1q_f32(pa);
+            v2 = vld1q_f32(ptemp);
+            v1 = vfmaq_n_f32(v1,v2,f);
+            vst1q_p_f32(pa,v1,p0);
+
+            pa += blkCnt2;
+        }
+            
+        pa += col;
+      } 
+
+      /* Copy Householder reflectors into R matrix */
+      pa = p + pOutR->numCols;
+      for(k=0;k<pSrc->numRows-col-1; k++)
+      {
+         *pa = pTmpA[k+1];
+         pa += pOutR->numCols;
+      }
+
+      p += 1 + pOutR->numCols;
+  }
+
+  /* Generate Q if requested by user matrix */
+
+  if (pOutQ != NULL)
+  {
+     /* Initialize Q matrix to identity */
+     memset(pOutQ->pData,0,sizeof(float32_t)*pOutQ->numRows*pOutQ->numRows);
+     
+     pa = pOutQ->pData;
+     for(col=0 ; col < pOutQ->numCols; col++)
+     {
+        *pa = 1.0f;
+        pa += pOutQ->numCols+1;
+     }
+   
+     nb = pOutQ->numRows - pOutQ->numCols + 1;
+   
+     pc = pOutTau + pOutQ->numCols - 1;
+     for(col=0 ; col < pOutQ->numCols; col++)
+     {
+       int32_t j,k, blkCnt, blkCnt2;
+       float32_t *pa0,*pa1,*pa2,*pa3,*ptemp;
+       float32_t temp;
+       float32x4_t v1,v2,vtemp;
+
+       pos = pSrc->numRows - nb;
+       p = pOutQ->pData + pos + pOutQ->numCols*pos ;
+   
+       
+       COPY_COL_F32(pOutR,pos,pos,pTmpA);
+       pTmpA[0] = 1.0f;
+       pdst = pTmpB;
+      
+       /* v.T A(col:,col:) -> tmpb */
+       
+       pv = pTmpA;
+       pa = p;
+
+       temp = *pv;
+       blkCnt2 = (pOutQ->numRows-pos) >> 2;
+       while (blkCnt2 > 0)
+       {
+           v1 = vld1q_f32(pa);
+           v1 = vmulq_n_f32(v1, temp);
+           vst1q_f32(pdst,v1);
+
+           pa += 4;
+           pdst += 4;
+
+           blkCnt2--;
+       }
+       blkCnt2 = (pOutQ->numRows-pos) & 3;
+       if (blkCnt2 > 0)
+       {
+           mve_pred16_t p0 = vctp32q(blkCnt2);
+
+           v1 = vld1q_f32(pa);
+           v1 = vmulq_n_f32(v1, temp);
+           vst1q_p_f32(pdst,v1,p0);
+
+           pa += blkCnt2;
+       }
+           
+       pa += pos;
+       pv++;
+       pdst = pTmpB;
+       pa0 = pa;
+       pa1 = pa0 + pOutQ->numRows;
+       pa2 = pa1 + pOutQ->numRows;
+       pa3 = pa2 + pOutQ->numRows;
+
+       /* Unrolled loop */
+       blkCnt = (pOutQ->numRows-pos - 1) >> 2;
+       k=1;
+       while(blkCnt > 0)
+       {
+
+           vtemp = vld1q_f32(pv);
+           blkCnt2 = (pOutQ->numRows-pos) >> 2;
+           while (blkCnt2 > 0)
+           {
+               v1 = vld1q_f32(pdst);
+
+               v2 = vld1q_f32(pa0);
+               v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,0));
+
+               v2 = vld1q_f32(pa1);
+               v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,1));
+
+               v2 = vld1q_f32(pa2);
+               v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,2));
+
+               v2 = vld1q_f32(pa3);
+               v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,3));
+
+               vst1q_f32(pdst,v1);
+
+               pa0 += 4;
+               pa1 += 4;
+               pa2 += 4;
+               pa3 += 4;
+               pdst += 4;
+
+               blkCnt2--;
+           }
+           blkCnt2 = (pOutQ->numRows-pos) & 3;
+           if (blkCnt2 > 0)
+           {
+               mve_pred16_t p0 = vctp32q(blkCnt2);
+
+               v1 = vld1q_f32(pdst);
+
+               v2 = vld1q_f32(pa0);
+               v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,0));
+
+               v2 = vld1q_f32(pa1);
+               v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,1));
+
+               v2 = vld1q_f32(pa2);
+               v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,2));
+
+               v2 = vld1q_f32(pa3);
+               v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,3));
+
+               vst1q_p_f32(pdst,v1,p0);
+
+               pa0 += blkCnt2;
+               pa1 += blkCnt2;
+               pa2 += blkCnt2;
+               pa3 += blkCnt2;
+
+           }
+               
+           pa0 += pos + 3*pOutQ->numRows;
+           pa1 += pos + 3*pOutQ->numRows;
+           pa2 += pos + 3*pOutQ->numRows;
+           pa3 += pos + 3*pOutQ->numRows;
+           pv  += 4;
+           pdst = pTmpB;
+           k += 4;
+           blkCnt--;
+       }
+
+       pa = pa0;
+       for(;k<pOutQ->numRows-pos; k++)
+       {
+           temp = *pv;
+           blkCnt2 = (pOutQ->numRows-pos) >> 2;
+           while (blkCnt2 > 0)
+           {
+               v1 = vld1q_f32(pdst);
+               v2 = vld1q_f32(pa);
+               v1 = vfmaq_n_f32(v1, v2, temp);
+               vst1q_f32(pdst,v1);
+
+               pdst += 4;
+               pa += 4;
+
+               blkCnt2--;
+           }
+           blkCnt2 = (pOutQ->numRows-pos) & 3;
+           if (blkCnt2 > 0)
+           {
+               mve_pred16_t p0 = vctp32q(blkCnt2);
+               v1 = vld1q_f32(pdst);
+               v2 = vld1q_f32(pa);
+               v1 = vfmaq_n_f32(v1, v2, temp);
+               vst1q_p_f32(pdst,v1,p0);
+
+               pa += blkCnt2;
+           }
+               
+           pa += pos;
+           pv++;
+           pdst = pTmpB;
+       }
+   
+       pa = p;
+       beta = *pc--;
+       for(j=0;j<pOutQ->numRows-pos; j++)
+       {
+           float32_t f = -beta * pTmpA[j];
+           ptemp = pTmpB;
+
+           blkCnt2 = (pOutQ->numCols-pos) >> 2;
+           while (blkCnt2 > 0)
+           {
+               v1 = vld1q_f32(pa);
+               v2 = vld1q_f32(ptemp);
+               v1 = vfmaq_n_f32(v1,v2,f);
+               vst1q_f32(pa,v1);
+
+               pa += 4;
+               ptemp += 4;
+
+               blkCnt2--;
+           }
+           blkCnt2 = (pOutQ->numCols-pos) & 3;
+           if (blkCnt2 > 0)
+           {
+               mve_pred16_t p0 = vctp32q(blkCnt2);
+
+               v1 = vld1q_f32(pa);
+               v2 = vld1q_f32(ptemp);
+               v1 = vfmaq_n_f32(v1,v2,f);
+               vst1q_p_f32(pa,v1,p0);
+
+               pa += blkCnt2;
+           }
+               
+           pa += pos;
+       } 
+   
+   
+       nb++;
+     }
+  }
+
+  arm_status status = ARM_MATH_SUCCESS;
+  /* Return to application */
+  return (status);
+}
+
+#endif /*#if !defined(ARM_MATH_MVEF)*/
+
+
+#endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
+
+
+
+#if (!defined(ARM_MATH_MVEF)) || defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status _arm_mat_qr_f32(
+    const arm_matrix_instance_f32 * pSrc,
+    const float32_t threshold,
+    arm_matrix_instance_f32 * pOutR,
+    arm_matrix_instance_f32 * pOutQ,
+    float32_t * pOutTau,
+    float32_t *pTmpA,
+    float32_t *pTmpB
+    )
+
+{
+  int32_t col=0;
+  int32_t nb,pos;
+  float32_t *pa,*pc;
+  float32_t beta;
+  float32_t *pv;
+  float32_t *pdst;
+  float32_t *p;
+
+  if (pSrc->numRows < pSrc->numCols)
+  {
+    return(ARM_MATH_SIZE_MISMATCH);
+  }
+
+  memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float32_t));
+  pOutR->numCols = pSrc->numCols;
+  pOutR->numRows = pSrc->numRows;
+  
+  p = pOutR->pData;
+  
+  pc = pOutTau;
+  for(col=0 ; col < pSrc->numCols; col++)
+  {
+      int32_t i,j,k,blkCnt;
+      float32_t *pa0,*pa1,*pa2,*pa3;
+      COPY_COL_F32(pOutR,col,col,pTmpA);
+
+      beta = arm_householder_f32(pTmpA,threshold,pSrc->numRows - col,pTmpA);
+      *pc++ = beta;
+    
+      pdst = pTmpB;
+
+      /* v.T A(col:,col:) -> tmpb */
+      pv = pTmpA;
+      pa = p;
+      for(j=0;j<pSrc->numCols-col; j++)
+      {
+              *pdst++ = *pv * *pa++; 
+      }
+      pa += col;
+      pv++;
+      pdst = pTmpB;
+
+      pa0 = pa;
+      pa1 = pa0 + pSrc->numCols;
+      pa2 = pa1 + pSrc->numCols;
+      pa3 = pa2 + pSrc->numCols;
+
+      /* Unrolled loop */
+      blkCnt = (pSrc->numRows-col - 1) >> 2;
+      k=1;
+      while(blkCnt > 0)
+      {
+          float32_t sum;
+
+          for(j=0;j<pSrc->numCols-col; j++)
+          {
+              sum = *pdst;
+
+              sum += pv[0] * *pa0++;
+              sum += pv[1] * *pa1++;
+              sum += pv[2] * *pa2++;
+              sum += pv[3] * *pa3++;
+              
+              *pdst++ = sum; 
+          }
+          pa0 += col + 3*pSrc->numCols;
+          pa1 += col + 3*pSrc->numCols;
+          pa2 += col + 3*pSrc->numCols;
+          pa3 += col + 3*pSrc->numCols;
+          pv  += 4;
+          pdst = pTmpB;
+          k += 4;
+          blkCnt--;
+      }
+
+      pa = pa0;
+      for(;k<pSrc->numRows-col; k++)
+      {
+          for(j=0;j<pSrc->numCols-col; j++)
+          {
+              *pdst++ += *pv * *pa++; 
+          }
+          pa += col;
+          pv++;
+          pdst = pTmpB;
+      }
+
+      /* A(col:,col:) - beta v tmpb */
+      pa = p;
+      for(j=0;j<pSrc->numRows-col; j++)
+      {
+        float32_t f = beta * pTmpA[j];
+
+        for(i=0;i<pSrc->numCols-col; i++)
+        {
+          *pa = *pa - f * pTmpB[i] ;
+          pa++;
+        }
+        pa += col;
+      } 
+
+      /* Copy Householder reflectors into R matrix */
+      pa = p + pOutR->numCols;
+      for(k=0;k<pSrc->numRows-col-1; k++)
+      {
+         *pa = pTmpA[k+1];
+         pa += pOutR->numCols;
+      }
+
+      p += 1 + pOutR->numCols;
+  }
+
+  /* Generate Q if requested by user matrix */
+
+  if (pOutQ != NULL)
+  {
+     /* Initialize Q matrix to identity */
+     memset(pOutQ->pData,0,sizeof(float32_t)*pOutQ->numRows*pOutQ->numRows);
+     
+     pa = pOutQ->pData;
+     for(col=0 ; col < pOutQ->numCols; col++)
+     {
+        *pa = 1.0f;
+        pa += pOutQ->numCols+1;
+     }
+   
+     nb = pOutQ->numRows - pOutQ->numCols + 1;
+   
+     pc = pOutTau + pOutQ->numCols - 1;
+     for(col=0 ; col < pOutQ->numCols; col++)
+     {
+       int32_t i,j,k, blkCnt;
+       float32_t *pa0,*pa1,*pa2,*pa3;
+       pos = pSrc->numRows - nb;
+       p = pOutQ->pData + pos + pOutQ->numCols*pos ;
+   
+       
+       COPY_COL_F32(pOutR,pos,pos,pTmpA);
+       pTmpA[0] = 1.0f;
+       pdst = pTmpB;
+      
+       /* v.T A(col:,col:) -> tmpb */
+       
+       pv = pTmpA;
+       pa = p;
+       for(j=0;j<pOutQ->numRows-pos; j++)
+       {
+               *pdst++ = *pv * *pa++; 
+       }
+       pa += pos;
+       pv++;
+       pdst = pTmpB;
+       pa0 = pa;
+       pa1 = pa0 + pOutQ->numRows;
+       pa2 = pa1 + pOutQ->numRows;
+       pa3 = pa2 + pOutQ->numRows;
+
+       /* Unrolled loop */
+       blkCnt = (pOutQ->numRows-pos - 1) >> 2;
+       k=1;
+       while(blkCnt > 0)
+       {
+           float32_t sum;
+
+           for(j=0;j<pOutQ->numRows-pos; j++)
+           {
+              sum = *pdst;
+
+              sum += pv[0] * *pa0++;
+              sum += pv[1] * *pa1++;
+              sum += pv[2] * *pa2++;
+              sum += pv[3] * *pa3++;
+              
+              *pdst++ = sum; 
+           }
+           pa0 += pos + 3*pOutQ->numRows;
+           pa1 += pos + 3*pOutQ->numRows;
+           pa2 += pos + 3*pOutQ->numRows;
+           pa3 += pos + 3*pOutQ->numRows;
+           pv  += 4;
+           pdst = pTmpB;
+           k += 4;
+           blkCnt--;
+       }
+
+       pa = pa0;
+       for(;k<pOutQ->numRows-pos; k++)
+       {
+           for(j=0;j<pOutQ->numRows-pos; j++)
+           {
+               *pdst++ += *pv * *pa++; 
+           }
+           pa += pos;
+           pv++;
+           pdst = pTmpB;
+       }
+   
+       pa = p;
+       beta = *pc--;
+       for(j=0;j<pOutQ->numRows-pos; j++)
+       {
+           float32_t f = beta * pTmpA[j];
+
+           for(i=0;i<pOutQ->numCols-pos; i++)
+           {
+             *pa = *pa - f * pTmpB[i] ;
+             pa++;
+           }
+           pa += pos;
+       } 
+   
+   
+       nb++;
+     }
+  }
+
+  arm_status status = ARM_MATH_SUCCESS;
+  /* Return to application */
+  return (status);
+}
+
+#endif /* end of test for Helium or Neon availability */
diff --git a/dsppp/tests/debug_test.cpp b/dsppp/tests/debug_test.cpp
new file mode 100644
index 000000000..ed0101528
--- /dev/null
+++ b/dsppp/tests/debug_test.cpp
@@ -0,0 +1,45 @@
+extern "C" {
+    extern void debug_test();
+}
+
+#include "allocator.h"
+
+#include <dsppp/fixed_point.hpp>
+#include <dsppp/matrix.hpp>
+
+#include <iostream>
+
+#include <cmsis_tests.h>
+#include "dsp/basic_math_functions.h"
+
+
+using namespace arm_cmsis_dsp;
+
+
+
+extern Q15 external_debug(const PVector<Q15,16> &a0,
+   const PVector<Q15,16> &a1,
+   const PVector<Q15,16> &a2,
+   const PVector<Q15,16> &a3,
+                              const PVector<Q15,16> &b,
+                           int l);
+
+template<typename T,int NB>
+static void test()
+{
+
+  PrintType<vector_traits<Q15>>();
+}
+
+void debug_test()
+{
+    title<Q15>("Debug test");
+
+    
+   
+    test<Q15,NBVEC_16>();
+    
+
+
+
+}
\ No newline at end of file
diff --git a/dsppp/tests/debug_test_external.cpp b/dsppp/tests/debug_test_external.cpp
new file mode 100644
index 000000000..795aaa53f
--- /dev/null
+++ b/dsppp/tests/debug_test_external.cpp
@@ -0,0 +1,56 @@
+#include "allocator.h"
+
+#include <dsppp/fixed_point.hpp>
+#include <dsppp/matrix.hpp>
+
+#include <cmsis_tests.h>
+#include <iostream>
+
+
+
+#if defined(ARM_MATH_MVEI) || defined(ARM_MATH_MVEF)
+Q15 external_debug(const PVector<Q15,16> &a0,
+    const PVector<Q15,16> &a1,
+    const PVector<Q15,16> &a2,
+    const PVector<Q15,16> &a3,
+                   const PVector<Q15,16> &b,
+                    int l)
+{
+   int nb = l;
+   Q<33,30> acc0;
+   Q<33,30> acc1;
+   Q<33,30> acc2;
+   Q<33,30> acc3;
+   for(index_t i=0; i<nb; i+=8)
+   {
+        acc0 = inner::vmacc(acc0,a0.vector_op_tail(i,nb-i),b.vector_op_tail(i,nb-i),inner::vctpq<Q15>::mk(nb-i));
+        acc1 = inner::vmacc(acc1,a1.vector_op_tail(i,nb-i),b.vector_op_tail(i,nb-i),inner::vctpq<Q15>::mk(nb-i));
+        acc2 = inner::vmacc(acc2,a2.vector_op_tail(i,nb-i),b.vector_op_tail(i,nb-i),inner::vctpq<Q15>::mk(nb-i));
+        acc3 = inner::vmacc(acc3,a3.vector_op_tail(i,nb-i),b.vector_op_tail(i,nb-i),inner::vctpq<Q15>::mk(nb-i));
+   }
+   Q15 r0,r1,r2,r3;
+
+   r0 = inner::from_accumulator(acc0);
+   r1 = inner::from_accumulator(acc1);
+   r2 = inner::from_accumulator(acc2);
+   r3 = inner::from_accumulator(acc3);
+
+   return(r0+r1+r2+r3);
+}
+#else
+Q15 external_debug(const PVector<Q15,16> &a0,
+    const PVector<Q15,16> &a1,
+    const PVector<Q15,16> &a2,
+    const PVector<Q15,16> &a3,
+                   const PVector<Q15,16> &b,
+                    int l)
+{
+    (void)a0;
+    (void)a1;
+    (void)a2;
+    (void)a3;
+    (void)b;
+    (void)l;
+    return(a0[0]);
+}
+#endif
\ No newline at end of file
diff --git a/dsppp/tests/dot_test.cpp b/dsppp/tests/dot_test.cpp
new file mode 100644
index 000000000..53d878c7b
--- /dev/null
+++ b/dsppp/tests/dot_test.cpp
@@ -0,0 +1,213 @@
+extern "C" {
+    extern void dot_test();
+}
+
+#include "allocator.h"
+
+#include <dsppp/arch.hpp>
+#include <dsppp/fixed_point.hpp>
+#include <dsppp/matrix.hpp>
+
+#include <iostream>
+
+#include <cmsis_tests.h>
+
+#include "dsp/basic_math_functions.h"
+#include "dsp/basic_math_functions_f16.h"
+
+
+
+
+
+template<typename T,int NB,typename O>
+static void complex_test(const T scale)
+{
+   std::cout << "----\r\n" << "N = " << NB << "\r\n";
+   #if defined(STATIC_TEST)
+   PVector<T,NB> a;
+   PVector<T,NB> b;
+   PVector<T,NB> c;
+   PVector<T,NB> d;
+
+   PVector<T,NB> res;
+   #else 
+   PVector<T> a(NB);
+   PVector<T> b(NB);
+   PVector<T> c(NB);
+   PVector<T> d(NB);
+
+   PVector<T> res(NB);
+   #endif
+
+   
+   init_array(a,NB);
+   init_array(b,NB);
+   init_array(c,NB);
+   init_array(d,NB);
+    
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   O result = dot(scale*(a+b),c*d);
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+   
+   O ref;
+   PVector<T,NB> tmp1;
+   PVector<T,NB> tmp2;
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   cmsisdsp_dot_expr(a.const_ptr(),
+                     b.const_ptr(),
+                     c.const_ptr(),
+                     d.const_ptr(),
+                     tmp1.ptr(),
+                     tmp2.ptr(),
+                     scale,
+                     ref,NB);
+   STOP_CYCLE_MEASUREMENT;
+
+   if (!validate(result,ref))
+   {
+      printf("dot expr failed \r\n");
+
+   }
+
+   std::cout << "=====\r\n";
+
+}
+
+
+template<typename T,int NB,typename O>
+static void test()
+{
+   std::cout << "----\r\n" << "N = " << NB << "\r\n";
+   #if defined(STATIC_TEST)
+   PVector<T,NB> a;
+   PVector<T,NB> b;
+
+   PVector<T,NB> res;
+   #else
+   PVector<T> a(NB);
+   PVector<T> b(NB);
+
+   PVector<T> res(NB);
+   #endif
+
+   init_array(a,NB);
+   init_array(b,NB);
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   O result = dot(a,b);
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+
+   O ref;
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   cmsisdsp_dot(a.const_ptr(),b.const_ptr(),ref,NB);
+   STOP_CYCLE_MEASUREMENT;
+
+   if (!validate(result,ref))
+   {
+      printf("dot failed \r\n");
+
+   }
+
+   std::cout << "=====\r\n";
+
+}
+
+
+template<typename T>
+void all_dot_test()
+{
+
+   const int nb_tails = TailForTests<T>::tail;
+   const int nb_loops = TailForTests<T>::loop;
+
+    using ACC = typename number_traits<T>::accumulator;
+    constexpr auto v = TestConstant<T>::v;
+
+    title<T>("Dot product");
+
+    
+    test<T,NBVEC_4,ACC>();
+    test<T,NBVEC_8,ACC>();
+    test<T,NBVEC_9,ACC>();
+    test<T,NBVEC_16,ACC>();
+    test<T,NBVEC_32,ACC>();
+    test<T,NBVEC_64,ACC>();
+    test<T,NBVEC_128,ACC>();
+    test<T,NBVEC_256,ACC>();
+    test<T,NBVEC_258,ACC>();
+    test<T,NBVEC_512,ACC>();
+    test<T,NBVEC_1024,ACC>();
+    if constexpr (!std::is_same<T,double>::value)
+    {
+       test<T,NBVEC_2048,ACC>();
+    }
+
+    test<T,1,ACC>();
+    test<T,nb_tails,ACC>();
+    test<T,nb_loops,ACC>();
+    test<T,nb_loops+1,ACC>();
+    test<T,nb_loops+nb_tails,ACC>();
+
+
+    title<T>("Dot product with expressions");
+
+    
+    complex_test<T,NBVEC_4,ACC>(v);
+    complex_test<T,NBVEC_8,ACC>(v);
+    complex_test<T,NBVEC_9,ACC>(v);
+    complex_test<T,NBVEC_32,ACC>(v);
+    complex_test<T,NBVEC_64,ACC>(v);
+    complex_test<T,NBVEC_128,ACC>(v);
+    
+    complex_test<T,NBVEC_256,ACC>(v);
+    
+    complex_test<T,NBVEC_258,ACC>(v);
+    complex_test<T,NBVEC_512,ACC>(v);
+    complex_test<T,NBVEC_1024,ACC>(v);
+    if constexpr (!std::is_same<T,double>::value)
+    {
+       complex_test<T,NBVEC_2048,ACC>(v);
+    }
+
+    complex_test<T,1,ACC>(v);
+    complex_test<T,nb_tails,ACC>(v);
+    complex_test<T,nb_loops,ACC>(v);
+    complex_test<T,nb_loops+1,ACC>(v);
+    complex_test<T,nb_loops+nb_tails,ACC>(v);
+
+    //print_map("Stats",max_stats);
+
+}
+
+void dot_test()
+{
+#if defined(DOT_TEST)
+   #if defined(F64_DT)
+   all_dot_test<double>();
+   #endif
+   #if defined(F32_DT)
+   all_dot_test<float>();
+   #endif
+   #if defined(F16_DT) && !defined(DISABLEFLOAT16)
+   all_dot_test<float16_t>();
+   #endif
+   #if defined(Q31_DT)
+   all_dot_test<Q31>();
+   #endif
+   #if defined(Q15_DT)
+   all_dot_test<Q15>();
+   #endif
+   #if defined(Q7_DT)
+   all_dot_test<Q7>();
+   #endif
+#endif
+}
diff --git a/dsppp/tests/filter_test.cpp b/dsppp/tests/filter_test.cpp
new file mode 100644
index 000000000..da5bea2a4
--- /dev/null
+++ b/dsppp/tests/filter_test.cpp
@@ -0,0 +1,657 @@
+extern "C" {
+    extern void filter_test();
+}
+
+#include "allocator.h"
+
+#include <dsppp/arch.hpp>
+#include <dsppp/fixed_point.hpp>
+#include <dsppp/matrix.hpp>
+#include <dsppp/unroll.hpp>
+
+#include <iostream>
+
+#include <cmsis_tests.h>
+
+
+#if defined(ARM_MATH_MVEI) || defined(ARM_MATH_MVEF)
+
+#define MVE_ASRL_SAT16(acc, shift)          ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
+
+
+#define FIR_Q15_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs)        \
+        for (int j = 0; j < nbAcc; j++) {                                  \
+            const q15_t    *pSmp = &pSample[j];                            \
+            q63_t           acc[4];                                        \
+                                                                           \
+            acc[j] = 0;                                                    \
+            for (int i = 0; i < nbVecTaps; i++) {                          \
+                vecIn0 = vld1q(pSmp + 8 * i);                  \
+                acc[j] = vmlaldavaq(acc[j], vecIn0, vecCoeffs[i]);         \
+            }                                                              \
+            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc[j], 15);               \
+        }
+
+#define FIR_Q15_MAIN_CORE()                                                                  \
+{                                                                                            \
+    q15_t          *pState = S->pState;     /* State pointer */                              \
+    const q15_t    *pCoeffs = S->pCoeffs;   /* Coefficient pointer */                        \
+    q15_t          *pStateCur;              /* Points to the current sample of the state */  \
+    const q15_t    *pSamples;               /* Temporary pointer to the sample buffer */     \
+    q15_t          *pOutput;                /* Temporary pointer to the output buffer */     \
+    const q15_t    *pTempSrc;               /* Temporary pointer to the source data */       \
+    q15_t          *pTempDest;              /* Temporary pointer to the destination buffer */\
+    uint32_t        numTaps = S->numTaps;   /* Number of filter coefficients in the filter */\
+    int32_t         blkCnt;                                                                  \
+    q15x8_t         vecIn0;                                                                  \
+                                                                                             \
+    /*                                                                                       \
+     * load coefs                                                                            \
+     */                                                                                      \
+    q15x8_t         vecCoeffs[NBVECTAPS];                                                    \
+                                                                                             \
+    for (int i = 0; i < NBVECTAPS; i++)                                                      \
+        vecCoeffs[i] = vldrhq_s16(pCoeffs + 8 * i);                                          \
+                                                                                             \
+    /*                                                                                       \
+     * pState points to state array which contains previous frame (numTaps - 1) samples      \
+     * pStateCur points to the location where the new input data should be written           \
+     */                                                                                      \
+    pStateCur = &(pState[(numTaps - 1u)]);                                                   \
+    pTempSrc = pSrc;                                                                         \
+    pSamples = pState;                                                                       \
+    pOutput = pDst;                                                                          \
+                                                                                             \
+    blkCnt = blockSize >> 2;                                                                 \
+    while (blkCnt > 0) {                                                                     \
+        /*                                                                                   \
+         * Save 4 input samples in the history buffer                                        \
+         */                                                                                  \
+        vstrhq_s32(pStateCur, vldrhq_s32(pTempSrc));                                         \
+        pStateCur += 4;                                                                      \
+        pTempSrc += 4;                                                                       \
+                                                                                             \
+        FIR_Q15_CORE(pOutput, 4, NBVECTAPS, pSamples, vecCoeffs);                            \
+        pSamples += 4;                                                                       \
+                                                                                             \
+        blkCnt--;                                                                            \
+    }                                                                                        \
+                                                                                             \
+    /* tail */                                                                               \
+    int32_t        residual = blockSize & 3;                                                \
+                                                                                             \
+    for (int i = 0; i < residual; i++)                                                       \
+        *pStateCur++ = *pTempSrc++;                                                          \
+                                                                                             \
+    FIR_Q15_CORE(pOutput, residual, NBVECTAPS, pSamples, vecCoeffs);                         \
+                                                                                             \
+    /*                                                                                       \
+     * Copy the samples back into the history buffer start                                   \
+     */                                                                                      \
+    pTempSrc = &pState[blockSize];                                                           \
+    pTempDest = pState;                                                                      \
+                                                                                             \
+    /* current compiler limitation */                                                        \
+    blkCnt = (numTaps - 1) >> 3;                                                             \
+    while (blkCnt > 0)                                                                       \
+    {                                                                                        \
+        vstrhq_s16(pTempDest, vldrhq_s16(pTempSrc));                                         \
+        pTempSrc += 8;                                                                       \
+        pTempDest += 8;                                                                      \
+        blkCnt--;                                                                            \
+    }                                                                                        \
+    blkCnt = (numTaps - 1) & 7;                                                              \
+    if (blkCnt > 0)                                                                          \
+    {                                                                                        \
+        mve_pred16_t p = vctp16q(blkCnt);                                                    \
+        vstrhq_p_s16(pTempDest, vldrhq_z_s16(pTempSrc, p), p);                               \
+    }                                                                                        \
+}
+    
+static void arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S, 
+  const q15_t * __restrict pSrc,
+  q15_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 32
+    #define NBVECTAPS (NBTAPS / 8)
+    FIR_Q15_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+static void arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S, 
+  const q15_t * __restrict pSrc,
+  q15_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 24
+    #define NBVECTAPS (NBTAPS / 8)
+    FIR_Q15_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+
+static void arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S, 
+  const q15_t * __restrict pSrc,
+  q15_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 16
+    #define NBVECTAPS (NBTAPS / 8)
+    FIR_Q15_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, 
+  const q15_t * __restrict pSrc, 
+  q15_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 8
+    #define NBVECTAPS (NBTAPS / 8)
+    FIR_Q15_MAIN_CORE();
+    #undef NBVECTAPS
+    #undef NBTAPS
+}
+
+
+void debug_arm_fir_q15(
+  const arm_fir_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+    q15_t    *pState = S->pState;   /* State pointer */
+    const q15_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+    q15_t    *pStateCur;        /* Points to the current sample of the state */
+    const q15_t    *pSamples;         /* Temporary pointer to the sample buffer */
+    q15_t    *pOutput;          /* Temporary pointer to the output buffer */
+    const q15_t    *pTempSrc;         /* Temporary pointer to the source data */
+    q15_t    *pTempDest;        /* Temporary pointer to the destination buffer */
+    uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+    uint32_t  blkCnt;
+    q15x8_t vecIn0;
+    uint32_t  tapsBlkCnt = (numTaps + 7) / 8;
+    q63_t     acc0, acc1, acc2, acc3;
+
+
+int32_t nbTaps = (numTaps + 7) >> 3;
+
+switch(nbTaps) {
+
+    case 1:
+        arm_fir_q15_1_8_mve(S, pSrc, pDst, blockSize);
+        return;
+    case 2:
+        arm_fir_q15_9_16_mve(S, pSrc, pDst, blockSize);
+        return;
+    case 3:
+        arm_fir_q15_17_24_mve(S, pSrc, pDst, blockSize);
+        return;
+    case 4:
+        arm_fir_q15_25_32_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    /*
+     * pState points to state array which contains previous frame (numTaps - 1) samples
+     * pStateCur points to the location where the new input data should be written
+     */
+    pStateCur   = &(pState[(numTaps - 1u)]);
+    pTempSrc    = pSrc;
+    pSamples    = pState;
+    pOutput     = pDst;
+    blkCnt      = blockSize >> 2;
+
+    while (blkCnt > 0U)
+    {
+        const q15_t    *pCoeffsTmp = pCoeffs;
+        const q15_t    *pSamplesTmp = pSamples;
+
+        acc0 = 0LL;
+        acc1 = 0LL;
+        acc2 = 0LL;
+        acc3 = 0LL;
+
+        /*
+         * Save 8 input samples in the history buffer
+         */
+        vst1q(pStateCur, vld1q(pTempSrc));
+        pStateCur += 8;
+        pTempSrc += 8;
+
+        //INIT_SYSTICK;
+        //START_CYCLE_MEASUREMENT;
+        int       i = tapsBlkCnt;
+        //startSectionNB(3);
+        while (i > 0)
+        {
+            /*
+             * load 8 coefs
+             */
+            q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
+
+            vecIn0 = vld1q(pSamplesTmp);
+            acc0 =  vmlaldavaq(acc0, vecIn0, vecCoeffs);
+
+            vecIn0 = vld1q(&pSamplesTmp[1]);
+            acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
+
+            vecIn0 = vld1q(&pSamplesTmp[2]);
+            acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
+
+            vecIn0 = vld1q(&pSamplesTmp[3]);
+            acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs);
+
+            pSamplesTmp += 8;
+            pCoeffsTmp += 8;
+            /*
+             * Decrement the taps block loop counter
+             */
+            i--;
+        }
+        //stopSectionNB(3);
+        //STOP_CYCLE_MEASUREMENT;
+
+
+        *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
+        *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
+        *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
+        *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc3, 15);
+
+        pSamples += 4;
+        /*
+         * Decrement the sample block loop counter
+         */
+        blkCnt--;
+    }
+
+    uint32_t  residual = blockSize & 3;
+    switch (residual)
+    {
+    case 3:
+        {
+            const q15_t    *pCoeffsTmp = pCoeffs;
+            const q15_t    *pSamplesTmp = pSamples;
+
+            acc0 = 0LL;
+            acc1 = 0LL;
+            acc2 = 0LL;
+
+            /*
+             * Save 8 input samples in the history buffer
+             */
+            *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc;
+            pStateCur += 8;
+            pTempSrc += 8;
+
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                /*
+                 * load 8 coefs
+                 */
+                q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
+
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
+
+                vecIn0 = vld1q(&pSamplesTmp[2]);
+                acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
+
+                vecIn0 = vld1q(&pSamplesTmp[4]);
+                acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
+
+                pSamplesTmp += 8;
+                pCoeffsTmp += 8;
+                /*
+                 * Decrement the taps block loop counter
+                 */
+                i--;
+            }
+
+            acc0 = asrl(acc0, 15);
+            acc1 = asrl(acc1, 15);
+            acc2 = asrl(acc2, 15);
+
+            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
+            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
+            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
+        }
+        break;
+
+    case 2:
+        {
+            const q15_t    *pCoeffsTmp = pCoeffs;
+            const q15_t    *pSamplesTmp = pSamples;
+
+            acc0 = 0LL;
+            acc1 = 0LL;
+            /*
+             * Save 8 input samples in the history buffer
+             */
+            vst1q(pStateCur, vld1q(pTempSrc));
+            pStateCur += 8;
+            pTempSrc += 8;
+
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                /*
+                 * load 8 coefs
+                 */
+                q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
+
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
+
+                vecIn0 = vld1q(&pSamplesTmp[2]);
+                acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
+
+                pSamplesTmp += 8;
+                pCoeffsTmp += 8;
+                /*
+                 * Decrement the taps block loop counter
+                 */
+                i--;
+            }
+
+            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
+            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
+        }
+        break;
+
+    case 1:
+        {
+            const q15_t    *pCoeffsTmp = pCoeffs;
+            const q15_t    *pSamplesTmp = pSamples;
+
+            acc0 = 0LL;
+
+            /*
+             * Save 8 input samples in the history buffer
+             */
+            vst1q(pStateCur, vld1q(pTempSrc));
+            pStateCur += 8;
+            pTempSrc += 8;
+
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                /*
+                 * load 8 coefs
+                 */
+                q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
+
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
+
+                pSamplesTmp += 8;
+                pCoeffsTmp += 8;
+                /*
+                 * Decrement the taps block loop counter
+                 */
+                i--;
+            }
+
+            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
+        }
+        break;
+
+    }
+
+    /*
+     * Copy the samples back into the history buffer start
+     */
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;
+
+    blkCnt = numTaps >> 3;
+    while (blkCnt > 0U)
+    {
+        vst1q(pTempDest, vld1q(pTempSrc));
+        pTempSrc += 8;
+        pTempDest += 8;
+        blkCnt--;
+    }
+    blkCnt = numTaps & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vstrhq_p_s16(pTempDest, vld1q(pTempSrc), p0);
+    }
+}
+#endif 
+
+template<typename T>
+struct FirType;
+
+template<>
+struct FirType<float32_t>
+{
+   typedef arm_fir_instance_f32 type;
+   static void init_state(type * S,
+                          uint16_t numTaps,
+                          const float32_t * pCoeffs,
+                                float32_t * pState,
+                                uint32_t blockSize)
+   {
+       arm_fir_init_f32(S,numTaps,pCoeffs,pState,blockSize);
+   };
+
+   static void init_coef(float32_t *coefs,uint16_t numTaps)
+   {
+     for(int i=0;i<numTaps;i++)
+     {
+        coefs[i] = 1.0f / numTaps;
+     }
+   };
+};
+
+template<>
+struct FirType<Q15>
+{
+   typedef arm_fir_instance_q15 type;
+   static void init_state(type * S,
+                          uint16_t numTaps,
+                          const Q15 * pCoeffs,
+                                Q15 * pState,
+                                uint32_t blockSize)
+   {
+       arm_fir_init_q15(S,numTaps,
+         reinterpret_cast<const q15_t*>(pCoeffs),
+         reinterpret_cast<q15_t*>(pState),blockSize);
+   };
+
+   static void init_coef(Q15 *coefs,uint16_t numTaps)
+   {
+     for(int i=0;i<numTaps;i++)
+     {
+        coefs[i] = Q15::f(1.0f / numTaps);
+     }
+   };
+};
+
+
+
+template<typename T,int BLOCK,int TAPS>
+struct FIR {
+
+   FIR(const PVector<T,TAPS> &coefs):coef_(coefs),state_(T{})
+   {};
+
+   
+   PVector<T,BLOCK> filter(const PVector<T,BLOCK> &signal)
+   {
+      constexpr int UNROLL_FACTOR = 4;
+      PVector<T,BLOCK> res(T{});
+      using acc_type = typename number_traits<T>::accumulator;
+      std::array<acc_type,UNROLL_FACTOR> accu;
+      index_t i=0;
+
+#if defined(ARM_COMPUTE_DISABLE_UNROLL)
+      #pragma clang loop unroll(disable)
+#endif
+      for(;i<=BLOCK-UNROLL_FACTOR;i+=UNROLL_FACTOR)
+      {
+         
+         state_.sub(TAPS-1+i,TAPS-1+i+UNROLL_FACTOR) = copy(signal.sub(i,i+UNROLL_FACTOR));
+        
+         //INIT_SYSTICK;
+         //START_CYCLE_MEASUREMENT;
+         //startSectionNB(2);
+         results(accu) =
+           dot(unroll<UNROLL_FACTOR>(
+                  [i,this](index_t k){return state_.sub(i+k,i+k+TAPS);}),
+               replicate<UNROLL_FACTOR>(coef_)
+              );
+         //stopSectionNB(2);
+         //STOP_CYCLE_MEASUREMENT;
+
+         for(index_t k=0;k<UNROLL_FACTOR;k++)
+         {
+            res[i+k] = inner::from_accumulator(accu[k]);
+         }
+      }
+
+#if defined(ARM_COMPUTE_DISABLE_UNROLL)
+      #pragma clang loop unroll(disable)
+#endif
+      for(;i<BLOCK;i++)
+      {
+         state_[TAPS-1+i] = signal[i];
+         acc_type acc = dot(state_.sub(i,i+TAPS),coef_);
+         res[i] = inner::from_accumulator(acc);
+      }
+
+      state_.sub(0,TAPS) = copy(state_.sub(BLOCK,TAPS+BLOCK));
+      return(res);
+   }
+
+   void purec(const T *signal, T *dst)
+   {
+      const T* coef=coef_.const_ptr();
+      T *state = state_.ptr();
+
+#if defined(ARM_COMPUTE_DISABLE_UNROLL)
+      #pragma clang loop unroll(disable)
+#endif
+      for(index_t i=0;i<BLOCK;i++)
+      {
+         T acc=T{};
+         state[TAPS-1+i] = signal[i];
+         for(index_t k=0;k<TAPS;k++)
+         {
+            acc += state[i+k]*coef[k];
+         }
+         dst[i] = acc;
+      }
+      for(index_t i=0;i<TAPS-1;i++)
+      {
+         state[i] = state[BLOCK+i];
+      }
+   }
+
+   const PVector<T,TAPS> coef_;
+   PVector<T,TAPS+BLOCK-1> state_;
+};
+
+template<typename T,int BLOCK,int TAPS>
+static void test()
+{
+   constexpr int NB = BLOCK;
+   std::cout << "----\r\n(" << BLOCK << "," << TAPS << ")\r\n";
+
+   typename FirType<T>::type S;
+   PVector<T,NB> signal;
+   PVector<T,TAPS> coefs;
+   
+   FirType<T>::init_coef(coefs.ptr(),TAPS);
+
+   init_array(signal,NB);
+   FIR<T,BLOCK,TAPS> fir(coefs);
+
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   PVector<T,BLOCK> res = fir.filter(signal);
+   //PVector<T,BLOCK> res;
+   //fir.purec(signal.const_ptr(),res.ptr());
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+
+   T* state;
+   T* coefsb;
+   state=(T*)malloc(sizeof(T)*(TAPS+BLOCK+BLOCK));
+   coefsb=(T*)malloc(sizeof(T)*(TAPS+32));
+   memset(coefsb,0,sizeof(T)*(TAPS+32));
+   for(int i =0;i<TAPS;i++)
+   {
+      coefsb[i] = coefs[i];
+   }
+   FirType<T>::init_state(&S,TAPS,coefsb,state,BLOCK);
+   PVector<T,NB> ref;
+   //std::cout << "---\r\n";
+
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   arm_fir_q15(&S,
+                    reinterpret_cast<const q15_t*>(signal.const_ptr()),
+                    reinterpret_cast<q15_t*>(ref.ptr()),BLOCK);
+   STOP_CYCLE_MEASUREMENT;
+
+
+   if (!validate(res.const_ptr(),ref.const_ptr(),BLOCK))
+   {
+      printf("fir failed \r\n");
+   }
+
+   free(state);
+   free(coefsb);
+
+   
+}
+
+
+
+
+template<typename T>
+void all_filter_test()
+{
+
+    title<T>("FIR test");
+   
+   
+    test<T,NBVEC_4,8>();
+    test<T,NBVEC_4,16>();
+    test<T,NBVEC_4,24>();
+    test<T,NBVEC_4,32>();
+    test<T,NBVEC_4,64>();
+
+    test<T,NBVEC_32,8>();
+    test<T,NBVEC_32,16>();
+    test<T,NBVEC_32,24>();
+    test<T,NBVEC_32,32>();
+    test<T,NBVEC_32,64>();
+
+    test<T,NBVEC_256,8>();
+    test<T,NBVEC_256,16>();
+    test<T,NBVEC_256,24>();
+    test<T,NBVEC_256,32>();
+    test<T,NBVEC_256,64>();
+    
+
+    test<T,NBVEC_512,8>();
+    test<T,NBVEC_512,16>();
+    test<T,NBVEC_512,24>();
+    test<T,NBVEC_512,32>();
+    test<T,NBVEC_512,64>();
+
+   
+}
+
+void filter_test()
+{
+    //all_filter_test<Q15>();
+}
\ No newline at end of file
diff --git a/dsppp/tests/fusion_test.cpp b/dsppp/tests/fusion_test.cpp
new file mode 100644
index 000000000..0711cd1b6
--- /dev/null
+++ b/dsppp/tests/fusion_test.cpp
@@ -0,0 +1,247 @@
+extern "C" {
+    extern void fusion_test();
+}
+
+#include "allocator.h"
+
+#include <tuple>
+
+#include <dsppp/arch.hpp>
+#include <dsppp/fixed_point.hpp>
+#include <dsppp/matrix.hpp>
+#include <dsppp/unroll.hpp>
+
+#include <iostream>
+
+#include <cmsis_tests.h>
+
+template<typename T,int NB>
+static void test()
+{
+   std::cout << "----\r\n" << "N = " << NB << "\r\n";
+
+   #if defined(STATIC_TEST)
+   PVector<T,NB> a;
+   PVector<T,NB> b;
+   PVector<T,NB> c;
+   #else
+   PVector<T> a(NB);
+   PVector<T> b(NB);
+   PVector<T> c(NB);
+   #endif
+
+
+   init_array(a,NB);
+   init_array(b,NB);
+   init_array(c,NB);
+
+   #if defined(STATIC_TEST)
+   PVector<T,NB> resa;
+   PVector<T,NB> resb;
+   #else
+   PVector<T> resa(NB);
+   PVector<T> resb(NB);
+   #endif
+
+   
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   results(resa,resb) = Merged{a + b,a + c};
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+   PVector<T,NB> refa;
+   PVector<T,NB> refb;
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   cmsisdsp_add(a.const_ptr(),b.const_ptr(),refa.ptr(),NB);
+   cmsisdsp_add(a.const_ptr(),c.const_ptr(),refb.ptr(),NB);
+   STOP_CYCLE_MEASUREMENT;
+
+   if (!validate(resa.const_ptr(),refa.const_ptr(),NB))
+   {
+      printf("add a failed \r\n");
+
+   }
+
+   if (!validate(resb.const_ptr(),refb.const_ptr(),NB))
+   {
+      printf("add b failed \r\n");
+
+   }
+
+   std::cout << "=====\r\n";
+}
+
+
+template<typename T,int NB>
+static void test2()
+{
+   std::cout << "----\r\n" << "N = " << NB << "\r\n";
+   #if defined(STATIC_TEST)
+   PVector<T,NB> a;
+   PVector<T,NB> b;
+   PVector<T,NB> c;
+   #else
+   PVector<T> a(NB);
+   PVector<T> b(NB);
+   PVector<T> c(NB);
+   #endif
+   using Acc = typename number_traits<T>::accumulator;
+
+
+   init_array(a,NB);
+   init_array(b,NB);
+   init_array(c,NB);
+
+   Acc resa,resb,refa,refb;
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(2);
+   std::tie(resa,resb) = dot(Merged{expr(a),expr(a)},
+                             Merged{expr(b),expr(c)});
+   stopSectionNB(2);
+   STOP_CYCLE_MEASUREMENT;
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   cmsisdsp_dot(a.const_ptr(),b.const_ptr(),refa,NB);
+   cmsisdsp_dot(a.const_ptr(),c.const_ptr(),refb,NB);
+   STOP_CYCLE_MEASUREMENT;
+
+   if (!validate(resa,refa))
+   {
+      printf("dot a failed \r\n");
+
+   }
+
+   if (!validate(resb,refb))
+   {
+      printf("dot b failed \r\n");
+
+   }
+
+   std::cout << "=====\r\n";
+
+
+}
+
+template<typename T,int NB>
+static void test3()
+{
+   std::cout << "----\r\n" << "N = " << NB << "\r\n";
+
+   constexpr int U = 2;
+   #if defined(STATIC_TEST)
+   PVector<T,NB> a[U];
+   PVector<T,NB> b[U];
+   #else
+   PVector<T> a[U]={PVector<T>(NB),PVector<T>(NB)};
+   PVector<T> b[U]={PVector<T>(NB),PVector<T>(NB)};
+   #endif
+
+   using Acc = typename number_traits<T>::accumulator;
+
+   for(int i=0;i<U;i++)
+   {
+      init_array(a[i],NB);
+      init_array(b[i],NB);
+   }
+
+   std::array<Acc,U> res;
+   Acc ref[U];
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(3);
+   results(res) = dot(unroll<U>(
+                       [&a](index_t k){return expr(a[k]);}),
+                      unroll<U>(
+                       [&b](index_t k){return expr(b[k]);})
+              );
+   stopSectionNB(3);
+   STOP_CYCLE_MEASUREMENT;
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   for(int i=0;i<U;i++)
+   {
+      cmsisdsp_dot(a[i].const_ptr(),b[i].const_ptr(),ref[i],NB);
+   }
+   STOP_CYCLE_MEASUREMENT;
+
+   for(int i=0;i<U;i++)
+   {
+      if (!validate(res[i],ref[i]))
+      {
+         printf("dot failed %d \r\n",i);
+
+      }
+   }
+
+   std::cout << "=====\r\n";
+
+}
+
+template<typename T>
+void all_fusion_test()
+{
+    
+    const int nb_tails = TailForTests<T>::tail;
+    const int nb_loops = TailForTests<T>::loop;
+
+    title<T>("Vector Fusion");
+
+    test<T,NBVEC_256>();
+    test<T,1>();
+    test<T,nb_tails>();
+    test<T,nb_loops>();
+    test<T,nb_loops+1>();
+    test<T,nb_loops+nb_tails>();
+
+    title<T>("Dot Product Fusion");
+
+    test2<T,NBVEC_256>();
+    test2<T,1>();
+    test2<T,nb_tails>();
+    test2<T,nb_loops>();
+    test2<T,nb_loops+1>();
+    test2<T,nb_loops+nb_tails>();
+
+    title<T>("Unroll Fusion");
+
+    test3<T,NBVEC_256>();
+    test3<T,1>();
+    test3<T,nb_tails>();
+    test3<T,nb_loops>();
+    test3<T,nb_loops+1>();
+    test3<T,nb_loops+nb_tails>();
+}
+
+void fusion_test()
+{
+#if defined(FUSION_TEST)
+   #if defined(F64_DT)
+   all_fusion_test<double>();
+   #endif
+   #if defined(F32_DT)
+   all_fusion_test<float>();
+   #endif
+   #if defined(F16_DT) && !defined(DISABLEFLOAT16)
+   all_fusion_test<float16_t>();
+   #endif
+   #if defined(Q31_DT)
+   all_fusion_test<Q31>();
+   #endif
+   #if defined(Q15_DT)
+   all_fusion_test<Q15>();
+   #endif
+   #if defined(Q7_DT)
+   all_fusion_test<Q7>();
+   #endif
+#endif
+
+}
\ No newline at end of file
diff --git a/dsppp/tests/matrix_test.cpp b/dsppp/tests/matrix_test.cpp
new file mode 100644
index 000000000..e18a827ae
--- /dev/null
+++ b/dsppp/tests/matrix_test.cpp
@@ -0,0 +1,1863 @@
+extern "C" {
+    extern void matrix_test();
+}
+
+#include "allocator.h"
+
+#include <dsppp/arch.hpp>
+#include <dsppp/fixed_point.hpp>
+#include <dsppp/matrix.hpp>
+#include <dsppp/unroll.hpp>
+
+#include <iostream>
+
+#include <cmsis_tests.h>
+#include "boost/mp11.hpp"
+using namespace boost::mp11;
+
+
+extern "C" {
+#include "dsp/matrix_functions.h"
+#include "dsp/matrix_utils.h"
+}
+
+template<typename T>
+struct MatTestConstant;
+
+template<>
+struct MatTestConstant<double>
+{
+    constexpr static double value = 0.001;
+    constexpr static double half = 0.5;
+};
+
+template<>
+struct MatTestConstant<float32_t>
+{
+    constexpr static float value = 0.001f;
+    constexpr static float half = 0.5f;
+};
+
+#if !defined(DISABLEFLOAT16)
+template<>
+struct MatTestConstant<float16_t>
+{
+    constexpr static float16_t value = (float16_t)0.001f;
+    constexpr static float16_t half = (float16_t)0.5f;
+
+};
+#endif
+
+template<>
+struct MatTestConstant<Q7>
+{
+    constexpr static Q7 value = 0.001_q7;
+    constexpr static Q7 half = 0.5_q7;
+};
+
+template<>
+struct MatTestConstant<Q15>
+{
+    constexpr static Q15 value = 0.001_q15;
+    constexpr static Q15 half = 0.5_q15;
+};
+
+template<>
+struct MatTestConstant<Q31>
+{
+    constexpr static Q31 value = 0.001_q31;
+    constexpr static Q31 half = 0.5_q31;
+};
+
+
+template<typename T>
+struct ErrThreshold
+{
+   constexpr static float abserr = 0;
+   constexpr static float relerr = 0;
+   constexpr static float abserr_cholesky = 0;
+   constexpr static float relerr_cholesky = 0;
+   constexpr static float abserr_householder = 0;
+   constexpr static float relerr_householder = 0;
+   constexpr static float abserr_qr = 0;
+   constexpr static float relerr_qr = 0;
+   constexpr static float abserr_inv = 0;
+   constexpr static float relerr_inv = 0;
+};
+
+// Should be more accurate than F32 but right know
+// we only check there is no regression compared to f32
+template<>
+struct ErrThreshold<double>
+{
+   constexpr static float abserr = ABS_ERROR;
+   constexpr static float relerr = REL_ERROR;
+   constexpr static float abserr_cholesky = 3e-4;
+   constexpr static float relerr_cholesky = 1e-4;
+
+   constexpr static float abserr_householder = ABS_ERROR;
+   constexpr static float relerr_householder = REL_ERROR;
+   constexpr static float abserr_qr = ABS_ERROR;
+   constexpr static float relerr_qr = REL_ERROR;
+
+   constexpr static float abserr_inv = ABS_ERROR;
+   constexpr static float relerr_inv = REL_ERROR;
+};
+
+template<>
+struct ErrThreshold<float32_t>
+{
+   constexpr static float abserr = ABS_ERROR;
+   constexpr static float relerr = REL_ERROR;
+   constexpr static float abserr_cholesky = 3e-4;
+   constexpr static float relerr_cholesky = 1e-4;
+
+   constexpr static float abserr_householder = ABS_ERROR;
+   constexpr static float relerr_householder = REL_ERROR;
+   constexpr static float abserr_qr = ABS_ERROR;
+   constexpr static float relerr_qr = REL_ERROR;
+
+   constexpr static float abserr_inv = 4.0e-6;
+   constexpr static float relerr_inv = 5.0e-6;
+};
+
+#if !defined(DISABLEFLOAT16)
+template<>
+struct ErrThreshold<float16_t>
+{
+   constexpr static float abserr = ABS_ERROR;
+   constexpr static float relerr = REL_ERROR;
+   constexpr static float abserr_cholesky = 2e-1;
+   constexpr static float relerr_cholesky = 2e-1;
+
+   constexpr static float abserr_householder = 2e-4;
+   constexpr static float relerr_householder = 2e-3;
+   // 32x32 is not numerically behaving well with
+   // the matrix used as input
+   constexpr static float abserr_qr = 2.0;
+   constexpr static float relerr_qr = 1e-2;
+
+   constexpr static float abserr_inv = 3e-2;
+   constexpr static float relerr_inv = 3e-2;
+};
+#endif
+
+void cmsisdsp_mat_inv(float64_t *amod,
+                      float64_t* b, 
+                      uint32_t r,uint32_t c)
+{
+   arm_matrix_instance_f64 src;
+   arm_matrix_instance_f64 dst;
+
+
+   src.numRows = r;
+   src.numCols = c;
+   src.pData = amod;
+
+   dst.numRows = r;
+   dst.numCols = c;
+   dst.pData = b;
+
+   arm_status status = arm_mat_inverse_f64(&src,&dst);
+   (void)status;
+};
+
+void cmsisdsp_mat_inv(float32_t *amod,
+                      float32_t* b, 
+                      uint32_t r,uint32_t c)
+{
+   arm_matrix_instance_f32 src;
+   arm_matrix_instance_f32 dst;
+
+
+   src.numRows = r;
+   src.numCols = c;
+   src.pData = amod;
+
+   dst.numRows = r;
+   dst.numCols = c;
+   dst.pData = b;
+
+   arm_status status = arm_mat_inverse_f32(&src,&dst);
+   (void)status;
+};
+
+#if !defined(DISABLEFLOAT16)
+void cmsisdsp_mat_inv(float16_t *amod,
+                      float16_t* b, 
+                      uint32_t r,uint32_t c)
+{
+   arm_matrix_instance_f16 src;
+   arm_matrix_instance_f16 dst;
+
+
+   src.numRows = r;
+   src.numCols = c;
+   src.pData = amod;
+
+   dst.numRows = r;
+   dst.numCols = c;
+   dst.pData = b;
+
+   arm_status status = arm_mat_inverse_f16(&src,&dst);
+   (void)status;
+};
+#endif
+
+const float32_t mat64[64] = {0.395744, 0.623798, 0.885422, 0.95415, 0.310384, 0.257541, 
+        0.631426, 0.424491, 0.130945, 0.799959, 0.133693, 0.479455, 
+        0.519254, 0.381039, 0.617455, 0.748273, 0.146944, 0.928945, 
+        0.430936, 0.508207, 0.829023, 0.358027, 0.999501, 0.851953, 
+        0.273895, 0.685898, 0.0436612, 0.295212, 0.467651, 0.0515567, 
+        0.21037, 0.607475, 0.570295, 0.281109, 0.979219, 0.0947969, 
+        0.319016, 0.398405, 0.349953, 0.710002, 0.431597, 0.447659, 
+        0.0747669, 0.057063, 0.165648, 0.773106, 0.135765, 0.709327, 
+        0.873836, 0.292361, 0.00202529, 0.392942, 0.520183, 0.0528055, 
+        0.797982, 0.613497, 0.509682, 0.0435791, 0.780526, 0.960582, 
+        0.535914, 0.216113, 0.134108, 0.225859};
+
+const float32_t mat16[16] = {1.0, 2.0, 3.0, 4.0, 2.0, 4.0, 5.0, 6.0, 
+         3.0, 5.0, 9.0, 10.0, 4.0, 6.0, 10.0, 16.0};
+
+const float32_t mat256[256] = {0.97936, 0.498105, 0.452618, 0.299761, 0.688624, 0.247212, \
+        0.228337, 0.22905, 0.563815, 0.251998, 0.5238, 0.141223, 0.0980689, \
+        0.79112, 0.771182, 0.890995, 0.0256181, 0.0377277, 0.575629, \
+        0.648138, 0.926218, 0.803878, 0.620333, 0.325635, 0.587355, 0.041795, \
+        0.934271, 0.0690131, 0.0240136, 0.800828, 0.522999, 0.374706, \
+        0.266977, 0.208028, 0.112878, 0.0389899, 0.658311, 0.205067, \
+        0.244172, 0.0762778, 0.190575, 0.677312, 0.0682093, 0.367328, \
+        0.0191464, 0.988968, 0.437477, 0.130622, 0.907823, 0.0116559, \
+        0.614526, 0.447443, 0.0126975, 0.995496, 0.947676, 0.659996, \
+        0.321547, 0.725415, 0.658426, 0.0243924, 0.0843519, 0.351748, \
+        0.974332, 0.673381, 0.375012, 0.719626, 0.721219, 0.766905, \
+        0.17065, 0.648905, 0.770983, 0.360008, 0.344226, 0.179633, 0.347905, \
+        0.555561, 0.742615, 0.908389, 0.806959, 0.176078, 0.872167, \
+        0.321839, 0.098607, 0.954515, 0.627286, 0.235082, 0.746179, 0.163606, \
+        0.899323, 0.871471, 0.712448, 0.956971, 0.736687, 0.750702, 0.843348, \
+        0.302435, 0.444862, 0.0644597, 0.765519, 0.518397, 0.765541, \
+        0.900375, 0.201853, 0.490325, 0.721786, 0.893647, 0.774724, \
+        0.0983631, 0.339887, 0.526084, 0.0786152, 0.515697, 0.438801, \
+        0.226628, 0.125093, 0.886642, 0.617766, 0.71696, 0.473172, 0.640949, \
+        0.67688, 0.676214, 0.453662, 0.345796, 0.608999, 0.904448, 0.0965741, \
+        0.00461771, 0.467399, 0.292235, 0.0418646, 0.116632, 0.0766192, \
+        0.269051, 0.411649, 0.0538381, 0.973959, 0.667106, 0.301662, \
+        0.977206, 0.891751, 0.420267, 0.441334, 0.0896179, 0.249969, \
+        0.672614, 0.623966, 0.609733, 0.320772, 0.39723, 0.845196, 0.653877, \
+        0.0599186, 0.340188, 0.199787, 0.598104, 0.45664, 0.920485, 0.969439, \
+        0.446555, 0.0932837, 0.0247635, 0.747644, 0.438759, 0.639154, \
+        0.754049, 0.379433, 0.968655, 0.0452146, 0.208123, 0.252654, \
+        0.261898, 0.608665, 0.145211, 0.395368, 0.799111, 0.697823, \
+        0.382906, 0.456515, 0.262579, 0.284169, 0.881488, 0.860877, 0.155548, \
+        0.537387, 0.804235, 0.311383, 0.183216, 0.677692, 0.829542, 0.406049, \
+        0.860392, 0.467668, 0.385633, 0.654692, 0.841125, 0.178406, \
+        0.668945, 0.369609, 0.809711, 0.454593, 0.632028, 0.605791, 0.643851, \
+        0.787023, 0.285633, 0.832216, 0.30892, 0.303559, 0.704898, 0.61118, \
+        0.435547, 0.173678, 0.788689, 0.319511, 0.648378, 0.635417, 0.125127, \
+        0.310251, 0.800819, 0.4863, 0.924361, 0.308059, 0.952175, 0.449844, \
+        0.215496, 0.257826, 0.556383, 0.259735, 0.197234, 0.0509903, 0.21474, \
+        0.145085, 0.41288, 0.876758, 0.096721, 0.228955, 0.0152248, 0.126501, \
+        0.28899, 0.336668, 0.580015, 0.932761, 0.989783, 0.667379, \
+        0.798751, 0.587173, 0.445902, 0.041448, 0.311878, 0.0332857, \
+        0.401984, 0.795049, 0.8222, 0.678648, 0.807558};
+
+template<typename T,int R,int C, template<int> typename A>
+void init_mat(Matrix<T,R,C,A> &pDst,std::size_t r,std::size_t c)
+{
+   const float32_t *p;
+   if ((r==4) && (r==c))
+   {
+      p = mat16;
+   }
+
+   if ((r==8) && (r==c))
+   {
+      p = mat64;
+   }
+
+   if ((r==16) && (r==c))
+   {
+      p = mat256;
+   }
+
+
+   for(std::size_t i=0;i<r*c;i++)
+   {
+      pDst[i] = T(p[i]);
+   }
+}
+
+#if !defined(DISABLEFLOAT16)
+__STATIC_FORCEINLINE float16_t _abs(float16_t x)
+{
+   return((float16_t)fabsf((float)x));
+};
+#endif
+
+__STATIC_FORCEINLINE float32_t _abs(float32_t x)
+{
+   return(fabsf(x));
+};
+
+__STATIC_FORCEINLINE double _abs(double x)
+{
+   return(fabs(x));
+};
+
+template<typename T,
+         int NB,
+         template<int> typename A,
+         typename M>
+void _matinv(const Matrix<T,NB,NB,A> &a,M && res)
+{
+
+  Matrix<T,NB,NB,TMP_ALLOC> b = a;
+
+  const vector_length_t nb_rows = a.rows();
+  const vector_length_t nb_cols = a.columns();
+
+
+  for(index_t r=0;r < nb_rows ; r++)
+  {
+     res.row(r) = T{};
+     res(r,r) = number_traits<T>::one();
+  }
+
+
+  for(index_t c=0;c < nb_cols ; c++)
+  {
+     T pivot = b(c,c);
+     index_t selectedRow = c;
+
+     
+     for(index_t r=c+1;r < nb_rows ; r++)
+     {
+        T newPivot = b(r,c);
+        if (_abs(newPivot)>_abs(pivot))
+        {
+            pivot = newPivot;
+            selectedRow = r;
+        }
+     }
+
+     if ((pivot!=T{}) && (selectedRow != c))
+     {
+         swap(b.row(c,c),b.row(selectedRow,c));
+         swap(res.row(c),res.row(selectedRow));
+     }
+     else if (pivot == T{})
+     {
+        break;
+     }
+
+     pivot = number_traits<T>::one() / pivot;
+
+     b.row(c,c) *= pivot;
+     res.row(c) *= pivot;
+
+     index_t r=0;
+
+     for(;r < c ; r++)
+     {
+         const T tmp = b(r,c);
+         b.row(r,c) -= b.row(c,c)*tmp;
+         res.row(r) -= res.row(c)*tmp;
+     }
+
+     for(r=c+1;r < nb_rows ; r++)
+     {
+         const T tmp = b(r,c);
+         b.row(r,c) -= b.row(c,c)*tmp;
+         res.row(r) -= res.row(c)*tmp;
+     }
+
+  }
+
+  
+}
+
+template<typename T,
+         int NB,
+         template<int> typename A,
+         typename std::enable_if<(NB>0),bool>::type = true>
+Matrix<T,NB,NB,TMP_ALLOC> matinv(const Matrix<T,NB,NB,A> &a)
+{
+   Matrix<T,NB,NB,TMP_ALLOC> res;
+   _matinv(a,res);
+   return(res);
+}
+
+template<typename T,
+         int NB,
+         template<int> typename A,
+         typename std::enable_if<(NB<0),bool>::type = true>
+Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> matinv(const Matrix<T,NB,NB,A> &a)
+{
+   Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> res(a.rows(),a.columns());
+   return (_matinv(a,res));
+   return(res);
+}
+
+template<typename T,
+         int NB,
+         template<int> typename A,
+         typename std::enable_if<(NB<0),bool>::type = true>
+void matinv(Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> &res, const Matrix<T,NB,NB,A> &a)
+{
+   (void)_matinv(a,res);
+}
+
+
+template<typename T,int R,int C>
+void testinv()
+{
+   std::cout << "----\r\n";
+   std::cout << R << " x " << C << "\r\n";
+
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> a;
+   #else
+   PMat<T> a(R,C);
+   #endif
+
+   init_mat(a,R,C);
+
+   #if !defined(STATIC_TEST)
+   PMat<T> res(R,C);
+   #endif
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> res = matinv(a);
+   #else 
+   matinv(res,a);
+   #endif
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+   PMat<T> amod(a);
+   PMat<T> cmsis_res(R,C);
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   cmsisdsp_mat_inv(amod.ptr(),cmsis_res.ptr(),R,C);
+   STOP_CYCLE_MEASUREMENT;
+
+   
+
+   if (!validate(res.const_ptr(),cmsis_res.const_ptr(),R*C,
+       ErrThreshold<T>::abserr_inv,ErrThreshold<T>::relerr_inv))
+   {
+      printf("inv failed \r\n");
+
+   }
+
+   std::cout << "=====\r\n";
+
+}
+
+template<typename T,int R,int C>
+void testadd()
+{
+   std::cout << "----\r\n";
+   std::cout << R << " x " << C << "\r\n";
+
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> a;
+   PMat<T,R,C> b;
+   #else
+   PMat<T> a(R,C);
+   PMat<T> b(R,C);
+   #endif
+
+   init_array(a,R*C);
+   init_array(b,R*C);
+
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> res = a+b;
+   #else
+   PMat<T> res = a+b;
+   #endif
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+   //PrintType<decltype(a+b)>();
+   //PrintType<decltype(res)>();
+//
+   //std::cout << "a: " << IsVector<decltype(a)>::value << "\r\n";
+   //std::cout << "b: " << IsVector<decltype(b)>::value << "\r\n";
+   //std::cout << "a+b: " << IsVector<decltype(a+b)>::value << "\r\n";
+   //std::cout << "res: " << IsVector<decltype(res)>::value << "\r\n";
+   //std::cout << "same: " << SameElementType<decltype(res),decltype(a+b)>::value << "\r\n";
+//
+   //std::cout << "vec inst: " << has_vector_inst<decltype(res)>() << "\r\n";
+   //std::cout << "vec index pair: " << vector_idx_pair<decltype(res),decltype(a+b)>() << "\r\n";
+   //std::cout << "must use mat idx: " << must_use_matrix_idx_pair<decltype(res),decltype(a+b)>() << "\r\n";
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> cmsis_res;
+   #else
+   PMat<T> cmsis_res(R,C);
+   #endif
+   cmsisdsp_mat_add(a.const_ptr(),b.const_ptr(),cmsis_res.ptr(),R,C);
+   STOP_CYCLE_MEASUREMENT;
+
+   
+
+   if (!validate(res.const_ptr(),cmsis_res.const_ptr(),R*C,
+       ErrThreshold<T>::abserr,ErrThreshold<T>::relerr))
+   {
+      printf("add failed \r\n");
+   }
+
+   std::cout << "=====\r\n";
+
+}
+
+
+template<typename T,int R,int C>
+void testdiag()
+{
+   std::cout << "----\r\n";
+   std::cout << R << " x " << C << "\r\n";
+   #if defined(STATIC_TEST)
+   PVector<T,R> a;
+   #else
+   PVector<T> a(R);
+   #endif
+   init_array(a,R);
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> res=PMat<T,R,C>::diagonal(a);
+   #else
+   PMat<T> res=PMat<T>::diagonal(a);
+   #endif
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+   
+   const T* ap = a.const_ptr();
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> cmsis_res;
+   #else
+   PMat<T> cmsis_res(R,C);
+   #endif
+   T* refp = cmsis_res.ptr();
+
+   UNROLL_LOOP
+   for(index_t row=0;row < R; row++)
+   {
+      UNROLL_LOOP
+      for(index_t col=0;col < C; col++)
+      {
+         if (row != col)
+         {
+            refp[row*C+col] = T{};
+         }
+         else
+         {
+             refp[row*C+col] = ap[row];
+         }
+      }
+   }
+   STOP_CYCLE_MEASUREMENT;
+
+   
+
+   if (!validate(res.const_ptr(),cmsis_res.const_ptr(),R*C,
+       ErrThreshold<T>::abserr,ErrThreshold<T>::relerr))
+   {
+      printf("diag failed \r\n");
+   }
+
+   std::cout << "=====\r\n";
+}
+
+
+
+template<typename T,int R,int C>
+void testouter()
+{
+   std::cout << "----\r\n";
+   std::cout << R << " x " << C << "\r\n";
+
+   PVector<T,R> a;
+   PVector<T,C> b;
+   init_array(a,R);
+   init_array(b,C);
+
+   b = b + b;
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   PMat<T,R,C> res = outer(a,b);
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+   
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(2);
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> cmsis_res;
+   #else
+   PMat<T> cmsis_res(R,C);
+   #endif
+   CMSISOuter<T>::run(a.const_ptr(),b.const_ptr(),cmsis_res.ptr(),R,C);
+   startSectionNB(2);
+   STOP_CYCLE_MEASUREMENT;
+
+   //std::cout<<cmsis_res;
+
+   if (!validate(res.const_ptr(),cmsis_res.const_ptr(),R*C,
+       ErrThreshold<T>::abserr,ErrThreshold<T>::relerr))
+   {
+      printf("outer failed \r\n");
+   }
+
+   std::cout << "=====\r\n";
+
+}
+
+template<typename T,int R,int C>
+void testview()
+{
+   std::cout << "----\r\n";
+   std::cout << R << " x " << C << "\r\n";
+
+   #if defined(STATIC_TEST)
+   PVector<T,R> a;
+   #else 
+   PVector<T> a(R);
+   #endif
+   init_array(a,R);
+
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> res=PMat<T,R,C>::diagonal(a);
+   #else 
+   PMat<T> res=PMat<T>::diagonal(a);
+   #endif
+   //std::cout << res;
+   constexpr int subsize = 8;
+   constexpr int subpos = 8;
+   auto r = res.sub(Slice(subpos,subpos+subsize),Slice(subpos,subpos+subsize));
+
+   #if defined(STATIC_TEST)
+   PMat<T,subsize,subsize> resb;
+   #else 
+   PMat<T> resb(subsize,subsize);
+   #endif
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   resb = r+r;
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+   //std::cout << IsMatrix<decltype(r+r)>::value << "\r\n";
+
+   PMat<T,subsize,subsize> cmsis_res;
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(2);
+   DISABLE_LOOP_UNROLL
+   for(index_t row=0;row < subsize ; row++)
+   {
+      DISABLE_LOOP_UNROLL
+      for(index_t col=0;col < subsize ; col++)
+      {
+         cmsis_res(row,col) = r(row,col)+r(row,col);
+      }
+   }
+   startSectionNB(2);
+   STOP_CYCLE_MEASUREMENT;
+
+   //std::cout<<resb;
+   //std::cout<<cmsis_res;
+
+   if (!validate(resb,cmsis_res,
+       ErrThreshold<T>::abserr,ErrThreshold<T>::relerr))
+   {
+      printf("sub matrix failed \r\n");
+   }
+
+   std::cout << "=====\r\n";
+  
+}
+
+
+
+template<typename T,int R,int C>
+void testmatvec()
+{
+
+   using STO = typename vector_traits<T>::storage_type;
+
+   std::cout << "----\r\n";
+   std::cout << R << " x " << C << "\r\n";
+
+   #if defined(STATIC_TEST)
+   PVector<T,C> a;
+   #else
+   PVector<T> a(C);
+   #endif
+   init_array(a,C);
+
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> m;
+   #else
+   PMat<T> m(R,C);
+   #endif
+   init_array(m,R*C);
+
+   
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   #if defined(STATIC_TEST)
+   PVector<T,R> res = dot(m,a);
+   #else
+   PVector<T> res = dot(m,a);
+   #endif
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+   //std::cout << IsMatrix<decltype(r+r)>::value << "\r\n";
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   #if defined(STATIC_TEST)
+   PVector<T,R> cmsis_res;
+   #else
+   PVector<T> cmsis_res(R);
+   #endif
+   typename CMSISMatrixType<T>::type S;
+   S.numRows = R;
+   S.numCols = C;
+   S.pData = reinterpret_cast<STO*>(const_cast<T*>(m.ptr()));
+
+  
+   startSectionNB(2);
+   cmsis_mat_vec_mult(&S, a.const_ptr(), cmsis_res.ptr());
+   startSectionNB(2);
+   STOP_CYCLE_MEASUREMENT;
+
+   //std::cout << cmsis_res;
+   
+   if (!validate(res.const_ptr(),cmsis_res.const_ptr(),R,
+       ErrThreshold<T>::abserr,ErrThreshold<T>::relerr))
+   {
+      printf("matrix times vector failed \r\n");
+   }
+   std::cout << "=====\r\n";
+  
+}
+
+template<typename T,int R,int C>
+void testcomplexmatvec()
+{
+   const T scalar = MatTestConstant<T>::half;
+   using STO = typename vector_traits<T>::storage_type;
+
+   std::cout << "----\r\n";
+   std::cout << R << " x " << C << "\r\n";
+
+   #if defined(STATIC_TEST)
+   PVector<T,C> a;
+   PVector<T,C> b;
+   #else 
+   PVector<T> a(C);
+   PVector<T> b(C);
+   #endif
+   init_array(a,C);
+   init_array(b,C);
+
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> m;
+   #else
+   PMat<T> m(R,C);
+   #endif
+   init_array(m,R*C);
+
+   
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   #if defined(STATIC_TEST)
+   PVector<T,C> tmpv = a + b * scalar;
+   PVector<T,R> res = dot(m,tmpv);
+   #else
+   PVector<T> tmpv = a + b * scalar;
+   PVector<T> res = dot(m,tmpv);
+   #endif
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+   //std::cout << IsMatrix<decltype(r+r)>::value << "\r\n";
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   #if defined(STATIC_TEST)
+   PVector<T,R> cmsis_res;
+   PVector<T,C> tmp;
+   #else 
+   PVector<T> cmsis_res(R);
+   PVector<T> tmp(C);
+   #endif
+   typename CMSISMatrixType<T>::type S;
+   S.numRows = R;
+   S.numCols = C;
+   S.pData = reinterpret_cast<STO*>(const_cast<T*>(m.ptr()));
+
+   
+   startSectionNB(2);
+     cmsis_complex_mat_vec(&S,
+                         a.const_ptr(),
+                         b.const_ptr(),
+                         scalar,
+                         tmp.ptr(),
+                         cmsis_res.ptr());
+   startSectionNB(2);
+   STOP_CYCLE_MEASUREMENT;
+
+
+
+   //std::cout << cmsis_res;
+   
+   if (!validate(res.const_ptr(),cmsis_res.const_ptr(),R,
+       ErrThreshold<T>::abserr,ErrThreshold<T>::relerr))
+   {
+      printf("matrix times vector expression failed \r\n");
+   }
+
+   std::cout << "=====\r\n";
+  
+}
+
+
+template<typename T,int R, int K,int C>
+void testmatmult()
+{
+   std::cout << "----\r\n";
+   std::cout << R << " x " << K << " x " << C << "\r\n";
+
+   using S = typename CMSISMatrixType<T>::scalar;
+
+   #if defined(STATIC_TEST)
+   PMat<T,R,K> ma;
+   #else
+   PMat<T> ma(R,K);
+   #endif
+   init_array(ma,R*K);
+
+   #if defined(STATIC_TEST)
+   PMat<T,K,C> mb;
+   #else
+   PMat<T> mb(K,C);
+   #endif
+   init_array(mb,K*C);
+
+
+
+   mb += TestConstant<T>::small;
+
+   //std::cout << ma;
+   //std::cout << mb;
+   
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> res = dot(ma,mb);
+   #else
+   PMat<T> res = dot(ma,mb);
+   #endif
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+   //PrintType<decltype(ma)>();
+   //PrintType<decltype(mb)>();
+   //std::cout << ma;
+   //std::cout << mb;
+   //std::cout << res;
+   
+
+   //std::cout << IsMatrix<decltype(r+r)>::value << "\r\n";
+
+   PMat<T> tmp(C,K);
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> cmsis_res;
+   #else
+   PMat<T> cmsis_res(R,C);
+   #endif
+
+
+   typename CMSISMatrixType<T>::type SA;
+   SA.numRows = R;
+   SA.numCols = K;
+   SA.pData = reinterpret_cast<S*>(ma.ptr());
+
+   typename CMSISMatrixType<T>::type SB;
+   SB.numRows = K;
+   SB.numCols = C;
+   SB.pData = reinterpret_cast<S*>(mb.ptr());
+
+   typename CMSISMatrixType<T>::type RES;
+   RES.numRows = R;
+   RES.numCols = C;
+   RES.pData = reinterpret_cast<S*>(cmsis_res.ptr());
+
+   
+   startSectionNB(2);
+   cmsis_mat_mult(&SA, &SB, &RES,reinterpret_cast<S*>(tmp.ptr()));
+   startSectionNB(2);
+   STOP_CYCLE_MEASUREMENT;
+
+  
+   //std::cout << cmsis_res;
+   
+   if (!validate(res,cmsis_res,
+       ErrThreshold<T>::abserr,ErrThreshold<T>::relerr))
+   {
+      printf("matrix times matrix expression failed \r\n");
+   }
+
+   std::cout << "=====\r\n";
+  
+}
+
+template<typename T,int R,int K, int C>
+void testsubmatmult()
+{
+   std::cout << "----\r\n";
+   std::cout << R << " x " << K << " x " << C << "\r\n";
+
+   using S = typename CMSISMatrixType<T>::scalar;
+   constexpr int TOTALA = 4 + 2*K + 2*R + K*R;
+   constexpr int TOTALB = 4 + 2*C + 2*K + C*K;
+
+   #if defined(STATIC_TEST)
+   PMat<T,R+2,K+2> ma;
+   #else
+   PMat<T> ma(R+2,K+2);
+   #endif
+   init_array(ma,TOTALA);
+
+   #if defined(STATIC_TEST)
+   PMat<T,K+2,C+2> mb;
+   #else
+   PMat<T> mb(K+2,C+2);
+   #endif
+   init_array(mb,TOTALB);
+
+
+
+   mb += MatTestConstant<T>::value;
+
+   //std::cout << ma;
+   //std::cout << mb;
+   
+   
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> res(T{});
+   #else
+   PMat<T> res(R,C,T{});
+   #endif
+   startSectionNB(1);
+   res.sub(Slice(0,R),Slice(0,C)) = copy(dot(ma.sub(Slice(0,R),Slice(0,K)),mb.sub(Slice(0,K),Slice(0,C))));
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+   //PrintType<decltype(ma)>();
+   //PrintType<decltype(mb)>();
+   //std::cout << ma;
+   //std::cout << mb;
+   //std::cout << res;
+   
+
+   //std::cout << IsMatrix<decltype(r+r)>::value << "\r\n";
+
+   PMat<T> cmsis_res(R,C);
+   PMat<T> cmsis_ma(R,K);
+   PMat<T> cmsis_mb(K,C);
+   PMat<T> tmp(C,K);
+
+   typename CMSISMatrixType<T>::type SA;
+   SA.numRows = R;
+   SA.numCols = K;
+   SA.pData = reinterpret_cast<S*>(cmsis_ma.ptr());
+
+   typename CMSISMatrixType<T>::type SB;
+   SB.numRows = K;
+   SB.numCols = C;
+   SB.pData = reinterpret_cast<S*>(cmsis_mb.ptr());
+
+   typename CMSISMatrixType<T>::type RES;
+   RES.numRows = R;
+   RES.numCols = C;
+   RES.pData = reinterpret_cast<S*>(cmsis_res.ptr());
+
+  
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(2);
+   cmsis_ma = copy(ma.sub(Slice(0,R),Slice(0,K)));
+   cmsis_mb = copy(mb.sub(Slice(0,K),Slice(0,C)));
+   cmsis_mat_mult(&SA, &SB, &RES,reinterpret_cast<S*>(tmp.ptr()));
+   startSectionNB(2);
+   STOP_CYCLE_MEASUREMENT;
+
+  
+   //std::cout << cmsis_res;
+   
+   if (!validate(res.sub(Slice(0,R),Slice(0,C)),cmsis_res,
+       ErrThreshold<T>::abserr,ErrThreshold<T>::relerr))
+   {
+      printf("matrix times matrix expression failed \r\n");
+   }
+
+
+   std::cout << "=====\r\n";
+}
+
+
+template<typename T,int R,int C>
+void testmattranspose()
+{
+   std::cout << "----\r\n";
+   std::cout << R << " x " << C << "\r\n";
+
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> ma;
+   #else
+   PMat<T> ma(R,C);
+   #endif
+   init_array(ma,R*C);
+
+   
+   //PrintType<decltype(ma)>();
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   #if defined(STATIC_TEST)
+   PMat<T,C,R> res = ma.transpose();
+   #else
+   PMat<T> res = ma.transpose();
+   #endif
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+   //std::cout << IsMatrix<decltype(r+r)>::value << "\r\n";
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   #if defined(STATIC_TEST)
+   PMat<T,C,R> cmsis_res;
+   #else
+   PMat<T> cmsis_res(C,R);
+   #endif
+
+   typename CMSISMatrixType<T>::type SA;
+   SA.numRows = R;
+   SA.numCols = C;
+   SA.pData = reinterpret_cast<typename CMSISMatrixType<T>::scalar*>(ma.ptr());
+
+   typename CMSISMatrixType<T>::type RES;
+   RES.numRows = C;
+   RES.numCols = R;
+   RES.pData = reinterpret_cast<typename CMSISMatrixType<T>::scalar*>(cmsis_res.ptr());
+
+   
+   startSectionNB(2);
+   cmsis_mat_trans(&SA, &RES);
+   startSectionNB(2);
+   STOP_CYCLE_MEASUREMENT;
+
+   //std::cout << cmsis_res;
+   
+   if (!validate(res,cmsis_res,
+       ErrThreshold<T>::abserr,ErrThreshold<T>::relerr))
+   {
+      printf("matrix transpose failed \r\n");
+   }
+
+
+   std::cout << "=====\r\n";
+}
+
+
+#if !defined(DISABLEFLOAT16)
+static float16_t _gen_sqrt(const float16_t v)
+{
+   return((float16_t)sqrtf(v));
+}
+#endif
+
+static float32_t _gen_sqrt(const float32_t v)
+{
+   return(sqrtf(v));
+}
+
+static float64_t _gen_sqrt(const float64_t v)
+{
+   return(sqrt(v));
+}
+
+template<int L,template<int> typename A,
+         typename V,typename T>
+inline T _householder(Vector<T,L,A> &res,const V&v,const T eps)
+{
+   T alpha = v[0];
+   T tau;
+   T beta;
+   if (v.length()==1)
+   {
+      res[0] = T{};
+      return(T{});
+   }
+   T xnorm2 = dot(v.sub(1),v.sub(1));
+
+   //std::cout << xnorm2 << "\r\n";
+   if (xnorm2 <= eps)
+   {
+      tau = T{};
+      res = T{};
+   }
+   else 
+   {
+      if (alpha<=0)
+      {
+         beta = _gen_sqrt(alpha*alpha+xnorm2);
+      }
+      else 
+      {
+         beta = -_gen_sqrt(alpha*alpha+xnorm2);
+      }
+      T r = number_traits<T>::one() / (alpha - beta);
+      res = v * r;
+      tau = (beta - alpha)/beta;
+      res[0] = number_traits<T>::one();
+   }
+   return(tau);
+}
+
+template<typename V,typename T,
+         typename std::enable_if<
+         !IsDynamic<V>::value &&
+         SameElementType<V,T>::value &&
+         IsVector<V>::value,bool>::type = true>
+auto householder(const V&v,const T threshold)
+{
+  constexpr int NB = StaticLength<V>::value;
+  Vector<T,NB,TMP_ALLOC> res;
+  T beta = _householder(res,v,threshold);
+  return std::tuple<T,Vector<T,NB,TMP_ALLOC>>(beta,res);
+}
+
+template<typename V,typename T,
+         typename std::enable_if<
+         IsDynamic<V>::value &&
+         SameElementType<V,T>::value &&
+         IsVector<V>::value,bool>::type = true>
+auto householder(const V&v,const T threshold)
+{
+  Vector<T,DYNAMIC,TMP_ALLOC> res(v.length());
+  T beta = _householder(res,v,threshold);
+  return std::tuple<T,Vector<T,DYNAMIC,TMP_ALLOC>>(beta,res);
+}
+
+template<typename V,typename T,typename TMP,
+         typename std::enable_if<
+         IsDynamic<V>::value &&
+         SameElementType<V,T>::value &&
+         IsVector<V>::value,bool>::type = true>
+auto householder(const V&v,const T threshold,TMP &res)
+{
+  T beta = _householder(res,v,threshold);
+  return beta;
+}
+
+template<typename T>
+struct HouseholderThreshold;
+
+#if !defined(DISABLEFLOAT16)
+template<>
+struct HouseholderThreshold<float16_t>
+{
+   static constexpr float16_t value = DEFAULT_HOUSEHOLDER_THRESHOLD_F16;
+};
+#endif
+
+template<>
+struct HouseholderThreshold<float64_t>
+{
+   static constexpr float64_t value = DEFAULT_HOUSEHOLDER_THRESHOLD_F64;
+};
+
+
+template<>
+struct HouseholderThreshold<float32_t>
+{
+   static constexpr float32_t value = DEFAULT_HOUSEHOLDER_THRESHOLD_F32;
+};
+
+
+template<typename T,int NB>
+static void testHouseholder()
+{
+   std::cout << "----\r\n" << "N = " << NB << "\r\n";
+   #if defined(STATIC_TEST)
+   PVector<T,NB> a;
+   #else
+   PVector<T> a(NB);
+   #endif
+
+   cmsis_init_householder(a.ptr(),NB);
+
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   auto res = householder(a,HouseholderThreshold<T>::value);
+   //PVector<T,NB> res;// = a + b;
+   //float res_beta=0;
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+  
+   
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   #if defined(STATIC_TEST)
+   PVector<T,NB> ref;
+   #else
+   PVector<T> ref(NB);
+   #endif
+   T ref_beta = cmsis_householder(a.const_ptr(),ref.ptr(),NB);
+   STOP_CYCLE_MEASUREMENT;
+
+   if (!validate(std::get<1>(res).const_ptr(),ref.const_ptr(),NB,
+       ErrThreshold<T>::abserr_householder,ErrThreshold<T>::relerr_householder))
+   {
+      printf("householder vector failed \r\n");
+   }
+
+
+   if (!validate(std::get<0>(res),ref_beta,
+       ErrThreshold<T>::abserr_householder,ErrThreshold<T>::relerr_householder))
+   {
+      printf("householder beta failed \r\n");
+   }
+   std::cout << "=====\r\n";
+} 
+
+#include "debug_mat.h"
+
+#if 1
+// R >= C
+template<typename T,int R,int C,template<int> typename A>
+auto QR(const Matrix<T,R,C,A>&m,const T eps,bool wantQ)
+{
+   #if defined(STATIC_TEST)
+   Vector<T,C,TMP_ALLOC> tau;
+   Matrix<T,R,C,TMP_ALLOC> RM = m;
+   Matrix<T,R,R,TMP_ALLOC> Q = Matrix<T,R,R>::identity();
+
+
+   // Temporaries
+   Vector<T,R,TMP_ALLOC> tmpvec;
+   Matrix<T,1,R,TMP_ALLOC> tmpmat;
+   #else
+   Vector<T> tau(m.columns());
+   Matrix<T> RM = m;
+   Matrix<T> Q = Matrix<T>::identity(m.rows());
+
+
+   // Temporaries
+   Vector<T> tmpvec(m.rows());
+   Matrix<T> tmpmat(1,m.rows());
+   #endif
+
+   const int NBC = m.columns();
+   const int NBR = m.rows();
+
+   
+   for(index_t c=0;c<NBC-1;c++)
+   {
+      auto beta = householder(RM.col(c,c),eps,tmpvec);
+      tau[c] = beta;
+
+      MatrixView<T> vt(tmpvec,1,NBR-c);
+      dot(tmpmat.sub(0,1,0,NBC-c),vt,RM.sub(c,c));
+      
+      RM.sub(c,c) = 
+        RM.sub(c,c) - beta * outer(tmpvec.sub(0,NBR-c),tmpmat.row(0,0,NBC-c));
+     
+      // Copy householder reflector 
+      // Not valid when c == C-1
+      // We don't want to use a  test since CMSIS-DSP is not using
+      // one and introducing a test would give worse performance
+      RM.col(c,c+1) = copy(tmpvec.sub(1,NBR-c));
+     
+   }
+
+
+   auto beta = householder(RM.col(NBC-1,NBC-1),eps,tmpvec);
+   tau[NBC-1] = beta;
+
+   MatrixView<T> vt(tmpvec,1,NBR-(NBC-1));
+   dot(tmpmat.sub(0,1,0,NBC-(NBC-1)),vt,RM.sub(NBC-1,NBC-1));
+      
+   RM.sub(NBC-1,NBC-1) = 
+        RM.sub(NBC-1,NBC-1) - beta * outer(tmpvec.sub(0,NBR-(NBC-1)),tmpmat.row(0,0,NBC-(NBC-1)));
+     
+
+   
+
+   if (wantQ)
+   {
+      for(index_t c=NBC-1;c>=0;c--)
+      {
+         tmpvec.sub(1) = copy(RM.col(c,c+1));
+         tmpvec[0] = number_traits<T>::one();
+   
+         MatrixView<T> vt(tmpvec,1,NBR-c);
+         dot(tmpmat.sub(0,1,0,NBR-c),vt,Q.sub(c,c));
+
+         Q.sub(c,c) = 
+           Q.sub(c,c) - tau[c] * outer(tmpvec.sub(0,NBR-c),tmpmat.row(0,0,NBR-c));
+
+      }
+   }
+
+   return std::make_tuple(RM,Q,tau);
+
+}
+
+template<typename T,int R,int C>
+static void testQR()
+{
+   std::cout << "----\r\n";
+   std::cout << R << " x " << C << "\r\n";
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> a;
+   #else
+   PMat<T> a(R,C);
+   #endif
+
+   cmsis_init_qr(a.ptr(),R,C);
+
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   auto res = QR(a,HouseholderThreshold<T>::value,true);
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+   //std::cout << "next\r\n";
+
+   //std::cout << std::get<0>(res);
+   //std::cout << std::get<1>(res);
+   //std::cout << std::get<2>(res);
+
+   // For fair comparison, in dynamic mode we must take into
+   // account the memory allocations since they are made
+   // by the QR algorithms
+   #if !defined(STATIC_TEST)
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   #endif 
+
+   #if 0 //defined(STATIC_TEST)
+   PMat<T,R,C> cmsis_res;
+   PMat<T,R,C> cmsis_outRp;
+   PMat<T,R,R> cmsis_outQp;
+   PVector<T,C> cmsis_tau;
+   PVector<T,R> cmsis_tmpa;
+   PVector<T,C> cmsis_tmpb;
+   #else
+   PMat<T> cmsis_res(R,C);
+   PMat<T> cmsis_outRp(R,C);
+   PMat<T> cmsis_outQp(R,R);
+   PVector<T> cmsis_tau(C);
+   PVector<T> cmsis_tmpa(R);
+   PVector<T> cmsis_tmpb(C);
+   #endif
+
+   typename CMSISMatrixType<T>::type RP;
+   RP.numRows = R;
+   RP.numCols = C;
+   RP.pData = cmsis_outRp.ptr();
+
+   typename CMSISMatrixType<T>::type QP;
+   QP.numRows = R;
+   QP.numCols = R;
+   QP.pData = cmsis_outQp.ptr();
+
+   typename CMSISMatrixType<T>::type IN;
+   IN.numRows = R;
+   IN.numCols = C;
+   IN.pData = a.ptr();
+
+   //std::cout << "-------\r\n";
+
+   #if defined(STATIC_TEST)
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   #endif
+   arm_status status=cmsis_qr(&IN,HouseholderThreshold<T>::value,
+                                    &RP,&QP,
+                                    cmsis_tau.ptr(),
+                                    cmsis_tmpa.ptr(),
+                                    cmsis_tmpb.ptr());
+   (void)status;
+   STOP_CYCLE_MEASUREMENT;
+
+   //std::cout << cmsis_outRp;
+   //std::cout << cmsis_outQp;
+   //std::cout << cmsis_tau;
+
+   if (!validate(std::get<0>(res),cmsis_outRp,
+       ErrThreshold<T>::abserr_qr,ErrThreshold<T>::relerr_qr))
+   {
+      printf("QR Rp matrix failed \r\n");
+   }
+
+
+   if (!validate(std::get<1>(res),cmsis_outQp,
+       ErrThreshold<T>::abserr_qr,ErrThreshold<T>::relerr_qr))
+   {
+      printf("QR Qp matrix failed \r\n");
+   }
+
+   if (!validate(std::get<2>(res),cmsis_tau,
+       ErrThreshold<T>::abserr_qr,ErrThreshold<T>::relerr_qr))
+   {
+      printf("QR tau failed \r\n");
+   }
+   std::cout << "=====\r\n";
+} 
+
+#endif 
+
+
+template<typename T,int R,template<int> typename A>
+auto cholesky(const Matrix<T,R,R,A>&a)
+{
+     // Temporaries
+   #if defined(STATIC_TEST)
+   Matrix<T,R,R,TMP_ALLOC> g = a;
+   Vector<T,R,TMP_ALLOC> tmp;
+   #else
+   Matrix<T> g = a;
+   Vector<T> tmp(a.rows());
+   #endif
+
+   const int NBR = a.rows();
+
+   g.col(0,0) = g.col(0,0) * (T)(number_traits<T>::one() / _gen_sqrt(g(0,0)));
+
+   for(int j=1;j<NBR;j++)
+   {
+      dot(tmp.sub(j),g.sub(j,NBR,0,j) , g.row(j,0,j));
+      
+      g.col(j,j) = (g.col(j,j) - tmp.sub(j)) * (T)(number_traits<T>::one() / _gen_sqrt(g(j,j)- tmp[j]));
+   
+   }
+   return(g);
+}
+
+
+template<typename T,int R>
+static void testCholesky()
+{
+   std::cout << "----\r\n";
+   std::cout << R << " x " << R << "\r\n";
+   #if defined(STATIC_TEST)
+   PMat<T,R,R> a;
+   #else
+   PMat<T> a(R,R);
+   #endif
+
+   cmsis_init_cholesky(a.ptr(),R,R);
+
+   //std::cout << a;
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   // Not totally equivalent to CMSIS implementation
+   // It should be possible to rewrite it to avoid use of
+   // temporary buffer like CMSIS-DSP
+   auto res = cholesky(a);
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+  
+   //std::cout << res;
+
+   PMat<T,R,R> cmsis_res(T{});
+
+   typename CMSISMatrixType<T>::type OUT;
+   OUT.numRows = R;
+   OUT.numCols = R;
+   OUT.pData = cmsis_res.ptr();
+
+
+   typename CMSISMatrixType<T>::type IN;
+   IN.numRows = R;
+   IN.numCols = R;
+   IN.pData = a.ptr();
+
+   //std::cout << "-------\r\n";
+
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   arm_status status=cmsis_cholesky(&IN,&OUT);
+   (void)status;
+   STOP_CYCLE_MEASUREMENT;
+
+   //std::cout << cmsis_res;
+
+ 
+   if (!validateLT(res,cmsis_res,
+       ErrThreshold<T>::abserr_cholesky,ErrThreshold<T>::relerr_cholesky))
+   {
+      printf("cholesky failed \r\n");
+   }
+   std::cout << "=====\r\n";
+}
+
+template<typename TT,typename ...T>
+struct TESTINV
+{
+   static void all()
+   {
+      testinv<TT,T::value...>();
+   }
+};
+
+template<typename TT,typename ...T>
+struct TESTOUTER
+{
+   static void all()
+   {
+      testouter<TT,T::value...>();
+   }
+};
+
+template<typename TT,typename ...T>
+struct TESTMATVEC
+{
+   static void all()
+   {
+      testmatvec<TT,T::value...>();
+   }
+};
+
+template<typename TT,typename ...T>
+struct TESTCOMPLEXMATVEC
+{
+   static void all()
+   {
+      testcomplexmatvec<TT,T::value...>();
+   }
+};
+
+template<typename TT,typename ...T>
+struct TESTADD
+{
+   static void all()
+   {
+      testadd <TT,T::value...>();
+   }
+};
+
+template<typename TT,typename ...T>
+struct TESTMATTRANSPOSE
+{
+   static void all()
+   {
+      testmattranspose<TT,T::value...>();
+   }
+};
+
+template<typename TT,typename ...T>
+struct TESTMATMULT
+{
+   static void all()
+   {
+      testmatmult<TT,T::value...>();
+   }
+};
+
+template<typename TT,typename ...T>
+struct TESTSUBMATMULT
+{
+   static void all()
+   {
+      testsubmatmult<TT,T::value...>();
+   }
+};
+
+
+template<typename TT,typename... T>
+struct TEST_CASES
+{
+   static void all()
+   {
+      (mp_push_front<T,TT>::all(),...);
+   }
+};
+
+template<template <typename, typename...> typename F,
+         typename... A>
+using ALL_TESTS = mp_rename<mp_push_front<mp_product<F, A...>,float>,TEST_CASES>;
+  
+
+template<typename T>
+void matrix_all_test()
+{
+
+#if defined(MATRIX_TEST)
+   #if !defined(SUBTEST1) && !defined(SUBTEST2)
+   const int nb_tails = TailForTests<T>::tail;
+   const int nb_loops = TailForTests<T>::loop;
+   using UNROLL = mp_rename<mp_list_v<1,2,4,8,9,11>,mp_list>;
+   #endif 
+
+   #if defined(SUBTEST8) || defined(SUBTEST14) 
+   using UNROLLA = mp_rename<mp_list_v<1>,mp_list>;
+   #endif
+   #if defined(SUBTEST9) || defined(SUBTEST15) 
+   using UNROLLA = mp_rename<mp_list_v<2>,mp_list>;
+   #endif
+   #if defined(SUBTEST10) || defined(SUBTEST16)
+   using UNROLLA = mp_rename<mp_list_v<4>,mp_list>;
+   #endif
+
+   #if defined(SUBTEST11) || defined(SUBTEST17)
+   using UNROLLA = mp_rename<mp_list_v<8>,mp_list>;
+   #endif
+   #if defined(SUBTEST12) || defined(SUBTEST18)
+   using UNROLLA = mp_rename<mp_list_v<9>,mp_list>;
+   #endif
+   #if defined(SUBTEST13) || defined(SUBTEST19)
+   using UNROLLA = mp_rename<mp_list_v<11>,mp_list>;
+   #endif
+
+   #if !defined(SUBTEST1) && !defined(SUBTEST2)
+   using VEC = mp_rename<mp_list_v<1,
+                                   nb_tails,
+                                   nb_loops,
+                                   nb_loops+1,
+                                   nb_loops+nb_tails>,mp_list>;
+   #endif
+   
+
+   //using res = mp_rename<mp_push_front<mp_product<TEST, A, B,C>,float>,TEST_CASES>;
+   //using res = ALL_TESTS<TEST,A,B,C>;
+   //PrintType<test>();
+
+
+   if constexpr (number_traits<T>::is_float)
+   {
+
+#if defined(SUBTEST1)
+      
+      title<T>("Householder");
+      testHouseholder<T,NBVEC_4>();
+      testHouseholder<T,NBVEC_16>();
+      testHouseholder<T,NBVEC_32>();
+
+      title<T>("QR");
+      testQR<T,NBVEC_4,NBVEC_4>();
+      testQR<T,NBVEC_16,NBVEC_16>();
+      testQR<T,NBVEC_32,NBVEC_32>();
+
+      title<T>("Cholesky");
+      testCholesky<T,NBVEC_4>();
+      testCholesky<T,NBVEC_16>();
+      testCholesky<T,NBVEC_32>();
+
+#endif
+
+#if defined(SUBTEST2)
+      title<T>("Matrix inverse");
+      testinv<T,NBVEC_4,NBVEC_4>();
+      testinv<T,NBVEC_8,NBVEC_8>();
+      testinv<T,NBVEC_16,NBVEC_16>();
+#endif
+      
+
+   }
+
+#if defined(SUBTEST1)
+      title<T>("Matrix outer product");
+   
+      testouter<T,NBVEC_4,NBVEC_4>();
+      testouter<T,NBVEC_8,NBVEC_8>();
+      testouter<T,NBVEC_16,NBVEC_16>();
+#endif
+
+#if defined(SUBTEST3)
+      title<T>("Matrix outer product");
+      ALL_TESTS<TESTOUTER,UNROLL,VEC>::all();
+#endif
+
+      if constexpr (!std::is_same<T,double>::value)
+      {
+#if defined(SUBTEST1)
+          title<T>("Matrix times vector");
+    
+          testmatvec<T,NBVEC_4 ,NBVEC_4>();
+          testmatvec<T,NBVEC_16,NBVEC_16>();
+          testmatvec<T,NBVEC_32,NBVEC_32>();
+          testmatvec<T,NBVEC_44,NBVEC_44>();
+          testmatvec<T,NBVEC_47,NBVEC_47>();
+#endif
+
+#if defined(SUBTEST4)
+          title<T>("Matrix times vector");
+          ALL_TESTS<TESTMATVEC,UNROLL,VEC>::all();
+#endif 
+
+#if defined(SUBTEST1)
+          title<T>("Matrix times vector expression");
+    
+          testcomplexmatvec<T,NBVEC_4 ,NBVEC_4>();
+          testcomplexmatvec<T,NBVEC_16,NBVEC_16>();
+          testcomplexmatvec<T,NBVEC_32,NBVEC_32>();
+          testcomplexmatvec<T,NBVEC_44,NBVEC_44>();
+          testcomplexmatvec<T,NBVEC_47,NBVEC_47>();
+#endif
+
+#if defined(SUBTEST5)
+          title<T>("Matrix times vector expression");
+          ALL_TESTS<TESTCOMPLEXMATVEC,UNROLL,VEC>::all();
+#endif
+      }
+
+   if constexpr (!std::is_same<T,Q7>::value && !std::is_same<T,double>::value)
+   {
+#if defined(SUBTEST1)
+      title<T>("Matrix add");
+       
+      testadd<T,NBVEC_4,NBVEC_4>();
+      testadd<T,NBVEC_8,NBVEC_8>();
+      testadd<T,NBVEC_16,NBVEC_16>();
+#endif
+
+#if defined(SUBTEST6)
+      title<T>("Matrix add");
+      ALL_TESTS<TESTADD,UNROLL,VEC>::all();
+#endif
+    }
+ 
+ #if defined(SUBTEST1)
+   title<T>("Matrix diag");
+   
+   testdiag<T,NBVEC_4,NBVEC_4>();
+   testdiag<T,NBVEC_8,NBVEC_8>();
+   testdiag<T,NBVEC_16,NBVEC_16>();
+
+   title<T>("Matrix submatrix");
+
+   testview<T,NBVEC_16,NBVEC_16>();
+#endif
+
+#if defined(SUBTEST1)
+   title<T>("Matrix multiply");
+   testmatmult<T,NBVEC_4,NBVEC_4,NBVEC_4>();
+   testmatmult<T,NBVEC_16,NBVEC_16,NBVEC_16>();
+   testmatmult<T,NBVEC_32,NBVEC_32,NBVEC_32>();
+#endif
+
+
+
+#if defined(SUBTEST1)
+   title<T>("Matrix transpose");
+   testmattranspose<T,NBVEC_2,NBVEC_2>();
+   testmattranspose<T,NBVEC_3,NBVEC_3>();
+   testmattranspose<T,NBVEC_4,NBVEC_4>();
+   testmattranspose<T,NBVEC_16,NBVEC_16>();
+   testmattranspose<T,NBVEC_32,NBVEC_32>();
+#endif 
+
+#if defined(SUBTEST7)
+   title<T>("Matrix transpose");
+   ALL_TESTS<TESTMATTRANSPOSE,UNROLL,VEC>::all();
+#endif
+
+#if defined(SUBTEST8) || defined(SUBTEST9)|| defined(SUBTEST10)|| defined(SUBTEST11)|| defined(SUBTEST12)|| defined(SUBTEST13)
+   title<T>("Matrix multiply");
+   ALL_TESTS<TESTMATMULT,UNROLLA,VEC,UNROLL>::all();
+#endif
+
+#if defined(SUBTEST1)
+   title<T>("Submatrix multiply");
+   testsubmatmult<T,NBVEC_4,NBVEC_4,NBVEC_4>();
+   testsubmatmult<T,NBVEC_16,NBVEC_16,NBVEC_16>();
+#endif 
+
+#if defined(SUBTEST14) || defined(SUBTEST15) || defined(SUBTEST16)|| defined(SUBTEST17)|| defined(SUBTEST18)|| defined(SUBTEST19)
+   title<T>("Submatrix multiply");
+   ALL_TESTS<TESTSUBMATMULT,UNROLLA,VEC,UNROLL>::all();
+#endif
+
+   //testsubmatmult<T,NBVEC_32>();
+#endif
+
+};
+
+
+
+void matrix_test()
+{
+#if defined(MATRIX_TEST)
+   #if defined(F64_DT)
+   matrix_all_test<double>();
+   #endif
+   #if defined(F32_DT)
+   matrix_all_test<float>();
+   #endif
+   #if defined(F16_DT) && !defined(DISABLEFLOAT16)
+   matrix_all_test<float16_t>();
+   #endif
+   #if defined(Q31_DT)
+   matrix_all_test<Q31>();
+   #endif
+   #if defined(Q15_DT)
+   matrix_all_test<Q15>();
+   #endif
+   #if defined(Q7_DT)
+   matrix_all_test<Q7>();
+   #endif
+#endif
+}
\ No newline at end of file
diff --git a/dsppp/tests/matrix_utils.h b/dsppp/tests/matrix_utils.h
new file mode 100644
index 000000000..4e1defa8e
--- /dev/null
+++ b/dsppp/tests/matrix_utils.h
@@ -0,0 +1,640 @@
+/******************************************************************************
+ * @file     matrix_utils.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.11.0
+ * @date     30 May 2022
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2022 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _MATRIX_UTILS_H_
+#define _MATRIX_UTILS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#define ELEM(A,ROW,COL) &((A)->pData[(A)->numCols* (ROW) + (COL)])
+
+#define SCALE_COL_T(T,CAST,A,ROW,v,i)        \
+{                                       \
+  int32_t _w;                            \
+  T *data = (A)->pData;                 \
+  const int32_t _numCols = (A)->numCols; \
+  const int32_t nb = (A)->numRows - ROW;\
+                                        \
+  data += i + _numCols * (ROW);          \
+                                        \
+  for(_w=0;_w < nb; _w++)                  \
+  {                                     \
+     *data *= CAST v;                   \
+     data += _numCols;                   \
+  }                                     \
+}
+
+#define COPY_COL_T(T,A,ROW,COL,DST)               \
+{                                                 \
+    uint32_t _row;                                \
+    T *_pb=DST;                                    \
+    T *_pa = (A)->pData + ROW * (A)->numCols + COL;\
+    for(_row = ROW; _row < (A)->numRows; _row ++) \
+    {                                             \
+         *_pb++ = *_pa;                             \
+         _pa += (A)->numCols;                      \
+    }                                             \
+}
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define SWAP_ROWS_F16(A,COL,i,j)                  \
+  {                                               \
+    int cnt = ((A)->numCols)-(COL);               \
+    int32_t _w;                                    \
+    float16_t *data = (A)->pData;                 \
+    const int32_t _numCols = (A)->numCols;        \
+                                                  \
+    for(_w=(COL);_w < _numCols; _w+=8)               \
+    {                                             \
+       f16x8_t tmpa,tmpb;                         \
+       mve_pred16_t p0 = vctp16q(cnt);            \
+                                                  \
+       tmpa=vldrhq_z_f16(&data[i*_numCols + _w],p0);\
+       tmpb=vldrhq_z_f16(&data[j*_numCols + _w],p0);\
+                                                  \
+       vstrhq_p(&data[i*_numCols + _w], tmpb, p0);  \
+       vstrhq_p(&data[j*_numCols + _w], tmpa, p0);  \
+                                                  \
+       cnt -= 8;                                  \
+    }                                             \
+  }
+
+#define SCALE_ROW_F16(A,COL,v,i)                   \
+{                                                   \
+  int cnt = ((A)->numCols)-(COL);                   \
+  int32_t _w;                                       \
+  float16_t *data = (A)->pData;                     \
+  const int32_t _numCols = (A)->numCols;            \
+                                                    \
+  for(_w=(COL);_w < _numCols; _w+=8)                    \
+  {                                                 \
+       f16x8_t tmpa;                                \
+       mve_pred16_t p0 = vctp16q(cnt);              \
+       tmpa = vldrhq_z_f16(&data[i*_numCols + _w],p0);\
+       tmpa = vmulq_n_f16(tmpa,(_Float16)v);                  \
+       vstrhq_p(&data[i*_numCols + _w], tmpa, p0);    \
+       cnt -= 8;                                    \
+  }                                                 \
+                                                    \
+}
+
+#define MAC_ROW_F16(COL,A,i,v,B,j)                   \
+{                                                    \
+  int cnt = ((A)->numCols)-(COL);                    \
+  int32_t _w;                                        \
+  float16_t *dataA = (A)->pData;                     \
+  float16_t *dataB = (B)->pData;                     \
+  const int32_t _numCols = (A)->numCols;             \
+                                                     \
+  for(_w=(COL);_w < _numCols; _w+=8)                     \
+  {                                                  \
+       f16x8_t tmpa,tmpb;                            \
+       mve_pred16_t p0 = vctp16q(cnt);               \
+       tmpa = vldrhq_z_f16(&dataA[i*_numCols + _w],p0);\
+       tmpb = vldrhq_z_f16(&dataB[j*_numCols + _w],p0);\
+       tmpa = vfmaq_n_f16(tmpa,tmpb,v);              \
+       vstrhq_p(&dataA[i*_numCols + _w], tmpa, p0);    \
+       cnt -= 8;                                     \
+  }                                                  \
+                                                     \
+}
+
+#define MAS_ROW_F16(COL,A,i,v,B,j)                   \
+{                                                    \
+  int cnt = ((A)->numCols)-(COL);                    \
+  int32_t _w;                                        \
+  float16_t *dataA = (A)->pData;                     \
+  float16_t *dataB = (B)->pData;                     \
+  const int32_t _numCols = (A)->numCols;             \
+  f16x8_t vec=vdupq_n_f16(v);                        \
+                                                     \
+  for(_w=(COL);_w < _numCols; _w+=8)                     \
+  {                                                  \
+       f16x8_t tmpa,tmpb;                            \
+       mve_pred16_t p0 = vctp16q(cnt);               \
+       tmpa = vldrhq_z_f16(&dataA[i*_numCols + _w],p0);\
+       tmpb = vldrhq_z_f16(&dataB[j*_numCols + _w],p0);\
+       tmpa = vfmsq_f16(tmpa,tmpb,vec);              \
+       vstrhq_p(&dataA[i*_numCols + _w], tmpa, p0);    \
+       cnt -= 8;                                     \
+  }                                                  \
+                                                     \
+}
+
+#else
+
+
+#define SWAP_ROWS_F16(A,COL,i,j)       \
+{                                      \
+  int32_t _w;                           \
+  float16_t *dataI = (A)->pData;       \
+  float16_t *dataJ = (A)->pData;       \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);    \
+                                       \
+  dataI += i*_numCols + (COL);          \
+  dataJ += j*_numCols + (COL);          \
+                                       \
+  for(_w=0;_w < nb; _w++)                 \
+  {                                    \
+     float16_t tmp;                    \
+     tmp = *dataI;                     \
+     *dataI++ = *dataJ;                \
+     *dataJ++ = tmp;                   \
+  }                                    \
+}
+
+#define SCALE_ROW_F16(A,COL,v,i)       \
+{                                      \
+  int32_t _w;                           \
+  float16_t *data = (A)->pData;        \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);    \
+                                       \
+  data += i*_numCols + (COL);           \
+                                       \
+  for(_w=0;_w < nb; _w++)                 \
+  {                                    \
+     *data++ *= (_Float16)v;           \
+  }                                    \
+}
+
+
+#define MAC_ROW_F16(COL,A,i,v,B,j)                \
+{                                                 \
+  int32_t _w;                                      \
+  float16_t *dataA = (A)->pData;                  \
+  float16_t *dataB = (B)->pData;                  \
+  const int32_t _numCols = (A)->numCols;           \
+  const int32_t nb = _numCols-(COL);               \
+                                                  \
+  dataA += i*_numCols + (COL);                     \
+  dataB += j*_numCols + (COL);                     \
+                                                  \
+  for(_w=0;_w < nb; _w++)                            \
+  {                                               \
+     *dataA++ += (_Float16)v * (_Float16)*dataB++;\
+  }                                               \
+}
+
+#define MAS_ROW_F16(COL,A,i,v,B,j)                \
+{                                                 \
+  int32_t _w;                                      \
+  float16_t *dataA = (A)->pData;                  \
+  float16_t *dataB = (B)->pData;                  \
+  const int32_t _numCols = (A)->numCols;           \
+  const int32_t nb = _numCols-(COL);               \
+                                                  \
+  dataA += i*_numCols + (COL);                     \
+  dataB += j*_numCols + (COL);                     \
+                                                  \
+  for(_w=0;_w < nb; _w++)                            \
+  {                                               \
+     *dataA++ -= (_Float16)v * (_Float16)*dataB++;\
+  }                                               \
+}
+
+#endif /*defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)*/
+
+/* Functions with only a scalar version */
+#define COPY_COL_F16(A,ROW,COL,DST) \
+  COPY_COL_T(float16_t,A,ROW,COL,DST)
+
+#define SCALE_COL_F16(A,ROW,v,i)        \
+  SCALE_COL_T(float16_t,(_Float16),A,ROW,v,i)
+  
+#endif /* defined(ARM_FLOAT16_SUPPORTED)*/
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define SWAP_ROWS_F32(A,COL,i,j)                  \
+  {                                               \
+    int cnt = ((A)->numCols)-(COL);               \
+    float32_t *data = (A)->pData;                 \
+    const int32_t _numCols = (A)->numCols;        \
+    int32_t _w;                                   \
+                                                  \
+    for(_w=(COL);_w < _numCols; _w+=4)                \
+    {                                             \
+       f32x4_t tmpa,tmpb;                         \
+       mve_pred16_t p0 = vctp32q(cnt);            \
+                                                  \
+       tmpa=vldrwq_z_f32(&data[i*_numCols + _w],p0);\
+       tmpb=vldrwq_z_f32(&data[j*_numCols + _w],p0);\
+                                                  \
+       vstrwq_p(&data[i*_numCols + _w], tmpb, p0);  \
+       vstrwq_p(&data[j*_numCols + _w], tmpa, p0);  \
+                                                  \
+       cnt -= 4;                                  \
+    }                                             \
+  }
+
+#define MAC_ROW_F32(COL,A,i,v,B,j)                   \
+{                                                    \
+  int cnt = ((A)->numCols)-(COL);                    \
+  float32_t *dataA = (A)->pData;                     \
+  float32_t *dataB = (B)->pData;                     \
+  const int32_t _numCols = (A)->numCols;             \
+  int32_t _w;                                        \
+                                                     \
+  for(_w=(COL);_w < _numCols; _w+=4)                     \
+  {                                                  \
+       f32x4_t tmpa,tmpb;                            \
+       mve_pred16_t p0 = vctp32q(cnt);               \
+       tmpa = vldrwq_z_f32(&dataA[i*_numCols + _w],p0);\
+       tmpb = vldrwq_z_f32(&dataB[j*_numCols + _w],p0);\
+       tmpa = vfmaq_n_f32(tmpa,tmpb,v);              \
+       vstrwq_p(&dataA[i*_numCols + _w], tmpa, p0);    \
+       cnt -= 4;                                     \
+  }                                                  \
+                                                     \
+}
+
+#define MAS_ROW_F32(COL,A,i,v,B,j)                   \
+{                                                    \
+  int cnt = ((A)->numCols)-(COL);                    \
+  float32_t *dataA = (A)->pData;                     \
+  float32_t *dataB = (B)->pData;                     \
+  const int32_t _numCols = (A)->numCols;             \
+  int32_t _w;                                        \
+  f32x4_t vec=vdupq_n_f32(v);                        \
+                                                     \
+  for(_w=(COL);_w < _numCols; _w+=4)                     \
+  {                                                  \
+       f32x4_t tmpa,tmpb;                            \
+       mve_pred16_t p0 = vctp32q(cnt);               \
+       tmpa = vldrwq_z_f32(&dataA[i*_numCols + _w],p0);\
+       tmpb = vldrwq_z_f32(&dataB[j*_numCols + _w],p0);\
+       tmpa = vfmsq_f32(tmpa,tmpb,vec);              \
+       vstrwq_p(&dataA[i*_numCols + _w], tmpa, p0);    \
+       cnt -= 4;                                     \
+  }                                                  \
+                                                     \
+}
+
+#define SCALE_ROW_F32(A,COL,v,i)                    \
+{                                                   \
+  int cnt = ((A)->numCols)-(COL);                   \
+  float32_t *data = (A)->pData;                     \
+  const int32_t _numCols = (A)->numCols;            \
+  int32_t _w;                                       \
+                                                    \
+  for(_w=(COL);_w < _numCols; _w+=4)                    \
+  {                                                 \
+       f32x4_t tmpa;                                \
+       mve_pred16_t p0 = vctp32q(cnt);              \
+       tmpa = vldrwq_z_f32(&data[i*_numCols + _w],p0);\
+       tmpa = vmulq_n_f32(tmpa,v);                  \
+       vstrwq_p(&data[i*_numCols + _w], tmpa, p0);    \
+       cnt -= 4;                                    \
+  }                                                 \
+                                                    \
+}
+
+#elif defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define SWAP_ROWS_F32(A,COL,i,j)       \
+{                                      \
+  int32_t _w;                           \
+  float32_t *dataI = (A)->pData;       \
+  float32_t *dataJ = (A)->pData;       \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols - COL;    \
+                                       \
+  dataI += i*_numCols + (COL);          \
+  dataJ += j*_numCols + (COL);          \
+                                       \
+  float32_t tmp;                       \
+                                       \
+  for(_w=0;_w < nb; _w++)                 \
+  {                                    \
+     tmp = *dataI;                     \
+     *dataI++ = *dataJ;                \
+     *dataJ++ = tmp;                   \
+  }                                    \
+}
+
+#define MAC_ROW_F32(COL,A,i,v,B,j)     \
+{                                      \
+  float32_t *dataA = (A)->pData;       \
+  float32_t *dataB = (B)->pData;       \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols - (COL);  \
+  int32_t nbElems;                     \
+  f32x4_t vec = vdupq_n_f32(v);        \
+                                       \
+  nbElems = nb >> 2;                   \
+                                       \
+  dataA += i*_numCols + (COL);          \
+  dataB += j*_numCols + (COL);          \
+                                       \
+  while(nbElems>0)                     \
+  {                                    \
+       f32x4_t tmpa,tmpb;              \
+       tmpa = vld1q_f32(dataA,p0);     \
+       tmpb = vld1q_f32(dataB,p0);     \
+       tmpa = vmlaq_f32(tmpa,tmpb,vec);\
+       vst1q_f32(dataA, tmpa, p0);     \
+       nbElems--;                      \
+       dataA += 4;                     \
+       dataB += 4;                     \
+  }                                    \
+                                       \
+  nbElems = nb & 3;                    \
+  while(nbElems > 0)                   \
+  {                                    \
+     *dataA++ += v* *dataB++;          \
+     nbElems--;                        \
+  }                                    \
+}
+
+#define MAS_ROW_F32(COL,A,i,v,B,j)     \
+{                                      \
+  float32_t *dataA = (A)->pData;       \
+  float32_t *dataB = (B)->pData;       \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols - (COL);  \
+  int32_t nbElems;                     \
+  f32x4_t vec = vdupq_n_f32(v);        \
+                                       \
+  nbElems = nb >> 2;                   \
+                                       \
+  dataA += i*_numCols + (COL);          \
+  dataB += j*_numCols + (COL);          \
+                                       \
+  while(nbElems>0)                     \
+  {                                    \
+       f32x4_t tmpa,tmpb;              \
+       tmpa = vld1q_f32(dataA);        \
+       tmpb = vld1q_f32(dataB);        \
+       tmpa = vmlsq_f32(tmpa,tmpb,vec);\
+       vst1q_f32(dataA, tmpa);         \
+       nbElems--;                      \
+       dataA += 4;                     \
+       dataB += 4;                     \
+  }                                    \
+                                       \
+  nbElems = nb & 3;                    \
+  while(nbElems > 0)                   \
+  {                                    \
+     *dataA++ -= v* *dataB++;          \
+     nbElems--;                        \
+  }                                    \
+}
+
+#define SCALE_ROW_F32(A,COL,v,i)        \
+{                                       \
+  float32_t *data = (A)->pData;         \
+  const int32_t _numCols = (A)->numCols; \
+  const int32_t nb = _numCols - (COL);   \
+  int32_t nbElems;                      \
+  f32x4_t vec = vdupq_n_f32(v);         \
+                                        \
+  nbElems = nb >> 2;                    \
+                                        \
+  data += i*_numCols + (COL);            \
+  while(nbElems>0)                      \
+  {                                     \
+       f32x4_t tmpa;                    \
+       tmpa = vld1q_f32(data);          \
+       tmpa = vmulq_f32(tmpa,vec);      \
+       vst1q_f32(data, tmpa);           \
+       data += 4;                       \
+       nbElems --;                      \
+  }                                     \
+                                        \
+  nbElems = nb & 3;                     \
+  while(nbElems > 0)                    \
+  {                                     \
+     *data++ *= v;                      \
+     nbElems--;                         \
+  }                                     \
+                                        \
+}
+
+#else
+
+#define SWAP_ROWS_F32(A,COL,i,j)       \
+{                                      \
+  int32_t _w;                           \
+  float32_t tmp;                       \
+  float32_t *dataI = (A)->pData;       \
+  float32_t *dataJ = (A)->pData;       \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols - COL;    \
+                                       \
+  dataI += i*_numCols + (COL);          \
+  dataJ += j*_numCols + (COL);          \
+                                       \
+                                       \
+  for(_w=0;_w < nb; _w++)                 \
+  {                                    \
+     tmp = *dataI;                     \
+     *dataI++ = *dataJ;                \
+     *dataJ++ = tmp;                   \
+  }                                    \
+}
+
+#define SCALE_ROW_F32(A,COL,v,i)       \
+{                                      \
+  int32_t _w;                           \
+  float32_t *data = (A)->pData;        \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols - COL;    \
+                                       \
+  data += i*_numCols + (COL);           \
+                                       \
+  for(_w=0;_w < nb; _w++)                 \
+  {                                    \
+     *data++ *= v;                     \
+  }                                    \
+}
+
+
+#define MAC_ROW_F32(COL,A,i,v,B,j)     \
+{                                      \
+  int32_t _w;                           \
+  float32_t *dataA = (A)->pData;       \
+  float32_t *dataB = (B)->pData;       \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);    \
+                                       \
+  dataA = dataA + i*_numCols + (COL);   \
+  dataB = dataB + j*_numCols + (COL);   \
+                                       \
+  for(_w=0;_w < nb; _w++)                 \
+  {                                    \
+     *dataA++ += v* *dataB++;          \
+  }                                    \
+}
+
+#define MAS_ROW_F32(COL,A,i,v,B,j)     \
+{                                      \
+  int32_t _w;                           \
+  float32_t *dataA = (A)->pData;       \
+  float32_t *dataB = (B)->pData;       \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);    \
+                                       \
+  dataA = dataA + i*_numCols + (COL);   \
+  dataB = dataB + j*_numCols + (COL);   \
+                                       \
+  for(_w=0;_w < nb; _w++)                 \
+  {                                    \
+     *dataA++ -= v* *dataB++;          \
+  }                                    \
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+/* Functions _with only a scalar version */
+
+#define COPY_COL_F32(A,ROW,COL,DST) \
+  COPY_COL_T(float32_t,A,ROW,COL,DST)
+
+#define COPY_COL_F64(A,ROW,COL,DST) \
+  COPY_COL_T(float64_t,A,ROW,COL,DST)
+
+#define SWAP_COLS_F32(A,COL,i,j)               \
+{                                              \
+  int32_t _w;                                  \
+  float32_t *data = (A)->pData;                \
+  const int32_t _numCols = (A)->numCols;       \
+  for(_w=(COL);_w < _numCols; _w++)                \
+  {                                            \
+     float32_t tmp;                            \
+     tmp = data[_w*_numCols + i];                \
+     data[_w*_numCols + i] = data[_w*_numCols + j];\
+     data[_w*_numCols + j] = tmp;                \
+  }                                            \
+}
+
+#define SCALE_COL_F32(A,ROW,v,i)        \
+  SCALE_COL_T(float32_t,,A,ROW,v,i)
+
+#define SWAP_ROWS_F64(A,COL,i,j)       \
+{                                      \
+  int32_t _w;                           \
+  float64_t *dataI = (A)->pData;       \
+  float64_t *dataJ = (A)->pData;       \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);    \
+                                       \
+  dataI += i*_numCols + (COL);          \
+  dataJ += j*_numCols + (COL);          \
+                                       \
+  for(_w=0;_w < nb; _w++)                 \
+  {                                    \
+     float64_t tmp;                    \
+     tmp = *dataI;                     \
+     *dataI++ = *dataJ;                \
+     *dataJ++ = tmp;                   \
+  }                                    \
+}
+
+#define SWAP_COLS_F64(A,COL,i,j)               \
+{                                              \
+  int32_t _w;                                  \
+  float64_t *data = (A)->pData;                \
+  const int32_t _numCols = (A)->numCols;       \
+  for(_w=(COL);_w < _numCols; _w++)                \
+  {                                            \
+     float64_t tmp;                            \
+     tmp = data[_w*_numCols + i];                \
+     data[_w*_numCols + i] = data[_w*_numCols + j];\
+     data[_w*_numCols + j] = tmp;                \
+  }                                            \
+}
+
+#define SCALE_ROW_F64(A,COL,v,i)       \
+{                                      \
+  int32_t _w;                           \
+  float64_t *data = (A)->pData;        \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);    \
+                                       \
+  data += i*_numCols + (COL);           \
+                                       \
+  for(_w=0;_w < nb; _w++)                 \
+  {                                    \
+     *data++ *= v;                     \
+  }                                    \
+}
+
+#define SCALE_COL_F64(A,ROW,v,i)        \
+  SCALE_COL_T(float64_t,,A,ROW,v,i)
+
+#define MAC_ROW_F64(COL,A,i,v,B,j)      \
+{                                       \
+  int32_t _w;                           \
+  float64_t *dataA = (A)->pData;        \
+  float64_t *dataB = (B)->pData;        \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);     \
+                                        \
+  dataA += i*_numCols + (COL);           \
+  dataB += j*_numCols + (COL);           \
+                                        \
+  for(_w=0;_w < nb; _w++)                  \
+  {                                     \
+     *dataA++ += v* *dataB++;           \
+  }                                     \
+}
+
+#define MAS_ROW_F64(COL,A,i,v,B,j)      \
+{                                       \
+  int32_t _w;                           \
+  float64_t *dataA = (A)->pData;        \
+  float64_t *dataB = (B)->pData;        \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);     \
+                                        \
+  dataA += i*_numCols + (COL);           \
+  dataB += j*_numCols + (COL);           \
+                                        \
+  for(_w=0;_w < nb; _w++)                  \
+  {                                     \
+     *dataA++ -= v* *dataB++;           \
+  }                                     \
+}
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _MATRIX_UTILS_H_ */
diff --git a/dsppp/tests/row_test.cpp b/dsppp/tests/row_test.cpp
new file mode 100644
index 000000000..0d4ac48af
--- /dev/null
+++ b/dsppp/tests/row_test.cpp
@@ -0,0 +1,204 @@
+extern "C" {
+    extern void row_test();
+}
+
+#include "allocator.h"
+
+#include <dsppp/arch.hpp>
+#include <dsppp/fixed_point.hpp>
+#include <dsppp/matrix.hpp>
+
+#include <iostream>
+
+#include <cmsis_tests.h>
+
+
+
+#include "dsp/matrix_functions.h"
+#include "matrix_utils.h"
+
+template<typename T,int R,int C>
+static void test()
+{
+   constexpr int NBOUT = C-1;
+   std::cout << "----\r\n";
+   std::cout << R << " x " << C << "\r\n";
+   std::cout << "NBOUT = " << NBOUT << "\r\n";
+
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> a;
+   #else
+   PMat<T> a(R,C);
+   #endif
+
+   init_array(a,R*C);
+
+
+   INIT_SYSTICK
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   #if defined(STATIC_TEST)
+   PVector<T,NBOUT> res = a.row(0,1) + a.row(1,1);
+   #else 
+   PVector<T> res = a.row(0,1) + a.row(1,1);
+   #endif
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   PVector<T,NBOUT> da (a.row(0,1));
+   PVector<T,NBOUT> db (a.row(1,1));
+   PVector<T,NBOUT> ref;
+
+   cmsisdsp_add(da.const_ptr(),db.const_ptr(),ref.ptr(),NBOUT);
+   STOP_CYCLE_MEASUREMENT;
+
+   if (!validate(res.const_ptr(),ref.const_ptr(),NBOUT))
+   {
+      printf("row add failed \r\n");
+   }
+
+   std::cout << "=====\r\n";
+}
+
+template<typename T,int R,int C>
+static void swaptest()
+{
+   constexpr int NBOUT = C-2;
+   std::cout << "----\r\n";
+   std::cout << R << " x " << C << "\r\n";
+   std::cout << "NBOUT = " << NBOUT << "\r\n";
+
+   #if defined(STATIC_TEST)
+   PMat<T,R,C> a;
+   PMat<T,R,C> b;
+   #else
+   PMat<T> a(R,C);
+   PMat<T> b(R,C);
+   #endif
+
+   init_array(a,R*C);
+   init_array(b,R*C);
+
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   swap(a.row(0,2) , a.row(1,2));
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+   typename CMSISMatrixType<T>::type mat;
+   mat.numCols = C;
+   mat.numRows = R;
+   mat.pData = b.ptr();
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT; 
+   SWAP_ROWS_F32(&mat,2,0,1);
+   STOP_CYCLE_MEASUREMENT;
+
+   if (!validate(a.const_ptr(),(const float32_t*)mat.pData,R*C))
+   {
+      printf("row add failed \r\n");
+   }
+
+   std::cout << "=====\r\n";
+
+
+   
+}
+
+
+
+
+template<typename T>
+void all_row_test()
+{
+   const int nb_tails = TailForTests<T>::tail;
+   const int nb_loops = TailForTests<T>::loop;
+
+
+    title<T>("Row test");
+   
+    test<T,2,NBVEC_4>();
+    test<T,4,NBVEC_4>();
+    test<T,5,NBVEC_4>();
+    test<T,9,NBVEC_4>();
+
+    test<T,2,NBVEC_8>();
+    test<T,4,NBVEC_8>();
+    test<T,5,NBVEC_8>();
+    test<T,9,NBVEC_8>();
+
+    test<T,2,NBVEC_16>();
+    test<T,4,NBVEC_16>();
+    test<T,5,NBVEC_16>();
+    test<T,9,NBVEC_16>();
+
+    test<T,2,nb_loops>();
+    test<T,4,nb_loops>();
+    test<T,5,nb_loops>();
+    test<T,9,nb_loops>();
+
+    test<T,2,nb_loops+1>();
+    test<T,4,nb_loops+1>();
+    test<T,5,nb_loops+1>();
+    test<T,9,nb_loops+1>();
+
+    test<T,2,nb_loops+nb_tails>();
+    test<T,4,nb_loops+nb_tails>();
+    test<T,5,nb_loops+nb_tails>();
+    test<T,9,nb_loops+nb_tails>();
+
+    if constexpr (std::is_same<T,float>::value)
+    {
+       title<T>("Swap test");
+
+       swaptest<T,2,NBVEC_32>();
+       swaptest<T,4,NBVEC_32>();
+       swaptest<T,5,NBVEC_32>();
+       swaptest<T,9,NBVEC_32>();
+
+       swaptest<T,2,nb_loops>();
+       swaptest<T,4,nb_loops>();
+       swaptest<T,5,nb_loops>();
+       swaptest<T,9,nb_loops>();
+
+       swaptest<T,2,nb_loops+1>();
+       swaptest<T,4,nb_loops+1>();
+       swaptest<T,5,nb_loops+1>();
+       swaptest<T,9,nb_loops+1>();
+
+       swaptest<T,2,nb_loops+nb_tails>();
+       swaptest<T,4,nb_loops+nb_tails>();
+       swaptest<T,5,nb_loops+nb_tails>();
+       swaptest<T,9,nb_loops+nb_tails>();
+    }
+    //print_map("Stats",max_stats);
+}
+
+void row_test()
+{
+#if defined(ROW_TEST)
+   #if defined(F64_DT)
+   all_row_test<double>();
+   #endif
+   #if defined(F32_DT)
+   all_row_test<float>();
+   #endif
+   #if defined(F16_DT) && !defined(DISABLEFLOAT16)
+   all_row_test<float16_t>();
+   #endif
+   #if defined(Q31_DT)
+   all_row_test<Q31>();
+   #endif
+   #if defined(Q15_DT)
+   all_row_test<Q15>();
+   #endif
+   #if defined(Q7_DT)
+   all_row_test<Q7>();
+   #endif
+#endif
+}
diff --git a/dsppp/tests/test.h b/dsppp/tests/test.h
new file mode 100644
index 000000000..9273f47f0
--- /dev/null
+++ b/dsppp/tests/test.h
@@ -0,0 +1,19 @@
+#ifndef _TEST_H_ 
+#define _TEST_H_ 
+
+#define SERIAL_DUMP
+
+
+extern void matrix_test(void);
+extern void dot_test(void);
+extern void vector_test(void);
+extern void row_test(void);
+extern void col_test(void);
+extern void filter_test(void);
+extern void fusion_test(void);
+extern void debug_test(void);
+
+extern void memory_pool_stats();
+
+#endif
+
diff --git a/dsppp/tests/vector_test.cpp b/dsppp/tests/vector_test.cpp
new file mode 100644
index 000000000..e6a1ede8d
--- /dev/null
+++ b/dsppp/tests/vector_test.cpp
@@ -0,0 +1,219 @@
+extern "C" {
+    extern void vector_test();
+}
+
+#include "allocator.h"
+
+
+#include <dsppp/arch.hpp>
+#include <dsppp/fixed_point.hpp>
+#include <dsppp/matrix.hpp>
+
+#include <iostream>
+
+#include <cmsis_tests.h>
+
+template<typename T,int NB>
+static void test()
+{
+   std::cout << "----\r\n" << "N = " << NB << "\r\n";
+   #if defined(STATIC_TEST)
+   PVector<T,NB> a;
+   PVector<T,NB> b;
+   #else 
+   PVector<T> a(NB);
+   PVector<T> b(NB);
+   #endif
+
+   init_array(a,NB);
+   init_array(b,NB);
+
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   #if defined(STATIC_TEST)
+   PVector<T,NB> res = a + b;
+   #else 
+   PVector<T> res = copy(a + b);
+   #endif
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+  
+
+   INIT_SYSTICK
+   START_CYCLE_MEASUREMENT;
+   #if defined(STATIC_TEST)
+   PVector<T,NB> ref;
+   #else
+   PVector<T> ref(NB);
+   #endif
+   cmsisdsp_add(a.const_ptr(),b.const_ptr(),ref.ptr(),NB);
+   STOP_CYCLE_MEASUREMENT;
+
+   if (!validate(res.const_ptr(),ref.const_ptr(),NB))
+   {
+      printf("add failed \r\n");
+   }
+
+   std::cout << "=====\r\n";
+}
+
+
+template<typename T,int NB>
+void test_view()
+{
+   std::cout << "----\r\n" << "N = " << NB << "\r\n";
+   #if defined(STATIC_TEST)
+   PVector<T,NB> a;
+   PVector<T,NB> b;
+   #else 
+   PVector<T> a(NB);
+   PVector<T> b(NB);
+   #endif
+
+   init_array(a,NB);
+   init_array(b,NB);
+
+   //std::cout << a;
+   //std::cout << "\r\n";
+
+   //std::cout << PVector<T,NB/2>(a.template sub<2>());
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   #if defined(STATIC_TEST)
+   PVector<T,NB/2> res = a.template sub<2>() + b.template sub<2>();
+   #else 
+   PVector<T> res = a.template sub<2>() + b.template sub<2>();
+   #endif
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+  
+   PVector<T,NB/2> ref;
+  
+   
+
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   PVector<T,NB/2> da(a.template sub<2>());
+   PVector<T,NB/2> db(b.template sub<2>());
+   cmsisdsp_add(da.const_ptr(),db.const_ptr(),ref.ptr(),NB/2);
+   STOP_CYCLE_MEASUREMENT;
+
+   if (!validate(res.const_ptr(),ref.const_ptr(),NB/2))
+   {
+      printf("add failed \r\n");
+   }
+   std::cout << "=====\r\n";
+}
+
+template<typename T,int NB>
+void test_fill()
+{
+   std::cout << "----\r\n" << "N = " << NB << "\r\n";
+   
+   INIT_SYSTICK;
+   START_CYCLE_MEASUREMENT;
+   startSectionNB(1);
+   #if defined(STATIC_TEST)
+   PVector<T,NB> res(T(1));
+   #else 
+   PVector<T> res(NB,T(1));
+   #endif
+   stopSectionNB(1);
+   STOP_CYCLE_MEASUREMENT;
+
+}
+
+
+template<typename T>
+void all_vector_test()
+{
+
+
+   const int nb_tails = TailForTests<T>::tail;
+   const int nb_loops = TailForTests<T>::loop;
+
+    title<T>("Vector");
+
+    // For benchmarks
+    test<T,NBVEC_4>();
+    test<T,NBVEC_8>();
+    test<T,NBVEC_9>();
+    test<T,NBVEC_16>();
+    test<T,NBVEC_32>();
+    test<T,NBVEC_64>();
+    test<T,NBVEC_128>();
+    test<T,NBVEC_256>();
+    test<T,NBVEC_258>();
+    test<T,NBVEC_512>();
+    test<T,NBVEC_1024>();
+    test<T,NBVEC_2048>();
+
+    // For tests
+    test<T,1>();
+    test<T,nb_tails>();
+    test<T,nb_loops>();
+    test<T,nb_loops+1>();
+    test<T,nb_loops+nb_tails>();
+
+
+    title<T>("Vector View");
+
+    test_view<T,NBVEC_4>();
+    test_view<T,NBVEC_8>();
+    test_view<T,NBVEC_16>();
+    test_view<T,NBVEC_32>();
+    test_view<T,NBVEC_64>();
+
+    if constexpr (nb_tails>1)
+    {
+       test_view<T,nb_tails>();
+    }
+    test_view<T,nb_loops>();
+    test_view<T,nb_loops+1>();
+    test_view<T,nb_loops+nb_tails>();
+
+
+    title<T>("Vector fill");
+    test_fill<T,NBVEC_4>();
+    test_fill<T,NBVEC_8>();
+    test_fill<T,NBVEC_16>();
+    test_fill<T,NBVEC_32>();
+    test_fill<T,NBVEC_64>();
+
+    test_fill<T,1>();
+    test_fill<T,nb_tails>();
+    test_fill<T,nb_loops>();
+    test_fill<T,nb_loops+1>();
+    test_fill<T,nb_loops+nb_tails>();
+
+}
+
+void vector_test()
+{
+#if defined(VECTOR_TEST)
+   #if defined(F64_DT)
+   all_vector_test<double>();
+   #endif
+   #if defined(F32_DT)
+   all_vector_test<float>();
+   #endif
+   #if defined(F16_DT) && !defined(DISABLEFLOAT16)
+   all_vector_test<float16_t>();
+   #endif
+   #if defined(Q31_DT)
+   all_vector_test<Q31>();
+   #endif
+   #if defined(Q15_DT)
+   all_vector_test<Q15>();
+   #endif
+   #if defined(Q7_DT)
+   all_vector_test<Q7>();
+   #endif
+#endif
+}

From fafb5fe45b0abc04799eb835f61852f6f3e637e2 Mon Sep 17 00:00:00 2001
From: Christophe Favergeon <Christophe.Favergeon@arm.com>
Date: Tue, 27 Feb 2024 14:08:26 +0100
Subject: [PATCH 2/5] Improved vector and matrix documentation on C++ extension

---
 Documentation/Doxygen/dsp.dxy.in    |   4 +-
 dsppp/Include/dsppp/algorithms.hpp  |  88 +++--
 dsppp/Include/dsppp/matrix.hpp      |  97 ++++-
 dsppp/Include/dsppp/matrix_impl.hpp | 553 +++++++++++++++++++++++++++-
 dsppp/Include/dsppp/matrix_view.hpp | 233 +++++++++++-
 dsppp/Include/dsppp/vec.hpp         |  77 ++++
 dsppp/Include/dsppp/vector_impl.hpp | 381 ++++++++++++++++++-
 dsppp/Include/dsppp/vector_view.hpp | 497 ++++++++++++++++++++++++-
 8 files changed, 1889 insertions(+), 41 deletions(-)

diff --git a/Documentation/Doxygen/dsp.dxy.in b/Documentation/Doxygen/dsp.dxy.in
index f514addf5..77126d731 100644
--- a/Documentation/Doxygen/dsp.dxy.in
+++ b/Documentation/Doxygen/dsp.dxy.in
@@ -773,7 +773,7 @@ SHOW_FILES             = YES
 # Folder Tree View (if specified).
 # The default value is: YES.
 
-SHOW_NAMESPACES        = YES
+SHOW_NAMESPACES        = NO
 
 # The FILE_VERSION_FILTER tag can be used to specify a program or script that
 # doxygen should invoke to get the current version for each file (typically from
@@ -2430,7 +2430,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = ARM_MATH_NEON=1 ARM_FLOAT16_SUPPORTED=1 __STATIC_FORCEINLINE= __ALIGNED(x)=
+PREDEFINED             = HAS_VECTOR HAS_PREDICATED_LOOP ARM_MATH_NEON=1 ARM_FLOAT16_SUPPORTED=1 __STATIC_FORCEINLINE= __ALIGNED(x)=
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/dsppp/Include/dsppp/algorithms.hpp b/dsppp/Include/dsppp/algorithms.hpp
index 6f2f205ab..641e33269 100644
--- a/dsppp/Include/dsppp/algorithms.hpp
+++ b/dsppp/Include/dsppp/algorithms.hpp
@@ -2,13 +2,11 @@
 /** @file */ 
 #pragma once 
 
-/** \addtogroup DSPPP C++ extension
+/** \defgroup DSPPP C++ extension
  *  C++ template extension to CMSIS-DSP. It is not yet part of
  *  the pack but the headers can be found on the 
  *  [CMSIS-DSP github](https://github.com/ARM-software/CMSIS-DSP/dsppp/Include)
  *  The principles are described in this @ref dsppp_main "page"
- *  @{
- *  @}
  */
 
 
@@ -22,7 +20,7 @@ namespace arm_cmsis_dsp {
 
 /** \addtogroup ALGO Architecture independent algorithms
  *  \ingroup DSPPP
- *  @{
+ *  Algorithms written in an architecture independent way
  */
 
 /*
@@ -32,13 +30,16 @@ Matrix transpose
 */
 
 
-/**
- * Transpose a matrix.
- *
- * @param dst Destination matrix.
- * @param src Source matrix.
- *
- */
+
+ /** @ingroup ALGO
+  *  @brief Transpose a matrix.
+  *
+  * @tparam MA Any matrix type
+  * @tparam MB Any matrix type
+  * @param dst Destination matrix.
+  * @param src Source matrix.
+  *
+  */
 template<typename MA,
          typename MB,
          typename std::enable_if<
@@ -52,6 +53,7 @@ inline void transposeTo(MA &dst,
 }
 
 
+
 /*
 
 Init a diagonal matrix (0 outside of diagonal)
@@ -110,7 +112,17 @@ inline void _identity(Matrix<P,R,R,A> &v,
 }
 
 
-
+/**
+ * @ingroup ALGO
+ * @brief Matrix x Vector product.
+ *
+ * @tparam M Any matrix type
+ * @tparam V Any vector type
+ * @param m matrix.
+ * @param v vector.
+ * @return The matrix x vector product
+ *
+ */
 template<typename M,
          typename V,
          typename std::enable_if<CompatibleStaticMatVecProduct<M,V>::value,bool>::type = true>
@@ -141,6 +153,17 @@ inline void dot(RES && res,const M&m,const V&v)
    _dot_m_v(res,m,v,CURRENT_ARCH);
 }
 
+
+ /** @ingroup ALGO
+  *  @brief Matrix x Matrix product.
+  *
+  * @tparam MA Any matrix type
+  * @tparam MB Any matrix type
+  * @param ma Matrix.
+  * @param mb Matrix.
+  * @return ma x mb matrix product
+  *
+  */
 template<typename MA,
          typename MB,
          typename std::enable_if<CompatibleStaticMatMatProduct<MA,MB>::value &&
@@ -197,13 +220,21 @@ inline typename OutputMatrix<MA,MB>::type dot(const MA&ma,const MB&mb)
    return(res);
 }
 
-/*
-
-
-Get res matrix as argument to avoid memory allocation when
-assigning the result to a different type of Matrix (like a Matrix view).
-
-*/
+ /** @ingroup ALGO
+  *  @brief Matrix x Matrix product
+  *
+  * @tparam MA Any matrix type
+  * @tparam MB Any matrix type
+  * @tparam RES Any matrix type
+  * @param res Output matrix. Result of ma x mb is written to this argument
+  * @param ma Matrix.
+  * @param mb Matrix.
+  * 
+  * Used in dynamic mode (dimension of matrix not know at build time)
+  * to avoid a memory allocation if the result matrix is already available
+  * (Enable to reuse the same matrix storage for the result in some algorithms)
+  *
+  */
 template<typename MA,
          typename MB,
          typename RES,
@@ -246,7 +277,14 @@ inline typename OutputMatrix<MA,MB>::type dot(const MA&ma,const MB&mb,const TMP
 }
 
 
-
+ /** @ingroup ALGO
+  *  @brief Create identity matrix
+  *
+  * @tparam P Datatype of matrix elements
+  * @param l Dimension of matrix (l x l)
+  * @return Identity matrix. It is a dynamic matrix (size not know at build time)
+  * 
+  */
 template<typename P>
 Matrix<P,DYNAMIC,DYNAMIC,TMP_ALLOC> mk_identity(const vector_length_t l)
 {
@@ -256,6 +294,14 @@ Matrix<P,DYNAMIC,DYNAMIC,TMP_ALLOC> mk_identity(const vector_length_t l)
 };
 
 
+ /** @ingroup ALGO
+  *  @brief Create identity matrix
+  *
+  * @tparam P Datatype of matrix elements
+  * @tparam L Matrix dimension (L x L)
+  * @return Identity matrix. It is a static matrix : size known at build time.
+  * 
+  */
 template<typename P,int L>
 Matrix<P,L,L,TMP_ALLOC> mk_identity()
 {
@@ -264,6 +310,4 @@ Matrix<P,L,L,TMP_ALLOC> mk_identity()
        return(res);
 };
 
-/*! @} */
-
 }
diff --git a/dsppp/Include/dsppp/matrix.hpp b/dsppp/Include/dsppp/matrix.hpp
index 7836d6bec..db73002c4 100644
--- a/dsppp/Include/dsppp/matrix.hpp
+++ b/dsppp/Include/dsppp/matrix.hpp
@@ -463,20 +463,42 @@ struct VecRef<Matrix<P,R,C,A>,((R<0) || (C<0))>
  * 
  ****************/
 
+/**
+ * @brief  Outer product operator for expressions
+ *
+ * @tparam LHS Left hand side datatype
+ * @tparam RHS Right hand side datatype
+ * @tparam DerivedOp Operator for the Outer operation
+ * 
+ * vector `op` vector (including matrix)
+ */
 template<typename LHS,typename RHS,typename DerivedOp>
 struct _Outer: _Expr<_Outer<LHS,RHS,DerivedOp>>
 {
+    //! Type of vector elements
     using Scalar = typename traits<LHS>::Scalar;
 #if defined(HAS_VECTOR)
+    //! Type of vector in the architecture
     using Vector = typename traits<LHS>::Vector;
 #endif
+    /**
+    * @brief      Create an Outer operator
+    *
+    * @param lhs Left hand side expression
+    * @param rhs Right hand side expression
+    * @param op operator
+    */
     _Outer(const LHS &lhs,
             const RHS &rhs,
             const _BinaryOperator<Scalar,DerivedOp> &op):
             lhs_(lhs),rhs_(rhs),op_(op){
     }
 
-    
+    /**
+    * @brief      Create an Outer operator from another operator of same type
+    *
+    * @param other the other operator
+    */
     _Outer(const _Outer &other):
     lhs_(other.lhs_),rhs_(other.rhs_),op_(other.op_){
     }
@@ -484,6 +506,11 @@ struct _Outer: _Expr<_Outer<LHS,RHS,DerivedOp>>
     _Outer& operator=(const _Outer& other) = delete;
     _Outer& operator=(_Outer&& other) = delete;
 
+    /**
+    * @brief   Move semantic for _Outer operator
+    *
+    * @param other the other operator
+    */
     _Outer(_Outer &&other): 
     lhs_(std::move(other.lhs_)),rhs_(std::move(other.rhs_)),op_(std::move(other.op_))
     {
@@ -491,12 +518,26 @@ struct _Outer: _Expr<_Outer<LHS,RHS,DerivedOp>>
 
     
 
+    /**
+    * @brief   Length of the matrix (seen as vector) resulting from the outer operator
+    * @tparam R Right hand side datatype
+    * @tparam L Left hand side datatype
+    *
+    * @return  vector dimension
+    */
     template<typename R=RHS, typename L=LHS,
              typename std::enable_if<IsVector<L>::value && IsVector<R>::value,bool>::type = true>
     vector_length_t length() const {
         return(lhs_.length() * rhs_.length());
     }
 
+    /**
+    * @brief   Rows of the matrix
+    * @tparam R Right hand side datatype
+    * @tparam L Left hand side datatype
+    *
+    * @return  number of rows
+    */
     template<typename R=RHS, typename L=LHS,
              typename std::enable_if<IsVector<L>::value,bool>::type = true>
     vector_length_t rows() const {
@@ -504,7 +545,13 @@ struct _Outer: _Expr<_Outer<LHS,RHS,DerivedOp>>
     }
 
 
-
+    /**
+    * @brief   Columns of the matrix
+    * @tparam R Right hand side datatype
+    * @tparam L Left hand side datatype
+    *
+    * @return  number of columns
+    */
     template<typename R=RHS, typename L=LHS,
              typename std::enable_if<IsVector<R>::value,bool>::type = true>
     vector_length_t columns() const {
@@ -512,7 +559,15 @@ struct _Outer: _Expr<_Outer<LHS,RHS,DerivedOp>>
     }
 
 
-
+    /**
+    * @brief   Expression value at given position
+    * @tparam R Right hand side datatype
+    * @tparam L Left hand side datatype
+    * @param r row index
+    * @param c column index
+    *
+    * @return  expression value
+    */
     template<typename R=RHS, typename L=LHS,
              typename std::enable_if<IsVector<L>::value && 
                         IsVector<R>::value,bool>::type = true>
@@ -523,13 +578,25 @@ struct _Outer: _Expr<_Outer<LHS,RHS,DerivedOp>>
 
   
 #if defined(HAS_VECTOR)
-    /*************
+    /*
      * 
      * For matrix
      * 
      */
 
     /* V + V */
+
+    /**
+    * @brief   Expression vector value at given position
+    * @tparam R Right hand side datatype
+    * @tparam L Left hand side datatype
+    * @param r row index
+    * @param c column index
+    *
+    * @return  expression vector value
+    *
+    * Vector + Vector (matrix interpreted as a Vector)
+    */
     template<typename R=RHS, typename L=LHS,
              typename std::enable_if<IsVector<L>::value && 
                         IsVector<R>::value,bool>::type = true>
@@ -538,6 +605,18 @@ struct _Outer: _Expr<_Outer<LHS,RHS,DerivedOp>>
         return(op_(lhs_[r],rhs_.vector_op(c)));
     }
 
+    /**
+    * @brief   Expression vector value at given position with tail predication
+    * @tparam R Right hand side datatype
+    * @tparam L Left hand side datatype
+    * @param r row index
+    * @param c column index
+    * @param remaining remaining number of samples in loop
+    *
+    * @return  expression vector value
+    *
+    * Vector + Vector (matrix interpreted as a Vector)
+    */
     template<typename R=RHS, typename L=LHS,
              typename std::enable_if<IsVector<L>::value && 
                         IsVector<R>::value,bool>::type = true>
@@ -623,6 +702,16 @@ struct NbCols<_Outer<LHS,RHS,OP>>
 };
 
 
+/**
+* @brief   Outer product
+* @tparam VA Right hand side datatype
+* @tparam VB Left hand side datatype
+* @param a Vector a
+* @param b Vector b
+*
+* @return  Outer product of a and b
+*
+*/
 template<typename VA,typename VB,
 typename std::enable_if<vector_idx_pair<VA,VB>(),bool>::type = true>
 inline auto outer(const VA&a,const VB&b)
diff --git a/dsppp/Include/dsppp/matrix_impl.hpp b/dsppp/Include/dsppp/matrix_impl.hpp
index 759e11cbc..cc3f6da6c 100644
--- a/dsppp/Include/dsppp/matrix_impl.hpp
+++ b/dsppp/Include/dsppp/matrix_impl.hpp
@@ -29,29 +29,68 @@ namespace arm_cmsis_dsp {
  * 
  ********************/
 
+/**
+* @brief   Slice
+*/
 struct Slice
 {
+    /**
+    * @brief   Create new slice object
+    * @param s start index
+    * @param e stop index
+    *
+    */
     Slice(const index_t s,const index_t e):start(s),stop(e){};
 
+    //! Start index
     const index_t start;
+
+    //! Stop index
     const index_t stop;
 };
 
+/** @brief Matrix
+ *  @tparam P Type of the scalar
+ *  @tparam R Number of rows
+ *  @tparam C Number of columns
+ *  @tparam Allocator Memory allocator
+ */
 template<typename P,int R=DYNAMIC,int C=DYNAMIC,
          template<int> typename Allocator = TMP_ALLOC>
 struct Matrix:Vector<P,R*C,Allocator>
 {
+   /** @brief Number of rows
+    *  @return Number of rows
+    */
    constexpr vector_length_t rows() const {return(R);}
+
+   /** @brief Number of columns
+    *  @return Number of columns
+    */
    constexpr vector_length_t columns() const {return(C);}
+
+   /** @brief Number of stride
+    *  @return Number of stride
+    */
    constexpr uint32_t stride() const {return(C);}
 
 
+   /** @brief Create matrix
+    */
    Matrix():Vector<P,R*C,Allocator>(){};
+
+  /** @brief Create matrix
+   * @param init_val Initialization value
+   */
    explicit Matrix(P init_val):Vector<P,R*C,Allocator>(init_val){};
 
    Matrix(const Matrix& other) = default;
    Matrix(Matrix&& other) = default;
 
+   /** @brief Create matrix from another matrix using different memory allocator
+    * @tparam OtherAllocator other memory allocator
+    * @param other Other matrix
+    */
    template<template<int> typename OtherAllocator>
    explicit Matrix(const Matrix<P,R,C,OtherAllocator>& other):Vector<P,R*C,Allocator>()
    {
@@ -59,6 +98,14 @@ struct Matrix:Vector<P,R*C,Allocator>
    };
 
    /* Applies only when the AST does not contain any dynamic MatrixView */
+
+   /** @brief Create matrix from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other Other matrix
+    * 
+    * Only applies when the expression does not contain any MatrixView since
+    * matrix view may have a stride and cannot be used as vectors.
+    */
    template<typename Derived,
             typename std::enable_if<IsVector<Derived>::value,bool>::type = true>
    Matrix(const _Expr<Derived>& other):Vector<P,R*C,Allocator>(other)
@@ -66,6 +113,14 @@ struct Matrix:Vector<P,R*C,Allocator>
    };
 
    /* Applies only when AST is containing any dynamic MatrixView */
+
+   /** @brief Create matrix from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other Other matrix
+    * 
+    * Applies when contain a matrix view that has a stride and thus force a 2D
+    * evaluation loop.
+    */
    template<typename Derived,
             typename std::enable_if<must_use_matrix_idx<Derived>(),bool>::type = true>
    Matrix(const _Expr<Derived>& other):Vector<P,R*C,Allocator>()
@@ -73,6 +128,13 @@ struct Matrix:Vector<P,R*C,Allocator>
      eval2D(*this,other.derived(),rows(),columns(),CURRENT_ARCH);
    };
 
+  /** @brief Assign matrix from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other Other matrix
+    * @return the matrix
+    * 
+    * Applies when expression does not contain matrix view
+    */
    template<typename Derived,
             typename std::enable_if<IsVector<Derived>::value,bool>::type = true>
    Matrix& operator=(const _Expr<Derived>& other)
@@ -82,6 +144,14 @@ struct Matrix:Vector<P,R*C,Allocator>
    }
 
    /* Applies only when AST is containing any dynamic MatrixView */
+  /** @brief Assign matrix from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other Other matrix
+    * @return the matrix
+    * 
+    * Applies when contain a matrix view that has a stride and thus force a 2D
+    * evaluation loop.
+    */
    template<typename Derived,
             typename std::enable_if<must_use_matrix_idx<Derived>(),bool>::type = true>
    Matrix& operator=(const _Expr<Derived>& other)
@@ -90,6 +160,12 @@ struct Matrix:Vector<P,R*C,Allocator>
        return(*this);
    }
 
+  /** @brief Create a matrix view
+    * @param rs start row
+    * @param cs start column
+    * @return matrix view
+    *
+    */
    MatrixView<P,C> sub(const index_t rs,const index_t cs)
    {
         const vector_length_t nb_rows = rows() - rs;
@@ -98,6 +174,12 @@ struct Matrix:Vector<P,R*C,Allocator>
         return(MatrixView<P,C>(Vector_Base<P>::ptr(rs*stride()+cs),nb_rows,nb_cols));
    }
 
+  /** @brief Create a constant matrix view
+    * @param rs start row
+    * @param cs start column
+    * @return matrix view
+    *
+    */
    const MatrixView<P,C> sub(const index_t rs,const index_t cs) const
    {
         const vector_length_t nb_rows = rows() - rs;
@@ -106,6 +188,12 @@ struct Matrix:Vector<P,R*C,Allocator>
         return(MatrixView<P,C>(Vector_Base<P>::ptr(rs*stride()+cs),nb_rows,nb_cols));
    }
    
+  /** @brief Create a matrix view
+    * @param rs Row slice (start and end row)
+    * @param cs start column
+    * @return matrix view
+    *
+    */
    MatrixView<P,C> sub(const Slice &rs,const index_t cs)
    {
         const vector_length_t nb_rows = rs.stop - rs.start;
@@ -114,6 +202,12 @@ struct Matrix:Vector<P,R*C,Allocator>
         return(MatrixView<P,C>(Vector_Base<P>::ptr(rs.start*stride()+cs),nb_rows,nb_cols));
    }
 
+  /** @brief Create a constant matrix view
+    * @param rs Row slice (start and end row)
+    * @param cs start column
+    * @return matrix view
+    *
+    */
    const MatrixView<P,C> sub(const Slice &rs,const index_t cs) const
    {
         const vector_length_t nb_rows = rs.stop - rs.start;
@@ -122,6 +216,13 @@ struct Matrix:Vector<P,R*C,Allocator>
         return(MatrixView<P,C>(Vector_Base<P>::ptr(rs.start*stride()+cs),nb_rows,nb_cols));
    }
 
+
+  /** @brief Create a  matrix view
+    * @param rs Row start index
+    * @param cs Column slice
+    * @return matrix view
+    *
+    */
    MatrixView<P,C> sub(const index_t rs,const Slice &cs)
    {
         const vector_length_t nb_rows = rows() - rs;
@@ -130,6 +231,12 @@ struct Matrix:Vector<P,R*C,Allocator>
         return(MatrixView<P,C>(Vector_Base<P>::ptr(rs*stride()+cs.start),nb_rows,nb_cols));
    }
 
+  /** @brief Create a constant matrix view
+    * @param rs Row start index
+    * @param cs Column slice
+    * @return matrix view
+    *
+    */
    const MatrixView<P,C> sub(const index_t rs,const Slice &cs) const
    {
         const vector_length_t nb_rows = rows() - rs;
@@ -138,6 +245,12 @@ struct Matrix:Vector<P,R*C,Allocator>
         return(MatrixView<P,C>(Vector_Base<P>::ptr(rs*stride()+cs.start),nb_rows,nb_cols));
    }
 
+   /** @brief Create a matrix view
+    * @param rs Row slice
+    * @param cs Column slice
+    * @return matrix view
+    *
+    */
    MatrixView<P,C> sub(const Slice& rs,const Slice& cs)
    {
         const vector_length_t nb_rows = rs.stop - rs.start;
@@ -146,6 +259,12 @@ struct Matrix:Vector<P,R*C,Allocator>
         return(MatrixView<P,C>(Vector_Base<P>::ptr(rs.start*stride()+cs.start),nb_rows,nb_cols));
    }
 
+   /** @brief Create a constant matrix view
+    * @param rs Row slice
+    * @param cs Column slice
+    * @return matrix view
+    *
+    */
    const MatrixView<P,C> sub(const Slice& rs,const Slice& cs) const
    {
         const vector_length_t nb_rows = rs.stop - rs.start;
@@ -154,6 +273,14 @@ struct Matrix:Vector<P,R*C,Allocator>
         return(MatrixView<P,C>(Vector_Base<P>::ptr(rs.start*stride()+cs.start),nb_rows,nb_cols));
    }
 
+   /** @brief Create a matrix view
+    * @param rs Row start
+    * @param re Row end
+    * @param cs Column start
+    * @param ce Column end
+    * @return matrix view
+    *
+    */
    MatrixView<P,C> sub(const index_t rs,
                        const index_t re,
                        const index_t cs,
@@ -165,6 +292,14 @@ struct Matrix:Vector<P,R*C,Allocator>
         return(MatrixView<P,C>(Vector_Base<P>::ptr(rs*stride()+cs),nb_rows,nb_cols));
    }
 
+   /** @brief Create a constant matrix view
+    * @param rs Row start
+    * @param re Row end
+    * @param cs Column start
+    * @param ce Column end
+    * @return matrix view
+    *
+    */
    const MatrixView<P,C> sub(const index_t rs,
                              const index_t re,
                              const index_t cs,
@@ -181,17 +316,36 @@ struct Matrix:Vector<P,R*C,Allocator>
    
    Matrix& operator=(Matrix&& other) = default;
 
+   /** @brief Access matrix element at given position
+    * @param r Row index
+    * @param c Column index
+    * @return reference to element
+    *
+    */
    P& operator()(const index_t r,const index_t c)
    {
      return(Vector_Base<P>::ptr()[r*C+c]);
    }
 
+   /** @brief Access matrix element at given position
+    * @param r Row index
+    * @param c Column index
+    * @return reference to element
+    *
+    */
    P& operator()(const index_t r,const index_t c) const
    {
      return(Vector_Base<P>::ptr()[r*C+c]);
    }
 
 
+   /**
+    * @brief  Display the matrix content for debug purpose
+    * @param stream Output stream
+    * @param other The matrix to display
+    * @return the stream
+    * 
+    */
    friend std::ostream& operator<< (std::ostream& stream, const Matrix& other) {
         int c=0;
         for(index_t k=0;k<other.length();k++)
@@ -208,31 +362,76 @@ struct Matrix:Vector<P,R*C,Allocator>
         return(stream);
     }
 
+   /** @brief Create a row view vector with stride
+    * @tparam S stride
+    * @param i row index
+    * @param start Start index in row
+    * @param stop Stop index in row
+    *             Default is number of columns
+    * @return row view vector
+    *
+    */
    template<int S=1>
    VectorView<P,S> row(const index_t i,const index_t start=0,const index_t stop=C)
    {
      return(VectorView<P,S>(*this,i*stride()+start,i*stride()+stop));
    }
 
+   /** @brief Create a constant row view vector with stride
+    * @tparam S stride
+    * @param i row index
+    * @param start Start index in row
+    * @param stop Stop index in row
+    *             Default is number of columns
+    * @return row view vector
+    *
+    */
    template<int S=1>
    const VectorView<P,S> row(const index_t i,const index_t start=0,const index_t stop=C) const
    {
      return(VectorView<P,S>(*this,i*stride()+start,i*stride()+stop));
    }
 
-  
+   /** @brief Create a column view vector with stride
+    * @tparam S stride
+    * @param i column index
+    * @param start Start index in row
+    * @param stop Stop index in row
+    *             Default is number of rows
+    * @return column view vector
+    *
+    */
    template<int CS=1,int S=C>
    VectorView<P,CS*S> col(const index_t i,const index_t start=0,const index_t stop=R)
    {
      return(VectorView<P,CS*S>(*this,i+stride()*start,i+stride()*stop));
    }
 
+   /** @brief Create a constant column view vector with stride
+    * @tparam S stride
+    * @param i column index
+    * @param start Start index in row
+    * @param stop Stop index in row
+    *             Default is number of rows
+    * @return column view vector
+    *
+    */
    template<int CS=1,int S=C>
    const VectorView<P,CS*S> col(const index_t i,const index_t start=0,const index_t stop=R) const
    {
      return(VectorView<P,CS*S>(*this,i+stride()*start,i+stride()*stop));
    }
 
+    /** @brief Create a diagonal matrix
+    * @tparam RA Number of rows
+    * @tparam CA Number of columns
+    * @tparam VA Vector datatype
+    * @param a Vector for initializing the diagonal
+    * @return a matrix
+    * 
+    * Only exists when RA == CA and the size is known at built time
+    *
+    */
     template<int RA=R,int CA=C,typename VA,
             typename std::enable_if<IsVector<VA>::value && 
             (RA == CA) && (RA>0) &&
@@ -244,6 +443,15 @@ struct Matrix:Vector<P,R*C,Allocator>
        return(res);
     }
 
+    /** @brief Fill diagonal of a matrix with a vector
+    * @tparam RA Number of rows
+    * @tparam CA Number of columns
+    * @tparam VA Vector datatype
+    * @param a Vector for initializing the diagonal
+    * 
+    * Only exists when RA == CA and the size is known at built time
+    *
+    */
     template<int RA=R,int CA=C,typename VA,
             typename std::enable_if<IsVector<VA>::value && 
             (RA == CA) && (RA>0) &&
@@ -253,6 +461,14 @@ struct Matrix:Vector<P,R*C,Allocator>
        _fill_diagonal(*this,a,RA);
     }
 
+    /** @brief Create an identity matrix
+    * @tparam RA Number of rows
+    * @tparam CA Number of columns
+    * @return a matrix
+    * 
+    * Only exists when RA == CA and the size is known at built time
+    *
+    */
     template<int RA=R, int CA=C,
             typename std::enable_if< 
             (RA == CA) && (RA>0),bool>::type = true>
@@ -263,12 +479,20 @@ struct Matrix:Vector<P,R*C,Allocator>
        return(res);
     }
 
+    /** @brief Create a matrix of same type
+    * @return a matrix
+    *   
+    */
     Matrix<P,R,C,Allocator> create() const
     {
        Matrix<P,R,C,Allocator> res;
        return(res);
     }
 
+    /** @brief Create the transposed matrix
+    * @return a matrix
+    *   
+    */
     Matrix<P,C,R,Allocator> transpose() const
     {
        Matrix<P,C,R,Allocator> res;
@@ -277,7 +501,20 @@ struct Matrix:Vector<P,R*C,Allocator>
     }
 
 #if defined(HAS_VECTOR)
+    //! Type of vectors for a vector architecture and for scalar datatype P
     using VectorType = typename vector_traits<P>::vector;
+
+    /**
+    * @brief   %Vector store at a given row,column position
+    *
+    * @param row row index
+    * @param col column index
+    * @param val %Vector value
+    * 
+    * On an architecture supporting vectors, if the scalar datatype T
+    * has a corresponding vector datatype, this function stores a vector
+    * value at row,column in this matrix.
+    */
     void matrix_store(const index_t row,
                       const index_t col,
                       const VectorType val) const
@@ -286,6 +523,19 @@ struct Matrix:Vector<P,R*C,Allocator>
     }
 
 #if defined(HAS_PREDICATED_LOOP)
+    /**
+    * @brief   %Vector store at a given row,column position with predicated tail
+    *
+    * @param row row index
+    * @param col column index
+    * @param remaining Number of remaining samples in the loop
+    * @param val Vector value to write at index i with tail predication
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function stores a vector value at row,column index in this matrix datatype
+    * with predication
+    */
     void matrix_store_tail(const index_t row,
                            const index_t col,
                            const vector_length_t remaining,
@@ -294,6 +544,19 @@ struct Matrix:Vector<P,R*C,Allocator>
        Vector_Base<P>::vector_store_tail(row*C + col,remaining,val);
     }
 
+    /**
+    * @brief   %Vector operation at a given row,column position with predicated tail
+    *
+    * @param row row index
+    * @param col column index
+    * @param remaining Number of remaining samples in the loop
+    * @return the vector result of the operation
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function compute an operation at row,column index in this matrix datatype
+    * with predication
+    */
     VectorType const matrix_op_tail(const index_t row,
                                 const index_t col,
                                 const vector_length_t remaining) const
@@ -302,6 +565,17 @@ struct Matrix:Vector<P,R*C,Allocator>
     }
 #endif
 
+    /**
+    * @brief   %Vector operation at a given row,column position
+    *
+    * @param row row index
+    * @param col column index
+    * @return the vector result of the operation
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function compute an operation at row,column index in this matrix datatype
+    */
     VectorType const matrix_op(const index_t row,
                            const index_t col) const
     {
@@ -311,33 +585,75 @@ struct Matrix:Vector<P,R*C,Allocator>
 
 };
 
+/** @brief Matrix
+ *  @tparam P Type of the scalar
+ *  @tparam Allocator Memory allocator
+ */
 template<typename P,
          template<int> typename Allocator>
 struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
 {
+    /** @brief Number of rows
+    *  @return Number of rows
+    */
    vector_length_t rows() const {return(rows_);}
+
+   /** @brief Number of columns
+    *  @return Number of columns
+    */
    vector_length_t columns() const {return(columns_);}
-   uint32_t stride() const {return(columns_);}
 
+    /** @brief Number of stride
+    *  @return Number of stride
+    */
+   uint32_t stride() const {return(columns_);}
 
+   /** @brief Create matrix
+    * @param r number of rows
+    * @param c number of columns
+    */
    explicit Matrix(vector_length_t r,vector_length_t c):
    Vector<P,DYNAMIC,Allocator>(r*c),rows_(r),columns_(c){};
+
+   /** @brief Create matrix
+    * @param r number of rows
+    * @param c number of columns
+    * @param init_val Initialization value
+    */
    explicit Matrix(vector_length_t r,vector_length_t c,P init_val):
    Vector<P,DYNAMIC,Allocator>(r*c,init_val),rows_(r),columns_(c){};
 
    Matrix(const Matrix& other) = default;
    Matrix(Matrix&& other) = default;
 
+   /** @brief Access matrix element at given position
+    * @param r Row index
+    * @param c Column index
+    * @return reference to element
+    *
+    */
    P& operator()(const index_t r,const index_t c)
    {
      return(Vector_Base<P>::ptr()[r*columns()+c]);
    }
 
+   /** @brief Access matrix element at given position
+    * @param r Row index
+    * @param c Column index
+    * @return reference to element
+    *
+    */
    P& operator()(const index_t r,const index_t c) const
    {
      return(Vector_Base<P>::ptr()[r*columns()+c]);
    }
 
+    /** @brief Create matrix from another matrix using different memory allocator
+     * @tparam RK Number of rows
+     * @tparam CK Number of columns
+    * @tparam OtherAllocator other memory allocator
+    * @param other Other matrix
+    */
    template<int RK,int CK,template<int> typename OtherAllocator>
    explicit Matrix(const Matrix<P,RK,CK,OtherAllocator>& other):
    Vector<P,DYNAMIC,Allocator>(other.rows()*other.columns()),
@@ -349,6 +665,13 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
         }
    };
 
+   /** @brief Create matrix from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other Other matrix
+    * 
+    * Only applies when the expression does not contain any MatrixView since
+    * matrix view may have a stride and cannot be used as vectors.
+    */
    template<typename Derived,
             typename std::enable_if<IsVector<Derived>::value,bool>::type = true>
    Matrix(const _Expr<Derived>& other):Vector<P,DYNAMIC,Allocator>(other),
@@ -356,6 +679,13 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
    {
    };
 
+    /** @brief Create matrix from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other Other matrix
+    * 
+    * Applies when contain a matrix view that has a stride and thus force a 2D
+    * evaluation loop.
+    */
    template<typename Derived,
             typename std::enable_if<must_use_matrix_idx<Derived>(),bool>::type = true>
    Matrix(const _Expr<Derived>& other):
@@ -365,6 +695,13 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
         eval2D(*this,other.derived(),rows(),columns(),CURRENT_ARCH);
    };
 
+  /** @brief Assign matrix from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other Other matrix
+    * @return the matrix
+    * 
+    * Applies when expression does not contain matrix view
+    */
    template<typename Derived,
             typename std::enable_if<IsVector<Derived>::value,bool>::type = true>
    Matrix& operator=(const _Expr<Derived>& other)
@@ -374,6 +711,14 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
    };
 
 
+    /** @brief Assign matrix from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other Other matrix
+    * @return the matrix
+    * 
+    * Applies when contain a matrix view that has a stride and thus force a 2D
+    * evaluation loop.
+    */
    template<typename Derived,
             typename std::enable_if<must_use_matrix_idx<Derived>(),bool>::type = true>
    Matrix& operator=(const _Expr<Derived>& other)
@@ -386,6 +731,13 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
    
    Matrix& operator=(Matrix&& other) = default;
 
+   /**
+    * @brief  Display the matrix content for debug purpose
+    * @param stream Output stream
+    * @param other The matrix to display
+    * @return the stream
+    * 
+    */
    friend std::ostream& operator<< (std::ostream& stream, const Matrix& other) {
         int c=0;
         for(index_t k=0;k<other.length();k++)
@@ -402,6 +754,13 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
         return(stream);
     }
 
+    /** @brief Create a diagonal matrix
+    * @tparam VA Vector datatype
+    * @param a Vector for initializing the diagonal
+    * @return a matrix
+    * 
+    *
+    */
     template<typename VA,
             typename std::enable_if<IsVector<VA>::value && 
             SameElementType<VA,P>::value,bool>::type = true>
@@ -412,6 +771,11 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
        return(res);
     }
 
+    /** @brief Fill diagonal of a matrix with a vector
+    * @tparam VA Vector datatype
+    * @param a Vector for initializing the diagonal
+    * 
+    */
     template<typename VA,
             typename std::enable_if<IsVector<VA>::value && 
             SameElementType<VA,P>::value,bool>::type = true>
@@ -420,6 +784,11 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
        _fill_diagonal(*this,a,this->length());
     }
 
+     /** @brief Create an identity matrix
+      * @param l Matrix dimension (l x l)
+    * @return a matrix
+    *     
+    */
     static Matrix<P,DYNAMIC,DYNAMIC,Allocator> identity(const vector_length_t l)
     {
        Matrix<P,DYNAMIC,DYNAMIC,Allocator> res(l,l);
@@ -427,12 +796,20 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
        return(res);
     }
 
+    /** @brief Create a matrix of same type
+    * @return a matrix
+    *   
+    */
     Matrix<P,DYNAMIC,DYNAMIC,Allocator> create() const
     {
        Matrix<P,DYNAMIC,DYNAMIC,Allocator> res(rows(),columns());
        return(res);
     }
 
+    /** @brief Create the transposed matrix
+    * @return a matrix
+    *   
+    */
     Matrix<P,DYNAMIC,DYNAMIC,Allocator> transpose() const
     {
        Matrix<P,DYNAMIC,DYNAMIC,Allocator> res(columns(),rows());
@@ -440,44 +817,100 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
        return(res);
     }
 
+    /** @brief Create a row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @return row view vector
+    *
+    */
    VectorView<P,1> row(const index_t i,const index_t start=0)
    {
      return(VectorView<P,1>(*this,i*this->stride()+start,i*this->stride()+this->columns()));
    }
 
+   /** @brief Create a row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @param stop Stop index in row
+    * @return row view vector
+    *
+    */
    VectorView<P,1> row(const index_t i,const index_t start,const index_t stop)
    {
      return(VectorView<P,1>(*this,i*this->stride()+start,i*this->stride()+stop));
    }
 
+   /** @brief Create a constant row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @return row view vector
+    *
+    */
    const VectorView<P,1> row(const index_t i,const index_t start=0) const
    {
      return(VectorView<P,1>(*this,i*this->stride()+start,i*this->stride()+this->columns()));
    }
 
+   /** @brief Create a constant row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @param stop Stop index in row
+    * @return row view vector
+    *
+    */
    const VectorView<P,1> row(const index_t i,const index_t start,const index_t stop) const
    {
      return(VectorView<P,1>(*this,i*this->stride()+start,i*this->stride()+stop));
    }
 
+   /** @brief Create a column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    VectorView<P,DYNAMIC> col(const index_t i,const index_t start=0)
    {
      return(VectorView<P,DYNAMIC>(*this,i+this->stride()*start,i+this->stride()*this->rows(),this->stride()*CS));
    }
 
+   /** @brief Create a column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @param stop Stop index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    VectorView<P,DYNAMIC> col(const index_t i,const index_t start,const index_t stop)
    {
      return(VectorView<P,DYNAMIC>(*this,i+this->stride()*start,i+this->stride()*stop,this->stride()*CS));
    }
 
+   /** @brief Create a constant column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    const VectorView<P,DYNAMIC> col(const index_t i,const index_t start=0) const
    {
      return(VectorView<P,DYNAMIC>(*this,i+this->stride()*start,i+this->stride()*this->rows(),this->stride()*CS));
    }
 
+   /** @brief Create a constant column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @param stop Stop index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    const VectorView<P,DYNAMIC> col(const index_t i,const index_t start,const index_t stop) const
    {
@@ -485,7 +918,20 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
    }
 
 #if defined(HAS_VECTOR)
+    //! Type of vectors for a vector architecture and for scalar datatype P
     using VectorType = typename vector_traits<P>::vector;
+
+    /**
+    * @brief   %Vector store at a given row,column position
+    *
+    * @param row row index
+    * @param col column index
+    * @param val %Vector value
+    * 
+    * On an architecture supporting vectors, if the scalar datatype T
+    * has a corresponding vector datatype, this function stores a vector
+    * value at row,column in this matrix.
+    */
     void matrix_store(const index_t row,
                       const index_t col,
                       const VectorType val) const
@@ -494,6 +940,20 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
     }
 
 #if defined(HAS_PREDICATED_LOOP)
+
+    /**
+    * @brief   %Vector store at a given row,column position with predicated tail
+    *
+    * @param row row index
+    * @param col column index
+    * @param remaining Number of remaining samples in the loop
+    * @param val Vector value to write at index i with tail predication
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function stores a vector value at row,column index in this matrix datatype
+    * with predication
+    */
     void matrix_store_tail(const index_t row,
                            const index_t col,
                            const vector_length_t remaining,
@@ -502,6 +962,19 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
        Vector_Base<P>::vector_store_tail(row*stride() + col,remaining,val);
     }
 
+     /**
+    * @brief   %Vector operation at a given row,column position with predicated tail
+    *
+    * @param row row index
+    * @param col column index
+    * @param remaining Number of remaining samples in the loop
+    * @return the vector result of the operation
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function compute an operation at row,column index in this matrix datatype
+    * with predication
+    */
     VectorType const matrix_op_tail(const index_t row,
                                 const index_t col,
                                 const vector_length_t remaining) const
@@ -510,6 +983,17 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
     }
 #endif
 
+     /**
+    * @brief   %Vector operation at a given row,column position
+    *
+    * @param row row index
+    * @param col column index
+    * @return the vector result of the operation
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function compute an operation at row,column index in this matrix datatype
+    */
     VectorType const matrix_op(const index_t row,
                            const index_t col) const
     {
@@ -517,6 +1001,12 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
     }
 #endif
 
+   /** @brief Create a matrix view
+    * @param rs start row
+    * @param cs start column
+    * @return matrix view
+    *
+    */
    MatrixView<P,DYNAMIC> sub(const index_t rs,const index_t cs)
    {
         const vector_length_t nb_rows = rows() - rs;
@@ -525,6 +1015,12 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
         return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs*stride()+cs),nb_rows,nb_cols,stride()));
    }
 
+    /** @brief Create a constant matrix view
+    * @param rs start row
+    * @param cs start column
+    * @return matrix view
+    *
+    */
    const MatrixView<P,DYNAMIC> sub(const index_t rs,const index_t cs) const
    {
         const vector_length_t nb_rows = rows() - rs;
@@ -533,6 +1029,12 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
         return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs*stride()+cs),nb_rows,nb_cols,stride()));
    }
    
+   /** @brief Create a matrix view
+    * @param rs Row slice (start and end row)
+    * @param cs start column
+    * @return matrix view
+    *
+    */
    MatrixView<P,DYNAMIC> sub(const Slice &rs,const index_t cs)
    {
         const vector_length_t nb_rows = rs.stop - rs.start;
@@ -541,6 +1043,12 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
         return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs.start*stride()+cs),nb_rows,nb_cols,stride()));
    }
 
+    /** @brief Create a constant matrix view
+    * @param rs Row slice (start and end row)
+    * @param cs start column
+    * @return matrix view
+    *
+    */
    const MatrixView<P,DYNAMIC> sub(const Slice &rs,const index_t cs) const
    {
         const vector_length_t nb_rows = rs.stop - rs.start;
@@ -549,6 +1057,12 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
         return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs.start*stride()+cs),nb_rows,nb_cols,stride()));
    }
 
+   /** @brief Create a  matrix view
+    * @param rs Row start index
+    * @param cs Column slice
+    * @return matrix view
+    *
+    */
    MatrixView<P,DYNAMIC> sub(const index_t rs,const Slice &cs)
    {
         const vector_length_t nb_rows = rows() - rs;
@@ -557,6 +1071,13 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
         return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs*stride()+cs.start),nb_rows,nb_cols,stride()));
    }
 
+   
+  /** @brief Create a constant matrix view
+    * @param rs Row start index
+    * @param cs Column slice
+    * @return matrix view
+    *
+    */
    const MatrixView<P,DYNAMIC> sub(const index_t rs,const Slice &cs) const
    {
         const vector_length_t nb_rows = rows() - rs;
@@ -565,6 +1086,12 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
         return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs*stride()+cs.start),nb_rows,nb_cols,stride()));
    }
 
+    /** @brief Create a matrix view
+    * @param rs Row slice
+    * @param cs Column slice
+    * @return matrix view
+    *
+    */
    MatrixView<P,DYNAMIC> sub(const Slice& rs,const Slice& cs)
    {
         const vector_length_t nb_rows = rs.stop - rs.start;
@@ -573,6 +1100,12 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
         return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs.start*stride()+cs.start),nb_rows,nb_cols,stride()));
    }
 
+   /** @brief Create a constant matrix view
+    * @param rs Row slice
+    * @param cs Column slice
+    * @return matrix view
+    *
+    */
    const MatrixView<P,DYNAMIC> sub(const Slice& rs,const Slice& cs) const
    {
         const vector_length_t nb_rows = rs.stop - rs.start;
@@ -581,6 +1114,14 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
         return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs.start*stride()+cs.start),nb_rows,nb_cols,stride()));
    }
 
+   /** @brief Create a matrix view
+    * @param rs Row start
+    * @param re Row end
+    * @param cs Column start
+    * @param ce Column end
+    * @return matrix view
+    *
+    */
    MatrixView<P,DYNAMIC> sub(const index_t rs,
                              const index_t re,
                              const index_t cs,
@@ -592,6 +1133,14 @@ struct Matrix<P,DYNAMIC,DYNAMIC,Allocator>:Vector<P,DYNAMIC,Allocator>
         return(MatrixView<P,DYNAMIC>(Vector_Base<P>::ptr(rs*stride()+cs),nb_rows,nb_cols,stride()));
    }
 
+    /** @brief Create a constant matrix view
+    * @param rs Row start
+    * @param re Row end
+    * @param cs Column start
+    * @param ce Column end
+    * @return matrix view
+    *
+    */
    const MatrixView<P,DYNAMIC> sub(const index_t rs,
                                    const index_t re,
                                    const index_t cs,
diff --git a/dsppp/Include/dsppp/matrix_view.hpp b/dsppp/Include/dsppp/matrix_view.hpp
index d37bf9dce..5321a6dbc 100644
--- a/dsppp/Include/dsppp/matrix_view.hpp
+++ b/dsppp/Include/dsppp/matrix_view.hpp
@@ -24,18 +24,43 @@ namespace arm_cmsis_dsp {
  *  @{
  */
 
+/** @brief Matrix
+ *  @tparam T Type of the scalar
+ *  @tparam S Stride
+ */
 template<typename T,int S=1>
 struct MatrixView
 {
+   /** @brief Number of rows
+    *  @return Number of rows
+    */
    vector_length_t rows() const {return(nb_rows_);}
+
+   /** @brief Number of columns
+    *  @return Number of columns
+    */
    vector_length_t columns() const {return(nb_cols_);}
+
+    /** @brief Number of stride
+    *  @return Number of stride
+    */
    constexpr uint32_t stride() const {return(S);}
 
+    /** @brief Create matrix view on a buffer (buffer not owned by the view)
+    * @param v buffer
+    * @param rows number of rows
+    * @param cols number of columns
+    */
    explicit MatrixView(T* v,
               const vector_length_t rows,
               const vector_length_t cols):
    v_(v),nb_rows_(rows),nb_cols_(cols){};
 
+    /** @brief Create matrix view on vector (vector not owned by the view)
+    * @param v vector
+    * @param rows number of rows
+    * @param cols number of columns
+    */
    explicit MatrixView(const Vector_Base<T> &v,
               const vector_length_t rows,
               const vector_length_t cols):
@@ -55,17 +80,35 @@ struct MatrixView
    MatrixView& operator=(const MatrixView& other) = delete;
    MatrixView& operator=(MatrixView&& other)  = delete;
 
+   /** @brief Access matrix view element at given position
+    * @param r Row index
+    * @param c Column index
+    * @return reference to element
+    *
+    */
    T& operator()(const index_t r,const index_t c)
    {
      return(v_[r*stride()+c]);
    }
 
+    /** @brief Access matrix view element at given position
+    * @param r Row index
+    * @param c Column index
+    * @return reference to element
+    *
+    */
    T const operator()(const index_t r,const index_t c) const
    {
      return(v_[r*stride()+c]);
    }
 
 
+   /** @brief Assign matrix from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other Expression
+    * @return the matrix
+    * 
+    */
    template<typename Derived>
    MatrixView& operator=(const _Expr<Derived>&other)
    {
@@ -73,6 +116,11 @@ struct MatrixView
       return(*this);
    }
 
+    /** @brief Assign constant from expression
+    * @param val The constant
+    * @return the matrix
+    * 
+    */
    MatrixView& operator=(const T val)
    {
         _Fill2D(*this,val,rows(),columns(),CURRENT_ARCH);
@@ -81,6 +129,12 @@ struct MatrixView
    }
 
 
+    /** @brief Add matrix from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other Expression
+    * @return the matrix
+    * 
+    */
    template<typename Derived>
    MatrixView& operator +=(const _Expr<Derived>& other)
    {
@@ -88,18 +142,35 @@ struct MatrixView
       return(*this);
    };
 
+    /** @brief Add matrix from matrix view
+    * @param other Other matrix
+    * @return the matrix
+    * 
+    */
    MatrixView& operator +=(const MatrixView& other)
    {
       eval2D(*this,*this + other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+
+    /** @brief Add constant to matrix view
+    * @param other The constant
+    * @return the matrix
+    * 
+    */
    MatrixView& operator +=(const T other)
    {
       eval2D(*this,*this + other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+    /** @brief Subtract matrix from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other expression
+    * @return the matrix
+    * 
+    */
    template<typename Derived>
    MatrixView& operator -=(const _Expr<Derived>& other)
    {
@@ -107,19 +178,34 @@ struct MatrixView
       return(*this);
    };
 
-   
+    /** @brief Subtract matrix view
+    * @param other Other matrix view
+    * @return the matrix
+    * 
+    */
    MatrixView& operator -=(const MatrixView& other)
    {
       eval2D(*this,*this - other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+    /** @brief Subtract constant
+    * @param other Other matrix
+    * @return the matrix
+    * 
+    */
    MatrixView& operator -=(const T other)
    {
       eval2D(*this,*this - other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+    /** @brief Elementwise multiply matrix view with expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other expression
+    * @return the matrix
+    * 
+    */
    template<typename Derived>
    MatrixView& operator *=(const _Expr<Derived>& other)
    {
@@ -127,18 +213,35 @@ struct MatrixView
       return(*this);
    };
 
+   /** @brief Elementwise multiply matrix view with matrix view
+    * @param other Other matrix
+    * @return the matrix
+    * 
+    */
    MatrixView& operator *=(const MatrixView& other)
    {
       eval2D(*this,*this * other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+   /** @brief Elementwise multiply matrix view constant
+    * @param other constant
+    * @return the matrix
+    * 
+    */
    MatrixView& operator *=(const T other)
    {
       eval2D(*this,*this * other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+  /**
+    * @brief  Display the matrix content for debug purpose
+    * @param stream Output stream
+    * @param other The matrix to display
+    * @return the stream
+    * 
+    */
   friend std::ostream& operator<< (std::ostream& stream, const MatrixView& other) {
         for(index_t row=0;row<other.rows();row++)
         {
@@ -152,44 +255,100 @@ struct MatrixView
         return(stream);
     }
 
+   /** @brief Create a row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @return row view vector
+    *
+    */
    VectorView<T,1> row(const index_t i,const index_t start=0)
    {
      return(VectorView<T,1>(v_,i*stride()+start,i*stride()+columns()));
    }
 
+    /** @brief Create a row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @param stop Stop index in row
+    * @return row view vector
+    *
+    */
    VectorView<T,1> row(const index_t i,const index_t start,const index_t stop)
    {
      return(VectorView<T,1>(v_,i*stride()+start,i*stride()+stop));
    }
 
+   /** @brief Create a constant row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @return row view vector
+    *
+    */
    const VectorView<T,1> row(const index_t i,const index_t start=0) const
    {
      return(VectorView<T,1>(v_,i*stride()+start,i*stride()+columns()));
    }
 
+   /** @brief Create a constant row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @param stop Stop index in row
+    * @return row view vector
+    *
+    */
    const VectorView<T,1> row(const index_t i,const index_t start,const index_t stop) const
    {
      return(VectorView<T,1>(v_,i*stride()+start,i*stride()+stop));
    }
 
+    /** @brief Create a column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    VectorView<T,CS*S> col(const index_t i,const index_t start=0)
    {
      return(VectorView<T,CS*S>(v_,i+stride()*start,i+stride()*rows()));
    }
 
+    /** @brief Create a column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @param stop Stop index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    VectorView<T,CS*S> col(const index_t i,const index_t start,const index_t stop)
    {
      return(VectorView<T,CS*S>(v_,i+stride()*start,i+stride()*stop));
    }
 
+   /** @brief Create a constant column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    const VectorView<T,CS*S> col(const index_t i,const index_t start=0) const
    {
      return(VectorView<T,CS*S>(v_,i+stride()*start,i+stride()*rows()));
    }
 
+    /** @brief Create a constant column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @param stop Stop index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    const VectorView<T,CS*S> col(const index_t i,const index_t start,const index_t stop) const
    {
@@ -197,7 +356,20 @@ struct MatrixView
    }
 
    #if defined(HAS_VECTOR)
+    //! Type of vectors for a vector architecture and for scalar datatype P
     using VectorType = typename vector_traits<T>::vector;
+
+    /**
+    * @brief   %Vector store at a given row,column position
+    *
+    * @param row row index
+    * @param col column index
+    * @param val %Vector value
+    * 
+    * On an architecture supporting vectors, if the scalar datatype T
+    * has a corresponding vector datatype, this function stores a vector
+    * value at row,column in this matrix.
+    */
     void matrix_store(const index_t row,
                       const index_t col,
                       const VectorType val) const
@@ -206,6 +378,19 @@ struct MatrixView
     }
 
 #if defined(HAS_PREDICATED_LOOP)
+    /**
+    * @brief   %Vector store at a given row,column position with predicated tail
+    *
+    * @param row row index
+    * @param col column index
+    * @param remaining Number of remaining samples in the loop
+    * @param val Vector value to write at index i with tail predication
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function stores a vector value at row,column index in this matrix datatype
+    * with predication
+    */
     void matrix_store_tail(const index_t row,
                            const index_t col,
                            const vector_length_t remaining,
@@ -214,6 +399,19 @@ struct MatrixView
         inner::vstore1_z<1>((typename std::remove_cv<T>::type*)(&v_[row*stride() + col]),val,remaining,inner::vctpq<T>::mk(remaining));
     }
 
+     /**
+    * @brief   %Vector operation at a given row,column position with predicated tail
+    *
+    * @param row row index
+    * @param col column index
+    * @param remaining Number of remaining samples in the loop
+    * @return the vector result of the operation
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function compute an operation at row,column index in this matrix datatype
+    * with predication
+    */
     VectorType const matrix_op_tail(const index_t row,
                                 const index_t col,
                                 const vector_length_t remaining) const
@@ -222,6 +420,17 @@ struct MatrixView
     }
 #endif
 
+     /**
+    * @brief   %Vector operation at a given row,column position
+    *
+    * @param row row index
+    * @param col column index
+    * @return the vector result of the operation
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function compute an operation at row,column index in this matrix datatype
+    */
     VectorType const matrix_op(const index_t row,
                            const index_t col) const
     {
@@ -229,6 +438,11 @@ struct MatrixView
     }
 #endif
 
+    /** @brief Fill diagonal of a matrix with a vector
+    * @tparam VA Vector datatype
+    * @param a Vector for initializing the diagonal
+    * 
+    */
     template<typename VA,
             typename std::enable_if<IsVector<VA>::value && 
             SameElementType<VA,T>::value,bool>::type = true>
@@ -237,6 +451,10 @@ struct MatrixView
        _fill_diagonal(*this,a,this->length());
     }
 
+    /** @brief Create the transposed matrix
+    * @return a matrix
+    *   
+    */
     Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> transpose() const
     {
        Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> res(columns(),rows());
@@ -244,13 +462,26 @@ struct MatrixView
        return(res);
     }
 
+    /** @brief Create a matrix of same type
+    * @return a matrix
+    *   
+    */
     Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> create() const
     {
        Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> res(rows(),columns());
        return(res);
     }
 
+     /**
+    * @brief      Pointer to storage buffer
+    * @return Pointer to storage
+    */
     T* ptr() const {return(v_);}
+
+     /**
+    * @brief      Constant pointer to storage buffer
+    * @return Pointer to storage
+    */
     const T* const_ptr() const {return(v_);}
 
 protected:
diff --git a/dsppp/Include/dsppp/vec.hpp b/dsppp/Include/dsppp/vec.hpp
index c388bae7d..ec729dea9 100644
--- a/dsppp/Include/dsppp/vec.hpp
+++ b/dsppp/Include/dsppp/vec.hpp
@@ -317,6 +317,17 @@ using StaticType=typename std::conditional<IsDynamic<VA>::value,VB,VA>::type;
 
 
 
+/**
+ * @brief  Addition operator for expressions
+ *
+ * @tparam LHS Left hand side datatype
+ * @tparam RHS Right hand side datatype
+ * @param a Left hand side expression tree
+ * @param b Right hand side expression tree
+ * @return Expression representing the add
+ * 
+ * vector + vector (including matrix)
+ */
 template<typename LHS,typename RHS,
 typename std::enable_if<(!is_scalar<LHS>() || 
                         !is_scalar<RHS>()) && 
@@ -331,6 +342,16 @@ inline auto operator+(const LHS &a,const RHS &b)
     return(_Binary<typename VecLHS::type,typename VecRHS::type,_AddOp<Scalar>>(VecLHS::ref(a),VecRHS::ref(b),_AddOp<Scalar>()));
 };
 
+
+/**
+ * @brief  + operator for expressions
+ *
+ * @tparam LHS Left hand side datatype
+ * @param a Left hand side expression tree
+ * @return Expression representing + vector
+ * 
+ * +vector  (including matrix)
+ */
 template<typename LHS,
 typename std::enable_if<!is_scalar<LHS>(),bool>::type = true>
 inline auto operator+(const LHS &a)
@@ -352,6 +373,18 @@ VectorView = expr(VectorView) or copy
 we cannot rely on the copy or move constructors.
 
 */
+
+/**
+ * @brief  Identity operator for expression
+ *
+ * @tparam LHS Left hand side datatype
+ * @param a Left hand side expression tree
+ * @return Expression representing the identity
+ * 
+ * Used to consider a vector view as an expression and force the copy
+ * of this vector view when assigned to another vector entity.
+ * 
+ */
 template<typename LHS,
 typename std::enable_if<!is_scalar<LHS>(),bool>::type = true>
 inline auto expr(const LHS &a)
@@ -361,6 +394,17 @@ inline auto expr(const LHS &a)
     return(_Unary<typename VecLHS::type,_NoOp<Scalar>>(VecLHS::ref(a),_NoOp<Scalar>()));
 };
 
+/**
+ * @brief  Identity operator for expression
+ *
+ * @tparam LHS Left hand side datatype
+ * @param a Left hand side expression tree
+ * @return Expression representing the identity
+ * 
+ * Used to consider a vector view as an expression and force the copy
+ * of this vector view when assigned to another vector entity.
+ * 
+ */
 template<typename LHS,
 typename std::enable_if<!is_scalar<LHS>(),bool>::type = true>
 inline auto copy(const LHS &a)
@@ -370,6 +414,18 @@ inline auto copy(const LHS &a)
     return(_Unary<typename VecLHS::type,_NoOp<Scalar>>(VecLHS::ref(a),_NoOp<Scalar>()));
 };
 
+
+/**
+ * @brief  Subtraction operator for expressions
+ *
+ * @tparam LHS Left hand side datatype
+ * @tparam RHS Right hand side datatype
+ * @param a Left hand side expression tree
+ * @param b Right hand side expression tree
+ * @return Expression representing the add
+ * 
+ * vector - vector (including matrix)
+ */
 template<typename LHS,typename RHS,
 typename std::enable_if<(!is_scalar<LHS>() || 
                         !is_scalar<RHS>()) && 
@@ -385,6 +441,16 @@ inline auto operator-(const LHS &a,const RHS &b)
         VecLHS::ref(a),VecRHS::ref(b),_SubOp<Scalar>()));
 };
 
+
+/**
+ * @brief  - operator for expressions
+ *
+ * @tparam LHS Left hand side datatype
+ * @param a Left hand side expression tree
+ * @return Expression representing the - vector
+ * 
+ * -vector (including matrix)
+ */
 template<typename LHS,
 typename std::enable_if<!is_scalar<LHS>(),bool>::type = true>
 inline auto operator-(const LHS &a)
@@ -396,6 +462,17 @@ inline auto operator-(const LHS &a)
 };
 
 
+/**
+ * @brief  Element wise multiplication operator for expressions
+ *
+ * @tparam LHS Left hand side datatype
+ * @tparam RHS Right hand side datatype
+ * @param a Left hand side expression tree
+ * @param b Right hand side expression tree
+ * @return Expression representing the *
+ * 
+ * elementwise vector * vector (including matrix)
+ */
 template<typename LHS,typename RHS,
 typename std::enable_if<(!is_scalar<LHS>() || 
                         !is_scalar<RHS>())  && 
diff --git a/dsppp/Include/dsppp/vector_impl.hpp b/dsppp/Include/dsppp/vector_impl.hpp
index 6978e2db6..a01509cb1 100644
--- a/dsppp/Include/dsppp/vector_impl.hpp
+++ b/dsppp/Include/dsppp/vector_impl.hpp
@@ -31,25 +31,79 @@ Generic evaluators.
 #include "Helium/basic.hpp"
 #include "Neon/basic.hpp"
 
-
+/** @brief Storage for a vector
+ *  @tparam P Type of the scalar
+ */
 template<typename P>
 struct Vector_Base {
 
+    //! Type of vector elements
     typedef P element_type;
 
+
+    /**
+    * @brief   Vector dimension
+    * @return Vector dimension
+    *
+    */
     vector_length_t length() const   {return(length_);};
 
+    /**
+    * @brief      Pointer to storage buffer
+    * @return Pointer to storage
+    */
     P* ptr() const {return(values_);}
+
+    /**
+    * @brief      Pointer to storage buffer at index i
+    *
+    * @param i Index in buffer
+    * @return Pointer to storage
+    *
+    */
     P* ptr(const index_t i) const {return(&values_[i]);}
 
+    /**
+    * @brief      Pointer to storage buffer
+    * @return Pointer to constant storage
+    *
+    */
     const P* const_ptr() const {return(values_);}
+
+    /**
+    * @brief      Pointer to storage buffer at index i
+    *
+    * @param i Index in buffer
+    * @return Pointer to constant storage
+    *
+    */
     const P* const_ptr(const index_t i) const {return(&values_[i]);}
 
 
+    
+    /**
+    * @brief   Iterator begin
+    *
+    * @return Pointer to start of buffer
+    *
+    */
     P* begin() const {return(values_);}
-    P* end() const {return(values_+length_);}
 
+    /**
+    * @brief   Iterator end
+    *
+    * @return Pointer to first element after end of buffer
+    *
+    */
+    P* end() const {return(values_+length_);}
 
+    /**
+    * @brief  Display the vector content for debug purpose
+    * @param stream Output stream
+    * @param other The vector to display
+    * @return the stream
+    * 
+    */
     friend std::ostream& operator<< (std::ostream& stream, const Vector_Base<P>& other) {
         constexpr int nb = 10;
         int i=0;
@@ -81,19 +135,44 @@ struct Vector_Base {
    Vector_Base(const Vector_Base& other) = delete;
    //   length_(other.length_),values_(other.values_){};
 
+
+   /**
+    * @brief      Element at index i
+    *
+    * @param i index
+    * @return     Reference to element
+    */
    P& operator[](const index_t i)
    {
        return(values_[i]);
    }
 
+   /**
+    * @brief      Element at index i
+    *
+    * @param i index
+    * @return     Reference to element
+    */
    P& operator[](const index_t i) const
    {
        return(values_[i]);
    }
 
 #if defined(HAS_VECTOR)
+    //! Type of vectors for a vector architecture and for scalar datatype P
     using Vector = typename vector_traits<P>::vector;
 
+    /**
+    * @brief   %Vector store at index i
+    *
+    * @tparam T scalar datatype
+    * @param i index
+    * @param val Vector value
+    * 
+    * On an architecture supporting vectors, if the scalar datatype T
+    * has a corresponding vector datatype, this function stores a vector
+    * value at index i in this vector datatype
+    */
     template<typename T=P,
              typename std::enable_if<vector_traits<T>::has_vector,bool>::type = true>
     void vector_store(const index_t i,const Vector val) const
@@ -102,17 +181,52 @@ struct Vector_Base {
     }
 
 #if defined(HAS_PREDICATED_LOOP)
+    /**
+    * @brief   %Vector store at index i with predicated tail
+    *
+    * @param i index
+    * @param remaining Number of remaining samples in the loop
+    * @param val Vector value to write at index i with tail predication
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function stores a vector value at index i in this vector datatype
+    * with predication
+    */
     void vector_store_tail(const index_t i,const vector_length_t remaining,const Vector val) const
     {
         inner::vstore1_z<1>((typename std::remove_cv<P>::type*)(&values_[i]),val,remaining,inner::vctpq<P>::mk(remaining));
     }
 
+    /**
+    * @brief   %Vector operation at index i with predicated tail
+    *
+    * @param i index
+    * @param remaining Number of remaining samples in the loop
+    * @return Result of operation
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function execute an operation at index i with predication.
+    * In the case of a vector, this operation is a load
+    */
     Vector const vector_op_tail(const index_t i,const vector_length_t remaining) const
     {
         return(inner::vload1_z<1>((typename std::remove_cv<P>::type*)(&values_[i]),remaining,inner::vctpq<P>::mk(remaining)));
     }
 #endif
     
+    /**
+    * @brief   %Vector operation at index i
+    *
+    * @param i index
+    * @return Result of operation
+    * 
+    * On an architecture supporting vectors, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function execute an operation at index i.
+    * In the case of a vector, this operation is a load
+    */
     Vector const vector_op(const index_t i) const
     {
         return(inner::vload1<1>((typename std::remove_cv<P>::type*)(&values_[i])));
@@ -169,7 +283,7 @@ struct traits<Vector_Base<T>>
 #endif
 };
 
-/** @brief Vector template
+/** @brief Vector template for size knonw at build time
  *  @tparam P Type of the scalar
  *  @tparam L Vector length in number of elements. 
  *            Negative if length not known at build time. It is the default value
@@ -215,6 +329,7 @@ struct Vector:Vector_Base<P> {
    /**
     * @brief      Construct a new vector and initialize it with a list
     * 
+    *             A vector can be initialized like an array using {} syntax
     *             The length is known at build time.
     *      
     * @param l Initialization list         
@@ -233,6 +348,11 @@ struct Vector:Vector_Base<P> {
         //std::memcpy(Vector_Base<P>::values_,other.values_,vector_size);
   };
 
+  /**
+    * @brief      Create a vector from a vector using a different memory allocator
+    * 
+    * @param other Other vector using a different memory allocator
+    */
    template<template<int> typename OtherAllocator>
    explicit Vector(const Vector<P,L,OtherAllocator>& other):Vector_Base<P>(L,Vector::allocate())
    {
@@ -248,6 +368,12 @@ struct Vector:Vector_Base<P> {
         }
    };
 
+    /**
+    * @brief      Create a vector from a VectorView
+    * 
+    * @tparam S The stride of the vector view known at build time
+    * @param other The vector view
+    */
    template<int S>
    explicit Vector(const VectorView<P,S>& other):Vector_Base<P>(L,Vector::allocate())
    {
@@ -256,7 +382,16 @@ struct Vector:Vector_Base<P> {
    
 
   
-
+    /**
+    * @brief      Create a vector from an expression
+    * 
+    * @tparam Derived The type representing the abstract syntax tree
+    * @param other The expression
+    * 
+    * It is the mechanism allowing to evaluate an expression
+    * and merge all of the operators the of the expression in the
+    * same loop
+    */
    template<typename Derived>
    Vector(const _Expr<Derived>& other):Vector_Base<P>(L,Vector::allocate())
    {
@@ -284,6 +419,17 @@ struct Vector:Vector_Base<P> {
       return(*this);
    }
 
+  /**
+  * @brief      Copy result of an expression to a vector content
+  * 
+  * @tparam Derived The type representing the abstract syntax tree
+  * @param other The expression
+  * @return A reference to the vector
+  * 
+  * It is the mechanism allowing to evaluate an expression
+  * and merge all of the operators the of the expression in the
+  * same loop
+  */
   template<typename Derived>
   Vector& operator=(const _Expr<Derived>&other)
   {
@@ -292,6 +438,14 @@ struct Vector:Vector_Base<P> {
   }
 
 
+  /**
+  * @brief      Fill a vector with a constant
+  * 
+  * @tparam T The constant datatype
+  * @param other The const
+  * @return A reference to the vector
+  * 
+  */
   template<typename T,
            typename std::enable_if<is_scalar<T>(),bool>::type = true>
   Vector& operator=(const T other)
@@ -303,7 +457,14 @@ struct Vector:Vector_Base<P> {
 
    
 
-   
+   /**
+   * @brief    Elementwise add the result of an expression to a vector
+   * 
+   * @tparam Derived The type representing the abstract syntax tree of the expression
+   * @param other The expression
+   * @return A reference to the vector
+   * 
+   */
    template<typename Derived>
    Vector& operator +=(const _Expr<Derived>& other)
    {
@@ -311,18 +472,42 @@ struct Vector:Vector_Base<P> {
       return(*this);
    };
 
+   /**
+   * @brief    Elementwise add vector to another vector
+   * 
+   * @param other The vector
+   * @return A reference to the vector
+   * 
+   */
    Vector& operator +=(const Vector& other)
    {
       eval(*this,*this + other,(vector_length_t)L,CURRENT_ARCH);
       return(*this);
    };
 
+
+   /**
+   * @brief    Elementwise add a constant to a vector
+   * 
+   * @tparam P The constant datatype
+   * @param other The expression
+   * @return A reference to the vector
+   * 
+   */
    Vector& operator +=(const P other)
    {
       eval(*this,*this + other,(vector_length_t)L,CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+   * @brief    Elementwise subtract the result of an expression from a vector
+   * 
+   * @tparam Derived The type representing the abstract syntax tree of the expression
+   * @param other The expression
+   * @return A reference to the vector
+   * 
+   */
    template<typename Derived>
    Vector& operator -=(const _Expr<Derived>& other)
    {
@@ -330,18 +515,43 @@ struct Vector:Vector_Base<P> {
       return(*this);
    };
 
+
+   /**
+   * @brief    Elementwise subtract a vector from a vector
+   * 
+   * @param other The other vector
+   * @return A reference to the vector
+   * 
+   */
    Vector& operator -=(const Vector& other)
    {
       eval(*this,*this - other,(vector_length_t)L,CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+   * @brief    Elementwise subtract a constant from a vector
+   * 
+   * @tparam P Datatype of the constant
+   * @param other The constant
+   * @return A reference to the vector
+   * 
+   */
    Vector& operator -=(const P other)
    {
       eval(*this,*this - other,(vector_length_t)L,CURRENT_ARCH);
       return(*this);
    };
 
+
+   /**
+   * @brief    Elementwise multiply the result of an expression with a vector
+   * 
+   * @tparam Derived The type representing the abstract syntax tree of the expression
+   * @param other The expression
+   * @return A reference to the vector
+   * 
+   */
    template<typename Derived>
    Vector& operator *=(const _Expr<Derived>& other)
    {
@@ -349,12 +559,27 @@ struct Vector:Vector_Base<P> {
       return(*this);
    };
 
+   /**
+   * @brief    Elementwise multiply a vector with a vector
+   * 
+   * @param other The othr vector
+   * @return A reference to the vector
+   * 
+   */
    Vector& operator *=(const Vector& other)
    {
       eval(*this,*this * other,(vector_length_t)L,CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+   * @brief    Elementwise multiply a constant with a vector
+   * 
+   * @tparam Derived Constant datatype
+   * @param other The constant
+   * @return A reference to the vector
+   * 
+   */
    Vector& operator *=(const P other)
    {
       eval(*this,*this * other,(vector_length_t)L,CURRENT_ARCH);
@@ -363,7 +588,16 @@ struct Vector:Vector_Base<P> {
 
 
 
-  
+   /**
+   * @brief    Create a vector view
+   * 
+   * @tparam S Stride known at build time
+   * @param start Start index for the vector view
+   * @param stop Stop index for the vector view (first element after the vector view)
+   *             Default is length of the vector if known at build time.
+   * @return A reference to the vector view
+   * 
+   */
    template<int S=1>
    VectorView<P,S> sub(const index_t start=0,const index_t stop=L)
    {
@@ -390,26 +624,59 @@ struct Vector:Vector_Base<P> {
 };
 
 
-
+/** @brief Vector template for dynamic vector (size known at runtime)
+ *  @tparam P Type of the scalar
+ *  @tparam Allocator Memory allocator to use. By default it is the macro `TMP_ALLOC`
+ */
 template<typename P,
          template<int> typename Allocator>
 struct Vector<P,DYNAMIC,Allocator>:Vector_Base<P> {
 
+    /**
+    * @brief      Allocate a buffer for this vector using the memory allocator
+    *
+    * @param length Vector dimension
+    * @return     A new memory buffer
+    */
     static char* allocate(vector_length_t length){return(Allocator<DYNAMIC>::allocate(sizeof(P)*length));};
 
     Vector() = delete;
 
+    /**
+    * @brief  Create a new vector
+    *
+    * @param length Vector dimension
+    * @param init_val Initialization value
+    */
     explicit Vector(vector_length_t length,P init_val):
     Vector_Base<P>(length,Vector::allocate(length),init_val){};
       
+    /**
+    * @brief  Create a new vector
+    *
+    * @param length Vector dimension
+    */
     explicit Vector(vector_length_t length):
     Vector_Base<P>(length,Vector::allocate(length)){};
 
+
+    /**
+    * @brief  Create a new vector
+    *
+    * @param l Initializer list
+    *          A vector can be initialized like an array using {} syntax
+    */
     explicit Vector(const std::initializer_list<P> &l)
     :Vector_Base<P>(l.size(),Vector::allocate(l.size())){
        std::memcpy(Vector_Base<P>::values_,l.data(),sizeof(P)*l.size());
     };
    
+   /**
+    * @brief  Create a new vector from a vector using a different memory allocator
+    *
+    * @tparam K Dimension of other vector (statically known or dynamic)
+    * @param other The vector to copy
+    */
    template<int K,template<int> typename OtherAllocator>
    explicit Vector(const Vector<P,K,OtherAllocator>& other):Vector_Base<P>(other.length(),Vector::allocate(other.length()))
    {
@@ -417,6 +684,11 @@ struct Vector<P,DYNAMIC,Allocator>:Vector_Base<P> {
    };
 
    
+   /**
+    * @brief  Create a new vector from a vector of same type
+    *
+    * @param other The vector to copy
+    */
    Vector(const Vector& other):Vector_Base<P>(other.length(),Vector::allocate(other.length()))
    {
         eval(*this,+other,Vector_Base<P>::length(),CURRENT_ARCH);
@@ -424,12 +696,24 @@ struct Vector<P,DYNAMIC,Allocator>:Vector_Base<P> {
         //std::memcpy(Vector_Base<P>::values_,other.values_,vector_size);
    };
 
+   /**
+    * @brief  Create a new vector from a vector view
+    *
+    * @tparam S Stride of vector view known at build time
+    * @param other The vector to copy
+    */
    template<int S>
    explicit Vector(const VectorView<P,S>& other):Vector_Base<P>(other.length(),Vector::allocate(other.length()))
    {
         eval(*this,+other,Vector_Base<P>::length(),CURRENT_ARCH);
    };
 
+   /**
+    * @brief  Create a new vector from an expressipn
+    *
+    * @tparam Derived Type representing the abstract syntax tree of the expression
+    * @param other The expression to evaluate
+    */
    template<typename Derived>
    Vector(const _Expr<Derived>& other):Vector_Base<P>(other.length(),Vector::allocate(other.length()))
    {
@@ -460,6 +744,13 @@ struct Vector<P,DYNAMIC,Allocator>:Vector_Base<P> {
       return(*this);
    }
 
+  /**
+    * @brief  Fill a vector with an expression
+    *
+    * @tparam Derived Type representing the abstract syntax tree of the expression
+    * @param other The expression to evaluate
+    * @return A reference to the vector
+    */
   template<typename Derived>
   Vector& operator=(const _Expr<Derived>&other)
   {
@@ -467,6 +758,13 @@ struct Vector<P,DYNAMIC,Allocator>:Vector_Base<P> {
       return(*this);
   }
 
+  /**
+    * @brief  Fill a vector with a scalar
+    *
+    * @tparam T Scalar datatype
+    * @param other The scalar
+    * @return A reference to the vector
+    */
   template<typename T,
            typename std::enable_if<is_scalar<T>(),bool>::type = true>
   Vector& operator=(const T other)
@@ -475,6 +773,13 @@ struct Vector<P,DYNAMIC,Allocator>:Vector_Base<P> {
       return(*this);
   }
 
+    /**
+    * @brief  Elementwise add an expression to a vector
+    *
+    * @tparam Derived Type representing the abstract syntax tree of the expression
+    * @param other The expression to evaluate
+    * @return A reference to the vector
+    */
    template<typename Derived>
    Vector& operator +=(const _Expr<Derived>& other)
    {
@@ -482,18 +787,38 @@ struct Vector<P,DYNAMIC,Allocator>:Vector_Base<P> {
       return(*this);
    };
 
+   /**
+    * @brief  Elementwise add a vector to a vector
+    *
+    * @param other The vector to add
+    * @return A reference to the vector
+    */
    Vector& operator +=(const Vector& other)
    {
       eval(*this,*this + other,Vector_Base<P>::length(),CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+    * @brief  Elementwise add a scalar to a vector
+    *
+    * @tparam P Scalar datatype
+    * @param other The scalar
+    * @return A reference to the vector
+    */
    Vector& operator +=(const P other)
    {
       eval(*this,*this + other,Vector_Base<P>::length(),CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+    * @brief  Elementwise subtract an expression to a vector
+    *
+    * @tparam Derived Type representing the abstract syntax tree of the expression
+    * @param other The expression to evaluate
+    * @return A reference to the vector
+    */
    template<typename Derived>
    Vector& operator -=(const _Expr<Derived>& other)
    {
@@ -501,18 +826,38 @@ struct Vector<P,DYNAMIC,Allocator>:Vector_Base<P> {
       return(*this);
    };
 
+   /**
+    * @brief  Elementwise subtract a vector to a vector
+    *
+    * @param other The vector to add
+    * @return A reference to the vector
+    */
    Vector& operator -=(const Vector& other)
    {
       eval(*this,*this - other,Vector_Base<P>::length(),CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+    * @brief  Elementwise subtract a scalar to a vector
+    *
+    * @tparam P Scalar datatype
+    * @param other The scalar
+    * @return A reference to the vector
+    */
    Vector& operator -=(const P other)
    {
       eval(*this,*this - other,Vector_Base<P>::length(),CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+    * @brief  Elementwise multiply an expression with a vector
+    *
+    * @tparam Derived Type representing the abstract syntax tree of the expression
+    * @param other The expression to evaluate
+    * @return A reference to the vector
+    */
    template<typename Derived>
    Vector& operator *=(const _Expr<Derived>& other)
    {
@@ -520,6 +865,12 @@ struct Vector<P,DYNAMIC,Allocator>:Vector_Base<P> {
       return(*this);
    };
 
+   /**
+    * @brief  Elementwise multiply a vector with a vector
+    *
+    * @param other The vector to add
+    * @return A reference to the vector
+    */
    Vector& operator *=(const Vector& other)
    {
       eval(*this,*this * other,Vector_Base<P>::length(),CURRENT_ARCH);
@@ -527,12 +878,28 @@ struct Vector<P,DYNAMIC,Allocator>:Vector_Base<P> {
    };
 
 
+  /**
+    * @brief  Elementwise multiply a scalar with a vector
+    *
+    * @tparam P Scalar datatype
+    * @param other The scalar
+    * @return A reference to the vector
+    */
    Vector& operator *=(const P other)
    {
       eval(*this,*this * other,Vector_Base<P>::length(),CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+    * @brief  Create a vector view
+    *
+    * @tparam S stride
+    * @param start Start index of view
+    * @param stop Stop index of view (first index after end of view)
+    *              By default it is the length of the vector.
+    * @return The vector view
+    */
    template<int S=1>
    VectorView<P,S> sub(const index_t start=0,const index_t stop=-1)
    {
diff --git a/dsppp/Include/dsppp/vector_view.hpp b/dsppp/Include/dsppp/vector_view.hpp
index b3ddc9c6f..25be3c197 100644
--- a/dsppp/Include/dsppp/vector_view.hpp
+++ b/dsppp/Include/dsppp/vector_view.hpp
@@ -22,6 +22,10 @@ namespace arm_cmsis_dsp {
  *  @{
  */
 
+/** @brief Vector view
+ *  @tparam T Type of the scalar
+ *  @tparam S Stride known at build time (default 1)
+ */
 template<typename T,int stride=1>
 struct VectorView
 {
@@ -34,58 +38,180 @@ struct VectorView
     */
     VectorView() = delete;
 
+    /**
+    * @brief  Compute the number of elements in the vector view
+    * @param start Vector view start index
+    * @param stop Vector view stop index (first elemnt after the view)
+    * @return Vector dimension
+    *
+    */
     constexpr static vector_length_t compute_length(const index_t start,const index_t stop)
     {
         return(1+(stop-1 -start)/stride);
     }
 
+    /**
+    * @brief  Create a vector view on a buffer
+    * @param v Buffer of samples (not owned by the view)
+    * @param start Start index of the view
+    * @param stop Stop index of the view (first elemnt after the view)
+    *
+    */
     explicit VectorView(T *v,const vector_length_t start,const vector_length_t stop):
       v_(v+start),nb_samples_(compute_length(start,stop)){};
 
+   /**
+    * @brief  Create a vector on a vector
+    * @param v Vector storage (not owned by the view)
+    *
+    */
     explicit VectorView(const Vector_Base<T> &v):
       v_(v.ptr()),nb_samples_(compute_length(0,v.length())){};
 
+    /**
+    * @brief  Create a vector view on vector
+    * @param v Vector storage (not owned by the view)
+    * @param start Start index of the view
+    * @param stop Stop index of the view (first elemnt after the view)
+    *
+    */
     explicit VectorView(const Vector_Base<T> &v,const index_t start,const index_t stop):
       v_(v.ptr()+start),nb_samples_(compute_length(start,stop)){};
 
+    /**
+    * @brief  Vector view dimension
+    * @return Number of elements 
+    *
+    */
     vector_length_t length() const {return(nb_samples_);};
 
 
+    /**
+    * @brief  Pointer to view storage
+    * @return Pointer to start of storage
+    *
+    */
     T* ptr() const {return(v_);}
+
+    /**
+    * @brief  Pointer to view storage at index i
+    * @param i Index
+    * @return Pointer to storage at index i
+    *
+    * The stride is used to compute this pointer.
+    * The index is scaled by the stride.
+    */
     T* ptr(const index_t i) const {return(&v_[i*stride]);}
 
+    /**
+    * @brief  Pointer to view constant storage
+    * @return Pointer to start of constant storage
+    *
+    */
     const T* const_ptr() const {return(v_);}
+
+    /**
+    * @brief  Pointer to view constant storage at index i
+    * @param i Index
+    * @return Pointer to constant storage at index i
+    *
+    * The stride is used to compute this pointer.
+    * The index is scaled by the stride.
+    */
     const T* const_ptr(const index_t i) const {return(&v_[i*stride]);}
 
+    /**
+    * @brief  Element at index i
+    * @param i Index
+    * @return Reference to element
+    *
+    * The stride is used to compute this reference.
+    * The index is scaled by the stride.
+    */
     T& operator[](const index_t i)
     {
         return(v_[i*stride]);
     }
    
+    /**
+    * @brief  Element at index i
+    * @param i Index
+    * @return Reference to element
+    *
+    * The stride is used to compute this reference.
+    * The index is scaled by the stride.
+    */
     T& operator[](const index_t i) const
     {
         return(v_[i*stride]);
     }
    
 #if defined(HAS_VECTOR)
+     //! Type of vectors for a vector architecture and for scalar datatype P
     using Vector = typename vector_traits<T>::vector;
+
+    /**
+    * @brief   %Vector store at index i
+    *
+    * @param i index
+    * @param val Vector value
+    * 
+    * On an architecture supporting vectors, if the scalar datatype T
+    * has a corresponding vector datatype, this function stores a vector
+    * value at index i in this vector datatype
+    */
     void vector_store(const index_t i,const Vector val)
     {
         inner::vstore1<stride>((typename std::remove_cv<T>::type*)(&v_[i*stride]),val);
     }
 
 #if defined(HAS_PREDICATED_LOOP)
+    /**
+    * @brief   %Vector store at index i with predicated tail
+    *
+    * @param i index
+    * @param remaining Number of remaining samples in the loop
+    * @param val Vector value to write at index i with tail predication
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function stores a vector value at index i in this vector datatype
+    * with predication
+    */
     void vector_store_tail(const index_t i,const vector_length_t remaining,const Vector val)
     {
         inner::vstore1_z<stride>((typename std::remove_cv<T>::type*)(&v_[i*stride]),val,remaining,inner::vctpq<T>::mk(remaining));
     }
 
+    /**
+    * @brief   %Vector operation at index i with predicated tail
+    *
+    * @param i index
+    * @param remaining Number of remaining samples in the loop
+    * @return Result of operation
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function execute an operation at index i with predication.
+    * In the case of a vector, this operation is a load
+    */
     Vector const vector_op_tail(const index_t i,const vector_length_t remaining) const
     {
         return(inner::vload1_z<stride>((typename std::remove_cv<T>::type*)(&v_[i*stride]),remaining,inner::vctpq<T>::mk(remaining)));
     }
 #endif
 
+    /**
+    * @brief   %Vector operation at index i
+    *
+    * @param i index
+    * @return Result of operation
+    * 
+    * On an architecture supporting vectors, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function execute an operation at index i.
+    * In the case of a vector, this operation is a load
+    */
     Vector const vector_op(const index_t i) const
     {
         return(inner::vload1<stride>((typename std::remove_cv<T>::type*)(&v_[i*stride])));
@@ -94,10 +220,28 @@ struct VectorView
 
     virtual ~VectorView() {};
 
+    /**
+    * @brief  Create a vector view from another view
+    * @param other the other vector view
+    *
+    * The new vector view will point to the same storage as the
+    * other vector view. No copy of element is occuring.
+    * To copy vector view content, the expr and copy operators
+    * are needed.
+    */
     VectorView(const VectorView& other):
     v_(other.v_),nb_samples_(other.nb_samples_){};
   
 
+    /**
+    * @brief  Move a vector view to another view
+    * @param other the other vector view
+    *
+    * The new vector view will point to the same storage as the
+    * other vector view. No copy of element is occuring.
+    *
+    * The other vector view is no more valid (points to null storage)
+    */
     VectorView(VectorView&& other) :
      v_(std::move(other.v_)),nb_samples_(other.nb_samples_)
      {
@@ -108,7 +252,14 @@ VectorView& operator=(const VectorView& other) = delete;
 VectorView& operator=(VectorView&& other)  = delete;
 
 
-
+   /**
+    * @brief  Assign an expression to a vector view
+    * @tparam Derived the datatype representing the abstract syntax tree of the view
+    * @param other the expression
+    * @return the vector view
+    * 
+    * Evaluate an expression an assign its result to the vector view
+    */
    template<typename Derived>
    VectorView& operator=(const _Expr<Derived>&other)
    {
@@ -117,6 +268,12 @@ VectorView& operator=(VectorView&& other)  = delete;
    }
 
 
+   /**
+    * @brief  Assign a scalar to a vector view
+    * @param val the scalar
+    * @return the vector view
+    * 
+    */
    VectorView& operator=(const T val)
    {
         _Fill(*this,val,length(),CURRENT_ARCH);
@@ -124,6 +281,13 @@ VectorView& operator=(VectorView&& other)  = delete;
         return(*this);
    }
 
+   /**
+    * @brief  Elementwise  add an expression to a vector view
+    * @tparam Derived the datatype representing the abstract syntax tree of the view
+    * @param other the expression
+    * @return the vector view
+    * 
+    */
    template<typename Derived>
    VectorView& operator +=(const _Expr<Derived>& other)
    {
@@ -131,18 +295,37 @@ VectorView& operator=(VectorView&& other)  = delete;
       return(*this);
    };
 
+   /**
+    * @brief  Elementwise add a vector view to a vector view
+    * @param other the vector view to add
+    * @return the vector view
+    * 
+    */
    VectorView& operator +=(const VectorView& other)
    {
       eval(*this,*this + other,length(),CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+    * @brief  Elementwise  add a scalar to a vector view
+    * @param other the scalar
+    * @return the vector view
+    * 
+    */
    VectorView& operator +=(const T other)
    {
       eval(*this,*this + other,length(),CURRENT_ARCH);
       return(*this);
    };
 
+    /**
+    * @brief  Elementwise  subtract an expression to a vector view
+    * @tparam Derived the datatype representing the abstract syntax tree of the view
+    * @param other the expression
+    * @return the vector view
+    * 
+    */
    template<typename Derived>
    VectorView& operator -=(const _Expr<Derived>& other)
    {
@@ -150,19 +333,37 @@ VectorView& operator=(VectorView&& other)  = delete;
       return(*this);
    };
 
-   
+    /**
+    * @brief  Elementwise subtract a vector view to a vector view
+    * @param other the vector view to add
+    * @return the vector view
+    * 
+    */
    VectorView& operator -=(const VectorView& other)
    {
       eval(*this,*this - other,length(),CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+    * @brief  Elementwise  subtract a scalar to a vector view
+    * @param other the scalar
+    * @return the vector view
+    * 
+    */
    VectorView& operator -=(const T other)
    {
       eval(*this,*this - other,length(),CURRENT_ARCH);
       return(*this);
    };
 
+    /**
+    * @brief  Elementwise multiply an expression to a vector view
+    * @tparam Derived the datatype representing the abstract syntax tree of the view
+    * @param other the expression
+    * @return the vector view
+    * 
+    */
    template<typename Derived>
    VectorView& operator *=(const _Expr<Derived>& other)
    {
@@ -170,18 +371,37 @@ VectorView& operator=(VectorView&& other)  = delete;
       return(*this);
    };
 
+    /**
+    * @brief  Elementwise multiply a vector view to a vector view
+    * @param other the vector view to add
+    * @return the vector view
+    * 
+    */
    VectorView& operator *=(const VectorView& other)
    {
       eval(*this,*this * other,length(),CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+    * @brief  Elementwise  multiply a scalar to a vector view
+    * @param other the scalar
+    * @return the vector view
+    * 
+    */
    VectorView& operator *=(const T other)
    {
       eval(*this,*this * other,length(),CURRENT_ARCH);
       return(*this);
    };
 
+  /**
+    * @brief  Display the vector view content for debug purpose
+    * @param stream Output stream
+    * @param other The vector view to display
+    * @return the stream
+    * 
+    */
   friend std::ostream& operator<< (std::ostream& stream, const VectorView<T,stride>& other) {
         constexpr int nb = 10;
         int i=0;
@@ -199,6 +419,15 @@ VectorView& operator=(VectorView&& other)  = delete;
         return(stream);
     }
 
+   /**
+    * @brief  Create a sub vector (a view of a view)
+    * @tparam S stride known at build time
+    * @param start Start index
+    * @param stop Stop index (first element after the view)
+    *             By default it is the vector view length
+    * @return the vector view
+    * 
+    */
    template<int S=1>
    VectorView<T,S*stride> sub(const index_t start=0,const index_t stop=-1)
    {
@@ -212,6 +441,15 @@ VectorView& operator=(VectorView&& other)  = delete;
      }
    }
 
+   /**
+    * @brief  Create a constant sub vector (a view of a view)
+    * @tparam S stride known at build time
+    * @param start Start index
+    * @param stop Stop index (first element after the view)
+    *             By default it is the vector view length
+    * @return the vector view
+    * 
+    */
    template<int S=1>
    const VectorView<T,S*stride> sub(const index_t start=0,const index_t stop=-1) const
    {
@@ -231,6 +469,9 @@ VectorView& operator=(VectorView&& other)  = delete;
     const vector_length_t nb_samples_;
 };
 
+/** @brief Vector view with dynamic stride (not known at build time)
+ *  @tparam T Type of the scalar
+ */
 template<typename T>
 struct VectorView<T,DYNAMIC>
 {
@@ -243,58 +484,190 @@ struct VectorView<T,DYNAMIC>
     */
     VectorView() = delete;
 
+    /**
+    * @brief  Compute the number of elements in the vector view
+    * @param start Vector view start index
+    * @param stop Vector view stop index (first elemnt after the view)
+    * @param stride Stride (only known at runtime)
+    * @return Vector dimension
+    *
+    */
     vector_length_t compute_length(const index_t start,const index_t stop,const index_t stride) const
     {
         return(1+(stop-1 -start)/stride);
     }
 
+    /**
+    * @brief  Create a vector view on a buffer
+    * @param v Buffer of samples (not owned by the view)
+    * @param start Start index of the view
+    * @param stop Stop index of the view (first elemnt after the view)
+    * @param stride Stride (only known at runtime)
+    *
+    */
     explicit VectorView(T *v,const index_t start,const index_t stop,const index_t stride):
       v_(v+start),nb_samples_(compute_length(start,stop,stride)),stride_(stride){};
 
+    
+    /**
+    * @brief  Create a vector view on a vector
+    * @param v Vector owning the storage (not owned by the view)
+    * @param stride Stride (only known at runtime)
+    * 
+    * start is 0
+    * stop is defined by the length of the vector
+    *
+    */
     explicit VectorView(const Vector_Base<T> &v,const index_t stride):
       v_(v.ptr()),nb_samples_(compute_length(0,v.length(),stride)),stride_(stride){};
 
+    /**
+    * @brief  Create a vector view on a vector
+    * @param v Vector owning the storage (not owned by the view)
+    * @param start Start index of the view
+    * @param stop Stop index
+    * @param stride Stride (only known at runtime)
+    * 
+    *
+    */
     explicit VectorView(const Vector_Base<T> &v,const index_t start,const index_t stop,const index_t stride):
       v_(v.ptr()+start),nb_samples_(compute_length(start,stop,stride)),stride_(stride){};
 
+    /**
+    * @brief  Vector view dimension
+    * @return Number of elements 
+    *
+    */
     vector_length_t length() const {return(nb_samples_);};
 
 
+    /**
+    * @brief  Pointer to view storage
+    * @return Pointer to start of storage
+    *
+    */
     T* ptr() const {return(v_);}
+
+    /**
+    * @brief  Pointer to view storage at index i
+    * @param i Index
+    * @return Pointer to storage at index i
+    *
+    * The stride is used to compute this pointer.
+    * The index is scaled by the stride.
+    */
     T* ptr(const index_t i) const {return(&v_[i*stride_]);}
 
+
+    /**
+    * @brief  Pointer to view constant storage
+    * @return Pointer to start of constant storage
+    *
+    */
     const T* const_ptr() const {return(v_);}
+
+    /**
+    * @brief  Pointer to view constant storage at index i
+    * @param i Index
+    * @return Pointer to constant storage at index i
+    *
+    * The stride is used to compute this pointer.
+    * The index is scaled by the stride.
+    */
     const T* const_ptr(const index_t i) const {return(&v_[i*stride_]);}
 
+    /**
+    * @brief  Element at index i
+    * @param i Index
+    * @return Reference to element
+    *
+    * The stride is used to compute this reference.
+    * The index is scaled by the stride.
+    */
     T& operator[](index_t i)
     {
         return(v_[i*stride_]);
     }
    
+    /**
+    * @brief  Element at index i
+    * @param i Index
+    * @return Reference to element
+    *
+    * The stride is used to compute this reference.
+    * The index is scaled by the stride.
+    */
     T& operator[](index_t i) const
     {
         return(v_[i*stride_]);
     }
    
 #if defined(HAS_VECTOR)
+    //! Type of vectors for a vector architecture and for scalar datatype P
     using Vector = typename vector_traits<T>::vector;
+
+    /**
+    * @brief   %Vector store at index i
+    *
+    * @param i index
+    * @param val Vector value
+    * 
+    * On an architecture supporting vectors, if the scalar datatype T
+    * has a corresponding vector datatype, this function stores a vector
+    * value at index i in this vector datatype
+    */
     void vector_store(index_t i,Vector val)
     {
         inner::vstore1((typename std::remove_cv<T>::type*)(&v_[i*stride_]),stride_,val);
     }
 
 #if defined(HAS_PREDICATED_LOOP)
+    /**
+    * @brief   %Vector store at index i with predicated tail
+    *
+    * @param i index
+    * @param remaining Number of remaining samples in the loop
+    * @param val Vector value to write at index i with tail predication
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function stores a vector value at index i in this vector datatype
+    * with predication
+    */
     void vector_store_tail(index_t i,vector_length_t remaining,Vector val)
     {
         inner::vstore1_z((typename std::remove_cv<T>::type*)(&v_[i*stride_]),stride_,val,remaining,inner::vctpq<T>::mk(remaining));
     }
 
+     /**
+    * @brief   %Vector operation at index i with predicated tail
+    *
+    * @param i index
+    * @param remaining Number of remaining samples in the loop
+    * @return Result of operation
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function execute an operation at index i with predication.
+    * In the case of a vector, this operation is a load
+    */
     Vector const vector_op_tail(index_t i,vector_length_t remaining) const
     {
         return(inner::vload1_z((typename std::remove_cv<T>::type*)(&v_[i*stride_]),stride_,remaining,inner::vctpq<T>::mk(remaining)));
     }
 #endif
     
+    /**
+    * @brief   %Vector operation at index i
+    *
+    * @param i index
+    * @return Result of operation
+    * 
+    * On an architecture supporting vectors, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function execute an operation at index i.
+    * In the case of a vector, this operation is a load
+    */
     Vector const vector_op(index_t i) const
     {
         return(inner::vload1((typename std::remove_cv<T>::type*)(&v_[i*stride_]),stride_));
@@ -303,10 +676,28 @@ struct VectorView<T,DYNAMIC>
 
     virtual ~VectorView() {};
 
+    /**
+    * @brief  Create a vector view from another view
+    * @param other the other vector view
+    *
+    * The new vector view will point to the same storage as the
+    * other vector view. No copy of element is occuring.
+    * To copy vector view content, the expr and copy operators
+    * are needed.
+    */
     VectorView(const VectorView& other):
     v_(other.v_),nb_samples_(other.nb_samples_),stride_(other.stride_){};
   
 
+    /**
+    * @brief  Move a vector view to another view
+    * @param other the other vector view
+    *
+    * The new vector view will point to the same storage as the
+    * other vector view. No copy of element is occuring.
+    *
+    * The other vector view is no more valid (points to null storage)
+    */
     VectorView(VectorView&& other) :
      v_(std::move(other.v_)),nb_samples_(other.nb_samples_),stride_(other.stride_)
      {
@@ -318,6 +709,14 @@ VectorView& operator=(VectorView&& other)  = delete;
 
 
 
+   /**
+    * @brief  Assign an expression to a vector view
+    * @tparam Derived the datatype representing the abstract syntax tree of the view
+    * @param other the expression
+    * @return the vector view
+    * 
+    * Evaluate an expression an assign its result to the vector view
+    */
    template<typename Derived>
    VectorView& operator=(const _Expr<Derived>&other)
    {
@@ -326,6 +725,12 @@ VectorView& operator=(VectorView&& other)  = delete;
    }
 
 
+    /**
+    * @brief  Assign a scalar to a vector view
+    * @param val the scalar
+    * @return the vector view
+    * 
+    */
    VectorView& operator=(const T val)
    {
         _Fill(*this,val,length(),CURRENT_ARCH);
@@ -333,6 +738,13 @@ VectorView& operator=(VectorView&& other)  = delete;
         return(*this);
    }
 
+   /**
+    * @brief  Elementwise  add an expression to a vector view
+    * @tparam Derived the datatype representing the abstract syntax tree of the view
+    * @param other the expression
+    * @return the vector view
+    * 
+    */
    template<typename Derived>
    VectorView& operator +=(const _Expr<Derived>& other)
    {
@@ -340,18 +752,37 @@ VectorView& operator=(VectorView&& other)  = delete;
       return(*this);
    };
 
+   /**
+    * @brief  Elementwise add a vector view to a vector view
+    * @param other the vector view to add
+    * @return the vector view
+    * 
+    */
    VectorView& operator +=(const VectorView& other)
    {
       eval(*this,*this + other,length(),CURRENT_ARCH);
       return(*this);
    };
 
+    /**
+    * @brief  Elementwise  add a scalar to a vector view
+    * @param other the scalar
+    * @return the vector view
+    * 
+    */
    VectorView& operator +=(const T other)
    {
       eval(*this,*this + other,length(),CURRENT_ARCH);
       return(*this);
    };
 
+    /**
+    * @brief  Elementwise  subtract an expression to a vector view
+    * @tparam Derived the datatype representing the abstract syntax tree of the view
+    * @param other the expression
+    * @return the vector view
+    * 
+    */
    template<typename Derived>
    VectorView& operator -=(const _Expr<Derived>& other)
    {
@@ -359,19 +790,37 @@ VectorView& operator=(VectorView&& other)  = delete;
       return(*this);
    };
 
-   
+    /**
+    * @brief  Elementwise subtract a vector view to a vector view
+    * @param other the vector view to add
+    * @return the vector view
+    * 
+    */
    VectorView& operator -=(const VectorView& other)
    {
       eval(*this,*this - other,length(),CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+    * @brief  Elementwise  subtract a scalar to a vector view
+    * @param other the scalar
+    * @return the vector view
+    * 
+    */
    VectorView& operator -=(const T other)
    {
       eval(*this,*this - other,length(),CURRENT_ARCH);
       return(*this);
    };
 
+    /**
+    * @brief  Elementwise multiply an expression to a vector view
+    * @tparam Derived the datatype representing the abstract syntax tree of the view
+    * @param other the expression
+    * @return the vector view
+    * 
+    */
    template<typename Derived>
    VectorView& operator *=(const _Expr<Derived>& other)
    {
@@ -379,18 +828,37 @@ VectorView& operator=(VectorView&& other)  = delete;
       return(*this);
    };
 
+    /**
+    * @brief  Elementwise multiply a vector view to a vector view
+    * @param other the vector view to add
+    * @return the vector view
+    * 
+    */
    VectorView& operator *=(const VectorView& other)
    {
       eval(*this,*this * other,length(),CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+    * @brief  Elementwise  multiply a scalar to a vector view
+    * @param other the scalar
+    * @return the vector view
+    * 
+    */
    VectorView& operator *=(const T other)
    {
       eval(*this,*this * other,length(),CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+    * @brief  Display the vector view content for debug purpose
+    * @param stream Output stream
+    * @param other The vector view to display
+    * @return the stream
+    * 
+    */
   friend std::ostream& operator<< (std::ostream& stream, const VectorView<T,DYNAMIC>& other) {
         constexpr int nb = 10;
         int i=0;
@@ -408,9 +876,23 @@ VectorView& operator=(VectorView&& other)  = delete;
         return(stream);
     }
 
+    /**
+    * @brief  Stride of the vector view
+    * @return the stride
+    * 
+    */
     index_t stride() const {return stride_;}
 
   
+   /**
+    * @brief  Create a sub vector (a view of a view)
+    * @tparam S stride known at build time
+    * @param start Start index
+    * @param stop Stop index (first element after the view)
+    *             By default it is the vector view length
+    * @return the vector view
+    * 
+    */
    template<int S=1>
    VectorView<T,DYNAMIC> sub(const index_t start=0,const index_t stop=-1)
    {
@@ -424,6 +906,15 @@ VectorView& operator=(VectorView&& other)  = delete;
      }
    }
 
+   /**
+    * @brief  Create a constant sub vector (a view of a view)
+    * @tparam S stride known at build time
+    * @param start Start index
+    * @param stop Stop index (first element after the view)
+    *             By default it is the vector view length
+    * @return the vector view
+    * 
+    */
    template<int S=1>
    const VectorView<T,DYNAMIC> sub(const index_t start=0,const index_t stop=-1) const
    {

From 5cb08580bbde52467d0cdf66128e6337fb75a567 Mon Sep 17 00:00:00 2001
From: Christophe Favergeon <Christophe.Favergeon@arm.com>
Date: Wed, 28 Feb 2024 11:38:13 +0100
Subject: [PATCH 3/5] Improved doxygen documentation for C++ extension.

---
 dsppp/Include/dsppp/common.hpp      |   14 +-
 dsppp/Include/dsppp/fixed_point.hpp | 1293 ++++++++++++++++++++++++++-
 dsppp/Include/dsppp/fusion.hpp      |  197 +++-
 dsppp/Include/dsppp/fusion_ops.hpp  |   53 +-
 dsppp/Include/dsppp/matrix_view.hpp |  483 +++++++++-
 dsppp/Include/dsppp/memory_pool.hpp |  127 ++-
 dsppp/Include/dsppp/number.hpp      |    4 +-
 7 files changed, 2138 insertions(+), 33 deletions(-)

diff --git a/dsppp/Include/dsppp/common.hpp b/dsppp/Include/dsppp/common.hpp
index cd272c03e..5ade21efa 100644
--- a/dsppp/Include/dsppp/common.hpp
+++ b/dsppp/Include/dsppp/common.hpp
@@ -45,15 +45,15 @@ namespace arm_cmsis_dsp {
  *  \ingroup DSPPP
  *  @{
  */
-    // Dynamic objects (dimensions only known at runtime)
+    //! Dynamic objects (dimensions only known at runtime)
     constexpr int DYNAMIC = -1;
 
-    // Dynamic objects (dimensions only known at runtime) but
-    // with some constraints (like stride == nb_cols)
+    //! Dynamic objects (dimensions only known at runtime) but with some constraints (like stride == nb_cols)
     constexpr int CONSTRAINED_DYNAMIC = -2;
 
-    // It must be a signed datatype
+    //! index datatype. It must be a signed datatype
     typedef int32_t index_t;
+    //! Vector length datatype. Iy must be a signed datatype.
     typedef int32_t vector_length_t;
 
 /*! @} */
@@ -62,6 +62,12 @@ namespace arm_cmsis_dsp {
  *  \ingroup DSPPP
  *  @{
  */
+
+/**
+ * @brief      Prints a textual representation of a type.
+ *
+ * @tparam     T     The datatype to display
+ */
 template <typename T>
 void PrintType(void)
 {
diff --git a/dsppp/Include/dsppp/fixed_point.hpp b/dsppp/Include/dsppp/fixed_point.hpp
index 82bc2d437..cbb791d13 100644
--- a/dsppp/Include/dsppp/fixed_point.hpp
+++ b/dsppp/Include/dsppp/fixed_point.hpp
@@ -109,110 +109,286 @@ __STATIC_FORCEINLINE int32_t __QADD(
 
 #endif
 
-
+/**
+ * @brief  Function to identify the template for fixed number 
+ *         representable on 64 bits
+ *
+ * @param M number of mantissa bit (without sign bit)
+ * @param F number of fractional bits
+ * @param S sign or unsigned
+ * @return True if the template must be selected
+ */
 constexpr bool test64(const int M,const int F,const int S){return((M+F+S)>32 && (M+F+S)<=64);}
+
+/**
+ * @brief  Function to identify the template for fixed number 
+ *         representable on 32 bits
+ *
+ * @param M number of mantissa bit (without sign bit)
+ * @param F number of fractional bits
+ * @param S sign or unsigned
+ * @return True if the template must be selected
+ */
 constexpr bool test32(const int M,const int F,const int S){return((M+F+S)>16 && (M+F+S)<=32);}
+
+/**
+ * @brief  Function to identify the template for fixed number 
+ *         representable on 16 bits
+ *
+ * @param M number of mantissa bit (without sign bit)
+ * @param F number of fractional bits
+ * @param S sign or unsigned
+ * @return True if the template must be selected
+ */
 constexpr bool test16(const int M,const int F,const int S){return((M+F+S)>8  && (M+F+S)<=16);}
+
+/**
+ * @brief  Function to identify the template for fixed number 
+ *         representable on 8 bits
+ *
+ * @param M number of mantissa bit (without sign bit)
+ * @param F number of fractional bits
+ * @param S sign or unsigned
+ * @return True if the template must be selected
+ */
 constexpr bool test8 (const int M,const int F,const int S){return((M+F+S)<=8);}
 
+/**
+ * @brief  Storage type for a fixed point number
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * @tparam s signed or unsigned
+ * 
+ */
 template<int M,int F,bool s = true,bool = true>
 struct fixed_storage_type
 {
 };
 
+
+/**
+ * @brief  Storage type for a fixed point number representable on int64
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct fixed_storage_type<M,F,true,test64(M,F,1)>
 {
+    //! Storage for the fixed point number
     typedef int64_t value_type;
+    //! Storage for the widening of this fixed point number datatype
     typedef int64_t wider_type;
+    //! Storage for the narrowing of this fixed point number datatype
     typedef int32_t narrow_type;
 };
 
+/**
+ * @brief  Storage type for a fixed point number representable on uint64
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct fixed_storage_type<M,F,false,test64(M,F,0)>
 {
+    //! Storage for the fixed point number
     typedef uint64_t value_type;
+    //! Storage for the widening of this fixed point number datatype
     typedef uint64_t wider_type;
+    //! Storage for the narrowing of this fixed point number datatype
     typedef uint32_t narrow_type;
 };
 
 
+/**
+ * @brief  Storage type for a fixed point number representable on int32
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct fixed_storage_type<M,F,true,test32(M,F,1)>
 {
+    //! Storage for the fixed point number
     typedef int32_t value_type;
+    //! Storage for the widening of this fixed point number datatype
     typedef int64_t wider_type;
+    //! Storage for the narrowing of this fixed point number datatype
     typedef int16_t narrow_type;
 };
 
+/**
+ * @brief  Storage type for a fixed point number representable on uint32
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct fixed_storage_type<M,F,false,test32(M,F,0)>
 {
+    //! Storage for the fixed point number
     typedef uint32_t value_type;
+    //! Storage for the widening of this fixed point number datatype
     typedef uint64_t wider_type;
+    //! Storage for the narrowing of this fixed point number datatype
     typedef uint16_t narrow_type;
 };
 
 
-
+/**
+ * @brief  Storage type for a fixed point number representable on int16
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct fixed_storage_type<M,F, true,test16(M,F,1)>
 {
+    //! Storage for the fixed point number
     typedef int16_t value_type;
+    //! Storage for the widening of this fixed point number datatype
     typedef int32_t wider_type;
+    //! Storage for the narrowing of this fixed point number datatype
     typedef int8_t narrow_type;
 };
 
+/**
+ * @brief  Storage type for a fixed point number representable on uint16
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct fixed_storage_type<M,F, false,test16(M,F,0)>
 {
+    //! Storage for the fixed point number
     typedef uint16_t value_type;
+    //! Storage for the widening of this fixed point number datatype
     typedef uint32_t wider_type;
+    //! Storage for the narrowing of this fixed point number datatype
     typedef uint8_t narrow_type;
 };
 
-
+/**
+ * @brief  Storage type for a fixed point number representable on int8
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct fixed_storage_type<M,F,true,test8(M,F,1)>
 {
+    //! Storage for the fixed point number
     typedef int8_t value_type;
+    //! Storage for the widening of this fixed point number datatype
     typedef int16_t wider_type;
+    //! Storage for the narrowing of this fixed point number datatype
     typedef int8_t narrow_type;
 };
 
+/**
+ * @brief  Storage type for a fixed point number representable on uint8
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct fixed_storage_type<M,F,false,test8(M,F,0)>
 {
+    //! Storage for the fixed point number
     typedef uint8_t value_type;
+    //! Storage for the widening of this fixed point number datatype
     typedef uint16_t wider_type;
+    //! Storage for the narrowing of this fixed point number datatype
     typedef uint8_t narrow_type;
 };
 
 
+/**
+ * @brief  Fixed point template
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)#
+ * @tparam F Number of bits for fractional part
+ * @tparam S Signed or unsigned
+ * @tparam T storage datatype
+ * 
+ */
 template<int M, int F, bool S = true,
          typename T=typename fixed_storage_type<M,F,S>::value_type>
 struct Q {};
 
+/**
+ * @brief  Signed fixed point datatypes on 64 bits
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct Q<M,F,true,int64_t> {
+  //! Number of fractional bits
   constexpr static int fracBits = F;
+  //! Number of mantissa bits (without sign bit)
   constexpr static int mantissaBits = M;
+  //! Is this number representation signed
   constexpr static bool isSigned = true;
+  //! Storage type for the value
   using value_type = typename fixed_storage_type<M,F,true>::value_type;
+  //! Storage type for the widening of the value
   using wider_type = typename fixed_storage_type<M,F,true>::wider_type;
+
+  //! Maximum representable positive value
   constexpr static value_type maxVal = 0x7FFFFFFFFFFFFFFFLL;
-  constexpr static value_type minVal = 0x8000000000000000LL;
 
+  //! Minimum representable negative value
+  constexpr static value_type minVal = 0x8000000000000000LL;
 
+  /**
+   * @brief  Convert a float to fixed point
+   *
+   * @param f float value
+   * @return the fixed point value in the storage type
+   * 
+   */
   constexpr static value_type convert(const float f) {
     return(f >= 1.0f ? maxVal : (f <= -1.0f ? minVal : value_type(f * (float)((maxVal >> (63 - F)) ))));
   };
 
+  //! Storage value
   value_type v;
+
+  /**
+   * @brief  Create a new zero fixed point
+   */
   constexpr Q():v(0){};
+
+  /**
+   * @brief  Create a new fixed point from a raw integer
+   * @param x the raw integer
+   */
   constexpr explicit Q(const value_type x):v(x){};
+
+  /**
+   * @brief  Create a new fixed point from a float
+   * @param x the float
+   * @return The fixed point representing the float value with saturation
+   */
   constexpr static Q f(const float x){return Q(convert(x));}
 
+  /**
+   * @brief  Fixed point number representing 1
+   * @return Fixed point representing 1
+   */
   constexpr static Q one() {return f(1.0f);};
 
   Q(Q&& other)=default;
@@ -221,46 +397,94 @@ struct Q<M,F,true,int64_t> {
   Q& operator=(const Q& other)=default;
 
 
+  /**
+   * @brief  Convert an unsigned fixed point to this fixed point
+   * @param other The unsigned fixed point number
+   * 
+   * Some problem may occur since the unsigned may not be representable
+   * with the less bits required for the sign representation.
+   * This convertion is not saturating.
+   */
   constexpr explicit Q(const Q<M,F,false>&other) 
     :v{value_type(other.v)} {};
 
+   /**
+    * @brief  this == b
+    * @param b the other fixed point
+    * @return true if this == b
+    */
    bool operator==(const Q& b) const
    {
      return(v == b.v);
    }
 
+   /**
+    * @brief  this != b
+    * @param b the other fixed point
+    * @return true if this != b
+    */
    bool operator!=(const Q& b) const
    {
      return(v != b.v);
    }
 
+   /**
+    * @brief  this < b
+    * @param b the other fixed point
+    * @return true if this < b
+    */
    bool operator<(const Q& b) const
    {
      return(v < b.v);
    }
 
+   /**
+    * @brief  this > b
+    * @param b the other fixed point
+    * @return true if this > b
+    */
    bool operator>(const Q& b) const
    {
      return(v > b.v);
    }
 
+   /**
+    * @brief  this <= b
+    * @param b the other fixed point
+    * @return true if this <= b
+    */
    bool operator<=(const Q& b) const
    {
      return(v <= b.v);
    }
 
+   /**
+    * @brief  this >= b
+    * @param b the other fixed point
+    * @return true if this >= b
+    */
    bool operator>=(const Q& b) const
    {
      return(v >= b.v);
    }
 
 
+   /**
+   * @brief  this += other
+   * @param other the other fixed point
+   * @return true if this += other
+   */
    Q & operator+=(const Q other)
    {
        v += other.v;
        return(*this);
    }
 
+   /**
+   * @brief  this -= other
+   * @param other the other fixed point
+   * @return true if this += other
+   */
    Q & operator-=(const Q other)
    {
        v -= other.v;
@@ -268,6 +492,13 @@ struct Q<M,F,true,int64_t> {
    }
 
 
+   /**
+    * @brief  Display fixed point number for debug purpose
+    * @param stream Output stream
+    * @param other The fixed point to display
+    * @return the stream
+    * 
+    */
    friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
         stream << double(1.0*other.v / (maxVal >> (63 - F))) << "_Q(" << M << "," << F << ")";;
         return(stream);
@@ -275,23 +506,64 @@ struct Q<M,F,true,int64_t> {
 
 };
 
+/**
+ * @brief  Unsigned fixed point datatypes on 64 bits
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct Q<M,F,false,uint64_t> {
+  //! Number of fractional bits
   constexpr static int fracBits = F;
+  //! Number of mantissa bits (without sign bit)
   constexpr static int mantissaBits = M;
+  //! Is this number representation signed
   constexpr static bool isSigned = false;
+  //! Storage type for the value
   using value_type = typename fixed_storage_type<M,F,false>::value_type;
+  //! Storage type for the widening of the value
   using wider_type = typename fixed_storage_type<M,F,false>::wider_type;
+  //! Maximum representable positive value
   constexpr static value_type maxVal = 0xFFFFFFFFFFFFFFFFLL;
+
+  /**
+   * @brief  Convert a float to fixed point
+   *
+   * @param f float value
+   * @return the fixed point value in the storage type
+   * 
+   */
   constexpr static value_type convert(const float f) {
     return(f >= 1.0f ? maxVal : (f <= 0.0f ? 0LL : value_type(f * (float)((maxVal >> (64 - F))))));
   };
   
+  //! Storage value
   value_type v;
+
+  /**
+   * @brief  Create a new zero fixed point
+   */
   constexpr Q():v(0){};
+
+   /**
+   * @brief  Create a new fixed point from a raw integer
+   * @param x the raw integer
+   */
   constexpr explicit Q(const value_type x):v(x){};
+
+  /**
+   * @brief  Create a new fixed point from a float
+   * @param x the float
+   * @return The fixed point representing the float value with saturation
+   */
   constexpr static Q f(const float x){return Q(convert(x));}
 
+  /**
+   * @brief  Fixed point number representing 1
+   * @return Fixed point representing 1
+   */
   constexpr static Q one() {return f(1.0f);};
 
   Q(Q&& other)=default;
@@ -299,36 +571,73 @@ struct Q<M,F,false,uint64_t> {
   Q& operator=(Q&& other)=default;
   Q& operator=(const Q& other)=default;
 
+    /**
+    * @brief  Display fixed point number for debug purpose
+    * @param stream Output stream
+    * @param other The fixed point to display
+    * @return the stream
+    * 
+    */
    friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
         stream << double(1.0*other.v / (maxVal >> (64 - F))) << "_UQ(" << M << "," << F << ")";;
         return(stream);
    }
 
+   /**
+    * @brief  this == b
+    * @param b the other fixed point
+    * @return true if this == b
+    */
    bool operator==(const Q& b) const
    {
      return(v == b.v);
    }
 
+   /**
+    * @brief  this != b
+    * @param b the other fixed point
+    * @return true if this != b
+    */
    bool operator!=(const Q& b) const
    {
      return(v != b.v);
    }
 
+    /**
+    * @brief  this < b
+    * @param b the other fixed point
+    * @return true if this < b
+    */
    bool operator<(const Q& b) const
    {
      return(v < b.v);
    }
 
+   /**
+    * @brief  this > b
+    * @param b the other fixed point
+    * @return true if this > b
+    */
    bool operator>(const Q& b) const
    {
      return(v > b.v);
    }
 
+   /**
+    * @brief  this <= b
+    * @param b the other fixed point
+    * @return true if this <= b
+    */
    bool operator<=(const Q& b) const
    {
      return(v <= b.v);
    }
 
+   /**
+    * @brief  this >= b
+    * @param b the other fixed point
+    * @return true if this >= b
+    */
    bool operator>=(const Q& b) const
    {
      return(v >= b.v);
@@ -336,29 +645,83 @@ struct Q<M,F,false,uint64_t> {
 
 };
 
-
+/**
+ * @brief  Signed fixed point datatypes on 32 bits
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct Q<M,F,true,int32_t> {
+  //! Number of fractional bits
   constexpr static int fracBits = F;
+
+  //! Number of mantissa bits (without sign bit)
   constexpr static int mantissaBits = M;
+
+  //! Is this number representation signed
   constexpr static bool isSigned = true;
+
+  //! Storage type for the value
   using value_type = typename fixed_storage_type<M,F,true>::value_type;
+
+  //! Storage type for the widening of the value
   using wider_type = typename fixed_storage_type<M,F,true>::wider_type;
+
+  //! Maximum representable positive value
   constexpr static value_type maxVal = 0x7FFFFFFFL;
+
+  //! Minimum representable negative value
   constexpr static value_type minVal = 0x80000000L;
+
+  /**
+   * @brief  Saturate a wider type to the current type
+   *
+   * @param i the wider integer type
+   * @return the saturated value
+   * 
+   */
   constexpr static value_type sat(const wider_type i) {
     return (i > (value_type)maxVal ? maxVal : (i<(value_type)minVal ? minVal : i));
   };
 
+  /**
+   * @brief  Convert a float to fixed point with saturation
+   *
+   * @param f float value
+   * @return the fixed point value in the storage type
+   * 
+   */
   constexpr static value_type convert(const float f) {
     return(f >= 1.0f ? maxVal : (f <= -1.0f ? minVal : value_type(f * (float)((wider_type)1<<F))));
   };
   
+  //! Storage value
   value_type v;
+
+  /**
+   * @brief  Create a new zero fixed point
+   */
   constexpr Q():v(0){};
+
+  /**
+   * @brief  Create a new fixed point from a raw integer
+   * @param x the raw integer
+   */
   constexpr explicit Q(const value_type x):v(x){};
+
+   /**
+   * @brief  Create a new fixed point from a float
+   * @param x the float
+   * @return The fixed point representing the float value with saturation
+   */
   constexpr static Q f(const float x){return Q(convert(x));}
 
+  /**
+   * @brief  Fixed point number representing 1
+   * @return Fixed point representing 1
+   */
   constexpr static Q one() {return f(1.0f);};
 
   Q(Q&& other)=default;
@@ -366,51 +729,107 @@ struct Q<M,F,true,int32_t> {
   Q& operator=(Q&& other)=default;
   Q& operator=(const Q& other)=default;
 
+  /**
+   * @brief  Convert an unsigned fixed point to this fixed point
+   * @param other The unsigned fixed point number
+   * 
+   * Some problem may occur since the unsigned may not be representable
+   * with the less bits required for the sign representation.
+   * This convertion is not saturating.
+   */
   constexpr explicit Q(const Q<M,F,false>&other):
   v{value_type(other.v)} {};
 
+  /**
+    * @brief  this == b
+    * @param b the other fixed point
+    * @return true if this == b
+    */
   bool operator==(const Q& b) const
    {
      return(v == b.v);
    }
 
+   /**
+    * @brief  this != b
+    * @param b the other fixed point
+    * @return true if this != b
+    */
    bool operator!=(const Q& b) const
    {
      return(v != b.v);
    }
 
+   /**
+    * @brief  this < b
+    * @param b the other fixed point
+    * @return true if this < b
+    */
    bool operator<(const Q& b) const
    {
      return(v < b.v);
    }
 
+   /**
+    * @brief  this > b
+    * @param b the other fixed point
+    * @return true if this > b
+    */
    bool operator>(const Q& b) const
    {
      return(v > b.v);
    }
 
+   /**
+    * @brief  this <= b
+    * @param b the other fixed point
+    * @return true if this <= b
+    */
    bool operator<=(const Q& b) const
    {
      return(v <= b.v);
    }
 
+   /**
+    * @brief  this >= b
+    * @param b the other fixed point
+    * @return true if this >= b
+    */
    bool operator>=(const Q& b) const
    {
      return(v >= b.v);
    }
 
+   /**
+   * @brief  this += other
+   * @param other the other fixed point
+   * @return true if this += other
+   */
    Q & operator+=(const Q other)
    {
        v = __QADD(v,other.v);
        return(*this);
    }
 
+   
+   /**
+   * @brief  this -= other
+   * @param other the other fixed point
+   * @return true if this += other
+   */
    Q & operator-=(const Q other)
    {
        v = __QSUB(v,other.v);
        return(*this);
    }
 
+    /**
+    * @brief  Display fixed point number for debug purpose
+    * @param stream Output stream
+    * @param other The fixed point to display
+    * @return the stream
+    * 
+    */
    friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
         stream << double(1.0*other.v / ((wider_type)1<<F)) << "_Q(" << M << "," << F << ")";;
         return(stream);
@@ -418,27 +837,76 @@ struct Q<M,F,true,int32_t> {
 
 };
 
-
+/**
+ * @brief  Unsigned fixed point datatypes on 32 bits
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct Q<M,F,false,uint32_t> {
+  //! Number of fractional bits
   constexpr static int fracBits = F;
+  //! Number of mantissa bits (without sign bit)
   constexpr static int mantissaBits = M;
+  //! Is this number representation signed
   constexpr static bool isSigned = false;
+  //! Storage type for the value
   using value_type = typename fixed_storage_type<M,F,false>::value_type;
+  //! Storage type for the widening of the value
   using wider_type = typename fixed_storage_type<M,F,false>::wider_type;
+
+  //! Maximum representable positive value
   constexpr static value_type maxVal = 0xFFFFFFFFL;
+
+  /**
+   * @brief  Saturate a wider type to the current type
+   *
+   * @param i the wider integer type
+   * @return the saturated value
+   * 
+   */
   constexpr static value_type sat(const wider_type i) {
     return (i > (value_type)maxVal ? maxVal : i);
   };
+
+  /**
+   * @brief  Convert a float to fixed point with saturation
+   *
+   * @param f float value
+   * @return the fixed point value in the storage type
+   * 
+   */
   constexpr static value_type convert(const float f) {
     return(f >= 1.0f ? maxVal : (f <= 0.0f ? 0 : value_type(f * (float)((wider_type)1<<F))));
   };
   
+  //! Storage value
   value_type v;
+
+   /**
+   * @brief  Create a new zero fixed point
+   */
   constexpr Q():v(0){};
+
+  /**
+   * @brief  Create a new fixed point from a raw integer
+   * @param x the raw integer
+   */
   constexpr explicit Q(const value_type x):v(x){};
+
+  /**
+   * @brief  Create a new fixed point from a float
+   * @param x the float
+   * @return The fixed point representing the float value with saturation
+   */
   constexpr static Q f(const float x){return Q(convert(x));}
 
+  /**
+   * @brief  Fixed point number representing 1
+   * @return Fixed point representing 1
+   */
   constexpr static Q one() {return f(1.0f);};
 
   Q(Q&& other)=default;
@@ -446,63 +914,156 @@ struct Q<M,F,false,uint32_t> {
   Q& operator=(Q&& other)=default;
   Q& operator=(const Q& other)=default;
  
+   /**
+    * @brief  Display fixed point number for debug purpose
+    * @param stream Output stream
+    * @param other The fixed point to display
+    * @return the stream
+    * 
+    */
    friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
         stream << double(1.0*other.v / ((wider_type)1<<F)) << "_UQ(" << M << "," << F << ")";;
         return(stream);
    }
 
+   /**
+    * @brief  this == b
+    * @param b the other fixed point
+    * @return true if this == b
+    */
    bool operator==(const Q& b) const
    {
      return(v == b.v);
    }
 
+   /**
+    * @brief  this != b
+    * @param b the other fixed point
+    * @return true if this != b
+    */
    bool operator!=(const Q& b) const
    {
      return(v != b.v);
    }
 
+   /**
+    * @brief  this < b
+    * @param b the other fixed point
+    * @return true if this < b
+    */
    bool operator<(const Q& b) const
    {
      return(v < b.v);
    }
 
+   /**
+    * @brief  this > b
+    * @param b the other fixed point
+    * @return true if this > b
+    */
    bool operator>(const Q& b) const
    {
      return(v > b.v);
    }
 
+   /**
+    * @brief  this <= b
+    * @param b the other fixed point
+    * @return true if this <= b
+    */
    bool operator<=(const Q& b) const
    {
      return(v <= b.v);
    }
 
+    /**
+    * @brief  this >= b
+    * @param b the other fixed point
+    * @return true if this >= b
+    */
    bool operator>=(const Q& b) const
    {
      return(v >= b.v);
    }
 };
 
+/**
+ * @brief  Signed fixed point datatypes on 16 bits
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct Q<M,F,true,int16_t> {
+  //! Number of fractional bits
   constexpr static int fracBits = F;
+  
+  //! Number of mantissa bits (without sign bit)
   constexpr static int mantissaBits = M;
+
+  //! Is this number representation signed
   constexpr static bool isSigned = true;
+
+  //! Storage type for the value
   using value_type = typename fixed_storage_type<M,F,true>::value_type;
+
+  //! Storage type for the widening of the value
   using wider_type = typename fixed_storage_type<M,F,true>::wider_type;
+
+  //! Maximum representable positive value
   constexpr static value_type maxVal = 0x7FFF;
+
+  //! Minimum representable negative value
   constexpr static value_type minVal = 0x8000;
+
+   /**
+   * @brief  Saturate a wider type to the current type
+   *
+   * @param i the wider integer type
+   * @return the saturated value
+   * 
+   */
   constexpr static value_type sat(const wider_type i) {
     return (i > (value_type)maxVal ? maxVal : (i<(value_type)minVal ? minVal : i));
   };
+
+  /**
+   * @brief  Convert a float to fixed point with saturation
+   *
+   * @param f float value
+   * @return the fixed point value in the storage type
+   * 
+   */
   constexpr static value_type convert(const float f) {
     return(f >= 1.0f ? maxVal : (f <= -1.0f ? minVal : value_type(f * (float)((wider_type)1<<F))));
   };
   
+  //! Storage value
   value_type v;
+
+  /**
+   * @brief  Create a new zero fixed point
+   */
   constexpr Q():v(0){};
+
+  /**
+   * @brief  Create a new fixed point from a raw integer
+   * @param x the raw integer
+   */
   constexpr explicit Q(const value_type x):v(x){};
+
+   /**
+   * @brief  Create a new fixed point from a float
+   * @param x the float
+   * @return The fixed point representing the float value with saturation
+   */
   constexpr static Q f(const float x){return Q(convert(x));}
 
+  /**
+   * @brief  Fixed point number representing 1
+   * @return Fixed point representing 1
+   */
   constexpr static Q one() {return f(1.0f);};
 
   Q(Q&& other)=default;
@@ -510,38 +1071,81 @@ struct Q<M,F,true,int16_t> {
   Q& operator=(Q&& other)=default;
   Q& operator=(const Q& other)=default;
 
+  /**
+   * @brief  Convert an unsigned fixed point to this fixed point
+   * @param other The unsigned fixed point number
+   * 
+   * Some problem may occur since the unsigned may not be representable
+   * with the less bits required for the sign representation.
+   * This convertion is not saturating.
+   */
   constexpr explicit Q(const Q<M,F,false>&other):v{value_type(other.v)} {};
 
-  bool operator==(const Q& b) const
+   /**
+    * @brief  this == b
+    * @param b the other fixed point
+    * @return true if this == b
+    */
+   bool operator==(const Q& b) const
    {
      return(v == b.v);
    }
 
+  /**
+    * @brief  this != b
+    * @param b the other fixed point
+    * @return true if this != b
+    */
    bool operator!=(const Q& b) const
    {
      return(v != b.v);
    }
 
+   /**
+    * @brief  this < b
+    * @param b the other fixed point
+    * @return true if this < b
+    */
    bool operator<(const Q& b) const
    {
      return(v < b.v);
    }
 
+    /**
+    * @brief  this > b
+    * @param b the other fixed point
+    * @return true if this > b
+    */
    bool operator>(const Q& b) const
    {
      return(v > b.v);
    }
 
+    /**
+    * @brief  this <= b
+    * @param b the other fixed point
+    * @return true if this <= b
+    */
    bool operator<=(const Q& b) const
    {
      return(v <= b.v);
    }
 
+   /**
+    * @brief  this >= b
+    * @param b the other fixed point
+    * @return true if this >= b
+    */
    bool operator>=(const Q& b) const
    {
      return(v >= b.v);
    }
 
+    /**
+   * @brief  this += other
+   * @param other the other fixed point
+   * @return true if this += other
+   */
    Q & operator+=(const Q other)
    {
     #if !defined(ARM_MATH_DSP)
@@ -552,6 +1156,11 @@ struct Q<M,F,true,int16_t> {
     return(*this);
    }
 
+   /**
+   * @brief  this -= other
+   * @param other the other fixed point
+   * @return true if this += other
+   */
    Q & operator-=(const Q other)
    {
     #if !defined(ARM_MATH_DSP)
@@ -562,6 +1171,13 @@ struct Q<M,F,true,int16_t> {
     return(*this);
    }
 
+   /**
+    * @brief  Display fixed point number for debug purpose
+    * @param stream Output stream
+    * @param other The fixed point to display
+    * @return the stream
+    * 
+    */
    friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
         stream << double(1.0*other.v / (((wider_type)1)<<F)) << "_Q(" << M << "," << F << ")";;
         return(stream);
@@ -569,27 +1185,75 @@ struct Q<M,F,true,int16_t> {
 
 };
 
-
+/**
+ * @brief  Unsigned fixed point datatypes on 16 bits
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct Q<M,F,false,uint16_t> {
+  //! Number of fractional bits
   constexpr static int fracBits = F;
+  //! Number of mantissa bits (without sign bit)
   constexpr static int mantissaBits = M;
+  //! Is this number representation signed
   constexpr static bool isSigned = false;
+  //! Storage type for the value
   using value_type = typename fixed_storage_type<M,F,false>::value_type;
+  //! Storage type for the widening of the value
   using wider_type = typename fixed_storage_type<M,F,false>::wider_type;
+  //! Maximum representable positive value
   constexpr static value_type maxVal = 0xFFFF;
+
+  /**
+   * @brief  Saturate a wider type to the current type
+   *
+   * @param i the wider integer type
+   * @return the saturated value
+   * 
+   */
   constexpr static value_type sat(const wider_type i) {
     return (i > (value_type)maxVal ? maxVal : i);
   };
+
+  /**
+   * @brief  Convert a float to fixed point with saturation
+   *
+   * @param f float value
+   * @return the fixed point value in the storage type
+   * 
+   */
   constexpr static value_type convert(const float f) {
     return(f >= 1.0f ? maxVal : (f <= 0.0f ? 0 : value_type(f * (float)((wider_type)1<<F))));
   };
   
+  //! Storage value
   value_type v;
+
+  /**
+   * @brief  Create a new zero fixed point
+   */
   constexpr Q():v(0){};
+
+  /**
+   * @brief  Create a new fixed point from a raw integer
+   * @param x the raw integer
+   */
   constexpr explicit Q(const value_type x):v(x){};
+
+   /**
+   * @brief  Create a new fixed point from a float
+   * @param x the float
+   * @return The fixed point representing the float value with saturation
+   */
   constexpr static Q f(const float x){return Q(convert(x));}
 
+  /**
+   * @brief  Fixed point number representing 1
+   * @return Fixed point representing 1
+   */
   constexpr static Q one() {return f(1.0f);};
 
   Q(Q&& other)=default;
@@ -597,36 +1261,71 @@ struct Q<M,F,false,uint16_t> {
   Q& operator=(Q&& other)=default;
   Q& operator=(const Q& other)=default;
 
+  /**
+    * @brief  this == b
+    * @param b the other fixed point
+    * @return true if this == b
+    */
   bool operator==(const Q& b) const
    {
      return(v == b.v);
    }
 
+   /**
+    * @brief  this != b
+    * @param b the other fixed point
+    * @return true if this != b
+    */
    bool operator!=(const Q& b) const
    {
      return(v != b.v);
    }
 
+   /**
+    * @brief  this < b
+    * @param b the other fixed point
+    * @return true if this < b
+    */
    bool operator<(const Q& b) const
    {
      return(v < b.v);
    }
 
+    /**
+    * @brief  this > b
+    * @param b the other fixed point
+    * @return true if this > b
+    */
    bool operator>(const Q& b) const
    {
      return(v > b.v);
    }
 
+    /**
+    * @brief  this <= b
+    * @param b the other fixed point
+    * @return true if this <= b
+    */
    bool operator<=(const Q& b) const
    {
      return(v <= b.v);
    }
 
+   /**
+    * @brief  this >= b
+    * @param b the other fixed point
+    * @return true if this >= b
+    */
    bool operator>=(const Q& b) const
    {
      return(v >= b.v);
    }
 
+   /**
+   * @brief  this += other
+   * @param other the other fixed point
+   * @return true if this += other
+   */
    Q & operator+=(const Q other)
    {
     v = __USAT((value_type)v + other.v,16);
@@ -634,7 +1333,13 @@ struct Q<M,F,false,uint16_t> {
    }
 
 
-
+   /**
+    * @brief  Display fixed point number for debug purpose
+    * @param stream Output stream
+    * @param other The fixed point to display
+    * @return the stream
+    * 
+    */
    friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
         stream << double(1.0*other.v / ((wider_type)1<<F)) << "_UQ(" << M << "," << F << ")";;
         return(stream);
@@ -642,27 +1347,78 @@ struct Q<M,F,false,uint16_t> {
 
 };
 
+/**
+ * @brief  Signed fixed point datatypes on 8 bits
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct Q<M,F,true,int8_t> {
+  //! Number of fractional bits
   constexpr static int fracBits = F;
+  //! Number of mantissa bits (without sign bit)
   constexpr static int mantissaBits = M;
+  //! Is this number representation signed
   constexpr static bool isSigned = true;
+  //! Storage type for the value
   using value_type = typename fixed_storage_type<M,F,true>::value_type;
+  //! Storage type for the widening of the value
   using wider_type = typename fixed_storage_type<M,F,true>::wider_type;
+  //! Maximum representable positive value
   constexpr static value_type maxVal = 0x7F;
+  //! Minimum representable negative value
   constexpr static value_type minVal = 0x80;
+
+   /**
+   * @brief  Saturate a wider type to the current type
+   *
+   * @param i the wider integer type
+   * @return the saturated value
+   * 
+   */
   constexpr static value_type sat(const wider_type i) {
     return (i > (value_type)maxVal ? maxVal : (i<(value_type)minVal ? minVal : i));
   };
+
+  
+  /**
+   * @brief  Convert a float to fixed point with saturation
+   *
+   * @param f float value
+   * @return the fixed point value in the storage type
+   * 
+   */
   constexpr static value_type convert(const float f) {
     return(f >= 1.0f ? maxVal : (f <= -1.0f ? minVal : value_type(f * (float)((wider_type)1<<F))));
   };
   
+  //! Storage value
   value_type v;
+
+   /**
+   * @brief  Create a new zero fixed point
+   */
   constexpr Q():v(0){};
+
+   /**
+   * @brief  Create a new fixed point from a raw integer
+   * @param x the raw integer
+   */
   constexpr explicit Q(const value_type x):v(x){};
+
+   /**
+   * @brief  Create a new fixed point from a float
+   * @param x the float
+   * @return The fixed point representing the float value with saturation
+   */
   constexpr static Q f(const float x){return Q(convert(x));}
 
+  /**
+   * @brief  Fixed point number representing 1
+   * @return Fixed point representing 1
+   */
   constexpr static Q one() {return f(1.0f);};
 
   Q(Q&& other)=default;
@@ -670,38 +1426,81 @@ struct Q<M,F,true,int8_t> {
   Q& operator=(Q&& other)=default;
   Q& operator=(const Q& other)=default;
 
+  /**
+   * @brief  Convert an unsigned fixed point to this fixed point
+   * @param other The unsigned fixed point number
+   * 
+   * Some problem may occur since the unsigned may not be representable
+   * with the less bits required for the sign representation.
+   * This convertion is not saturating.
+   */
   constexpr explicit Q(const Q<M,F,false>&other):v{value_type(other.v)} {};
 
+  /**
+    * @brief  this == b
+    * @param b the other fixed point
+    * @return true if this == b
+    */
   bool operator==(const Q& b) const
    {
      return(v == b.v);
    }
 
+   /**
+    * @brief  this != b
+    * @param b the other fixed point
+    * @return true if this != b
+    */
    bool operator!=(const Q& b) const
    {
      return(v != b.v);
    }
 
+   /**
+    * @brief  this < b
+    * @param b the other fixed point
+    * @return true if this < b
+    */
    bool operator<(const Q& b) const
    {
      return(v < b.v);
    }
 
+   /**
+    * @brief  this > b
+    * @param b the other fixed point
+    * @return true if this > b
+    */
    bool operator>(const Q& b) const
    {
      return(v > b.v);
    }
 
+   /**
+    * @brief  this <= b
+    * @param b the other fixed point
+    * @return true if this <= b
+    */
    bool operator<=(const Q& b) const
    {
      return(v <= b.v);
    }
 
+    /**
+    * @brief  this >= b
+    * @param b the other fixed point
+    * @return true if this >= b
+    */
    bool operator>=(const Q& b) const
    {
      return(v >= b.v);
    }
 
+    /**
+   * @brief  this += other
+   * @param other the other fixed point
+   * @return true if this += other
+   */
    Q & operator+=(const Q other)
    {
      #if !defined(ARM_MATH_DSP)
@@ -712,6 +1511,11 @@ struct Q<M,F,true,int8_t> {
     return(*this);
    }
 
+  /**
+   * @brief  this -= other
+   * @param other the other fixed point
+   * @return true if this += other
+   */
    Q & operator-=(const Q other)
    {
      #if !defined(ARM_MATH_DSP)
@@ -722,6 +1526,13 @@ struct Q<M,F,true,int8_t> {
     return(*this);
    }
 
+    /**
+    * @brief  Display fixed point number for debug purpose
+    * @param stream Output stream
+    * @param other The fixed point to display
+    * @return the stream
+    * 
+    */
    friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
         stream << double(1.0*other.v / ((wider_type)1<<F)) << "_Q(" << M << "," << F << ")";
         return(stream);
@@ -729,26 +1540,76 @@ struct Q<M,F,true,int8_t> {
 
 };
 
+/**
+ * @brief  Unsigned fixed point datatypes on 8 bits
+ *
+ * @tparam M Number of bits for mantissa (without sign bit)
+ * @tparam F Number of bits for fractional part
+ * 
+ */
 template<int M,int F>
 struct Q<M,F,false,uint8_t> {
+  //! Number of fractional bits
   constexpr static int fracBits = F;
+  //! Number of mantissa bits (without sign bit)
   constexpr static int mantissaBits = M;
+  //! Is this number representation signed
   constexpr static bool isSigned = false;
+  //! Storage type for the value
   using value_type = typename fixed_storage_type<M,F,false>::value_type;
+  //! Storage type for the widening of the value
   using wider_type = typename fixed_storage_type<M,F,false>::wider_type;
+
+  //! Maximum representable positive value
   constexpr static value_type maxVal = 0xFF;
+
+  /**
+   * @brief  Saturate a wider type to the current type
+   *
+   * @param i the wider integer type
+   * @return the saturated value
+   * 
+   */
   constexpr static value_type sat(const wider_type i) {
     return (i > (value_type)maxVal ? maxVal : i);
   };
+
+  /**
+   * @brief  Convert a float to fixed point with saturation
+   *
+   * @param f float value
+   * @return the fixed point value in the storage type
+   * 
+   */
   constexpr static value_type convert(const float f) {
     return(f >= 1.0f ? maxVal : (f <= 0.0f ? 0 : value_type(f * (float)((wider_type)1<<F))));
   };
   
+  //! Storage value
   value_type v;
+
+  /**
+   * @brief  Create a new zero fixed point
+   */
   constexpr Q():v(0){};
+
+  /**
+   * @brief  Create a new fixed point from a raw integer
+   * @param x the raw integer
+   */
   constexpr explicit Q(const value_type x):v(x){};
+
+   /**
+   * @brief  Create a new fixed point from a float
+   * @param x the float
+   * @return The fixed point representing the float value with saturation
+   */
   constexpr static Q f(const float x){return Q(convert(x));}
 
+  /**
+   * @brief  Fixed point number representing 1
+   * @return Fixed point representing 1
+   */
   constexpr static Q one() {return f(1.0f);};
 
   Q(Q&& other)=default;
@@ -756,61 +1617,179 @@ struct Q<M,F,false,uint8_t> {
   Q& operator=(Q&& other)=default;
   Q& operator=(const Q& other)=default;
 
+  /**
+    * @brief  this == b
+    * @param b the other fixed point
+    * @return true if this == b
+    */
   bool operator==(const Q& b) const
    {
      return(v == b.v);
    }
 
+   /**
+    * @brief  this != b
+    * @param b the other fixed point
+    * @return true if this != b
+    */
    bool operator!=(const Q& b) const
    {
      return(v != b.v);
    }
 
+   /**
+    * @brief  this < b
+    * @param b the other fixed point
+    * @return true if this < b
+    */
    bool operator<(const Q& b) const
    {
      return(v < b.v);
    }
 
+   /**
+    * @brief  this > b
+    * @param b the other fixed point
+    * @return true if this > b
+    */
    bool operator>(const Q& b) const
    {
      return(v > b.v);
    }
 
+   /**
+    * @brief  this <= b
+    * @param b the other fixed point
+    * @return true if this <= b
+    */
    bool operator<=(const Q& b) const
    {
      return(v <= b.v);
    }
 
+   /**
+    * @brief  this >= b
+    * @param b the other fixed point
+    * @return true if this >= b
+    */
    bool operator>=(const Q& b) const
    {
      return(v >= b.v);
    }
    
+   /**
+   * @brief  this += other
+   * @param other the other fixed point
+   * @return true if this += other
+   */
    Q & operator+=(const Q other)
    {
     v = __USAT((value_type)v + other.v,8);
     return(*this);
    }
 
+    /**
+    * @brief  Display fixed point number for debug purpose
+    * @param stream Output stream
+    * @param other The fixed point to display
+    * @return the stream
+    * 
+    */
    friend std::ostream& operator<< (std::ostream& stream, const Q& other) {
         stream << double(1.0*other.v / ((wider_type)1<<F)) << "_UQ(" << M << "," << F << ")";
         return(stream);
    }
 
 };
+    
 
+//! Q63 datatype
 using Q63 = Q<0,63>;
+
+//! Q31 datatype
 using Q31 = Q<0,31>;
+
+//! Q15 datatype
 using Q15 = Q<0,15>;
+
+//! Q7 datatype
 using Q7  = Q<0,7>;
 
+/**
+ * @brief q63 literal
+ * @param x long double value
+ * @return Q63 value
+ * 
+ * You can write 
+ * \code{.cpp}
+ * Q63 b = 0.4_q63;
+ * \endcode
+ * 
+ * The float is converted to Q63 at build time.
+ * 
+ */
 constexpr Q63 operator ""_q63(long double x){return Q63(Q63::convert((float)x));}
+
+/**
+ * @brief q31 literal
+ * @param x long double value
+ * @return Q31 value
+ * 
+ * You can write 
+ * \code{.cpp}
+ * Q31 b = 0.4_q31;
+ * \endcode
+ * 
+ * The float is converted to Q31 at build time.
+ * 
+ */
 constexpr Q31 operator ""_q31(long double x){return Q31(Q31::convert((float)x));}
+
+/**
+ * @brief q15 literal
+ * @param x long double value
+ * @return Q15 value
+ * 
+ * You can write 
+ * \code{.cpp}
+ * Q15 b = 0.4_q15;
+ * \endcode
+ * 
+ * The float is converted to Q15 at build time.
+ * 
+ */
 constexpr Q15 operator ""_q15(long double x){return Q15(Q15::convert((float)x));}
-constexpr Q7 operator ""_q7(long double x){return Q7(Q7::convert((float)x));}
 
+/**
+ * @brief q7 literal
+ * @param x long double value
+ * @return Q7 value
+ * 
+ * You can write 
+ * \code{.cpp}
+ * Q7 b = 0.4_q7;
+ * \endcode
+ * 
+ * The float is converted to Q7 at build time.
+ * 
+ */
+constexpr Q7 operator ""_q7(long double x){return Q7(Q7::convert((float)x));}
 
 
+/**
+ * @brief Multiplication of two fixed point numbers A and B
+ * @tparam MA Number of mantissa bits for A
+ * @tparam FA Number of fractional bits for A
+ * @tparam MB Number of mantissa bits for B
+ * @tparam FB Number of fractional bits for B
+ * @tparam SA Is A using a signed representation
+ * @tparam SB Is B using a signed representation
+ * @param a First fixed point number
+ * @param b Second fixed point number
+ * @return return the product of the two fixed point (and use adapted type)
+ * 
+ * 
+ */
 template<int MA,int FA,int MB,int FB,bool SA,bool SB>
 inline Q< MA+MB+1 , FA+FB,SA || SB> mult(const Q<MA,FA,SA> &a,
                                          const Q<MB,FB,SB> &b)
@@ -840,6 +1819,17 @@ inline Q< MA+MB+1 , FA+FB,SA || SB> mult(const Q<MA,FA,SA> &a,
 }
 
 
+/**
+ * @brief Add two fixed point numbers with saturation
+ * @tparam M Number of mantissa bits for the fixed point number
+ * @tparam F Number of fractional bits for the fixed point number
+ * @tparam S Is the fixed point number using a signed representation
+ * @param a First fixed point number
+ * @param b Second fixed point number
+ * @return return the sum with saturation (if supported by the datatype)
+ * 
+ * 
+ */
 template<int M,int F,bool S>
 inline Q<M,F,S> operator+(const Q<M,F,S> &a,const Q<M,F,S> &b)
 {
@@ -848,6 +1838,17 @@ inline Q<M,F,S> operator+(const Q<M,F,S> &a,const Q<M,F,S> &b)
     return ret;
 }
 
+/**
+ * @brief Subtract two fixed point numbers with saturation
+ * @tparam M Number of mantissa bits for the fixed point number
+ * @tparam F Number of fractional bits for the fixed point number
+ * @tparam S Is the fixed point number using a signed representation
+ * @param a First fixed point number
+ * @param b Second fixed point number
+ * @return return the subtraction with saturation (if supported by the datatype)
+ * 
+ * 
+ */
 template<int M,int F,bool S>
 inline Q<M,F,S> operator-(const Q<M,F,S> &a,const Q<M,F,S> &b)
 {
@@ -856,6 +1857,16 @@ inline Q<M,F,S> operator-(const Q<M,F,S> &a,const Q<M,F,S> &b)
     return ret;
 }
 
+/**
+ * @brief Negate a fixed point number with saturation
+ * @tparam M Number of mantissa bits for the fixed point number
+ * @tparam F Number of fractional bits for the fixed point number
+ * @tparam S Is the fixed point number using a signed representation
+ * @param a First fixed point number
+ * @return return negation with saturation (if supported by the datatype)
+ * 
+ * 
+ */
 template<int M,int F,bool S>
 inline Q<M,F,S> operator-(const Q<M,F,S> &a)
 {
@@ -865,6 +1876,17 @@ inline Q<M,F,S> operator-(const Q<M,F,S> &a)
 }
 
 // Unsaturating add
+/**
+ * @brief Add two fixed point numbers without saturation
+ * @tparam M Number of mantissa bits for the fixed point number
+ * @tparam F Number of fractional bits for the fixed point number
+ * @tparam S Is the fixed point number using a signed representation
+ * @param a First fixed point number
+ * @param b Second fixed point number
+ * @return return the sum without saturation
+ * 
+ * 
+ */
 template<int M,int F,bool S>
 inline Q<M,F,S> add(const Q<M,F,S> &a,const Q<M,F,S> &b)
 {
@@ -872,6 +1894,17 @@ inline Q<M,F,S> add(const Q<M,F,S> &a,const Q<M,F,S> &b)
 }
 
 // Unsaturating sub
+/**
+ * @brief Subtract two fixed point numbers without saturation
+ * @tparam M Number of mantissa bits for the fixed point number
+ * @tparam F Number of fractional bits for the fixed point number
+ * @tparam S Is the fixed point number using a signed representation
+ * @param a First fixed point number
+ * @param b Second fixed point number
+ * @return return the subtraction  without saturation
+ * 
+ * 
+ */
 template<int M,int F,bool S>
 inline Q<M,F,S> sub(const Q<M,F,S> &a,const Q<M,F,S> &b)
 {
@@ -882,17 +1915,54 @@ inline Q<M,F,S> sub(const Q<M,F,S> &a,const Q<M,F,S> &b)
 template<int N>
 constexpr std::integral_constant<int, N> i_{};
 
+/**
+ * @brief Shift right a fixed point number with a shift known at build time
+ * @tparam M Number of mantissa bits for the fixed point number
+ * @tparam F Number of fractional bits for the fixed point number
+ * @tparam S Is the fixed point number using a signed representation
+ * @param a First fixed point number
+ * @return return the shifted fixed point number
+ * 
+ * 
+ */
 template<int M,int F, int N,bool S>
 inline Q<M,F,S> operator >>(const Q<M,F,S> &a, std::integral_constant<int, N>) noexcept {
     return Q<M,F,S>(a.v >> N);
 }
 
+
+/**
+ * @brief Shift left a fixed point number with a shift known at build time
+ * @tparam M Number of mantissa bits for the fixed point number
+ * @tparam F Number of fractional bits for the fixed point number
+ * @tparam S Is the fixed point number using a signed representation
+ * @param a First fixed point number
+ * @return return the shifted fixed point number
+ * 
+ * 
+ */
 template<int M,int F,int N,bool S>
 inline Q< M+N , F,S> operator <<(const Q<M,F,S> &a,  std::integral_constant<int, N>) noexcept {
     using ResType = typename Q<M+N,F,S>::value_type;
     return Q<M+N,F,S>(ResType(a.v) << N);
 }
 
+
+/**
+ * @brief Saturate a signed fixed point number
+ * @tparam MD Number of mantissa bits for the destination fixed point number
+ * @tparam MS Number of mantissa bits for the source fixed point number
+ * @tparam S Is the fixed point number using a signed representation
+ * @param src First fixed point number
+ * @return return the saturated fixed point number
+ * 
+ * Only applies if the number is signed, the representation requires less
+ * than 32 bits (since there is no saturating instruction for 64 bits) and
+ * if destination has less mantissa bits.
+ * 
+ * If destination has more or equal number of mantissa bits then it does
+ * not make sense to saturate.
+ */
 template<int MD=0,int MS,int F>
 inline Q<MD,F,true> saturate(const Q<MS,F,true> &src,
                              typename std::enable_if<(MD < MS) && ((MD+F)<31)>::type* = nullptr)
@@ -901,6 +1971,21 @@ inline Q<MD,F,true> saturate(const Q<MS,F,true> &src,
 }
 
 
+/**
+ * @brief Saturate an unsigned fixed point number
+ * @tparam MD Number of mantissa bits for the destination fixed point number
+ * @tparam MS Number of mantissa bits for the source fixed point number
+ * @tparam S Is the fixed point number using a signed representation
+ * @param src The fixed point number
+ * @return return the saturated fixed point number
+ * 
+ * Only applies if the number is unsigned, the representation requires less
+ * than 31 bits (since there is no saturating instruction for 64 bits) and
+ * if destination has less mantissa bits.
+ * 
+ * If destination has more or equal number of mantissa bits then it does
+ * not make sense to saturate.
+ */
 template<int MD=0,int MS,int F>
 inline Q<MD,F,false> saturate(const Q<MS,F,false> &src,typename std::enable_if<(MD < MS) && ((MD+F)<31)>::type* = nullptr)
 {
@@ -912,6 +1997,18 @@ template<int M,int FD,int FS,bool S,bool = true>
 struct FixedCastShift {};
 
 /* Positive shift */
+
+/**
+ * @brief Changed fractional representation of a fixed point number using a shift
+ * @tparam M Number of mantissa bits for the fixed point number
+ * @tparam FD Number of fractional bits for the destination fixed point number
+ * @tparam FS Number of fractional bits for the source fixed point number
+ * @tparam S Is the fixed point number using a signed representation
+ * @param src The fixed point number
+ * @return return the fixed point number with different fractional part format
+ *
+ * Only applies if FD > FS
+ */
 template<int M,int FD,int FS,bool S>
 struct FixedCastShift<M,FD,FS,S,(FD>FS)> {
     constexpr static Q<M,FD,S> shift(const Q<M,FS,S> &src)
@@ -921,6 +2018,17 @@ struct FixedCastShift<M,FD,FS,S,(FD>FS)> {
     }
 };
 
+/**
+ * @brief Changed fractional representation of a fixed point number using a shift
+ * @tparam M Number of mantissa bits for the fixed point number
+ * @tparam FD Number of fractional bits for the destination fixed point number
+ * @tparam FS Number of fractional bits for the source fixed point number
+ * @tparam S Is the fixed point number using a signed representation
+ * @param src The fixed point number
+ * @return return the fixed point number with different fractional part format
+ *
+ * Only applies if FD < FS
+ */
 template<int M,int FD,int FS,bool S>
 struct FixedCastShift<M,FD,FS,S,(FD<FS)> {
     constexpr static Q<M,FD,S> shift(const Q<M,FS,S> &src)
@@ -932,6 +2040,16 @@ struct FixedCastShift<M,FD,FS,S,(FD<FS)> {
     }
 };
 
+/**
+ * @brief Convert between different fractional part formats
+ * @tparam M Number of mantissa bits for the fixed point number
+ * @tparam FD Number of fractional bits for the destination fixed point number
+ * @tparam FS Number of fractional bits for the source fixed point number
+ * @tparam S Is the fixed point number using a signed representation
+ * @param src The fixed point number
+ * @return return the fixed point number with different fractional part format
+ *
+ */
 template<int FD,int M,int FS,bool S>
 inline Q<M,FD,S> toFrac(const Q<M,FS,S> &src)
 {
@@ -939,11 +2057,36 @@ inline Q<M,FD,S> toFrac(const Q<M,FS,S> &src)
 }
 
 
+/**
+ * @brief Accumulation without saturation
+ * @tparam MD Number of mantissa bits for the destination fixed point number
+ * @tparam MS Number of mantissa bits for the source fixed point number
+ * @tparam F Number of fractional bits for fixed point number
+ * @tparam S Is the fixed point number using a signed representation
+ *
+ */
 template<int MD,int MS,int F,bool S,bool = true>
 struct Accumulate;
 
+/**
+ * @brief Accumulation without saturation
+ * @tparam MD Number of mantissa bits for the destination fixed point number
+ * @tparam MS Number of mantissa bits for the source fixed point number
+ * @tparam F Number of fractional bits for fixed point number
+ * @tparam S Is the fixed point number using a signed representation
+ *
+ */
 template<int MD,int MS,int F,bool S>
 struct Accumulate<MD,MS,F,S,true> {
+  /**
+   * @brief      Accumulation without saturation
+   *
+   * @param[in]  a     first fixed point number
+   * @param[in]  b     second fixed point number
+   *
+   * @return     The sum of both fixed point number with more
+   *             matissa bits.
+   */
   static Q<MD,F,S> acc (const Q<MD,F,S> &a,const Q<MS,F,S> &b)
   {
      using DstType = typename Q<MD,F,S>::value_type;
@@ -951,6 +2094,20 @@ struct Accumulate<MD,MS,F,S,true> {
   }
 };
 
+/**
+ * @brief      Accumulate without saturation
+ *
+ * @param[in]  a     First fixed point number
+ * @param[in]  b     Second fixed point number
+ *
+ * @tparam     MD    Number of mantissa bits for destination
+ * @tparam     MS    Number of mantissa bits fro source
+ * @tparam     F     Number of fractional bits
+ * @tparam     S     Is the representation signed
+ *
+ * @return     Sum of two numbers without saturation and using the
+ *             destination number of mantissa bits
+ */
 template<int MD,int MS,int F,bool S>
 inline Q<MD,F,S> accumulate(const Q<MD,F,S> &a,const Q<MS,F,S> &b)
 {
@@ -965,66 +2122,176 @@ inline Q<M,F,true> _abs(const Q<M,F,true> a)
     return(Q<M,F>(DestType(abs(a.v))));
 }
 
+/**
+ * @brief      Multiplication operator.
+ *
+ * @param[in]  a     First value
+ * @param[in]  b     Second value
+ *
+ * @return     The result of the multiplication with saturation
+ */
 inline Q7 operator*(const Q7 &a,const Q7 &b)
 {
     return(saturate(toFrac<7>(mult(a,b))));
 }
 
+/**
+ * @brief      Multiplication operator.
+ *
+ * @param[in]  a     First value
+ * @param[in]  b     Second value
+ *
+ * @return     The result of the multiplication with saturation
+ */
 inline Q15 operator*(const Q15 &a,const Q15 &b)
 {
     return (saturate(toFrac<15>(mult(a,b))));
 }
 
+/**
+ * @brief      Multiplication operator.
+ *
+ * @param[in]  a     First value
+ * @param[in]  b     Second value
+ *
+ * @return     The result of the multiplication with saturation
+ */
 inline Q31 operator*(const Q31 &a,const Q31 &b)
 {
     return (toFrac<31>(saturate(toFrac<30>(mult(a,b)))));
 }
 
+/**
+ * @brief      Greater-than comparison operator.
+ *
+ * @param[in]  a     First value
+ * @param[in]  b     Second value
+ *
+ * @tparam     M     Number of mantissa bits
+ * @tparam     F     Number of fractional bits
+ *
+ * @return     The result of the greater-than comparison
+ */
 template<int M,int F>
 inline bool operator>(const Q<M,F> &a,const Q<M,F> &b)
 {
     return(a.v>b.v);
 }
 
+/**
+ * @brief      Less-than comparison operator.
+ *
+ * @param[in]  a     First value
+ * @param[in]  b     Second value
+ *
+ * @tparam     M     Number of mantissa bits
+ * @tparam     F     Number of fractional bits
+ *
+ * @return     The result of the less-than comparison
+ */
 template<int M,int F>
 inline bool operator<(const Q<M,F> &a,const Q<M,F> &b)
 {
     return(a.v<b.v);
 }
 
+/**
+ * @brief      Greater-than-or-equal comparison operator.
+ *
+ * @param[in]  a     First value
+ * @param[in]  b     Second value
+ *
+ * @tparam     M     Number of mantissa bits
+ * @tparam     F     Number of fractional bits
+ *
+ * @return     The result of the greater-than-or-equal comparison
+ */
 template<int M,int F>
 inline bool operator>=(const Q<M,F> &a,const Q<M,F> &b)
 {
     return(a.v>=b.v);
 }
 
-
+/**
+ * @brief      Less-than-or-equal comparison operator.
+ *
+ * @param[in]  a     First value
+ * @param[in]  b     Second value
+ *
+ * @tparam     M     Number of mantissa bits
+ * @tparam     F     Number of fractional bits
+ *
+ * @return     The result of the less-than-or-equal comparison
+ */
 template<int M,int F>
 inline bool operator<=(const Q<M,F> &a,const Q<M,F> &b)
 {
     return(a.v<=b.v);
 }
 
+/**
+ * @brief      Equality operator.
+ *
+ * @param[in]  a     First value
+ * @param[in]  b     Second value
+ *
+ * @tparam     M     Number of mantissa bits
+ * @tparam     F     Number of fractional bits
+ *
+ * @return     The result of the equality
+ */
 template<int M,int F>
 inline bool operator==(const Q<M,F> a,const Q<M,F> b)
 {
     return(a.v==b.v);
 }
 
+/**
+ * @brief      Inequality operator.
+ *
+ * @param[in]  a     First value
+ * @param[in]  b     Second value
+ *
+ * @tparam     M     Number of mantissa bits
+ * @tparam     F     Number of fractional bits
+ *
+ * @return     The result of the inequality
+ */
 template<int M,int F>
 inline bool operator!=(const Q<M,F> a,const Q<M,F> b)
 {
     return(a.v!=b.v);
 }
 
-
+/**
+ * @brief      Division operator.
+ *
+ * @param[in]  a     First fixed point value
+ * @param[in]  b     Integer
+ *
+ * @tparam     M     Number of mantissa bits
+ * @tparam     F     Number of fractional bits
+ * @tparam     S     Is representation signed
+ *
+ * @return     The result of the division
+ */
 template<int M,int F,bool S>
 inline Q<M,F,S> operator/(const Q<M,F,S> a,const int32_t b)
 {
     return(Q<M,F,S>(a.v / b));
 }
 
-
+/**
+ * @brief      No op operator.
+ *  
+ * @param[in]  a     Fixed point number
+ *
+ * @tparam     M     Number of mantissa bits
+ * @tparam     F     Number of fractional bits
+ * @tparam     S     Is the representation signed
+ *
+ * @return     The result of the addition
+ */
 template<int M,int F, bool S>
 inline Q<M,F,S> operator+(const Q<M,F,S> &a)
 {
diff --git a/dsppp/Include/dsppp/fusion.hpp b/dsppp/Include/dsppp/fusion.hpp
index db8864c4a..96c8e4e77 100644
--- a/dsppp/Include/dsppp/fusion.hpp
+++ b/dsppp/Include/dsppp/fusion.hpp
@@ -74,19 +74,57 @@ struct ElementType
 template<typename A,typename B>
 using SameElementType=std::is_same<typename ElementType<A>::type,typename ElementType<B>::type>;
 
+/**
+ * @brief      Determines if vector datatype supports vector instruction
+ *             on a current architecture
+ *
+ * @tparam     DA    Datatype
+ *
+ * @return     True if has vector instructions
+ */
 template<typename DA>
 constexpr bool has_vector_inst() {return (vector_traits<typename ElementType<DA>::type>::has_vector);}
 
+/**
+ * @brief      Determines if datatype has predicated loop for current architecture
+ *
+ * @tparam     DA    Datatype
+ *
+ * @return     True if has predicated loops
+ */
 template<typename DA>
 constexpr bool has_predicate_inst() {return (vector_traits<typename ElementType<DA>::type>::has_predicate);}
 
+/**
+ * @brief      Determines if scalar datatype (not vector, vectorview, matrix, matrixview)
+ *
+ * @tparam     DA    { description }
+ *
+ * @return     True if scalar, False otherwise.
+ */
 template<typename DA>
 constexpr bool is_scalar() {return (!IsVector<DA>::value && 
                                     !HasMatrixIndexing<DA>::value);}
 
+/**
+ * @brief      Check if datatype can only be used as a matrix (no vector addressing)
+ *
+ * @tparam     DA    Datatype
+ *
+ * @return     True if can only be used as a matrix (no vector indexing)
+ */
 template<typename DA>
 constexpr bool must_use_matrix_idx() {return (!IsVector<DA>::value && 
                                        HasMatrixIndexing<DA>::value);}
+/**
+ * @brief      Check if both datatype have vector indexing are
+ *             same scalar datatype
+ *
+ * @tparam     DA    First datatype
+ * @tparam     DB    Second datatype
+ *
+ * @return     True if both datatype have vectro indexing and same scalar type
+ */
 template<typename DA,typename DB>
 constexpr bool vector_idx_pair() {return (IsVector<DA>::value && 
                                           IsVector<DB>::value &&
@@ -108,10 +146,26 @@ struct IsDynamic
 Vector only not including matrixes (which are also vectors)
 
 */
+
+/**
+ * @brief      Check if has vector indexing
+ *
+ * @tparam     DA    Datatype
+ *
+ * @return     True if dtatype supports vector indexing
+ */
 template<typename DA>
 constexpr bool is_only_vector() {return (IsVector<DA>::value && 
                                     !HasMatrixIndexing<DA>::value);}
 
+/**
+ * @brief      Check if datatypes have same scalar datatype and no vector indexing
+ *
+ * @tparam     DA    First datatype
+ * @tparam     DB    Second datatype
+ *
+ * @return     True if datatypes have same scalar datatype and no vector indexing
+ */
 template<typename DA,typename DB>
 constexpr bool must_use_matrix_idx_pair() {return ((must_use_matrix_idx<DA>() || must_use_matrix_idx<DB>()) &&
                                                  SameElementType<DA,DB>::value);}
@@ -121,6 +175,16 @@ constexpr bool must_use_matrix_idx_pair() {return ((must_use_matrix_idx<DA>() ||
 
 Static length is 0 for scalar and Dynamic vectors
 */
+
+/**
+ * @brief      Static length
+ *
+ * @tparam     DA    First datatype
+ * @tparam     DB    Second datatype
+ *
+ * @return     Return the static length of the first datatype having
+ *             a static length in the pair.
+ */
 template<typename DA,typename DB>
 constexpr vector_length_t static_length() {
     return ((StaticLength<DA>::value==0) ? StaticLength<DB>::value : StaticLength<DA>::value);
@@ -133,6 +197,15 @@ Anyother case is ok.
 
 */
 
+/**
+ * @brief      Check compatibility of length
+ *
+ * @tparam     DA    First datatype
+ * @tparam     DB    Second datatype
+ *
+ * @return     False only when DA and DA have different static lengths.
+ *             All other cases are True.
+ */
 template<typename DA,typename DB>
 constexpr bool same_static_length()
 {
@@ -148,6 +221,12 @@ Vector operators at instruction level
 #include "fusion_ops.hpp"
 
 
+/**
+ * @brief  Expression template
+ *
+ * @tparam T Datatype representing the expression
+ * 
+ */
 template<typename T>
 struct _Expr {
 
@@ -156,26 +235,100 @@ struct _Expr {
     using Vector = typename traits<T>::Vector;
 #endif
 
+  /**
+   * @brief      Derived datatype
+   *
+   * @return     Return the derived datatype
+   */
   T& derived()  {return(static_cast<T&>(*this));}
 
+  /**
+   * @brief      Derived datatype
+   *
+   * @return     Return the derived datatype
+   */
   T const& derived() const {return(static_cast<T const&>(*this));}
 
+  /**
+   * @brief      Vector indexing in the expression
+   *
+   * @param[in]  i     Index
+   *
+   * @return     The result of the vector indexer
+   */
   Scalar const operator[](const index_t i) const {return(this->derived()[i]);}
   
+  /**
+   * @brief      Matrix indexing
+   *
+   * @param[in]  r     Row index
+   * @param[in]  c     Column index
+   *
+   * @return     Element at index
+   */
   Scalar const operator()(const index_t r,const index_t c) const {return(this->derived()(r,c));}
 
 #if defined(HAS_VECTOR)
+  /**
+   * @brief      Vector operation at given index
+   *
+   * @param[in]  i     Vector index
+   *
+   * @return     Evaluation of vector at the index
+   */
   Vector const vector_op(const index_t i) const {return(this->derived().vector_op(i));}
   
+  /**
+   * @brief      Vector operation at index with loop predicate
+   *
+   * @param[in]  i          Vector index
+   * @param[in]  remaining  Remaining elements in the loop
+   *
+   * @return     Evaluation of vector at index with tail predication
+   */
   Vector const vector_op_tail(const index_t i,const vector_length_t remaining) const {return(this->derived().vector_op_tail(i,remaining));}
 
+  /**
+   * @brief      Matrix operation at index
+   *
+   * @param[in]  r     row index
+   * @param[in]  c     column index
+   *
+   * @return     Evaluation of matrix expression at index
+   */
   Vector const matrix_op(const index_t r,const index_t c) const {return(this->derived().matrix_op(r,c));}
   
+  /**
+   * @brief      Matrix operation at index with tail predication
+   *
+   * @param[in]  r          row index
+   * @param[in]  c          column index
+   * @param[in]  remaining  Remaining elements in the loop
+   *
+   * @return     Evaluation of matrix operation at index
+   */
   Vector const matrix_op_tail(const index_t r,const index_t c,const vector_length_t remaining) const {return(this->derived().matrix_op_tail(r,c,remaining));}
 #endif 
 
+  /**
+   * @brief      Length of result
+   *
+   * @return     The vector length.
+   */
   vector_length_t length() const {return(this->derived().length());}
+
+  /**
+   * @brief      Number of rows for result
+   *
+   * @return     Number of rows
+   */
   vector_length_t rows() const {return(this->derived().rows());}
+
+  /**
+   * @brief      Number of columns for result
+   *
+   * @return     Number of columns
+   */
   vector_length_t columns() const {return(this->derived().columns());}
 
   virtual ~_Expr(){};
@@ -193,6 +346,14 @@ struct _Expr {
  * BINARY AST
  */
 
+/**
+ * @brief  Expression for binary operator
+ *
+ * @tparam LHS Left hand side datatype
+ * @tparam RHS Right hand side datatype
+ * @tparam DerivedOp Operator for the binary operation
+ * 
+ */
 template<typename LHS,typename RHS,typename DerivedOp>
 struct _Binary: _Expr<_Binary<LHS,RHS,DerivedOp>>
 {
@@ -542,6 +703,13 @@ struct traits<_Binary<LHS,RHS,DerivedOp>>
  * UNARY AST
  */
 
+/**
+ * @brief  Expression for unary operator
+ *
+ * @tparam LHS Left hand side datatype
+ * @tparam DerivedOp Operator for the binary operation
+ * 
+ */
 template<typename LHS,typename DerivedOp>
 struct _Unary: _Expr<_Unary<LHS,DerivedOp>>
 {
@@ -702,7 +870,18 @@ using DotResult = typename number_traits<typename traits<DA>::Scalar>::accumulat
 
 
 
-
+/**
+ * @brief  Dot product
+ *
+ * @tparam VA Left hand side vector datatype
+ * @tparam VB Right hand side vector datatype
+ * @param a left hand side vector
+ * @param b right hand side vector
+ * @return The dot product
+ * 
+ * The vector can be vector, vector views or expressions.
+ * 
+ */
 template<typename VA,typename VB,
          typename std::enable_if<vector_idx_pair<VA,VB>() &&
          is_only_vector<VA>() &&
@@ -715,6 +894,7 @@ inline DotResult<VA> dot(const VA& a,
    return(_dot(a,b,l,CURRENT_ARCH));
 }
 
+
 template<typename VA,typename VB,
          typename std::enable_if<vector_idx_pair<VA,VB>() &&
          is_only_vector<VA>() &&
@@ -731,7 +911,19 @@ inline DotResult<VA> dot(const VA& a,
 
 
 
-
+/**
+ * @brief  Swap vectors
+ *
+ * @tparam VA Left hand side vector datatype
+ * @tparam VB Right hand side vector datatype
+ * @param a left hand side vector
+ * @param b right hand side vector
+ * 
+ * The vector can be vector, vector views or expressions.
+ * 
+ * The content of vector is swapped.
+ * 
+ */
 template<typename VA,typename VB,
          typename std::enable_if<vector_idx_pair<VA,VB>() &&
          (!IsDynamic<VA>::value || !IsDynamic<VB>::value),bool>::type = true>
@@ -743,6 +935,7 @@ inline void swap(VA&& a,
    _swap(std::forward<VA>(a),std::forward<VB>(b),l,CURRENT_ARCH);
 }
 
+
 template<typename VA,typename VB,
          typename std::enable_if<vector_idx_pair<VA,VB>() &&
          (IsDynamic<VA>::value && IsDynamic<VB>::value),bool>::type = true>
diff --git a/dsppp/Include/dsppp/fusion_ops.hpp b/dsppp/Include/dsppp/fusion_ops.hpp
index a1a83d932..b79410dfc 100644
--- a/dsppp/Include/dsppp/fusion_ops.hpp
+++ b/dsppp/Include/dsppp/fusion_ops.hpp
@@ -6,7 +6,13 @@
  *  @{
  */
 
-
+/**
+ * @brief  Unary operator
+ *
+ * @tparam Scalar Datatype for scalar
+ * @tparam Derived Datatype representing the operator expression
+ * 
+ */
 template<typename Scalar,typename Derived>
 struct _UnaryOperator{
     Derived& derived()  {return(static_cast<Derived&>(*this));}
@@ -48,6 +54,13 @@ struct _UnaryOperator{
 #endif
 };
 
+/**
+ * @brief  Unary operator
+ *
+ * @tparam Scalar Datatype for scalar
+ * @tparam Derived Datatype representing the operator expression
+ * 
+ */
 template<typename Scalar,typename Derived>
 struct _BinaryOperator{
     Derived& derived()  {return(static_cast<Derived&>(*this));}
@@ -112,11 +125,18 @@ struct _BinaryOperator{
 #endif
 };
 
-/*****************
+/*
  * 
  * BINARY
  * 
  */
+
+/**
+ * @brief  Add operator
+ *
+ * @tparam Scalar Datatype for scalar
+ * 
+ */
 template<typename Scalar>
 struct _AddOp:_BinaryOperator<Scalar,_AddOp<Scalar>>
 {
@@ -176,6 +196,12 @@ struct _AddOp:_BinaryOperator<Scalar,_AddOp<Scalar>>
 #endif
 };
 
+/**
+ * @brief  Sub operator
+ *
+ * @tparam Scalar Datatype for scalar
+ * 
+ */
 template<typename Scalar>
 struct _SubOp:_BinaryOperator<Scalar,_SubOp<Scalar>>
 {
@@ -235,6 +261,13 @@ struct _SubOp:_BinaryOperator<Scalar,_SubOp<Scalar>>
 #endif
 };
 
+
+/**
+ * @brief  Mul operator
+ *
+ * @tparam Scalar Datatype for scalar
+ * 
+ */
 template<typename Scalar>
 struct _MulOp:_BinaryOperator<Scalar,_MulOp<Scalar>>
 {
@@ -294,11 +327,18 @@ struct _MulOp:_BinaryOperator<Scalar,_MulOp<Scalar>>
 #endif
 };
 
-/*****************
+/*
  * 
  * UNARY
  * 
  */
+
+/**
+ * @brief  Neg operator
+ *
+ * @tparam Scalar Datatype for scalar
+ * 
+ */
 template<typename Scalar>
 struct _NegOp:_UnaryOperator<Scalar,_NegOp<Scalar>>
 {
@@ -327,6 +367,13 @@ struct _NegOp:_UnaryOperator<Scalar,_NegOp<Scalar>>
 #endif
 };
 
+
+/**
+ * @brief  No operator
+ *
+ * @tparam Scalar Datatype for scalar
+ * 
+ */
 template<typename Scalar>
 struct _NoOp:_UnaryOperator<Scalar,_NoOp<Scalar>>
 {
diff --git a/dsppp/Include/dsppp/matrix_view.hpp b/dsppp/Include/dsppp/matrix_view.hpp
index 5321a6dbc..67504d088 100644
--- a/dsppp/Include/dsppp/matrix_view.hpp
+++ b/dsppp/Include/dsppp/matrix_view.hpp
@@ -116,7 +116,7 @@ struct MatrixView
       return(*this);
    }
 
-    /** @brief Assign constant from expression
+    /** @brief Assign matrix view from constant
     * @param val The constant
     * @return the matrix
     * 
@@ -492,23 +492,56 @@ struct MatrixView
 
 /*
 
-When the stride if not known at build time AND different
+When the stride is not known at build time AND different
 from the nb_cols_
 
 */
+
+/** @brief Dynamic Matrix View
+ *  @tparam T Type of the scalar
+ *  
+ *  This template is used for dynamic matrix (stride not known
+ *  at build time) and when we do not know if stride == number of
+ *  columns.
+ *  When stride is different from number of columns, the matrix cannot
+ *  be seen as a vector.
+ */
 template<typename T>
 struct MatrixView<T,DYNAMIC>
 {
+    /** @brief Number of rows
+    *  @return Number of rows
+    */
    vector_length_t rows() const {return(nb_rows_);}
+
+   /** @brief Number of columns
+    *  @return Number of columns
+    */
    vector_length_t columns() const {return(nb_cols_);}
+
+    /** @brief Number of stride
+    *  @return Number of stride
+    */
    uint32_t stride() const {return(stride_);}
 
+   /** @brief Create matrix view on a buffer (buffer not owned by the view)
+    * @param v buffer
+    * @param rows number of rows
+    * @param cols number of columns
+    * @param stride stride
+    */
    explicit MatrixView(T* v,
               const vector_length_t rows,
               const vector_length_t cols,
               const uint32_t stride):
    v_(v),nb_rows_(rows),nb_cols_(cols),stride_(stride){};
 
+   /** @brief Create matrix view on vector (vector not owned by the view)
+    * @param v vector
+    * @param rows number of rows
+    * @param cols number of columns
+    * @param stride stride
+    */
    explicit MatrixView(const Vector_Base<T> &v,
               const vector_length_t rows,
               const vector_length_t cols,
@@ -529,17 +562,35 @@ struct MatrixView<T,DYNAMIC>
    MatrixView& operator=(const MatrixView& other) = delete;
    MatrixView& operator=(MatrixView&& other)  = delete;
 
+   /** @brief Access matrix view element at given position
+    * @param r Row index
+    * @param c Column index
+    * @return reference to element
+    *
+    */
    T& operator()(const index_t r,const index_t c)
    {
      return(v_[r*stride()+c]);
    }
 
+    /** @brief Access matrix view element at given position
+    * @param r Row index
+    * @param c Column index
+    * @return reference to element
+    *
+    */
    T const operator()(const index_t r,const index_t c) const
    {
      return(v_[r*stride()+c]);
    }
 
 
+   /** @brief Assign matrix view from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other Expression
+    * @return the matrix
+    * 
+    */
    template<typename Derived>
    MatrixView& operator=(const _Expr<Derived>&other)
    {
@@ -547,6 +598,11 @@ struct MatrixView<T,DYNAMIC>
       return(*this);
    }
 
+   /** @brief Assign matrix view from constant
+    * @param val The constant
+    * @return the matrix
+    * 
+    */
    MatrixView& operator=(const T val)
    {
         _Fill2D(*this,val,rows(),columns(),CURRENT_ARCH);
@@ -554,7 +610,13 @@ struct MatrixView<T,DYNAMIC>
         return(*this);
    }
 
-
+ 
+    /** @brief Add matrix from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other Expression
+    * @return the matrix
+    * 
+    */
    template<typename Derived>
    MatrixView& operator +=(const _Expr<Derived>& other)
    {
@@ -562,18 +624,34 @@ struct MatrixView<T,DYNAMIC>
       return(*this);
    };
 
+    /** @brief Add matrix from matrix view
+    * @param other Other matrix
+    * @return the matrix
+    * 
+    */
    MatrixView& operator +=(const MatrixView& other)
    {
       eval2D(*this,*this + other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+    /** @brief Add constant to matrix view
+    * @param other The constant
+    * @return the matrix
+    * 
+    */
    MatrixView& operator +=(const T other)
    {
       eval2D(*this,*this + other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+    /** @brief Subtract matrix from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other expression
+    * @return the matrix
+    * 
+    */
    template<typename Derived>
    MatrixView& operator -=(const _Expr<Derived>& other)
    {
@@ -581,19 +659,34 @@ struct MatrixView<T,DYNAMIC>
       return(*this);
    };
 
-   
+    /** @brief Subtract matrix view
+    * @param other Other matrix view
+    * @return the matrix
+    * 
+    */
    MatrixView& operator -=(const MatrixView& other)
    {
       eval2D(*this,*this - other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+    /** @brief Subtract constant
+    * @param other Other matrix
+    * @return the matrix
+    * 
+    */
    MatrixView& operator -=(const T other)
    {
       eval2D(*this,*this - other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+    /** @brief Elementwise multiply matrix view with expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other expression
+    * @return the matrix
+    * 
+    */
    template<typename Derived>
    MatrixView& operator *=(const _Expr<Derived>& other)
    {
@@ -601,18 +694,35 @@ struct MatrixView<T,DYNAMIC>
       return(*this);
    };
 
+    /** @brief Elementwise multiply matrix view with matrix view
+    * @param other Other matrix
+    * @return the matrix
+    * 
+    */
    MatrixView& operator *=(const MatrixView& other)
    {
       eval2D(*this,*this * other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+    /** @brief Elementwise multiply matrix view constant
+    * @param other constant
+    * @return the matrix
+    * 
+    */
    MatrixView& operator *=(const T other)
    {
       eval2D(*this,*this * other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+    * @brief  Display the matrix content for debug purpose
+    * @param stream Output stream
+    * @param other The matrix to display
+    * @return the stream
+    * 
+    */
   friend std::ostream& operator<< (std::ostream& stream, const MatrixView& other) {
         for(index_t row=0;row<other.rows();row++)
         {
@@ -626,44 +736,102 @@ struct MatrixView<T,DYNAMIC>
         return(stream);
     }
 
+    /** @brief Create a row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @return row view vector
+    *
+    */
    VectorView<T,1> row(const index_t i,const index_t start=0)
    {
      return(VectorView<T,1>(v_,i*stride()+start,i*stride()+columns()));
    }
 
+    /** @brief Create a row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @param stop Stop index in row
+    * @return row view vector
+    *
+    */
    VectorView<T,1> row(const index_t i,const index_t start,const index_t stop)
    {
      return(VectorView<T,1>(v_,i*stride()+start,i*stride()+stop));
    }
 
+    /** @brief Create a constant row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @return row view vector
+    *
+    */
    const VectorView<T,1> row(const index_t i,const index_t start=0) const
    {
      return(VectorView<T,1>(v_,i*stride()+start,i*stride()+columns()));
    }
 
+    /** @brief Create a constant row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @param stop Stop index in row
+    * @return row view vector
+    *
+    */
    const VectorView<T,1> row(const index_t i,const index_t start,const index_t stop) const
    {
      return(VectorView<T,1>(v_,i*stride()+start,i*stride()+stop));
    }
 
+   
+    /** @brief Create a column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    VectorView<T,DYNAMIC> col(const index_t i,const index_t start=0)
    {
      return(VectorView<T,DYNAMIC>(v_,i+stride()*start,i+stride()*rows(),stride()*CS));
    }
 
+  
+    /** @brief Create a column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @param stop Stop index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    VectorView<T,DYNAMIC> col(const index_t i,const index_t start,const index_t stop)
    {
      return(VectorView<T,DYNAMIC>(v_,i+stride()*start,i+stride()*stop,stride()*CS));
    }
 
+   /** @brief Create a constant column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    const VectorView<T,DYNAMIC> col(const index_t i,const index_t start=0) const
    {
      return(VectorView<T,DYNAMIC>(v_,i+stride()*start,i+stride()*rows(),stride()*CS));
    }
 
+    /** @brief Create a constant column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @param stop Stop index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    const VectorView<T,DYNAMIC> col(const index_t i,const index_t start,const index_t stop) const
    {
@@ -671,7 +839,21 @@ struct MatrixView<T,DYNAMIC>
    }
 
    #if defined(HAS_VECTOR)
+    //! Type of vectors for a vector architecture and for scalar datatype P
     using VectorType = typename vector_traits<T>::vector;
+    
+
+    /**
+    * @brief   %Vector store at a given row,column position
+    *
+    * @param row row index
+    * @param col column index
+    * @param val %Vector value
+    * 
+    * On an architecture supporting vectors, if the scalar datatype T
+    * has a corresponding vector datatype, this function stores a vector
+    * value at row,column in this matrix.
+    */
     void matrix_store(const index_t row,
                       const index_t col,
                       const VectorType val) const
@@ -680,6 +862,19 @@ struct MatrixView<T,DYNAMIC>
     }
 
 #if defined(HAS_PREDICATED_LOOP)
+     /**
+    * @brief   %Vector store at a given row,column position with predicated tail
+    *
+    * @param row row index
+    * @param col column index
+    * @param remaining Number of remaining samples in the loop
+    * @param val Vector value to write at index i with tail predication
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function stores a vector value at row,column index in this matrix datatype
+    * with predication
+    */
     void matrix_store_tail(const index_t row,
                            const index_t col,
                            const vector_length_t remaining,
@@ -688,6 +883,19 @@ struct MatrixView<T,DYNAMIC>
         inner::vstore1_z<1>((typename std::remove_cv<T>::type*)(&v_[row*stride() + col]),val,remaining,inner::vctpq<T>::mk(remaining));
     }
 
+     /**
+    * @brief   %Vector operation at a given row,column position with predicated tail
+    *
+    * @param row row index
+    * @param col column index
+    * @param remaining Number of remaining samples in the loop
+    * @return the vector result of the operation
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function compute an operation at row,column index in this matrix datatype
+    * with predication
+    */
     VectorType const matrix_op_tail(const index_t row,
                                 const index_t col,
                                 const vector_length_t remaining) const
@@ -696,6 +904,17 @@ struct MatrixView<T,DYNAMIC>
     }
 #endif
     
+     /**
+    * @brief   %Vector operation at a given row,column position
+    *
+    * @param row row index
+    * @param col column index
+    * @return the vector result of the operation
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function compute an operation at row,column index in this matrix datatype
+    */
     VectorType const matrix_op(const index_t row,
                            const index_t col) const
     {
@@ -703,6 +922,11 @@ struct MatrixView<T,DYNAMIC>
     }
 #endif
 
+    /** @brief Fill diagonal of a matrix with a vector
+    * @tparam VA Vector datatype
+    * @param a Vector for initializing the diagonal
+    * 
+    */
     template<typename VA,
             typename std::enable_if<IsVector<VA>::value && 
             SameElementType<VA,T>::value,bool>::type = true>
@@ -711,6 +935,10 @@ struct MatrixView<T,DYNAMIC>
        _fill_diagonal(*this,a,this->length());
     }
 
+    /** @brief Create the transposed matrix
+    * @return a matrix
+    *   
+    */
     Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> transpose() const
     {
        Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> res(columns(),rows());
@@ -718,14 +946,26 @@ struct MatrixView<T,DYNAMIC>
        return(res);
     }
 
+     /** @brief Create a matrix of same type
+    * @return a matrix
+    *   
+    */
     Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> create() const
     {
        Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> res(rows(),columns());
        return(res);
     }
 
+      /**
+    * @brief      Pointer to storage buffer
+    * @return Pointer to storage
+    */
     T* ptr() const {return(v_);}
 
+     /**
+    * @brief      Constant pointer to storage buffer
+    * @return Pointer to storage
+    */
     const T* const_ptr() const {return(v_);}
 
 
@@ -743,19 +983,55 @@ Dynamic but with stride == nb_cols_
 
 */
 
+/** @brief Dynamic Matrix View
+ *  @tparam T Type of the scalar
+ *  
+ *  This template is used for dynamic matrix (stride not known
+ *  at build time) and when we do know that stride == number of
+ *  columns.
+ *  When stride is equal to the number of columns, the matrix can
+ *  be seen as a vector and it enables to use the vector eval loop
+ *  in the operator fusion mechanism.
+ *  
+ *  Those matrix views are created by expression when a reference to
+ *  a matrix is used in the expression tree (to avoid copying the matrix).
+ *  In this case, we do know that the matrix view is the full matrix and thus
+ *  stride == number of columns
+ */
 template<typename T>
 struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
 {
+    /** @brief Number of rows
+    *  @return Number of rows
+    */
    vector_length_t rows() const {return(nb_rows_);}
+
+    /** @brief Number of columns
+    *  @return Number of columns
+    */
    vector_length_t columns() const {return(nb_cols_);}
+
+    /** @brief Number of stride
+    *  @return Number of stride
+    */
    uint32_t stride() const {return(nb_cols_);}
 
+   /** @brief Create matrix view on a buffer (buffer not owned by the view)
+    * @param v buffer
+    * @param rows number of rows
+    * @param cols number of columns
+    */
    explicit MatrixView(T* v,
               const vector_length_t rows,
               const vector_length_t cols):
    VectorView<T,1>(v,0,rows*cols),
    nb_rows_(rows),nb_cols_(cols){};
 
+   /** @brief Create matrix view on vector (vector not owned by the view)
+    * @param v vector
+    * @param rows number of rows
+    * @param cols number of columns
+    */
    explicit MatrixView(const Vector_Base<T> &v,
               const vector_length_t rows,
               const vector_length_t cols):
@@ -776,17 +1052,34 @@ struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
    MatrixView& operator=(const MatrixView& other) = delete;
    MatrixView& operator=(MatrixView&& other)  = delete;
 
+   /** @brief Access matrix view element at given position
+    * @param r Row index
+    * @param c Column index
+    * @return reference to element
+    *
+    */
    T& operator()(const index_t r,const index_t c)
    {
      return(&(*this)[r*stride()+c]);
    }
 
+    /** @brief Access matrix view element at given position
+    * @param r Row index
+    * @param c Column index
+    * @return reference to element
+    *
+    */
    T const operator()(const index_t r,const index_t c) const
    {
      return((*this)[r*stride()+c]);
    }
 
-
+    /** @brief Assign matrix view from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other Expression
+    * @return the matrix
+    * 
+    */
    template<typename Derived>
    MatrixView& operator=(const _Expr<Derived>&other)
    {
@@ -794,6 +1087,11 @@ struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
       return(*this);
    }
 
+   /** @brief Assign matrix view from constant
+    * @param val The constant
+    * @return the matrix
+    * 
+    */
    MatrixView& operator=(const T val)
    {
         _Fill2D(*this,val,rows(),columns(),CURRENT_ARCH);
@@ -802,6 +1100,12 @@ struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
    }
 
 
+     /** @brief Add matrix from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other Expression
+    * @return the matrix
+    * 
+    */
    template<typename Derived>
    MatrixView& operator +=(const _Expr<Derived>& other)
    {
@@ -809,18 +1113,34 @@ struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
       return(*this);
    };
 
+    /** @brief Add matrix from matrix view
+    * @param other Other matrix
+    * @return the matrix
+    * 
+    */
    MatrixView& operator +=(const MatrixView& other)
    {
       eval2D(*this,*this + other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+   /** @brief Add constant to matrix view
+    * @param other The constant
+    * @return the matrix
+    * 
+    */
    MatrixView& operator +=(const T other)
    {
       eval2D(*this,*this + other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+    /** @brief Subtract matrix from expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other expression
+    * @return the matrix
+    * 
+    */
    template<typename Derived>
    MatrixView& operator -=(const _Expr<Derived>& other)
    {
@@ -828,19 +1148,34 @@ struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
       return(*this);
    };
 
-   
+    /** @brief Subtract matrix view
+    * @param other Other matrix view
+    * @return the matrix
+    * 
+    */
    MatrixView& operator -=(const MatrixView& other)
    {
       eval2D(*this,*this - other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+    /** @brief Subtract constant
+    * @param other Other matrix
+    * @return the matrix
+    * 
+    */
    MatrixView& operator -=(const T other)
    {
       eval2D(*this,*this - other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+   /** @brief Elementwise multiply matrix view with expression
+    * @tparam Derived Datatype representing the abstract syntax tree of the expression
+    * @param other expression
+    * @return the matrix
+    * 
+    */
    template<typename Derived>
    MatrixView& operator *=(const _Expr<Derived>& other)
    {
@@ -848,18 +1183,35 @@ struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
       return(*this);
    };
 
+   /** @brief Elementwise multiply matrix view with matrix view
+    * @param other Other matrix
+    * @return the matrix
+    * 
+    */
    MatrixView& operator *=(const MatrixView& other)
    {
       eval2D(*this,*this * other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+    /** @brief Elementwise multiply matrix view constant
+    * @param other constant
+    * @return the matrix
+    * 
+    */
    MatrixView& operator *=(const T other)
    {
       eval2D(*this,*this * other,rows(),columns(),CURRENT_ARCH);
       return(*this);
    };
 
+   /**
+    * @brief  Display the matrix content for debug purpose
+    * @param stream Output stream
+    * @param other The matrix to display
+    * @return the stream
+    * 
+    */
   friend std::ostream& operator<< (std::ostream& stream, const MatrixView& other) {
         for(index_t row=0;row<other.rows();row++)
         {
@@ -873,44 +1225,100 @@ struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
         return(stream);
     }
 
+    /** @brief Create a row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @return row view vector
+    *
+    */
    VectorView<T,1> row(const index_t i,const index_t start=0)
    {
      return(VectorView<T,1>(this->ptr(),i*stride()+start,i*stride()+columns()));
    }
 
+   /** @brief Create a row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @param stop Stop index in row
+    * @return row view vector
+    *
+    */
    VectorView<T,1> row(const index_t i,const index_t start,const index_t stop)
    {
      return(VectorView<T,1>(this->ptr(),i*stride()+start,i*stride()+stop));
    }
 
+    /** @brief Create a constant row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @return row view vector
+    *
+    */
    const VectorView<T,1> row(const index_t i,const index_t start=0) const
    {
      return(VectorView<T,1>(this->ptr(),i*stride()+start,i*stride()+columns()));
    }
 
+    /** @brief Create a constant row view with stride 1
+    * @param i row index
+    * @param start Start index in row
+    * @param stop Stop index in row
+    * @return row view vector
+    *
+    */
    const VectorView<T,1> row(const index_t i,const index_t start,const index_t stop) const
    {
      return(VectorView<T,1>(this->ptr(),i*stride()+start,i*stride()+stop));
    }
 
+    /** @brief Create a column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    VectorView<T,DYNAMIC> col(const index_t i,const index_t start=0)
    {
      return(VectorView<T,DYNAMIC>(this->ptr(),i+stride()*start,i+stride()*rows(),stride()*CS));
    }
 
+    /** @brief Create a column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @param stop Stop index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    VectorView<T,DYNAMIC> col(const index_t i,const index_t start,const index_t stop)
    {
      return(VectorView<T,DYNAMIC>(this->ptr(),i+stride()*start,i+stride()*stop,stride()*CS));
    }
 
+    /** @brief Create a constant column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    const VectorView<T,DYNAMIC> col(const index_t i,const index_t start=0) const
    {
      return(VectorView<T,DYNAMIC>(this->ptr(),i+stride()*start,i+stride()*rows(),stride()*CS));
    }
 
+   /** @brief Create a constant column view vector
+    * @tparam CS column stride
+    * @param i column index
+    * @param start Start index in column
+    * @param stop Stop index in column
+    * @return column view vector
+    *
+    */
    template<int CS=1>
    const VectorView<T,DYNAMIC> col(const index_t i,const index_t start,const index_t stop) const
    {
@@ -918,7 +1326,20 @@ struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
    }
 
    #if defined(HAS_VECTOR)
+    //! Type of vectors for a vector architecture and for scalar datatype P
     using VectorType = typename vector_traits<T>::vector;
+
+     /**
+    * @brief   %Vector store at a given row,column position
+    *
+    * @param row row index
+    * @param col column index
+    * @param val %Vector value
+    * 
+    * On an architecture supporting vectors, if the scalar datatype T
+    * has a corresponding vector datatype, this function stores a vector
+    * value at row,column in this matrix.
+    */
     void matrix_store(const index_t row,
                       const index_t col,
                       const VectorType val) const
@@ -927,6 +1348,19 @@ struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
     }
 
 #if defined(HAS_PREDICATED_LOOP)
+     /**
+    * @brief   %Vector store at a given row,column position with predicated tail
+    *
+    * @param row row index
+    * @param col column index
+    * @param remaining Number of remaining samples in the loop
+    * @param val Vector value to write at index i with tail predication
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function stores a vector value at row,column index in this matrix datatype
+    * with predication
+    */
     void matrix_store_tail(const index_t row,
                            const index_t col,
                            const vector_length_t remaining,
@@ -935,6 +1369,19 @@ struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
         inner::vstore1_z<1>((typename std::remove_cv<T>::type*)(ptr(row*stride() + col)),val,remaining,inner::vctpq<T>::mk(remaining));
     }
 
+     /**
+    * @brief   %Vector operation at a given row,column position with predicated tail
+    *
+    * @param row row index
+    * @param col column index
+    * @param remaining Number of remaining samples in the loop
+    * @return the vector result of the operation
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function compute an operation at row,column index in this matrix datatype
+    * with predication
+    */
     VectorType const matrix_op_tail(const index_t row,
                                 const index_t col,
                                 const vector_length_t remaining) const
@@ -943,6 +1390,17 @@ struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
     }
 #endif
     
+     /**
+    * @brief   %Vector operation at a given row,column position
+    *
+    * @param row row index
+    * @param col column index
+    * @return the vector result of the operation
+    * 
+    * On an architecture supporting vectors and predicated loops, if the 
+    * scalar datatype T has a corresponding vector datatype, this 
+    * function compute an operation at row,column index in this matrix datatype
+    */
     VectorType const matrix_op(const index_t row,
                            const index_t col) const
     {
@@ -950,6 +1408,11 @@ struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
     }
 #endif
 
+     /** @brief Fill diagonal of a matrix with a vector
+    * @tparam VA Vector datatype
+    * @param a Vector for initializing the diagonal
+    * 
+    */
     template<typename VA,
             typename std::enable_if<IsVector<VA>::value && 
             SameElementType<VA,T>::value,bool>::type = true>
@@ -958,6 +1421,10 @@ struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
        _fill_diagonal(*this,a,this->length());
     }
 
+     /** @brief Create the transposed matrix
+    * @return a matrix
+    *   
+    */
     Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> transpose() const
     {
        Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> res(columns(),rows());
@@ -965,6 +1432,10 @@ struct MatrixView<T,CONSTRAINED_DYNAMIC>:VectorView<T,1>
        return(res);
     }
 
+      /** @brief Create a matrix of same type
+    * @return a matrix
+    *   
+    */
     Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> create() const
     {
        Matrix<T,DYNAMIC,DYNAMIC,TMP_ALLOC> res(rows(),columns());
diff --git a/dsppp/Include/dsppp/memory_pool.hpp b/dsppp/Include/dsppp/memory_pool.hpp
index 7b60b4f83..a152d691e 100644
--- a/dsppp/Include/dsppp/memory_pool.hpp
+++ b/dsppp/Include/dsppp/memory_pool.hpp
@@ -26,8 +26,35 @@ For instance, it is usedin the Memory pool allocator
 
 */
 
+/** \defgroup MEMBUF Memory buffer allocator
+ *  \ingroup MEMORY
+ *  Allocators for allocating memory buffers
+ */
+
+/** \defgroup MEMVEC Vector / matrix buffer allocator
+ *  \ingroup MEMORY
+ *  Allocators for allocating vector / matrix buffers
+ */
+
+/** \defgroup MEMTOOL Miscellaneous utilities for memory
+ *  \ingroup MEMORY
+ *  Miscellaneous utilities for implementing memory allocators
+ */
+
+/**
+ * @ingroup MEMBUF
+ * @brief      Malloc memory allocator
+ * 
+ */
 struct default_user_allocator_malloc_free
 {
+  /**
+   * @brief      Allocate a buffer
+   *
+   * @param[in]  bytes  The bytes
+   *
+   * @return     A pointer to the allocated buffer
+   */
   static char * malloc(const std::size_t bytes)  
   { 
     #if !defined(MEMORY_ALLOCATION_DEBUG)
@@ -41,6 +68,12 @@ struct default_user_allocator_malloc_free
     return(ret);
     #endif
   } 
+
+  /**
+   * @brief      Free a buffer
+   *
+   * @param      block  The buffer to free
+   */
   static void free(char * const block)  
   { 
     #if defined(MEMORY_ALLOCATION_DEBUG)
@@ -53,6 +86,15 @@ struct default_user_allocator_malloc_free
   }
 };
 
+/**
+ * @ingroup MEMBUF
+ * @brief      Aligned memory allocation
+ *
+ * @param[in]  alignment  The alignment of the buffer
+ * @param[in]  size       The size of the buffer
+ *
+ * @return     A pointer to the new buffer
+ */
 inline void* aligned_malloc(std::size_t alignment, std::size_t size)
 {
    void *ptr=std::malloc(size+alignment+sizeof(void*));
@@ -65,6 +107,12 @@ inline void* aligned_malloc(std::size_t alignment, std::size_t size)
    return(aligned);
 }
 
+/**
+ * @ingroup MEMBUF
+ * @brief      Free an aligned buffer
+ *
+ * @param      ptr   The pointer
+ */
 inline void
 aligned_free(void* ptr)
 {
@@ -73,12 +121,23 @@ aligned_free(void* ptr)
     }
 };
 
-
+/**
+ * @ingroup MEMBUF
+ * @brief      Memory allocation for aligned buffers
+ *
+ */
 struct user_allocator_aligned_malloc
 {
   typedef std::size_t size_type;
   typedef std::ptrdiff_t difference_type;
 
+  /**
+   * @brief      Allocate a new buffer
+   *
+   * @param[in]  bytes  The bytes
+   *
+   * @return     Pointer to the new buffer
+   */
   static char * malloc(const size_type bytes)  
   { 
     #if !defined(MEMORY_ALLOCATION_DEBUG)
@@ -92,6 +151,12 @@ struct user_allocator_aligned_malloc
     return(ret);
     #endif
   }
+
+  /**
+   * @brief      Free a buffer
+   *
+   * @param      block  Pointer to the buffer
+   */
   static void free(char * const block)  
   { 
     #if defined(MEMORY_ALLOCATION_DEBUG)
@@ -112,9 +177,22 @@ Memory allocator for vector and matrix.
 
 // Default allocator
 // Other allocator must be provided by user of the library
+
+/**
+ * @ingroup MEMVEC
+ * @brief      Default memory allocator for vectors and matrixes
+ *
+ * @tparam     L     Size known at build time in bytes
+ */
 template<int L>
 struct malloc_allocator {
-    /* Dynamic size allocations */
+    /**
+     * @brief      Allocate a buffer with size known at runtime
+     *
+     * @param[in]  sz    The size
+     *
+     * @return     Pointer to the buffer
+     */
     static  char* allocate  ( vector_length_t sz) noexcept{
         char *res;
         res=reinterpret_cast<char*>(std::malloc(sz));
@@ -127,7 +205,11 @@ struct malloc_allocator {
         return(res);
     }
 
-    /* Size know at build time */
+    /**
+     * @brief      Allocate a buffer with size known at build time
+     *
+     * @return     Pointer to the buffer
+     */
     static  char* allocate  ( ) noexcept{
         char *res;
         res=reinterpret_cast<char*>(std::malloc(L));
@@ -140,6 +222,11 @@ struct malloc_allocator {
         return(res);
     }
     
+    /**
+     * @brief      Destroys the given pointer.
+     *
+     * @param      ptr   The pointer
+     */
     static void destroy  ( char* ptr ) noexcept {
         #if defined(MEMORY_ALLOCATION_DEBUG)
         if (ptr==nullptr)
@@ -168,13 +255,31 @@ vectors and matrixes.
 
 struct ListElem;
 
+/**
+ * @ingroup MEMTOOL
+ * @brief      Simple list of elements
+ *
+ */
 struct ListElem {
     ListElem *next;
 };
 
+/**
+ * @ingroup MEMTOOL
+ * @brief      This class describes a memory pool that can be used to build
+ *             a memory allocator for vectors and matrixes
+ *
+ * @tparam     BUF_SIZE       Size of a buffer known at build time
+ * @tparam     UserAllocator  Memory allocator to allocate the memory buffer
+ */
 template<int BUF_SIZE,typename UserAllocator = default_user_allocator_malloc_free>
 class MemoryPool {
 public:
+    /**
+     * @brief      Create a new memory pool
+     *
+     * @param[in]  nbBufs  The number of buffers to pre-allocate
+     */
     explicit MemoryPool(const uint16_t nbBufs) 
     {
         buffer_list.reserve(nbBufs);
@@ -186,6 +291,9 @@ class MemoryPool {
         reset();
     };
 
+    /**
+     * @brief      Destroys the object.
+     */
     ~MemoryPool() 
     {
         for(auto p=buffer_list.begin();p != buffer_list.end(); ++p)
@@ -203,6 +311,11 @@ class MemoryPool {
     
     MemoryPool& operator=(MemoryPool&& other) = delete;
 
+    /**
+     * @brief      Gets the new free buffer.
+     *
+     * @return     The new buffer.
+     */
     char* get_new_buffer() noexcept
     {
          /* No error handling.
@@ -220,6 +333,11 @@ class MemoryPool {
          return(res);
     }
 
+    /**
+     * @brief      Release the buffer so that it can be reused
+     *
+     * @param      buf   The buffer
+     */
     void recycle_buffer(char* buf)  noexcept
     {
         ListElem *l = reinterpret_cast<ListElem*>(buf);
@@ -233,6 +351,9 @@ class MemoryPool {
         free = l;
     }
 
+    /**
+     * @brief      Release all the buffers so that they can be reused
+     */
     void reset()  noexcept
     {
         const int nbBufs = buffer_list.size();
diff --git a/dsppp/Include/dsppp/number.hpp b/dsppp/Include/dsppp/number.hpp
index 033a65e49..2cfd84a82 100644
--- a/dsppp/Include/dsppp/number.hpp
+++ b/dsppp/Include/dsppp/number.hpp
@@ -39,12 +39,12 @@ constexpr uint32_t shiftFromValue(const uint32_t val)
      return (val == 1 ? 0 : 1 + shiftFromValue(val>>1));
 }
 
-/** Properties of a scalar datatype
+/** @brief Properties of a scalar datatype
+ * @tparam T datatype
  * 
  * Needs to contain two static bool : is_float and is_fixed
  * 
  * Needs to contain a static function `one` returning the value
- * 
  * 1 for this datatype (used to write some datatype generic
  * algorithms)
  */

From 0732932f62039062caf8ef9b941fab24bbb37b73 Mon Sep 17 00:00:00 2001
From: Christophe Favergeon <Christophe.Favergeon@arm.com>
Date: Tue, 5 Mar 2024 09:42:39 +0100
Subject: [PATCH 4/5] Improved C++ documentation (inner operators)

---
 Documentation/Doxygen/dsp.dxy.in              |   2 +-
 dsppp/Include/dsppp/DSP/q15.hpp               |  69 ++-
 dsppp/Include/dsppp/Helium/basic.hpp          |  93 +++-
 dsppp/Include/dsppp/Helium/float.hpp          | 460 +++++++++++++++++-
 dsppp/Include/dsppp/Helium/half.hpp           | 132 +++++
 dsppp/Include/dsppp/Helium/q15.hpp            |  83 +++-
 dsppp/Include/dsppp/Helium/q31.hpp            |   4 +
 dsppp/Include/dsppp/Helium/q7.hpp             |   4 +
 dsppp/Include/dsppp/Scalar/basic.hpp          |  78 ++-
 .../Include/dsppp/Scalar/matrix_multiply.hpp  |  21 +-
 .../dsppp/Scalar/matrix_multiply_fixed.hpp    |  14 +
 .../dsppp/Scalar/matrix_multiply_float.hpp    |  12 +
 dsppp/Include/dsppp/num_features/double.hpp   |  85 ++++
 dsppp/Include/dsppp/num_features/float.hpp    |  69 +++
 dsppp/Include/dsppp/num_features/group.hpp    | 228 +++++++++
 dsppp/Include/dsppp/num_features/half.hpp     |  61 ++-
 dsppp/Include/dsppp/num_features/q15.hpp      |  55 ++-
 dsppp/Include/dsppp/num_features/q31.hpp      |  56 ++-
 dsppp/Include/dsppp/num_features/q7.hpp       |  53 +-
 dsppp/test.cproject.yml                       |  11 -
 20 files changed, 1554 insertions(+), 36 deletions(-)

diff --git a/Documentation/Doxygen/dsp.dxy.in b/Documentation/Doxygen/dsp.dxy.in
index 77126d731..c78d8871f 100644
--- a/Documentation/Doxygen/dsp.dxy.in
+++ b/Documentation/Doxygen/dsp.dxy.in
@@ -2430,7 +2430,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = HAS_VECTOR HAS_PREDICATED_LOOP ARM_MATH_NEON=1 ARM_FLOAT16_SUPPORTED=1 __STATIC_FORCEINLINE= __ALIGNED(x)=
+PREDEFINED             = DOXYGEN HAS_VECTOR HAS_PREDICATED_LOOP ARM_MATH_NEON=1 ARM_FLOAT16_SUPPORTED=1 __STATIC_FORCEINLINE= __ALIGNED(x)=
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/dsppp/Include/dsppp/DSP/q15.hpp b/dsppp/Include/dsppp/DSP/q15.hpp
index f10d2c140..5320f922e 100644
--- a/dsppp/Include/dsppp/DSP/q15.hpp
+++ b/dsppp/Include/dsppp/DSP/q15.hpp
@@ -20,21 +20,46 @@
 #if defined(ARM_MATH_DSP)
 #if !defined(ARM_MATH_MVEI) && !defined(ARM_MATH_MVEF) && !defined(ARM_MATH_NEON)
 
-
+/**
+ * @brief      Representation of a vector when DSP extension supported
+ */
 struct Q15DSPVector {
+    /**
+     * @brief      Create new 0 initialized vector
+     */
     Q15DSPVector():v(0){};
+
+    /**
+     * @brief      Create vector initialized from value
+     *
+     * @param[in]  val   The value
+     */
     explicit Q15DSPVector(int32_t val):v(val){};
+
+    /**
+     * @brief      Return value in vector
+     */
     operator int32_t(){return v;};
 
 int32_t v;
 };
 
+/**
+ * @brief      Vector description for Q15 with DSP extensions
+ */
 template<>
 struct vector_traits<Q15,DSP,typename std::enable_if<true>::type> 
 {
+  //! Scalar datatype
   typedef Q15 type;
+
+  //! Storage datatype
   typedef type::value_type storage_type;
+
+  //! Vector datatype
   typedef Q15DSPVector vector;
+
+  //! Accumulator datatype
   typedef Q<33,30> temp_accumulator;
 
   /*
@@ -48,30 +73,61 @@ struct vector_traits<Q15,DSP,typename std::enable_if<true>::type>
   predicate but they are not called in this context.
 
   */
-  typedef uint32_t predicate_t;
 
+  /**
+   * Dummy type since there is no predicated loop for
+   * DSP extensions
+   */
+  typedef uint32_t predicate_t;
 
+  //! Has some vector instructions
   static constexpr bool has_vector = true;
+
+  //! Is not float
   static constexpr bool is_float = false;
+
+  //! Is fixed point
   static constexpr bool is_fixed = true;
+
+  //! No predicated loops
   static constexpr bool has_predicate = false;
 
+  //! Number of lanes
   static constexpr int nb_lanes = 2;
 
+  /**
+   * @brief      Zero accumulator
+   *
+   * @return     Zero accumulator
+   */
   static Q<33,30> temp_acc_zero()
   {
        return(Q<33,30>());
   }
 
+  /**
+   * @brief      Value to write in a lane to write 0
+   *
+   * @return     Zero value for a lane
+   */
   static constexpr int16_t zero_lane() {return 0;};
 
+  /**
+   * @brief      Convert to lane value
+   *
+   * @param[in]  x     Value
+   *
+   * @return     Lane value
+   */
   static constexpr int16_t lane_value(const Q15 x) {return x.v;};
 
 
 };
 
 
-
+/**
+ * \ingroup DSPNumber
+ */
 namespace inner {
 
     /* Needed to build but not used */
@@ -83,6 +139,13 @@ namespace inner {
        };
     };
 
+   /**
+    * @brief      Vector const
+    *
+    * @param[in]  val   The value
+    *
+    * @return     The static forceinline.
+    */
     __STATIC_FORCEINLINE Q15DSPVector vconst(Q15 val)
     {
        return(Q15DSPVector(__PKHBT(val.v, val.v, 16)));
diff --git a/dsppp/Include/dsppp/Helium/basic.hpp b/dsppp/Include/dsppp/Helium/basic.hpp
index ac0529fce..fd804230c 100644
--- a/dsppp/Include/dsppp/Helium/basic.hpp
+++ b/dsppp/Include/dsppp/Helium/basic.hpp
@@ -18,6 +18,17 @@
  */
 
 #if defined(ARM_MATH_MVEI) || defined(ARM_MATH_MVEF)
+/**
+ * @brief      Fill evaluator for Helium
+ *
+ * @param      v          Destination value
+ * @param[in]  val        Initialization value
+ * @param[in]  l          Vector length
+ *
+ * @tparam     T          Scalar datatype
+ * @tparam     DST        Destination datatype
+ * @tparam     <unnamed>  Check if has vector indexing
+ */
 template<typename T,typename DST,
 typename std::enable_if<has_vector_inst<DST>() &&
           IsVector<DST>::value &&
@@ -36,6 +47,18 @@ inline void _Fill(DST &v,
       }
 }
 
+/**
+ * @brief      Fill2D evaluator for Helium
+ *
+ * @param      v          Destination value
+ * @param[in]  val        Initialization value
+ * @param[in]  rows       Number of rows
+ * @param[in]  cols       Number of columns
+ *
+ * @tparam     T          Scalar datatype
+ * @tparam     DST        Destination datatype
+ * @tparam     <unnamed>  Check only matrix indexing supported
+ */
 template<typename T,typename DST,
 typename std::enable_if<has_vector_inst<DST>() &&
          must_use_matrix_idx<DST>() &&
@@ -80,6 +103,17 @@ inline void _Fill2D(DST &v,
       }
 }
 
+/**
+ * @brief      Eval function for Helium
+ *
+ * @param      v          Destination
+ * @param[in]  other      Expression to evaluate
+ * @param[in]  l          Vector length
+ *
+ * @tparam     DA         Destination datatype
+ * @tparam     DB         Expression datatype
+ * @tparam     <unnamed>  Check vector indexing and compatible vectors
+ */
 template<typename DA,typename DB,
 typename std::enable_if<has_vector_inst<DA>() && 
                         vector_idx_pair<DA,DB>(),bool>::type = true>
@@ -100,7 +134,18 @@ inline void eval(DA &v,
       }
 }
 
-
+/**
+ * @brief      Eval2D function for Helium
+ *
+ * @param      v          Destination vector
+ * @param[in]  other      Expression to evaluate
+ * @param[in]  rows       Number of rows
+ * @param[in]  cols       Number of columns
+ *
+ * @tparam     DA         Destination datatype
+ * @tparam     DB         Source datatype
+ * @tparam     <unnamed>  Check has only matrix indexing
+ */
 template<typename DA,typename DB,
 typename std::enable_if<has_vector_inst<DA>() &&
                         must_use_matrix_idx_pair<DA,DB>(),bool>::type = true>
@@ -146,12 +191,27 @@ inline void eval2D(DA &v,
 }
 
 
+/**
+    * @brief  Display the matrix content for debug purpose
+    * @param stream Output stream
+    * @param other The matrix to display
+    * @return the stream
+    * 
+    */
 static std::ostream& operator<< (std::ostream& stream, const float32x4_t& other) 
 {
    stream << "(" << other[0] << "," <<other[1] << "," <<other[2] << "," <<other[3] << ")";
    return(stream);
 }
 
+/**
+ * @brief      Print tuple for debug
+ *
+ * @param[in]  _tup       Tuple
+ *
+ * @tparam     TupType    Tuple datatype
+ * @tparam     I          List of tuple indexes
+ */
 template<class TupType, size_t... I>
 void printt(const TupType& _tup, std::index_sequence<I...>)
 {
@@ -160,12 +220,32 @@ void printt(const TupType& _tup, std::index_sequence<I...>)
     std::cout << ")\n";
 }
 
+/**
+ * @brief      Print tuple
+ *
+ * @param[in]  _tup  Tuple
+ *
+ * @tparam     T     Datatype for tuple elements
+ */
 template<class... T>
 void printt (const std::tuple<T...>& _tup)
 {
     printt(_tup, std::make_index_sequence<sizeof...(T)>());
 }
 
+/**
+ * @brief      Dor product for Helium
+ *
+ * @param[in]  a          First expression
+ * @param[in]  b          Second expression
+ * @param[in]  l          Vector length
+ *
+ * @tparam     DA         First operand datatype
+ * @tparam     DB         Second operand datatype
+ * @tparam     <unnamed>  Check vector indexing and compatible vectors
+ *
+ * @return     Dot product of vector expressions
+ */
 template<typename DA,typename DB,
          typename std::enable_if<has_vector_inst<DA>() &&
          vector_idx_pair<DA,DB>(),bool>::type = true>
@@ -193,6 +273,17 @@ inline DotResult<DA> _dot(const DA& a,
      return(inner::vreduce(acc));
 }
 
+/**
+ * @brief      Swap operator for Helium
+ *
+ * @param      a          First opetand
+ * @param      b          Second operand
+ * @param[in]  l          Vector length
+ *
+ * @tparam     DA         First operand datatype
+ * @tparam     DB         Second operand datatype
+ * @tparam     <unnamed>  Check vector indexing and compatible vectors
+ */
 template<typename DA,typename DB,
          typename std::enable_if<has_vector_inst<DA>() &&
                                  vector_idx_pair<DA,DB>(),bool>::type = true>
diff --git a/dsppp/Include/dsppp/Helium/float.hpp b/dsppp/Include/dsppp/Helium/float.hpp
index 6f7861508..817beb13a 100644
--- a/dsppp/Include/dsppp/Helium/float.hpp
+++ b/dsppp/Include/dsppp/Helium/float.hpp
@@ -29,83 +29,194 @@
 Arch is deriving from Helium
 
 */
+/**
+ * @brief      Vector datatype for Helium
+ *
+ * @tparam     arch  Current architecture
+ */
 template<typename arch>
 struct vector_traits<float,arch,
 typename std::enable_if<std::is_base_of<Helium,arch>::value>::type >
 {
+  //! Scalar datatype
   typedef float type;
+  //! Storage datatype
   typedef float storage_type;
+  //! Vector datatype
   typedef float32x4_t vector;
+  //! Temp accumulator datatype (must be reduced to final scalar datatype)
   typedef float32x4_t temp_accumulator;
+  //! Predicate datatype
   typedef mve_pred16_t predicate_t;
+  //! Has vector instruction
   static constexpr bool has_vector = true;
+  //! Is float
   static constexpr bool is_float = true;
+  //! Is not fixed point
   static constexpr bool is_fixed = false;
+  //! Has predicated loops
   static constexpr bool has_predicate = true;
 
+  //! Number of lanes
   static constexpr int nb_lanes = 4;
 
+  /**
+   * @brief      Temp accumulator initialized to 0
+   *
+   * @return     Temp accumulator initialized to 0
+   */
   static float32x4_t temp_acc_zero()
   {
      return(vdupq_n_f32(0.0f));
   }
 
+  /**
+   * @brief      Zero lane
+   *
+   * @return     Value to wrte 0 into a lane
+   */
   static constexpr float zero_lane() {return 0.0f;};
+  
   // Useful in fixed point since lane value is an int and not a Q something
+  
+  /**
+   * @brief      Lane value
+   *
+   * @param[in]  x    Lane vlue
+   *
+   * @return     Value with scalar datatype
+   */
   static constexpr float lane_value(const float x) {return x;};
   
 };
 
 
-
+/**
+ * \ingroup HeliumNumber
+ * @{
+ */
 namespace inner {
 
+
+  /**
+   * @brief      vctpq for this datatype
+   */
   template<>
   struct vctpq<float> {
+   /**
+    * @brief      Make a predicate
+    *
+    * @param[in]  v     Number of iterations
+    *
+    * @return     Predicate
+    */
     static mve_pred16_t mk(uint32_t v)
     {
        return(vctp32q(v));
     };
   };
   
+  /**
+   * @brief      Vector constant
+   *
+   * @param[in]  v     Constant value
+   *
+   * @return     Vector initialized with constant in each lane
+   */
   __STATIC_FORCEINLINE float32x4_t vconst(const float v)
   {
      return(vdupq_n_f32(v));
   }
 
+  /**
+   * @brief      Vector constant with tail
+   *
+   * @param[in]  v     Constant value
+   * @param[in]  p0    Prddicate
+   *
+   * @return     Vector initialized with constant in some lanes
+   *             dependign on the predicate
+   */
   __STATIC_FORCEINLINE float32x4_t vconst_tail(const float v,
                                                const mve_pred16_t p0)
   {
      return(vdupq_x_n_f32(v,p0));
   }
 
+  /**
+   * @brief      Vector negate
+   *
+   * @param[in]  a     Vector value to negate
+   *
+   * @return     Negated value
+   */
   __STATIC_FORCEINLINE float32x4_t vneg(const float32x4_t a)
   {
      return(vnegq(a));
   };
 
+  /**
+   * @brief      Vector negate with tail
+   *
+   * @param[in]  a     Value
+   * @param[in]  p0    Predicate
+   *
+   * @return     Negated value
+   */
   __STATIC_FORCEINLINE float32x4_t vneg(const float32x4_t a,
                                         const mve_pred16_t p0)
   {
      return(vnegq_x(a,p0));
   };
   
+  /**
+   * @brief      Vector + Vector
+   *
+   * @param[in]  a     First operand
+   * @param[in]  b     Second operand
+   *
+   * @return     a + b
+   */
   __STATIC_FORCEINLINE float32x4_t vadd(const float32x4_t a,const float32x4_t b)
   {
      return(vaddq(a,b));
   };
 
- 
+  /**
+   * @brief      Vector + Scalar
+   *
+   * @param[in]  a     Vector
+   * @param[in]  b     Scalar
+   *
+   * @return     a + b
+   */
   __STATIC_FORCEINLINE float32x4_t vadd(const float32x4_t a,const float b)
   {
      return(vaddq_n_f32(a,b));
   };
 
+  /**
+   * @brief      Scalar + Vector
+   *
+   * @param[in]  a     Scalar
+   * @param[in]  b     Vector
+   *
+   * @return     a + b
+   */
    __STATIC_FORCEINLINE float32x4_t vadd(const float a,const float32x4_t b)
   {
      return(vaddq_n_f32(b,a));
   };
 
+  /**
+   * @brief      Vector + Vector with tail
+   *
+   * @param[in]  a     Vector
+   * @param[in]  b     Vector
+   * @param[in]  p0    Predicated
+   *
+   * @return     a + b with tail predicate
+   */
   __STATIC_FORCEINLINE float32x4_t vadd(const float32x4_t a,const float32x4_t b,
                                          const mve_pred16_t p0)
   {
@@ -113,94 +224,242 @@ namespace inner {
   };
 
 
+  /**
+   * @brief      Vector + scalar with tail
+   *
+   * @param[in]  a     Vector
+   * @param[in]  b     Scalar
+   * @param[in]  p0    Predicate
+   *
+   * @return     a + b with tail predicate
+   */
   __STATIC_FORCEINLINE float32x4_t vadd(const float32x4_t a,const float b,
                                         const mve_pred16_t p0)
   {
      return(vaddq_x_n_f32(a,b,p0));
   };
 
+  /**
+   * @brief      Scalar + vector with tail predicate
+   *
+   * @param[in]  a     Scalar
+   * @param[in]  b     Vector
+   * @param[in]  p0    Predicate
+   *
+   * @return     a + b with tail predicate
+   */
   __STATIC_FORCEINLINE float32x4_t vadd(const float a,const float32x4_t b,
                                         const mve_pred16_t p0)
   {
      return(vaddq_x_n_f32(b,a,p0));
   };
 
+  /**
+   * @brief      Vector - Vector
+   *
+   * @param[in]  a     Vector
+   * @param[in]  b     Vector
+   *
+   * @return     a - b
+   */
   __STATIC_FORCEINLINE float32x4_t vsub(const float32x4_t a,const float32x4_t b)
   {
      return(vsubq(a,b));
   };
 
+  /**
+   * @brief      Vector - Scalar
+   *
+   * @param[in]  a     Vector
+   * @param[in]  b     Scalar
+   *
+   * @return     a - b
+   */
   __STATIC_FORCEINLINE float32x4_t vsub(const float32x4_t a,const float b)
   {
      return(vsubq_n_f32(a,b));
   };
 
+  /**
+   * @brief      Scalar - Vector
+   *
+   * @param[in]  a     Scalar
+   * @param[in]  b     Vector
+   *
+   * @return     a - b
+   */
    __STATIC_FORCEINLINE float32x4_t vsub(const float a,const float32x4_t b)
   {
      return(vsubq_n_f32(b,a));
   };
 
+  /**
+   * @brief      Vector - Vector with predicate
+   *
+   * @param[in]  a     Vector
+   * @param[in]  b     Vector
+   * @param[in]  p0    Predicate
+   *
+   * @return     a - b
+   */
   __STATIC_FORCEINLINE float32x4_t vsub(const float32x4_t a,const float32x4_t b,
                                         const mve_pred16_t p0)
   {
      return(vsubq_x(a,b,p0));
   };
 
+  /**
+   * @brief      Vector - Scalar with predicate
+   *
+   * @param[in]  a     Vector
+   * @param[in]  b     Scalar
+   * @param[in]  p0    predicate
+   *
+   * @return     a - b with predicate
+   */
   __STATIC_FORCEINLINE float32x4_t vsub(const float32x4_t a,const float b,
                                         const mve_pred16_t p0)
   {
      return(vsubq_x_n_f32(a,b,p0));
   };
 
+  /**
+   * @brief      Scalar - Vector with predicate
+   *
+   * @param[in]  a     Scalar
+   * @param[in]  b     Vector
+   * @param[in]  p0    predicate
+   *
+   * @return     a - b with predicate
+   */
    __STATIC_FORCEINLINE float32x4_t vsub(const float a,const float32x4_t b,
                                         const mve_pred16_t p0)
   {
      return(vsubq_x_n_f32(b,a,p0));
   };
   
+  /**
+   * @brief      Vector * Vector
+   *
+   * @param[in]  a     Vector
+   * @param[in]  b     Vector
+   *
+   * @return     a * b
+   */
   __STATIC_FORCEINLINE float32x4_t vmul(const float32x4_t a,const float32x4_t b)
   {
      return(vmulq(a,b));
   };
 
+  /**
+   * @brief      Vector * Scalar
+   *
+   * @param[in]  a     Vector
+   * @param[in]  b     Scalar
+   *
+   * @return     a * b
+   */
   __STATIC_FORCEINLINE float32x4_t vmul(const float32x4_t a,const float b)
   {
      return(vmulq_n_f32(a,b));
   };
 
+  /**
+   * @brief      Scalar * Vector
+   *
+   * @param[in]  a     Scalar
+   * @param[in]  b     Vector
+   *
+   * @return     a * b
+   */
   __STATIC_FORCEINLINE float32x4_t vmul(const float a,const float32x4_t b)
   {
      return(vmulq_n_f32(b,a));
   };
 
+ /**
+  * @brief      Vector * Vector with predicate
+  *
+  * @param[in]  a     Vector
+  * @param[in]  b     Vector
+  * @param[in]  p0    Predicate
+  *
+  * @return     a * b
+  */
   __STATIC_FORCEINLINE float32x4_t vmul(const float32x4_t a,const float32x4_t b,
                                         const mve_pred16_t p0)
   {
      return(vmulq_x(a,b,p0));
   };
 
+  /**
+   * @brief      Vector * Scalar with predicate
+   *
+   * @param[in]  a     Vector
+   * @param[in]  b     Scalar
+   * @param[in]  p0    Predicate
+   *
+   * @return     a * b with predicate
+   */
   __STATIC_FORCEINLINE float32x4_t vmul(const float32x4_t a,const float b,
                                         const mve_pred16_t p0)
   {
      return(vmulq_x_n_f32(a,b,p0));
   };
 
+  /**
+   * @brief      Scalar * Vector with predicate
+   *
+   * @param[in]  a     Scalar
+   * @param[in]  b     Vector
+   * @param[in]  p0    Predicate
+   *
+   * @return     a * b with predicate
+   */
   __STATIC_FORCEINLINE float32x4_t vmul(const float a,const float32x4_t b,
                                         const mve_pred16_t p0)
   {
      return(vmulq_x_n_f32(b,a,p0));
   };
 
+  /**
+   * @brief      Multiply accumulate (Vector * Vector)
+   *
+   * @param[in]  acc   Accumulator
+   * @param[in]  a     Vector
+   * @param[in]  b     Vector
+   *
+   * @return     acc + a * b
+   */
   __STATIC_FORCEINLINE float32x4_t vmacc(const float32x4_t acc,const float32x4_t a,const float32x4_t b)
   {
      return(vfmaq(acc,a,b));
   };
 
+/**
+ * @brief      Multiply accumulate (Vector * Scalar)
+ *
+ * @param[in]  acc   Accumulator
+ * @param[in]  a     Vector
+ * @param[in]  b     Scalar
+ *
+ * @return     acc + a  * b
+ */
   __STATIC_FORCEINLINE float32x4_t vmacc(const float32x4_t acc,const float32x4_t a,const float_t b)
   {
      return(vfmaq(acc,a,b));
   };
 
+  /**
+   * @brief      Multiply accumulate with predicate (Vector * Vector)
+   *
+   * @param[in]  acc   Accumulator
+   * @param[in]  a     Vector
+   * @param[in]  b     Vector
+   * @param[in]  p0    Predicate
+   *
+   * @return     acc + a*b with predicate
+   */
   __STATIC_FORCEINLINE float32x4_t vmacc(const float32x4_t acc,const float32x4_t a,const float32x4_t b,
                                         const mve_pred16_t p0)
   {
@@ -209,6 +468,13 @@ namespace inner {
 
 
 
+  /**
+   * @brief      Vector reduce
+   *
+   * @param[in]  in    Vector
+   *
+   * @return     Reduced scalar value
+   */
   __STATIC_FORCEINLINE float vreduce(const float32x4_t in)
   {
      float acc = vgetq_lane(in, 0) + vgetq_lane(in, 1) +
@@ -218,7 +484,16 @@ namespace inner {
 
 
 
-
+  /**
+   * @brief      Vector load with stride 
+   *
+   * @param[in]  p          Load address
+   *
+   * @tparam     S          Stride
+   * @tparam     <unnamed>  Check stride value
+   *
+   * @return     Loaded vector with stride
+   */
   template<int S,
   typename std::enable_if<S==1,bool>::type = true>
   inline float32x4_t vload1(const float32_t *p)
@@ -226,6 +501,7 @@ namespace inner {
      return(vld1q(p));
   };
 
+ 
   template<int S,
   typename std::enable_if<(S>1),bool>::type = true>
   inline float32x4_t vload1(const float32_t *p)
@@ -236,6 +512,15 @@ namespace inner {
 
 
   // With dynamic stride
+  
+  /**
+   * @brief      Vector load with dynamic stride
+   *
+   * @param[in]  p       Load address
+   * @param[in]  stride  Stride value
+   *
+   * @return     Loaded vector with stride
+   */
   inline float32x4_t vload1(const float32_t *p,const index_t stride)
   {
      uint32x4_t offset = vidupq_u32((uint32_t)0,1);
@@ -244,7 +529,18 @@ namespace inner {
   };
 
   
- 
+  /**
+   * @brief      Vector load with stride and predicate 
+   *
+   * @param[in]  p          Load address
+   * @param[in]  nb         Number of remaining loop samples
+   * @param[in]  p0         Predicate for remaining loop samples
+   *
+   * @tparam     S          Stride
+   * @tparam     <unnamed>  Check stride value
+   *
+   * @return     Loaded vector with stride and loop predication
+   */
   template<int S,
   typename std::enable_if<S==1,bool>::type = true>
   inline float32x4_t vload1_z(const float32_t *p,const std::size_t nb,const mve_pred16_t p0)
@@ -253,6 +549,7 @@ namespace inner {
      return(vld1q_z(p,p0));
   };
 
+
   template<int S,
   typename std::enable_if<(S>1),bool>::type = true>
   inline float32x4_t vload1_z(const float32_t *p,const std::size_t nb,const mve_pred16_t p0)
@@ -266,6 +563,17 @@ namespace inner {
   };
 
   // With dynamic stride
+
+  /**
+   * @brief      Vector load with dynamic stride and loop predication
+   *
+   * @param[in]  p       Load address
+   * @param[in]  stride  Stride value
+   * @param[in]  nb      Number of remaining loop samples
+   * @param[in]  p0      Predicate for remaining loop samples
+   *
+   * @return     Loaded vector with stride and loop predicate
+   */
   inline float32x4_t vload1_z(const float32_t *p,const index_t stride,const std::size_t nb,const mve_pred16_t p0)
   {
      (void)nb;
@@ -277,9 +585,22 @@ namespace inner {
   };
   
   /* Generalized stride */
+
+  /**
+   * @brief      Load with generalized stride (gather load)
+   *
+   * @tparam     S     List of offsets known at built time
+   */
   template<int ...S>
   struct vload1_gen_stride
   {
+     /**
+      * @brief      Load with generalized stride
+      *
+      * @param[in]  p     Load address
+      *
+      * @return     Gather load
+      */
      static float32x4_t run(const float32_t *p)
      {
         constexpr uint32x4_t offset={S...};
@@ -287,20 +608,45 @@ namespace inner {
      };
   };
 
+  /**
+   * @brief      Load with generalized stride specialized for <0,1,2,3>
+   */
   template<>
   struct vload1_gen_stride<0,1,2,3>
   {
-     inline float32x4_t run(const float32_t *p)
+     /**
+      * @brief      Load with generalized stride
+      *
+      * @param[in]  p     Load address
+      *
+      * @return     Loaded vector
+      */
+     static float32x4_t run(const float32_t *p)
      {
         return(vld1q(p));
      };
   };
 
   /* Generalized stride */
+
+  /**
+   * @brief      Load with generalized stride (gather load) and tail predicate
+   *
+   * @tparam     S     List of offsets known at built time
+   */
   template<int ...S>
   struct vload1_gen_stride_z 
   {
-     inline float32x4_t run(const float32_t *p,const std::size_t nb,const mve_pred16_t p0)
+     /**
+      * @brief      Load
+      *
+      * @param[in]  p     Load address
+      * @param[in]  nb    Number of remaining samples in loop
+      * @param[in]  p0    Predicate for remaining samples
+      *
+      * @return     Gather load with predicate
+      */
+     static float32x4_t run(const float32_t *p,const std::size_t nb,const mve_pred16_t p0)
      {
         constexpr uint32x4_t offset={S...};
         (void)nb;
@@ -308,16 +654,39 @@ namespace inner {
      };
   };
 
+  /**
+   * @brief      Load with generalized stride (gather load) and tail predicate specialized for <0,1,2,3>
+   *
+   * @tparam     S     List of offsets known at built time
+   */
   template<>
   struct vload1_gen_stride_z<0,1,2,3>
   {
-     inline float32x4_t run(const float32_t *p,const std::size_t nb,const mve_pred16_t p0)
+     /**
+      * @brief      Gather load with predicated specialized for <0,1,2,3>
+      *
+      * @param[in]  p     Load address
+      * @param[in]  nb    Number of remaining samples in the loop
+      * @param[in]  p0    Predicate for samples in the loop
+      *
+      * @return     Gather load
+      */
+     static float32x4_t run(const float32_t *p,const std::size_t nb,const mve_pred16_t p0)
      {
         (void)nb;
         return(vld1q_z(p,p0));
      };
   };
   
+  /**
+   * @brief      Store with stride
+   *
+   * @param      p          Store address
+   * @param[in]  val        Value to store
+   *
+   * @tparam     S          Stride
+   * @tparam     <unnamed>  Check stride value
+   */
   template<int S,
   typename std::enable_if<S==1,bool>::type = true>
   inline void vstore1(float32_t *p,const float32x4_t val)
@@ -337,6 +706,14 @@ namespace inner {
   };
 
   // with dynamic stride
+
+  /**
+   * @brief      Store with dynamic stride
+   *
+   * @param      p       Store address
+   * @param[in]  stride  Stride value
+   * @param[in]  val     Value to store
+   */
   inline void vstore1(float32_t *p,const index_t stride,const float32x4_t val)
   {
      uint32x4_t offset = vidupq_u32((uint32_t)0,1);
@@ -345,6 +722,17 @@ namespace inner {
   };
 
   
+  /**
+   * @brief      Store with stride and tail predicate
+   *
+   * @param      p          Store address
+   * @param[in]  val        Value to store
+   * @param[in]  nb         Number of remaining loop iterations
+   * @param[in]  p0         Predicate for loop
+   *
+   * @tparam     S          Stride
+   * @tparam     <unnamed>  Check stride value
+   */
   template<int S,
   typename std::enable_if<S==1,bool>::type = true>
   inline void vstore1_z(float32_t *p,const float32x4_t val,const std::size_t nb,const mve_pred16_t p0)
@@ -366,6 +754,16 @@ namespace inner {
   };
 
   // with dynamic stride
+
+  /**
+   * @brief      Store with dynamic stride
+   *
+   * @param      p       Store address
+   * @param[in]  stride  Stride value
+   * @param[in]  val     Value to store
+   * @param[in]  nb      Number of remaining loops
+   * @param[in]  p0      Predicate for loop
+   */
   inline void vstore1_z(float32_t *p,const index_t stride,const float32x4_t val,const std::size_t nb,const mve_pred16_t p0)
   {
      (void)nb;
@@ -375,9 +773,21 @@ namespace inner {
   };
 
   // Generalized stride
+
+  /**
+   * @brief      Generalized store with strides
+   *
+   * @tparam     S     Stride values known at built time
+   */
   template<int ...S>
   struct vstore1_gen_stride
   {
+     /**
+      * @brief      Scatter store
+      *
+      * @param      p     Store address
+      * @param[in]  val   VAlue to store
+      */
      static void run(float32_t *p,const float32x4_t val)
      {
         constexpr uint32x4_t offset={S...};
@@ -385,18 +795,40 @@ namespace inner {
      };
   };
 
+  /**
+   * @brief      Generalized store with stride (Specialized for <0,1,2,3>)
+   */
   template<>
   struct vstore1_gen_stride<0,1,2,3>
   {
+      /**
+       * @brief      Scatter store
+       *
+       * @param      p     Store address
+       * @param[in]  val   Value to store
+       */
       static void run(float32_t *p,const float32x4_t val)
      {
         vst1q(p,val);
      };
   };
 
+  /**
+   * @brief      Store with generalized strides and tail predicate
+   *
+   * @tparam     S     Strides values known at built time
+   */
   template<int ...S>
   struct vstore1_gen_stride_z
   {
+      /**
+       * @brief      Scatter store with tail predicate
+       *
+       * @param      p     Store address
+       * @param[in]  val   Value to store
+       * @param[in]  nb    Remaining number of loops
+       * @param[in]  p0    Loop predicate
+       */
       static void vstore1_z(float32_t *p,const float32x4_t val,const std::size_t nb,const mve_pred16_t p0)
       {
         constexpr uint32x4_t offset={S...};
@@ -405,9 +837,20 @@ namespace inner {
       }
   };
 
+  /**
+   * @brief      Scatter store with tail predicate (specialized for <0,1,2,3>)
+   */
   template<>
   struct vstore1_gen_stride_z<0,1,2,3>
   {
+     /**
+      * @brief      Scatter store with tail predicate
+      *
+      * @param      p     Store address
+      * @param[in]  val   Value to store
+      * @param[in]  nb    Number of remaining loops
+      * @param[in]  p0    Loop predicate
+      */
      static void vstore1_z(float32_t *p,const float32x4_t val,const std::size_t nb,const mve_pred16_t p0)
      {
         (void)nb;
@@ -416,9 +859,10 @@ namespace inner {
 
   };
 
-  
+
 
 };
+/*! @} */
 
 #endif
 
diff --git a/dsppp/Include/dsppp/Helium/half.hpp b/dsppp/Include/dsppp/Helium/half.hpp
index 0df7f2418..bb678c1ab 100644
--- a/dsppp/Include/dsppp/Helium/half.hpp
+++ b/dsppp/Include/dsppp/Helium/half.hpp
@@ -17,39 +17,88 @@
  */
 
 #if defined(ARM_MATH_MVE_FLOAT16)
+
+/**
+ * @brief      Vector features for f16 on Helium
+ *
+ * @tparam     arch  Current architecture
+ */
 template<typename arch>
 struct vector_traits<float16_t,arch,typename std::enable_if<std::is_base_of<Helium,arch>::value>::type> 
 {
+  //! Scalar datatype
   typedef float16_t type;
+  //! Storage datatype
   typedef float16_t storage_type;
+  //! Vector datatype
   typedef float16x8_t vector;
+  //! Temp accumulator datatype
   typedef float16x8_t temp_accumulator;
+  //! Predicate datatype
   typedef mve_pred16_t predicate_t;
 
+  //! Has vector instructions
   static constexpr bool has_vector = true;
+  //! Is float
   static constexpr bool is_float = true;
+  //! Is not fixed point
   static constexpr bool is_fixed = false;
+  //! Has predicated loop
   static constexpr bool has_predicate = true;
 
+  //! Number of lanes
   static constexpr int nb_lanes = 8;
 
+  /**
+   * @brief      Vector of 0
+   *
+   * @return     Vector of 0
+   */
   static float16x8_t temp_acc_zero()
   {
      return(vdupq_n_f16(0.0f));
   }
 
+  /**
+   * @brief      Value to write 0 in a lane
+   *
+   * @return     Value to write 0 in a lane
+   * 
+   * f16 suffix not supported in C++ 
+   */
   static constexpr float16_t zero_lane() {return 0.0f;};
   // Useful in fixed point since lane value is an int and not a Q something
+
+  /**
+   * @brief      Convert from lane value
+   *
+   * @param[in]  x     Lane value
+   *
+   * @return     Lane value with current datatype
+   */
   static constexpr float16_t lane_value(const float16_t x) {return x;};
 
 };
 
 
+/**
+ * \ingroup HeliumNumber
+ */
 namespace inner {
 
 
+   /**
+    * @brief      vctpq for Helium and f16
+    */
    template<>
    struct vctpq<float16_t>{
+       /**
+        * @brief      Make predicate
+        *
+        * @param[in]  v     Remaining iterations
+        *
+        * @return     Predicate
+        */
        static mve_pred16_t mk(uint32_t v)
        
        {
@@ -57,22 +106,52 @@ namespace inner {
        };
    };
    
+   /**
+    * @brief      Vector const
+    *
+    * @param[in]  v     Initialization value
+    *
+    * @return     Vector of const
+    */
    __STATIC_FORCEINLINE float16x8_t vconst(float16_t v)
    {
       return(vdupq_n_f16(v));
    }
 
+   /**
+    * @brief      Vector of const with tail predicate
+    *
+    * @param[in]  v     The initialization parameter
+    * @param[in]  p0    The predicate
+    *
+    * @return     The initialized vector with const and predicate
+    */
    __STATIC_FORCEINLINE float16x8_t vconst_tail(const float16_t v,
                                                 const mve_pred16_t p0)
   {
      return(vdupq_x_n_f16(v,p0));
   }
 
+  /**
+   * @brief      Vector negate
+   *
+   * @param[in]  a     Vector
+   *
+   * @return     Negate of vector
+   */
    __STATIC_FORCEINLINE float16x8_t vneg(const float16x8_t a)
   {
      return(vnegq(a));
   };
 
+  /**
+   * @brief      Vector negate with tail predicate
+   *
+   * @param[in]  a     Vector
+   * @param[in]  p0    Predicate
+   *
+   * @return     Negate of vector with tail predicate
+   */
   __STATIC_FORCEINLINE float16x8_t vneg(const float16x8_t a,
                                         const mve_pred16_t p0)
   {
@@ -85,24 +164,57 @@ namespace inner {
 
    */
 
+   /**
+    * @brief      Vector + Vector
+    *
+    * @param[in]  a     Vector
+    * @param[in]  b     Vector
+    *
+    * @return     a + b
+    */
    __STATIC_FORCEINLINE float16x8_t vadd(const float16x8_t a,
                                          const float16x8_t b)
    {
       return(vaddq(a,b));
    };
 
+   /**
+    * @brief      Vector + Scalar
+    *
+    * @param[in]  a     Vector
+    * @param[in]  b     Scalar
+    *
+    * @return     a + b
+    */
    __STATIC_FORCEINLINE float16x8_t vadd(const float16x8_t a,
                                          const float16_t b)
    {
       return(vaddq_n_f16(a,b));
    };
 
+   /**
+    * @brief      Scalar + Vector
+    *
+    * @param[in]  a     Scalar
+    * @param[in]  b     Vector
+    *
+    * @return     a + b
+    */
    __STATIC_FORCEINLINE float16x8_t vadd(const float16_t a,
                                          const float16x8_t b)
    {
       return(vaddq_n_f16(b,a));
    };
 
+   /**
+    * @brief      Vector + Vector with tail predicate 
+    *
+    * @param[in]  a     Vector
+    * @param[in]  b     Vector
+    * @param[in]  p0    predicate
+    *
+    * @return     a + b with tail predicate
+    */
    __STATIC_FORCEINLINE float16x8_t vadd(const float16x8_t a,
                                          const float16x8_t b,
                                          const mve_pred16_t p0)
@@ -110,6 +222,15 @@ namespace inner {
      return(vaddq_x(a,b,p0));
    };
 
+   /**
+    * @brief      Vector + Scalar with tail predicate
+    *
+    * @param[in]  a     Vector
+    * @param[in]  b     Scalar
+    * @param[in]  p0    Predicate
+    *
+    * @return     a + b with tail predicate
+    */
    __STATIC_FORCEINLINE float16x8_t vadd(const float16x8_t a,
                                          const float16_t b,
                                          const mve_pred16_t p0)
@@ -117,6 +238,15 @@ namespace inner {
      return(vaddq_x_n_f16(a,b,p0));
    };
 
+   /**
+    * @brief      Scalar + Vector with tail predicate
+    *
+    * @param[in]  a     Scalar
+    * @param[in]  b     Vector
+    * @param[in]  p0    Predicate
+    *
+    * @return     a + b with tail predicate
+    */
    __STATIC_FORCEINLINE float16x8_t vadd(const float16_t a,
                                          const float16x8_t b,
                                          const mve_pred16_t p0)
@@ -512,8 +642,10 @@ namespace inner {
        }
     };
 
+
 }
 
+
 #endif
 
 /*! @} */
diff --git a/dsppp/Include/dsppp/Helium/q15.hpp b/dsppp/Include/dsppp/Helium/q15.hpp
index 2da379ac2..472f85df6 100644
--- a/dsppp/Include/dsppp/Helium/q15.hpp
+++ b/dsppp/Include/dsppp/Helium/q15.hpp
@@ -24,36 +24,77 @@
  */
 #if defined(ARM_MATH_MVEI)
 
-
+/**
+ * @brief      Vector features for Q15 on Helium
+ *
+ * @tparam     arch  Current architecture
+ */
 template<typename arch>
 struct vector_traits<Q15,arch,
 typename std::enable_if<std::is_base_of<Helium,arch>::value>::type > 
 {
+  //! Scalar datatype
   typedef Q15 type;
+
+  //! Storage datatype (int16_t)
   typedef type::value_type storage_type;
+
+  //! Vector datatype
   typedef int16x8_t vector;
+
+  //! Temp accumulator datatype
   typedef Q<33,30> temp_accumulator;
+
+  //! Predicate for loop
   typedef mve_pred16_t predicate_t;
 
+  //! Has vector instructions
   static constexpr bool has_vector = true;
+
+  //! Is not float
   static constexpr bool is_float = false;
+
+  //! Is fixed point
   static constexpr bool is_fixed = true;
+
+  //! Has predicated loop
   static constexpr bool has_predicate = true;
 
+  //! Number of lanes
   static constexpr int nb_lanes = 8;
 
 
+  /**
+   * @brief      Zero
+   *
+   * @return     Zero with accumulator datatype
+   */
   static Q<33,30> temp_acc_zero()
   {
        return(Q<33,30>());
   }
 
+  /**
+   * @brief      Value to write in a lane to write 0
+   *
+   * @return     Zero value
+   */
   static constexpr int16_t zero_lane() {return 0;};
 
+  /**
+   * @brief      Convert to lane value
+   *
+   * @param[in]  x     Lane value
+   *
+   * @return     Lane value 
+   */
   static constexpr int16_t lane_value(const Q15 x) {return x.v;};
 
 };
 
+/**
+ * \ingroup HeliumNumber
+ */
 namespace inner {
 
 
@@ -203,6 +244,26 @@ namespace inner {
     S <= 9362
 
     */
+
+    /**
+     * @brief      Vector load with stride
+     *
+     * @param[in]  p          Load address
+     *
+     * @tparam     S          Stride
+     * @tparam     <unnamed>  Stride check
+     *
+     * @return     Gather load
+     * 
+     * In q15, a lane is on 16 bits. So the offset that can be encoded
+     * for gather load cannot be bigger than 65535.
+     * With a stride of S, the bigger offset is S*7.
+     * So S must be <= 65535/7 
+     * S <= 9362
+     * 
+     * For higher stride, the Helium instruction cannot be used and instead
+     * a dynamic stride is used.
+     */
     template<int S,
     typename std::enable_if<(S>1) && (S<=9362),bool>::type = true>
     inline int16x8_t vload1(const Q15 *p)
@@ -421,6 +482,15 @@ namespace inner {
        }
     };
    
+    /**
+     * @brief      Vector accumulate into scalar
+     *
+     * @param[in]  sum   The sum
+     * @param[in]  vala  The vala
+     * @param[in]  valb  The valb
+     *
+     * @return     vala * valb and accumulated into sum
+     */
     __STATIC_FORCEINLINE Q<33,30> vmacc(const Q<33,30> sum,
                                         const int16x8_t vala,
                                         const int16x8_t valb)
@@ -449,6 +519,17 @@ namespace inner {
        return(Q<33,30>(vmlaldavq_p(vala,valb,p0)));
     };
 
+    /**
+     * @brief      Reduce accumulation value
+     *
+     * @param[in]  sum   The sum
+     *
+     * @return     Reduced value
+     * 
+     * Since the Helium instructions can accumulate vector product into a scalar
+     * there is no need to reduce the accumulator value. It is already in scalar
+     * form.
+     */
     __STATIC_FORCEINLINE Q<33,30> vreduce(const Q<33,30> sum)
     {
        return(sum);
diff --git a/dsppp/Include/dsppp/Helium/q31.hpp b/dsppp/Include/dsppp/Helium/q31.hpp
index 6f8eaae6e..711e8a47e 100644
--- a/dsppp/Include/dsppp/Helium/q31.hpp
+++ b/dsppp/Include/dsppp/Helium/q31.hpp
@@ -55,6 +55,10 @@ typename std::enable_if<std::is_base_of<Helium,arch>::value>::type >
 
 };
 
+/**
+ * Inner implementation of Helium intrinsics
+ * \ingroup HeliumNumber
+ */
 namespace inner {
 
     template<>
diff --git a/dsppp/Include/dsppp/Helium/q7.hpp b/dsppp/Include/dsppp/Helium/q7.hpp
index 5d9f4cc25..022d85172 100644
--- a/dsppp/Include/dsppp/Helium/q7.hpp
+++ b/dsppp/Include/dsppp/Helium/q7.hpp
@@ -54,6 +54,10 @@ typename std::enable_if<std::is_base_of<Helium,arch>::value>::type >
 
 };
 
+/**
+ * Inner implementation of Helium intrinsics
+ * \ingroup HeliumNumber
+ */
 namespace inner {
 
 
diff --git a/dsppp/Include/dsppp/Scalar/basic.hpp b/dsppp/Include/dsppp/Scalar/basic.hpp
index cace44b47..1b10de60e 100644
--- a/dsppp/Include/dsppp/Scalar/basic.hpp
+++ b/dsppp/Include/dsppp/Scalar/basic.hpp
@@ -12,6 +12,19 @@
 
 #define SCALAR_UNROLL 2
 
+/**
+ * @brief      Fill evaluator for scalar architecture
+ *
+ * @param      v          Destination vector 
+ * @param[in]  val        Initialization value
+ * @param[in]  l          Length of vector
+ *
+ * @tparam     T          Scalar datatype
+ * @tparam     DST        VEctor / Matrix datatype
+ * @tparam     <unnamed>  Test to restrict to vector addressing 
+ *                        and compatible datatype
+ * 
+ */
 template<typename T,typename DST,
 typename std::enable_if<IsVector<DST>::value &&
          SameElementType<DST,T>::value,bool>::type = true>
@@ -38,7 +51,18 @@ inline void _Fill(DST &v,
     }
 }
 
-
+/**
+ * @brief      Fill2D evaluator for scalar architecture
+ *
+ * @param      v          Matrix value
+ * @param[in]  val        Initialization value
+ * @param[in]  rows       Number of rows
+ * @param[in]  cols       Number of columns
+ *
+ * @tparam     T          Scalar datatype
+ * @tparam     DST        Matrix datatype
+ * @tparam     <unnamed>  Check DST has matrix indexing only
+ */
 template<typename T,typename DST,
 typename std::enable_if<must_use_matrix_idx<DST>() &&
          SameElementType<DST,T>::value,bool>::type = true>
@@ -75,11 +99,17 @@ inline void _Fill2D(DST &v,
 }
 
 
-/*
-
-Evaluation : used when result is a vector
-
-*/
+/**
+ * @brief      Expression evaluator for vector in scalar mode
+ *
+ * @param      v          Vector
+ * @param[in]  other      Expression
+ * @param[in]  l          Length of expression (number of samples)
+ *
+ * @tparam     DA         Destination datatype
+ * @tparam     DB         Source datatype
+ * @tparam     <unnamed>  Check vectors are compatible
+ */
 template<typename DA,typename DB,
 typename std::enable_if<vector_idx_pair<DA,DB>(),bool>::type = true>
 inline void eval(DA &v,
@@ -104,6 +134,18 @@ inline void eval(DA &v,
     }
 }
 
+/**
+ * @brief      2D expression evaluator for scalar archiecture
+ *
+ * @param      v          Destination value
+ * @param[in]  other      The expression
+ * @param[in]  rows       Number of rows
+ * @param[in]  cols       Number of columns
+ *
+ * @tparam     DA         Destination datatype
+ * @tparam     DB         Source datatype
+ * @tparam     <unnamed>  Check only support matrix indexing
+ */
 template<typename DA,typename DB,
 typename std::enable_if<must_use_matrix_idx_pair<DA,DB>(),bool>::type = true>
 inline void eval2D(DA &v,
@@ -139,6 +181,19 @@ inline void eval2D(DA &v,
       }
 }
 
+/**
+ * @brief      Dot product evaluator for scalar architectuire
+ *
+ * @param[in]  a          Left hand side
+ * @param[in]  b          Right hand side
+ * @param[in]  l          Vector lenght
+ *
+ * @tparam     DA         Left hand side datatype (may be expression)
+ * @tparam     DB         Right hand side datatype (may be expression)
+ * @tparam     <unnamed>  Check vector expressions are compatible
+ *
+ * @return     Dot product result
+ */
 template<typename DA,typename DB,
          typename std::enable_if<vector_idx_pair<DA,DB>(),bool>::type = true>
 inline DotResult<DA> _dot(const DA& a,
@@ -168,6 +223,17 @@ inline DotResult<DA> _dot(const DA& a,
     return(acc);
 }
 
+/**
+ * @brief      Swap evaluator for scalar architecture
+ *
+ * @param      a          Left hand side
+ * @param      b          Right hand side
+ * @param[in]  l          Vector length
+ *
+ * @tparam     DA         Left hand side datatype
+ * @tparam     DB         Right hand side datatype
+ * @tparam     <unnamed>  Check vectors are compatible
+ */
 template<typename DA,typename DB,
          typename std::enable_if<vector_idx_pair<DA,DB>(),bool>::type = true>
 inline void _swap(DA&& a,
diff --git a/dsppp/Include/dsppp/Scalar/matrix_multiply.hpp b/dsppp/Include/dsppp/Scalar/matrix_multiply.hpp
index a73a96917..75d65db98 100644
--- a/dsppp/Include/dsppp/Scalar/matrix_multiply.hpp
+++ b/dsppp/Include/dsppp/Scalar/matrix_multiply.hpp
@@ -7,7 +7,15 @@
  *  @{
  */
 
-
+/**
+ * @brief      Transposition for scalar architecture
+ *
+ * @param[in]  src        The source
+ * @param      dst        The destination
+ *
+ * @tparam     MA         Source datatype
+ * @tparam     MB         Destination datatype
+ */
 template<typename MA,
          typename MB>
 __STATIC_INLINE void _arm_mat_trans(
@@ -22,6 +30,17 @@ __STATIC_INLINE void _arm_mat_trans(
   }
 }
 
+/**
+ * @brief      Matrix times vector for scalar architecture
+ *
+ * @param      res        Destination
+ * @param[in]  m          Matrix
+ * @param[in]  v          Vector (my be expression)
+ *
+ * @tparam     M          Matrix datatype
+ * @tparam     V          Vector datatype
+ * @tparam     RES        Result datatype
+ */
 template<typename M,
          typename V,
          typename RES>
diff --git a/dsppp/Include/dsppp/Scalar/matrix_multiply_fixed.hpp b/dsppp/Include/dsppp/Scalar/matrix_multiply_fixed.hpp
index 461e63a76..1f5bab39f 100644
--- a/dsppp/Include/dsppp/Scalar/matrix_multiply_fixed.hpp
+++ b/dsppp/Include/dsppp/Scalar/matrix_multiply_fixed.hpp
@@ -6,6 +6,20 @@
  *  @{
  */
 
+/**
+ * @brief      Matrix times matrix for scalar architecture and fixed point
+ *
+ * @param[in]  pSrcA      The source a
+ * @param[in]  pSrcB      The source b
+ * @param      pDst       The destination
+ * @param[in]  BT         Temporary matrix for transposition
+ *
+ * @tparam     MA         Left hand side datatype
+ * @tparam     MB         Right hand side datatype
+ * @tparam     RES        Destination datatype
+ * @tparam     TMP        Temporary matrix datatype
+ * @tparam     <unnamed>  Check fixed point arithmetic used
+ */
 template<typename MA,
          typename MB,
          typename RES,
diff --git a/dsppp/Include/dsppp/Scalar/matrix_multiply_float.hpp b/dsppp/Include/dsppp/Scalar/matrix_multiply_float.hpp
index 96cf08cf3..5da4cb422 100644
--- a/dsppp/Include/dsppp/Scalar/matrix_multiply_float.hpp
+++ b/dsppp/Include/dsppp/Scalar/matrix_multiply_float.hpp
@@ -6,6 +6,18 @@
  *  @{
  */
 
+/**
+ * @brief      Matrix times matrix for scalar architecture and float
+ *
+ * @param[in]  pSrcA      The source a
+ * @param[in]  pSrcB      The source b
+ * @param      pDst       The destination
+ *
+ * @tparam     MA         Left hand side datatype
+ * @tparam     MB         Right hand side datatype
+ * @tparam     RES        Result datatype
+ * @tparam     <unnamed>  Check if float
+ */
 template<typename MA,
          typename MB,
          typename RES,
diff --git a/dsppp/Include/dsppp/num_features/double.hpp b/dsppp/Include/dsppp/num_features/double.hpp
index 1e3c78ae7..a6d22a697 100644
--- a/dsppp/Include/dsppp/num_features/double.hpp
+++ b/dsppp/Include/dsppp/num_features/double.hpp
@@ -11,48 +11,133 @@
  */
 
 
+/**
+ * @brief      Features for double
+ */
 template<>
 struct number_traits<double>
 {
+   //! It is a float number
    static constexpr bool is_float = true;
+
+   //! It is not a fixed point
    static constexpr bool is_fixed = false;
+
+   //! Accumulator datatype for this scalar datatype
    typedef double accumulator;
+
+   /**
+    * @brief      One for this datatype
+    *
+    * @return     Return 1 representation for this datatype
+    */
    static constexpr double one() {return 1.0;};
+
+   //! Compute datatype for this scalar datatype
    typedef double compute_type;
 };
 
+/**
+ * @brief      Default vector datatype description for this scalar datatype
+ *
+ * @tparam     arch  Current architecture
+ */
 template<typename arch>
 struct vector_traits<double,arch,void> {
+
+  /**
+   * Scalar datatype
+   */
   typedef double type;
+
+  /**
+   * Storage datatype
+   */
   typedef double storage_type;
 
   // No vector type but must still be defined
+
+  /**
+   * Dummy  datatype. Must be present for building but not used
+   * since by default there is no vector architecture assumed
+   */
   typedef bool vector;
+
+  /**
+   * Dummy  datatype. Must be present for building but not used
+   * since by default there is no vector architecture assumed
+   */
   typedef bool temp_accumulator;
+
+  /**
+   * Dummy  datatype. Must be present for building but not used
+   * since by default there is no vector architecture assumed
+   */
   typedef uint32_t predicate_t;
 
+  /**
+   * By default : no vector architecture assumed
+   */
   static constexpr bool has_vector = false;
+
+  //! It is a float
   static constexpr bool is_float = true;
+  //! Not a fixed point
   static constexpr bool is_fixed = false;
+  //! No predicated loops
   static constexpr bool has_predicate = false;
 };
 
+/**
+ * Inner implementation of generic intrinsics
+ * \ingroup GenericNumber
+ */
 namespace inner {
+   /**
+    * @brief      Convert from accumulator representation
+    *
+    * @param[in]  a     Value
+    *
+    * @return     Accumulator value converted to current datatype
+    */
  __STATIC_FORCEINLINE double from_accumulator(const double a)
   {
      return(a);
   };
 
+/**
+ * @brief      Multiply and accumulate for this datatype
+ *
+ * @param[in]  acc   The accumulated value
+ * @param[in]  a     The left hand side
+ * @param[in]  b     The right hand side
+ *
+ * @return     Return acc + a*b
+ */
   __STATIC_FORCEINLINE double mac(const double acc,const double a,const double b)
   {
      return(acc+a*b);
   };
 
+/**
+ * @brief      Accumulate
+ *
+ * @param      a     Accumulator
+ * @param[in]  b     VAlue to be added
+ */
   __STATIC_FORCEINLINE void accumulate(double &a,const double &b)
 {
    a += b;
 }
 
+/**
+ * @brief      Multiply
+ *
+ * @param      a     Left hand side
+ * @param[in]  b     Right hand side
+ *
+ * @return     Return a*b
+ */
 __STATIC_FORCEINLINE double mult(double &a,const double &b)
 {
    return(a*b);
diff --git a/dsppp/Include/dsppp/num_features/float.hpp b/dsppp/Include/dsppp/num_features/float.hpp
index bf7838302..6b4632b91 100644
--- a/dsppp/Include/dsppp/num_features/float.hpp
+++ b/dsppp/Include/dsppp/num_features/float.hpp
@@ -10,13 +10,29 @@
  *  @{
  */
 
+/**
+ * @brief      Features for float
+ */
 template<>
 struct number_traits<float>
 {
+   //! It is a float number
    static constexpr bool is_float = true;
+
+   //! It is not a fixed point
    static constexpr bool is_fixed = false;
+
+   //! Accumulator datatype for this scalar datatype
    typedef float accumulator;
+
+  /**
+    * @brief      One for this datatype
+    *
+    * @return     Return 1 representation for this datatype
+    */
    static constexpr float one() {return 1.0f;};
+
+   //! Compute datatype for this scalar datatype
    typedef float compute_type;
 };
 
@@ -27,42 +43,95 @@ If arch is not deriving from Neon or Helium, then there are
 no vectors for float
 
 */
+
+/**
+ * @brief      Vector instructions for float when no Helium or Neon
+ *
+ * @tparam     arch  Current architecture
+ */
 template<typename arch>
 struct vector_traits<float,arch,
     typename std::enable_if<!std::is_base_of<Helium,arch>::value &&
                             !std::is_base_of<Neon,arch>::value>::type> {
+  
+  //! Current type
   typedef float type;
+
+  //! Current storage type
   typedef float storage_type;
 
   // No vector type but must still be defined
+
+  //! Dummy type. Not used when no vector instructions
   typedef bool vector;
+  //! Dummy type. Not used when no vector instructions
   typedef bool temp_accumulator;
+  //! Dummy type. Not used when no vector instructions
   typedef uint32_t predicate_t;
 
   
+  //! No vector instructions for this datatype
   static constexpr bool has_vector = false;
+  //! Is float
   static constexpr bool is_float = true;
+  //! Is fixed
   static constexpr bool is_fixed = false;
+
+  //! No predicated loop
   static constexpr bool has_predicate = false;
 
 };
 
+/**
+ * Inner implementation of generic intrinsics
+ * \ingroup GenericNumber
+ */
 namespace inner {
+   /**
+    * @brief      Convert from accumulator representtaion
+    *
+    * @param[in]  a     Value
+    *
+    * @return     Accumulator value converted to current datatype
+    */
   __STATIC_FORCEINLINE float from_accumulator(const float a)
   {
      return(a);
   };
 
+/**
+ * @brief      Scalar multiply and accumulate
+ *
+ * @param[in]  acc   Accumulator
+ * @param[in]  a     Operand
+ * @param[in]  b     Operand
+ *
+ * @return     acc + a*b
+ */
   __STATIC_FORCEINLINE float mac(const float acc,const float a,const float b)
   {
      return(acc+a*b);
   };
 
+/**
+ * @brief      Scalar accumulate
+ *
+ * @param      a     Accumulator
+ * @param[in]  b     Operand
+ */
 __STATIC_FORCEINLINE void accumulate(float &a,const float &b)
 {
    a += b;
 }
 
+/**
+ * @brief      Scalar multiply
+ *
+ * @param      a     Operand
+ * @param[in]  b     Operand
+ *
+ * @return     a*b
+ */
 __STATIC_FORCEINLINE float mult(float &a,const float &b)
 {
    return(a*b);
diff --git a/dsppp/Include/dsppp/num_features/group.hpp b/dsppp/Include/dsppp/num_features/group.hpp
index f55d98770..4d21e76c1 100644
--- a/dsppp/Include/dsppp/num_features/group.hpp
+++ b/dsppp/Include/dsppp/num_features/group.hpp
@@ -11,14 +11,31 @@
  *  @{
  */
 
+/**
+ * @brief      Number description for a tuple of numbers
+ *
+ * @tparam     E     Datatype for all numbers
+ */
 template<typename ... E>
 struct number_traits<std::tuple<E...>>
 {
+   //! It is not a float number
    static constexpr bool is_float = false;
+
+   //! It is not a fixed point number
    static constexpr bool is_fixed = false;
+
+   //! Tuple of accumulator datatype for the accumulator type
    typedef std::tuple<typename number_traits<E>::accumulator...> accumulator;
+   
+   //! Tuple of compute datatype for the compute type
    typedef std::tuple<typename number_traits<E>::compute_type...> compute_type;
 
+   /**
+    * @brief      Return of tuples of one values
+    *
+    * @return     Tuples of one values with different datatypes
+    */
    static std::tuple<typename number_traits<E>::accumulator...> one()
    {
        return(std::make_tuple(vector_traits<E>::one()...));
@@ -34,21 +51,52 @@ like f32 and q13 that have same number of lanes.
 Any other mix will not work and won't be catched at build time.
 
 */
+
+/**
+ * @brief      Tuple of compatible vectors
+ *
+ * @tparam     arch  Current architecture
+ * @tparam     E     List of vector dataypes
+ * 
+ * The vector datatypes must be coherent : have same number of lanes
+ * or same lane datatype
+ */
 template<typename arch,typename ... E>
 struct vector_traits<std::tuple<E...>,arch> {
+
+  //! First element of tuple defines the scalar datatype
   using RefScalar = typename std::tuple_element<0,std::tuple<E...>>::type;
 
+
+  //! Temporary accumulator datatype
   typedef std::tuple<typename vector_traits<E,arch>::temp_accumulator...> temp_accumulator;
+  
+  //! Vector datatype
   typedef std::tuple<typename vector_traits<E,arch>::vector...> vector;
+  
+  //! Predicate datatype
   typedef std::tuple<typename vector_traits<E,arch>::predicate_t...> predicate_t;
 
+  //! Number of lanes (from RefScalar)
   static constexpr int nb_lanes = vector_traits<RefScalar,arch>::nb_lanes;
 
+  //! Has vector instructions
   static constexpr bool has_vector = vector_traits<RefScalar,arch>::has_vector;
+  
+  //! Is a float
   static constexpr bool is_float = vector_traits<RefScalar,arch>::is_float;
+  
+  //! Is fixed point
   static constexpr bool is_fixed = vector_traits<RefScalar,arch>::is_fixed;
+  
+  //! Has predicated loop
   static constexpr bool has_predicate = vector_traits<RefScalar,arch>::has_predicate;
 
+  /**
+   * @brief      Zero represented with temp accumulator datatype
+   *
+   * @return     Zero represented with temp accumulator datatype
+   */
   static temp_accumulator temp_acc_zero()
   {
      return(std::make_tuple(vector_traits<E,arch>::temp_acc_zero()...));
@@ -56,6 +104,10 @@ struct vector_traits<std::tuple<E...>,arch> {
 
 };
 
+/**
+ * Inner implementation of generic intrinsics
+ * \ingroup GenericNumber
+ */
 namespace inner {
 
  
@@ -69,9 +121,24 @@ namespace inner {
 
   */
 #if defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
+  /**
+   * @brief      Tuple of predicates
+   *
+   * @tparam     E     Tuple of datatypes
+   * 
+   * The datatypes must be coherent (same number of lanes).
+   * The first element is used to infer the vctpq instruction to use
+   */
   template<typename ...E>
   struct vctpq<std::tuple<E...>>
   {
+     /**
+      * @brief      Make a predicate for predicated loop
+      *
+      * @param[in]  v     Remaining number of iterations
+      *
+      * @return     Predicate
+      */
      static auto mk(const uint32_t v/*,
       typename std::enable_if<(vector_traits<E>::nb_lanes == ...),bool>::type* = nullptr*/)
      {
@@ -86,12 +153,37 @@ namespace inner {
 
   */
 
+  /**
+   * @brief      Vector accumulate for tuples of vectors
+   *
+   * @param[in]  acc        The accumulator
+   * @param[in]  a          First operand
+   * @param[in]  b          Second operand
+   *
+   * @tparam     A          Accumulator datatype
+   * @tparam     V          Vector datatype
+   * @tparam     Ns         Tuple index
+   *
+   * @return     tuple of results
+   */
   template<typename A,typename V,std::size_t... Ns>
   __STATIC_FORCEINLINE A vmacc_impl(const A &acc,const V &a,const V &b, std::index_sequence<Ns...>)
   {
      return(std::make_tuple(vmacc(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b))...));
   };
 
+  /**
+   * @brief      Vector accumulate for tuples of vectors
+   *  
+   * @param[in]  acc   The accumulator
+   * @param[in]  a     First operand
+   * @param[in]  b     Second operand
+   *
+   * @tparam     A     Accumulator datatype
+   * @tparam     E     Datatype of tuples elements
+   *
+   * @return     Accumulator result
+   */
   template<typename A,typename ...E>
   __STATIC_FORCEINLINE A 
   vmacc(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b)
@@ -99,12 +191,41 @@ namespace inner {
      return(vmacc_impl(acc,a,b,std::make_index_sequence<sizeof...(E)>()));
   };
 
+  /**
+   * @brief      Predicated vector accumulate for tuple
+   *
+   * @param[in]  acc        Accumulator
+   * @param[in]  a          First operand
+   * @param[in]  b          Second operand
+   * @param[in]  p0         Predicate
+   *
+   * @tparam     A          Accumulator datatype
+   * @tparam     V          Vector datatype
+   * @tparam     B          Predicate datatype
+   * @tparam     Ns         Tuple indexes
+   *
+   * @return     Tuple of accumulated values
+   */
   template<typename A,typename V,typename B,std::size_t... Ns>
   __STATIC_FORCEINLINE A vmacc_impl(const A &acc,const V &a,const V &b, const B p0,std::index_sequence<Ns...>)
   {
      return(std::make_tuple(vmacc(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b),p0)...));
   };
 
+  /**
+   * @brief      Predicated vector accumulate for tuples
+   *
+   * @param[in]  acc   Accumulator
+   * @param[in]  a     First operand
+   * @param[in]  b     Second operand
+   * @param[in]  p0    Predicate
+   *
+   * @tparam     A     Accumulator datatype
+   * @tparam     B     Predicate datatype
+   * @tparam     E     Dadatype of tuples elements
+   *
+   * @return     Tuple of accumulated vectors
+   */
   template<typename A,typename B,typename ...E>
   __STATIC_FORCEINLINE A 
   vmacc(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b,const B p0)
@@ -114,36 +235,114 @@ namespace inner {
 
  
 
+  /**
+   * @brief      Reduce function for tuple
+   *
+   * @param[in]  acc        Accumulator
+   *
+   * @tparam     A          Accumulator datatype
+   * @tparam     Ns         Tuple indexes
+   *
+   * @return     Reduced accumulator values
+   * 
+   * Some vector instructions sets cannot accumulate vectors
+   * into a scalar. They accumulate into this vector.
+   * This vector must be reduced to a scalar at the end of
+   * the accumulation loop.
+   */
   template<typename A,std::size_t... Ns>
   __STATIC_FORCEINLINE auto vreduce_impl(const A &acc, std::index_sequence<Ns...>)
   {
      return(std::make_tuple(vreduce(std::get<Ns>(acc))...));
   };
 
+/**
+ * @brief      Reduce function for tuples
+ *
+ * @param[in]  acc   The accumulator
+ *
+ * @tparam     E     Datatypes for tuples
+ *
+ * @return     Tuples of reduced values
+ * 
+ * Some vector instructions sets cannot accumulate vectors
+ * into a scalar. They accumulate into this vector.
+ * This vector must be reduced to a scalar at the end of
+ * the accumulation loop.
+ * 
+ */
   template<typename ...E>
   __STATIC_FORCEINLINE auto vreduce(const std::tuple<E...> &acc)
   {
      return(vreduce_impl(acc,std::make_index_sequence<sizeof...(E)>()));
   };
 
+   /**
+    * @brief      Convert from accumulator value
+    *
+    * @param[in]  acc        The accumulator
+    *
+    * @tparam     A          Accumulator datatype
+    * @tparam     Ns         Tuples indexes
+    *
+    * @return     Tuples of values
+    */
    template<typename A,std::size_t... Ns>
   __STATIC_FORCEINLINE auto from_accumulator_impl(const A &acc, std::index_sequence<Ns...>)
   {
      return(std::make_tuple(from_accumulator(std::get<Ns>(acc))...));
   };
 
+  /**
+   * @brief      Convert from tuple of accumulator values
+   *
+   * @param[in]  acc   Accumulator
+   *
+   * @tparam     E     Datatypes for tuple
+   *
+   * @return     Tuples of converted accumulator values
+   * 
+   * Accumulator may use more bits to avoid saturations.
+   * At the end of the accumulation, the final result must
+   * be converted to the current datatype (it may implies saturation)
+   */
   template<typename ...E>
   __STATIC_FORCEINLINE auto from_accumulator(const std::tuple<E...> &acc)
   {
      return(from_accumulator_impl(acc,std::make_index_sequence<sizeof...(E)>()));
   };
 
+  /**
+   * @brief      Multiply accumulate for tuple of scalar
+   *
+   * @param[in]  acc        Accumulator 
+   * @param[in]  a          First operand
+   * @param[in]  b          Second operand
+   *
+   * @tparam     A          Accumulator datatype
+   * @tparam     V          Scalar datatype
+   * @tparam     Ns         Tuple indexes
+   *
+   * @return     Tuples of accumulated values
+   */
   template<typename A,typename V,std::size_t... Ns>
   __STATIC_FORCEINLINE A mac_impl(const A &acc,const V &a,const V &b, std::index_sequence<Ns...>)
   {
      return(std::make_tuple(mac(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b))...));
   };
 
+/**
+ * @brief      Multiply accumulate
+ *
+ * @param[in]  acc   Accumulator
+ * @param[in]  a     First operand
+ * @param[in]  b     Second operand
+ *
+ * @tparam     A     Accumulator datatype
+ * @tparam     E     Datatypes for tuple
+ *
+ * @return     Accumulated values
+ */
   template<typename A,typename ...E>
   __STATIC_FORCEINLINE A 
   mac(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b)
@@ -151,12 +350,41 @@ namespace inner {
      return(mac_impl(acc,a,b,std::make_index_sequence<sizeof...(E)>()));
   };
 
+/**
+   * @brief      Multiply accumulate for tuple of scalar
+   *
+   * @param[in]  acc        Accumulator 
+   * @param[in]  a          First operand
+   * @param[in]  b          Second operand
+   * @param[in]  p0         Predicate
+   *
+   * @tparam     A          Accumulator datatype
+   * @tparam     V          Scalar datatype
+   * @tparam     B          Predicate datatype
+   * @tparam     Ns         Tuple indexes
+   *
+   * @return     Tuples of accumulated values
+   */
  template<typename A,typename V,typename B,std::size_t... Ns>
   __STATIC_FORCEINLINE A mac_impl(const A &acc,const V &a,const V &b, const B p0,std::index_sequence<Ns...>)
   {
      return(std::make_tuple(mac(std::get<Ns>(acc),std::get<Ns>(a),std::get<Ns>(b),p0)...));
   };
 
+/**
+ * @brief      Multiply accumulate
+ *
+ * @param[in]  acc   Accumulator
+ * @param[in]  a     First operand
+ * @param[in]  b     Second operand
+ * @param[in]  p0    Predicate
+ *
+ * @tparam     A     Accumulator datatype
+ * @tparam     B          Predicate datatype
+ * @tparam     E     Datatypes for tuple
+ *
+ * @return     Accumulated values
+ */
    template<typename A,typename B,typename ...E>
   __STATIC_FORCEINLINE A 
   mac(const A &acc,const std::tuple<E...> &a,const std::tuple<E...> &b,const B p0)
diff --git a/dsppp/Include/dsppp/num_features/half.hpp b/dsppp/Include/dsppp/num_features/half.hpp
index dd24fc785..ad5ccafe6 100644
--- a/dsppp/Include/dsppp/num_features/half.hpp
+++ b/dsppp/Include/dsppp/num_features/half.hpp
@@ -15,54 +15,113 @@
  */
 
 #if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+ * @brief      Feature of float16 datatype
+ */
 template<>
 struct number_traits<float16_t>
 {
+   //! It is a float number
    static constexpr bool is_float = true;
+   //! It is not a fixed point number
    static constexpr bool is_fixed = false;
+   //! Accumulator datatype
    typedef float16_t accumulator;
+
+   /**
+    * @brief      One value
+    *
+    * @return     One value in f16
+    */
    static constexpr float16_t one() {return  ((float16_t)1.0f);};
 
+   //! Compute datatype
    typedef _Float16 compute_type;
 };
 
 
 #if !defined(ARM_MATH_MVE_FLOAT16)
+/**
+ * @brief      float16 vector descrition when no vector architecture
+ */
 template<>
 struct vector_traits<float16_t> {
+  //! Float16 datatype
   typedef float16_t type;
+  //! Float16 storage type
   typedef float16_t storage_type;
 
   // No vector type but must still be defined
+  //! Dummy type when no vector instruction is supported
   typedef bool vector;
+  //! Dummy type when no vector instruction is supported
   typedef bool temp_accumulator;
+  //! Dummy type when no vector instruction is supported
   typedef uint32_t predicate_t;
 
-  
+  //! No vector instruction
   static constexpr bool has_vector = false;
+  //! Is float
   static constexpr bool is_float = true;
+  //! Not fixed point
   static constexpr bool is_fixed = false;
+  //! Has predicated loop
   static constexpr bool has_predicate = false;
 };
 #endif 
 
+/**
+ * Inner implementation of generic intrinsics
+ * \ingroup GenericNumber
+ */
 namespace inner {
+   /**
+    * @brief      Convert from accumulator datatype
+    *
+    * @param[in]  a     Value
+    *
+    * @return     Converted from accumulator datatype
+    */
    __STATIC_FORCEINLINE float16_t from_accumulator(const float16_t a)
    {
      return(a);
    };
 
+/**
+ * @brief      Multiply and accumulate
+ *
+ * @param[in]  acc   Accumulator
+ * @param[in]  a     First operand
+ * @param[in]  b     Second operand
+ *
+ * @return     acc + a*b
+ */
   __STATIC_FORCEINLINE float16_t mac(const float16_t acc,const float16_t a,const float16_t b)
   {
      return((_Float16)acc+(_Float16)a*(_Float16)b);
   };
   
 
+/**
+ * @brief      Accumulate
+ *
+ * @param      a     Accumulator
+ * @param[in]  b     Value to accumulate
+ */
 __STATIC_FORCEINLINE void accumulate(float16_t &a,const float16_t &b)
 {
    a += (_Float16)b;
 }
 
+/**
+ * @brief      Multiply
+ *
+ * @param      a     First operand
+ * @param[in]  b     Second operand
+ *
+ * @return     a*b
+ */
 __STATIC_FORCEINLINE float16_t mult(float16_t &a,const float16_t &b)
 {
    return((_Float16)a*(_Float16)b);
diff --git a/dsppp/Include/dsppp/num_features/q15.hpp b/dsppp/Include/dsppp/num_features/q15.hpp
index 5bd5d9fc3..faf58859a 100644
--- a/dsppp/Include/dsppp/num_features/q15.hpp
+++ b/dsppp/Include/dsppp/num_features/q15.hpp
@@ -10,52 +10,105 @@
  *  @{
  */
 
+/**
+ * @brief      Q15 features
+ */
 template<>
 struct number_traits<Q15>
 {
+   //! Is not float
    static constexpr bool is_float = false;
+   //! Is fixed point
    static constexpr bool is_fixed = true;
+   //! Accumulator datatype
    typedef Q<33,30> accumulator;
+   /**
+    * @brief      One value
+    *
+    * @return     One value in Q15
+    */
    static constexpr Q15 one() {return Q15::one();};
+   //! Compute type
    typedef Q15 compute_type;
 };
 
+/**
+ * @brief      Vector features for Q15 when no vector architecture
+ *
+ * @tparam     arch  Current architecture
+ */
 template<typename arch>
 struct vector_traits<Q15,arch,
     typename std::enable_if<!std::is_base_of<Helium,arch>::value &&
                             !std::is_base_of<Neon,arch>::value &&
                             !std::is_base_of<DSP,arch>::value>::type> {
+  //! Compute type
   typedef Q15 type;
+
+  //! Storage datatype (int16_t)
   typedef type::value_type storage_type;
 
   // No vector type but must still be defined
+  //! Dummy type when no vector instructions
   typedef bool vector;
+  //! Dummy type when no vector instructions
   typedef bool temp_accumulator;
+  //! Dummy type when no vector instructions
   typedef uint32_t predicate_t;
 
 
-  
+  //! Has no vector instructions
   static constexpr bool has_vector = false;
+  //! Is not float
   static constexpr bool is_float = false;
+  //! Is fixed point
   static constexpr bool is_fixed = true;
+  //! Has no predicated loop
   static constexpr bool has_predicate = false;
 
 };
 
+/**
+ * Inner implementation of generic intrinsics
+ * \ingroup GenericNumber
+ */
 namespace inner {
 #if defined(ARM_MATH_MVEI)
+    /**
+     * @brief      Convert from accumulator type
+     *
+     * @param[in]  a    The accumulator value
+     *
+     * @return     The converted value (with saturation)
+     */
     __STATIC_FORCEINLINE Q15 from_accumulator(const Q<33,30> a)
     {
       //return(saturate(toFrac<15>(a)));
         return(Q15((sqrshrl_sat48(a.v, -(32-15)) >> 32) & 0xffffffff));
     };
 #else 
+    /**
+     * @brief      Convert from accumulator type
+     *
+     * @param[in]  a    The accumulator value
+     *
+     * @return     The converted value (with saturation)
+     */
      __STATIC_FORCEINLINE Q15 from_accumulator(const Q<33,30> a)
     {
        return(saturate(toFrac<15>(a)));
     };
 #endif
 
+    /**
+     * @brief      Multiply and accumulate
+     *
+     * @param[in]  acc   Accumulator
+     * @param[in]  a     First operand
+     * @param[in]  b     Second operand
+     *
+     * @return     acc + a*b
+     */
     __STATIC_FORCEINLINE Q<33,30> mac(const Q<33,30> acc,const Q15 a,const Q15 b)
     {
       return(accumulate(acc , mult(a,b)));
diff --git a/dsppp/Include/dsppp/num_features/q31.hpp b/dsppp/Include/dsppp/num_features/q31.hpp
index 5af4f5647..9df178864 100644
--- a/dsppp/Include/dsppp/num_features/q31.hpp
+++ b/dsppp/Include/dsppp/num_features/q31.hpp
@@ -10,43 +10,88 @@
  *  @{
  */
 
+/**
+ * @brief      Features for Q31
+ */
 template<>
 struct number_traits<Q31>
 {
+   //! Is not a float
    static constexpr bool is_float = false;
+   //! Is fixed point
    static constexpr bool is_fixed = true;
+   //! Accumulator datatype
    typedef Q<15,48> accumulator;
+   /**
+    * @brief      One value
+    *
+    * @return     One value
+    */
    static constexpr Q31 one() {return Q31::one();};
+
+   //! Compute type
    typedef Q31 compute_type;
 };
 
+/**
+ * @brief      Vector features for Q31 when no vector instructions
+ *
+ * @tparam     arch  Current architecture
+ */
 template<typename arch>
 struct vector_traits<Q31,arch,
     typename std::enable_if<!std::is_base_of<Helium,arch>::value &&
                             !std::is_base_of<Neon,arch>::value>::type> {
+  //! Datatype
   typedef Q31 type;
+
+  //! Storage tpe (int32_t)
   typedef type::value_type storage_type;
 
   // No vector type but must still be defined
+  //! Dummy type when no vector instructions are supported
   typedef bool vector;
+  //! Dummy type when no vector instructions are supported
   typedef bool temp_accumulator;
+  //! Dummy type when no vector instructions are supported
   typedef uint32_t predicate_t;
 
 
-  
+  //! No vector instruction
   static constexpr bool has_vector = false;
+  //! Is not float
   static constexpr bool is_float = false;
+  //! Is fixed
   static constexpr bool is_fixed = true;
+  //! No predicated loop
   static constexpr bool has_predicate = false;
 };
 
+/**
+ * Inner implementation of generic intrinsics
+ * \ingroup GenericNumber
+ */
 namespace inner {
 #if defined(ARM_MATH_MVEI)
+  /**
+   * @brief      Convert from accumulator (with no saturation)
+   *
+   * @param[in]  a     Accumulator value
+   *
+   * @return     Converted value
+   */
   __STATIC_FORCEINLINE Q31 from_accumulator(const Q<15,48> a)
   {
         return(Q31(asrl(a.v, 17)));
   };
 #else
+   /**
+   * @brief      Convert from accumulator (with no saturation)
+   *
+   * @param[in]  a     Accumulator value
+   *
+   * @return     Converted value
+   */
   __STATIC_FORCEINLINE Q31 from_accumulator(const Q<15,48> a)
   {
         return(Q31(a.v >> 17));
@@ -54,6 +99,15 @@ namespace inner {
 #endif
 
 
+/**
+ * @brief      Multiply and accumulate
+ *
+ * @param[in]  acc   Accumulator
+ * @param[in]  a     First operand
+ * @param[in]  b     Second operand
+ *
+ * @return     acc + a*b
+ */
 __STATIC_FORCEINLINE Q<15,48> mac(const Q<15,48> acc,const Q31 a,const Q31 b)
 {
     return(accumulate(acc , toFrac<48>(mult(a,b))));
diff --git a/dsppp/Include/dsppp/num_features/q7.hpp b/dsppp/Include/dsppp/num_features/q7.hpp
index e408801bd..674a5fe73 100644
--- a/dsppp/Include/dsppp/num_features/q7.hpp
+++ b/dsppp/Include/dsppp/num_features/q7.hpp
@@ -10,43 +10,94 @@
  *  @{
  */
 
+/**
+ * @brief      Q7 features
+ */
 template<>
 struct number_traits<Q7>
 {
+   //! Is not float
    static constexpr bool is_float = false;
+
+   //! Is fixed point
    static constexpr bool is_fixed = true;
+
+   //! Accumulator datatype
    typedef Q<17,14> accumulator;
+
+   /**
+    * @brief      One value
+    *
+    * @return     One value in Q7
+    */
    static constexpr Q7 one() {return Q7::one();};
+
+
+   //! Compute type
    typedef Q7 compute_type;
 };
 
+/**
+ * @brief      Vector descrition when no vector architecture
+ *
+ * @tparam     arch  Current architecture
+ */
 template<typename arch>
 struct vector_traits<Q7,arch,
     typename std::enable_if<!std::is_base_of<Helium,arch>::value &&
                             !std::is_base_of<Neon,arch>::value &&
                             !std::is_base_of<DSP,arch>::value>::type> {
+  //! Current datatype
   typedef Q7 type;
+
+  //! Storage datatype (int8_t)
   typedef type::value_type storage_type;
 
   // No vector type but must still be defined
+  //! Dummy datatype when no vector instructions
   typedef bool vector;
+  //! Dummy datatype when no vector instructions
   typedef bool temp_accumulator;
+  //! Dummy datatype when no vector instructions
   typedef uint32_t predicate_t;
 
 
-  
+  //! No vector instructions
   static constexpr bool has_vector = false;
+  //! Is not float
   static constexpr bool is_float = false;
+  //! Is fixed point
   static constexpr bool is_fixed = true;
+  //! No predicated loop
   static constexpr bool has_predicate = false;
 };
 
+/**
+ * Inner implementation of generic intrinsics
+ * \ingroup GenericNumber
+ */
 namespace inner {
+    /**
+     * @brief      Convert from accumulator with saturation
+     *
+     * @param[in]  a     Accumulator value
+     *
+     * @return     Q7 value
+     */
     __STATIC_FORCEINLINE Q7 from_accumulator(const Q<17,14> a)
     {
         return(Q7(__SSAT(a.v >> 7, 8)));
     };
 
+/**
+ * @brief      Multiply and accumulate
+ *
+ * @param[in]  acc   Accumulator
+ * @param[in]  a     First operand
+ * @param[in]  b     Second operand
+ *
+ * @return     acc + a*b
+ */
     __STATIC_FORCEINLINE Q<17,14> mac(const Q<17,14> acc,const Q7 a,const Q7 b)
     {
       return(accumulate(acc , mult(a,b)));
diff --git a/dsppp/test.cproject.yml b/dsppp/test.cproject.yml
index bdb280636..2538d62c6 100644
--- a/dsppp/test.cproject.yml
+++ b/dsppp/test.cproject.yml
@@ -23,17 +23,6 @@ project:
       files:
         - file: main.c
         - file: allocator.cpp
-    - group: IPSS
-      for-context: 
-        - +IPSS-M0P
-        - +IPSS-M4
-      files:
-        - file: IPSS/retarget_m0.c
-          for-context: 
-             - +IPSS-M0P
-        - file: IPSS/retarget_m4.c
-          for-context: 
-             - +IPSS-M4
   add-path:
       - Include
       - ../../../boost_1_84_0

From 4d7d59922fadb2b6467dd921dd0384ab5d615909 Mon Sep 17 00:00:00 2001
From: Christophe Favergeon <Christophe.Favergeon@arm.com>
Date: Wed, 6 Mar 2024 07:25:49 +0100
Subject: [PATCH 5/5] Added comment to test script from dsppp

---
 dsppp/run_all.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/dsppp/run_all.py b/dsppp/run_all.py
index 9c15f71b8..502990440 100644
--- a/dsppp/run_all.py
+++ b/dsppp/run_all.py
@@ -183,9 +183,10 @@ def is_only_test(n,i):
         "DYNAMIC_TEST"
         ]
 
-TESTS=["DOT_TEST","VECTOR_TEST"]
-DATATYPES=["F32_DT"]
-MODE = ["STATIC_TEST"]
+# Restricted tests for debugging
+#TESTS=["DOT_TEST","VECTOR_TEST"]
+#DATATYPES=["F32_DT"]
+#MODE = ["STATIC_TEST"]
 
 all_tests = list(itertools.product(TESTS,DATATYPES,MODE))