libjpeg-turbo-3.0.2-e2k.patch

From 5c6ff06bc9aec237e1ba222a3dde057cbfa81c9d Mon Sep 17 00:00:00 2001
From: Ilya Kurdyukov <jpegqs@gmail.com>
Date: Fri, 9 Feb 2024 09:57:55 +0700
Subject: [PATCH] libjpeg-turbo-3.0.2 e2k support

---
 CMakeLists.txt           |   5 +
 simd/CMakeLists.txt      |  23 ++
 simd/e2k/jccolext-e2k.c  | 213 +++++++++++
 simd/e2k/jccolor-e2k.c   | 163 +++++++++
 simd/e2k/jchuff-e2k.c    | 307 ++++++++++++++++
 simd/e2k/jcphuff-e2k.c   | 145 ++++++++
 simd/e2k/jcsample-e2k.c  | 203 +++++++++++
 simd/e2k/jcsample.h      |  28 ++
 simd/e2k/jdcolext-e2k.c  | 258 +++++++++++++
 simd/e2k/jdcolor-e2k.c   | 289 +++++++++++++++
 simd/e2k/jdcoltab-e2k.c  |  80 ++++
 simd/e2k/jdsample-e2k.c  | 389 ++++++++++++++++++++
 simd/e2k/jfdctflt-e2k.c  | 127 +++++++
 simd/e2k/jfdctfst-e2k.c  | 145 ++++++++
 simd/e2k/jfdctint-e2k.c  | 255 +++++++++++++
 simd/e2k/jidctflt-e2k.c  | 215 +++++++++++
 simd/e2k/jidctfst-e2k.c  | 187 ++++++++++
 simd/e2k/jidctint-e2k.c  | 294 +++++++++++++++
 simd/e2k/jquantf-e2k.c   | 121 +++++++
 simd/e2k/jquanti-e2k.c   | 178 +++++++++
 simd/e2k/jsimd.c         | 761 +++++++++++++++++++++++++++++++++++++++
 simd/e2k/jsimd_api_e2k.h |  94 +++++
 simd/e2k/jsimd_e2k.h     | 207 +++++++++++
 23 files changed, 4687 insertions(+)
 create mode 100644 simd/e2k/jccolext-e2k.c
 create mode 100644 simd/e2k/jccolor-e2k.c
 create mode 100644 simd/e2k/jchuff-e2k.c
 create mode 100644 simd/e2k/jcphuff-e2k.c
 create mode 100644 simd/e2k/jcsample-e2k.c
 create mode 100644 simd/e2k/jcsample.h
 create mode 100644 simd/e2k/jdcolext-e2k.c
 create mode 100644 simd/e2k/jdcolor-e2k.c
 create mode 100644 simd/e2k/jdcoltab-e2k.c
 create mode 100644 simd/e2k/jdsample-e2k.c
 create mode 100644 simd/e2k/jfdctflt-e2k.c
 create mode 100644 simd/e2k/jfdctfst-e2k.c
 create mode 100644 simd/e2k/jfdctint-e2k.c
 create mode 100644 simd/e2k/jidctflt-e2k.c
 create mode 100644 simd/e2k/jidctfst-e2k.c
 create mode 100644 simd/e2k/jidctint-e2k.c
 create mode 100644 simd/e2k/jquantf-e2k.c
 create mode 100644 simd/e2k/jquanti-e2k.c
 create mode 100644 simd/e2k/jsimd.c
 create mode 100644 simd/e2k/jsimd_api_e2k.h
 create mode 100644 simd/e2k/jsimd_e2k.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index adb0ca4..3b445a0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,6 +125,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "aarch64" OR
 elseif(CMAKE_SYSTEM_PROCESSOR_LC MATCHES "^ppc" OR
   CMAKE_SYSTEM_PROCESSOR_LC MATCHES "^powerpc")
   set(CPU_TYPE powerpc)
+elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "e2k" OR
+  CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "elbrus")
+  set(CPU_TYPE e2k)
 else()
   set(CPU_TYPE ${CMAKE_SYSTEM_PROCESSOR_LC})
 endif()
@@ -906,6 +909,8 @@ if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386")
   elseif(CPU_TYPE STREQUAL "x86_64")
     set(DEFAULT_FLOATTEST8 no-fp-contract)
   endif()
+elseif(WITH_SIMD AND CPU_TYPE STREQUAL "e2k")
+  set(DEFAULT_FLOATTEST8 sse)
 elseif(CPU_TYPE STREQUAL "powerpc" OR CPU_TYPE STREQUAL "arm64")
   if(CMAKE_C_COMPILER_ID STREQUAL "Clang")
     if(CMAKE_C_COMPILER_VERSION VERSION_EQUAL 14.0.0 OR
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index 6024900..789fa3f 100644
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -531,6 +531,29 @@ if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
   set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
 endif()
 
+###############################################################################
+# Elbrus (Intrinsics)
+###############################################################################
+
+elseif(CPU_TYPE STREQUAL "e2k")
+
+set(SIMD_SOURCES e2k/jquanti-e2k.c e2k/jquantf-e2k.c
+  e2k/jccolor-e2k.c e2k/jcsample-e2k.c
+  e2k/jdcolor-e2k.c e2k/jdsample-e2k.c
+  e2k/jfdctint-e2k.c e2k/jfdctfst-e2k.c e2k/jfdctflt-e2k.c
+  e2k/jidctint-e2k.c e2k/jidctfst-e2k.c e2k/jidctflt-e2k.c
+  e2k/jchuff-e2k.c e2k/jcphuff-e2k.c)
+
+set_source_files_properties(${SIMD_SOURCES} PROPERTIES
+  COMPILE_FLAGS -msse4.1)
+
+set(SIMD_SOURCES ${SIMD_SOURCES} e2k/jsimd.c)
+
+add_library(simd OBJECT ${SIMD_SOURCES})
+
+if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+endif()
 
 ###############################################################################
 # None
diff --git a/simd/e2k/jccolext-e2k.c b/simd/e2k/jccolext-e2k.c
new file mode 100644
index 0000000..49abdb4
--- /dev/null
+++ b/simd/e2k/jccolext-e2k.c
@@ -0,0 +1,213 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2014, Jay Foad.  All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-e2k.c */
+
+void rgbn_ycc_convert(JDIMENSION img_width, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows, int shuf_idx)
+{
+  JSAMPROW inptr, outptr0, outptr1, outptr2;
+  unsigned char __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16];
+
+  __m128i pb_zero = _mm_setzero_si128();
+  __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]);
+#if PIXELSIZE == 4
+  __m128i rgb3 = pb_zero;
+#else
+  __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16);
+#endif
+  __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero,
+    rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+  __m128i y, yl, yh, y0, y1, y2, y3;
+  __m128i cb, cr, crl, crh, cbl, cbh;
+  __m128i cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
+
+  /* Constants */
+  __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)),
+    pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)),
+    pw_mf016_mf033 = _mm_setr_epi16(__4X2(-F_0_168, -F_0_331)),
+    pw_mf008_mf041 = _mm_setr_epi16(__4X2(-F_0_081, -F_0_418)),
+    pw_mf050_f000 = _mm_setr_epi16(__4X2(-F_0_500, 0)),
+    pd_onehalf = _mm_set1_epi32(ONE_HALF),
+    pd_onehalfm1_cj = _mm_set1_epi32(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS));
+#ifdef NEED_ALIGN8
+  ALIGN8_COMMON
+  ALIGN8_VARS(src)
+#endif
+
+  if (img_width > 0)
+  while (--num_rows >= 0) {
+    int num_cols;
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    if (img_width >= 16) {
+#ifdef NEED_ALIGN8
+      ALIGN8_START(inptr, src)
+      inptr += (img_width & -16) * PIXELSIZE;
+#endif
+
+      PRAGMA_E2K("ivdep")
+      for (num_cols = img_width; num_cols >= 16; num_cols -= 16,
+           outptr0 += 16, outptr1 += 16, outptr2 += 16) {
+#ifdef NEED_ALIGN8
+        ALIGN8_READ16(rgb0, src, 0)
+        ALIGN8_READ16(rgb1, src, 1)
+        ALIGN8_READ16(rgb2, src, 2)
+#if PIXELSIZE == 4
+        ALIGN8_READ16(rgb3, src, 3)
+#endif
+        src_ptr += PIXELSIZE * 2;
+#else
+        rgb0 = VEC_LD(inptr);
+        rgb1 = VEC_LD(inptr + 16);
+        rgb2 = VEC_LD(inptr + 32);
+#if PIXELSIZE == 4
+        rgb3 = VEC_LD(inptr + 48);
+#endif
+        inptr += PIXELSIZE * 16;
+#endif
+        RGB_SHUFFLE
+        CALC_Y(outptr0)
+        CALC_CC(outptr1, outptr2)
+      }
+    }
+
+    num_cols = img_width & 15;
+    if (num_cols) {
+      int i;
+      memcpy(tmpbuf, inptr, num_cols * PIXELSIZE);
+      rgb0 = VEC_LD(tmpbuf);
+      rgb1 = VEC_LD(tmpbuf + 16);
+      rgb2 = VEC_LD(tmpbuf + 32);
+#if PIXELSIZE == 4
+      rgb3 = VEC_LD(tmpbuf + 48);
+#endif
+      RGB_SHUFFLE
+      CALC_Y(tmpbuf)
+      CALC_CC(tmpbuf + 16, tmpbuf + 32)
+
+      for (i = 0; i < num_cols; i++) {
+        outptr0[i] = tmpbuf[i];
+        outptr1[i] = tmpbuf[i + 16];
+        outptr2[i] = tmpbuf[i + 32];
+      }
+    }
+  }
+}
+
+void rgbn_gray_convert(JDIMENSION img_width, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows, int shuf_idx)
+{
+  JSAMPROW inptr, outptr;
+  uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16];
+
+  __m128i pb_zero = _mm_setzero_si128();
+  __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]);
+#if PIXELSIZE == 4
+  __m128i rgb3 = pb_zero;
+#else
+  __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16);
+#endif
+  __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero,
+    rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+  __m128i y, yl, yh, y0, y1, y2, y3;
+
+  /* Constants */
+  __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)),
+    pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)),
+    pd_onehalf = _mm_set1_epi32(ONE_HALF);
+#ifdef NEED_ALIGN8
+  ALIGN8_COMMON
+  ALIGN8_VARS(src)
+#endif
+
+  if (img_width > 0)
+  while (--num_rows >= 0) {
+    int num_cols;
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    if (img_width >= 16) {
+#ifdef NEED_ALIGN8
+      ALIGN8_START(inptr, src)
+      inptr += (img_width & -16) * PIXELSIZE;
+#endif
+
+      PRAGMA_E2K("ivdep")
+      for (num_cols = img_width; num_cols >= 16; num_cols -= 16,
+           outptr += 16) {
+#ifdef NEED_ALIGN8
+        ALIGN8_READ16(rgb0, src, 0)
+        ALIGN8_READ16(rgb1, src, 1)
+        ALIGN8_READ16(rgb2, src, 2)
+#if PIXELSIZE == 4
+        ALIGN8_READ16(rgb3, src, 3)
+#endif
+        src_ptr += PIXELSIZE * 2;
+#else
+        rgb0 = VEC_LD(inptr);
+        rgb1 = VEC_LD(inptr + 16);
+        rgb2 = VEC_LD(inptr + 32);
+#if PIXELSIZE == 4
+        rgb3 = VEC_LD(inptr + 48);
+#endif
+        inptr += PIXELSIZE * 16;
+#endif
+        RGB_SHUFFLE
+        CALC_Y(outptr)
+      }
+    }
+
+    num_cols = img_width & 15;
+    if (num_cols) {
+      int i;
+      memcpy(tmpbuf, inptr, num_cols * PIXELSIZE);
+      rgb0 = VEC_LD(tmpbuf);
+      rgb1 = VEC_LD(tmpbuf + 16);
+      rgb2 = VEC_LD(tmpbuf + 32);
+#if PIXELSIZE == 4
+      rgb3 = VEC_LD(tmpbuf + 48);
+#endif
+      RGB_SHUFFLE
+      CALC_Y(tmpbuf)
+
+      for (i = 0; i < num_cols; i++) {
+        outptr[i] = tmpbuf[i];
+      }
+    }
+  }
+}
+
+#undef RGB_SHUFFLE
+#undef PIXELSIZE
+#undef rgbn_ycc_convert
+#undef rgbn_gray_convert
+
diff --git a/simd/e2k/jccolor-e2k.c b/simd/e2k/jccolor-e2k.c
new file mode 100644
index 0000000..0af2626
--- /dev/null
+++ b/simd/e2k/jccolor-e2k.c
@@ -0,0 +1,163 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_e2k.h"
+
+#define F_0_081  5329                 /* FIX(0.08131) */
+#define F_0_114  7471                 /* FIX(0.11400) */
+#define F_0_168  11059                /* FIX(0.16874) */
+#define F_0_250  16384                /* FIX(0.25000) */
+#define F_0_299  19595                /* FIX(0.29900) */
+#define F_0_331  21709                /* FIX(0.33126) */
+#define F_0_418  27439                /* FIX(0.41869) */
+#define F_0_500  32768                /* FIX(0.50000) */
+#define F_0_587  38470                /* FIX(0.58700) */
+#define F_0_337  (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
+#define F_0_413  (65536 - F_0_587)    /* FIX(1.00000) - FIX(0.58700) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+#define RGBG_INDEX_(name, color, i, x) \
+  name##_##color + i * name##_PIXELSIZE + x, \
+  name##_GREEN + i * name##_PIXELSIZE + x
+#define RGBG_INDEX(name, x) \
+  RGBG_INDEX_(name, RED, 0, x), RGBG_INDEX_(name, RED, 1, x), \
+  RGBG_INDEX_(name, RED, 2, x), RGBG_INDEX_(name, RED, 3, x), \
+  RGBG_INDEX_(name, BLUE, 0, x), RGBG_INDEX_(name, BLUE, 1, x), \
+  RGBG_INDEX_(name, BLUE, 2, x), RGBG_INDEX_(name, BLUE, 3, x)
+
+static const uint8_t __attribute__((aligned(16)))
+rgb_ycc_shuf_const[7][32] = {
+  { RGBG_INDEX(RGB, 0), RGBG_INDEX(RGB, 4) },
+  { RGBG_INDEX(EXT_RGB, 0), RGBG_INDEX(EXT_RGB, 4) },
+  { RGBG_INDEX(EXT_RGBX, 0), RGBG_INDEX(EXT_RGBX, 4) },
+  { RGBG_INDEX(EXT_BGR, 0), RGBG_INDEX(EXT_BGR, 4) },
+  { RGBG_INDEX(EXT_BGRX, 0), RGBG_INDEX(EXT_BGRX, 4) },
+  { RGBG_INDEX(EXT_XBGR, 0), RGBG_INDEX(EXT_XBGR, 4) },
+  { RGBG_INDEX(EXT_XRGB, 0), RGBG_INDEX(EXT_XRGB, 4) }
+};
+
+  /* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 
+   * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+   * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+   * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+   *
+   * rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+   * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+   * ...
+   */
+
+  /* (Original)
+   * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+   * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+   * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+   *
+   * (This implementation)
+   * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+   * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+   * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+   */
+
+#define CALC_Y(outptr0) \
+  rg0 = _mm_unpacklo_epi8(rgbg0, pb_zero); \
+  bg0 = _mm_unpackhi_epi8(rgbg0, pb_zero); \
+  rg1 = _mm_unpacklo_epi8(rgbg1, pb_zero); \
+  bg1 = _mm_unpackhi_epi8(rgbg1, pb_zero); \
+  rg2 = _mm_unpacklo_epi8(rgbg2, pb_zero); \
+  bg2 = _mm_unpackhi_epi8(rgbg2, pb_zero); \
+  rg3 = _mm_unpacklo_epi8(rgbg3, pb_zero); \
+  bg3 = _mm_unpackhi_epi8(rgbg3, pb_zero); \
+  \
+  /* Calculate Y values */ \
+  y0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_f0299_f0337), pd_onehalf); \
+  y1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_f0299_f0337), pd_onehalf); \
+  y2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_f0299_f0337), pd_onehalf); \
+  y3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_f0299_f0337), pd_onehalf); \
+  y0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_f0114_f0250), y0); \
+  y1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_f0114_f0250), y1); \
+  y2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_f0114_f0250), y2); \
+  y3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_f0114_f0250), y3); \
+  \
+  yl = _mm_packhi_epi32(y0, y1); \
+  yh = _mm_packhi_epi32(y2, y3); \
+  y = _mm_packus_epi16(yl, yh); \
+  VEC_ST(outptr0, y);
+
+#define CALC_CC(outptr1, outptr2) \
+  /* Calculate Cb values */ \
+  cb0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_mf016_mf033), pd_onehalfm1_cj); \
+  cb1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_mf016_mf033), pd_onehalfm1_cj); \
+  cb2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_mf016_mf033), pd_onehalfm1_cj); \
+  cb3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_mf016_mf033), pd_onehalfm1_cj); \
+  cb0 = _mm_sub_epi32(cb0, _mm_madd_epi16(bg0, pw_mf050_f000)); \
+  cb1 = _mm_sub_epi32(cb1, _mm_madd_epi16(bg1, pw_mf050_f000)); \
+  cb2 = _mm_sub_epi32(cb2, _mm_madd_epi16(bg2, pw_mf050_f000)); \
+  cb3 = _mm_sub_epi32(cb3, _mm_madd_epi16(bg3, pw_mf050_f000)); \
+  \
+  cbl = _mm_packhi_epi32(cb0, cb1); \
+  cbh = _mm_packhi_epi32(cb2, cb3); \
+  cb = _mm_packus_epi16(cbl, cbh); \
+  VEC_ST(outptr1, cb); \
+  \
+  /* Calculate Cr values */ \
+  cr0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_mf008_mf041), pd_onehalfm1_cj); \
+  cr1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_mf008_mf041), pd_onehalfm1_cj); \
+  cr2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_mf008_mf041), pd_onehalfm1_cj); \
+  cr3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_mf008_mf041), pd_onehalfm1_cj); \
+  cr0 = _mm_sub_epi32(cr0, _mm_madd_epi16(rg0, pw_mf050_f000)); \
+  cr1 = _mm_sub_epi32(cr1, _mm_madd_epi16(rg1, pw_mf050_f000)); \
+  cr2 = _mm_sub_epi32(cr2, _mm_madd_epi16(rg2, pw_mf050_f000)); \
+  cr3 = _mm_sub_epi32(cr3, _mm_madd_epi16(rg3, pw_mf050_f000)); \
+  \
+  crl = _mm_packhi_epi32(cr0, cr1); \
+  crh = _mm_packhi_epi32(cr2, cr3); \
+  cr = _mm_packus_epi16(crl, crh); \
+  VEC_ST(outptr2, cr);
+
+
+#define PIXELSIZE 3
+#define RGB_SHUFFLE \
+  rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \
+  rgbg1 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb1, rgb0), pb_shuf4); \
+  rgbg2 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb2, rgb1), pb_shuf0); \
+  rgbg3 = _mm_shuffle_epi8(rgb2, pb_shuf4);
+
+#define rgbn_ycc_convert  jsimd_rgb3_ycc_convert_e2k
+#define rgbn_gray_convert jsimd_rgb3_gray_convert_e2k
+#include "jccolext-e2k.c"
+
+
+#define PIXELSIZE 4
+#define RGB_SHUFFLE \
+  rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \
+  rgbg1 = _mm_shuffle_epi8(rgb1, pb_shuf0); \
+  rgbg2 = _mm_shuffle_epi8(rgb2, pb_shuf0); \
+  rgbg3 = _mm_shuffle_epi8(rgb3, pb_shuf0);
+
+#define rgbn_ycc_convert  jsimd_rgb4_ycc_convert_e2k
+#define rgbn_gray_convert jsimd_rgb4_gray_convert_e2k
+#include "jccolext-e2k.c"
+
diff --git a/simd/e2k/jchuff-e2k.c b/simd/e2k/jchuff-e2k.c
new file mode 100644
index 0000000..ec4329e
--- /dev/null
+++ b/simd/e2k/jchuff-e2k.c
@@ -0,0 +1,307 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2022, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+/* Encode a single block's worth of coefficients */
+
+#include "jsimd_e2k.h"
+
+#if __SIZEOF_SIZE_T__ != 8
+#error
+#endif
+
+typedef unsigned long long bit_buf_type;
+#define BIT_BUF_SIZE  64
+
+typedef struct {
+  bit_buf_type put_buffer;              /* current bit accumulation buffer */
+  int free_bits;                        /* # of bits available in it */
+  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
+  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
+  savable_state cur;            /* Current bit buffer & DC state */
+  j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+  int simd;
+} working_state;
+
+#define EMIT_BYTE(b) { \
+  buffer[0] = (JOCTET)(b); \
+  buffer[1] = 0; \
+  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
+}
+
+#define FLUSH() { \
+  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+    EMIT_BYTE(put_buffer >> 56) \
+    EMIT_BYTE(put_buffer >> 48) \
+    EMIT_BYTE(put_buffer >> 40) \
+    EMIT_BYTE(put_buffer >> 32) \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    *(uint64_t*)buffer = __builtin_bswap64(put_buffer); \
+    buffer += 8; \
+  } \
+}
+
+#define PUT_AND_FLUSH(code, size) { \
+  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+  FLUSH() \
+  free_bits += BIT_BUF_SIZE; \
+  put_buffer = code; \
+}
+
+#define PUT_BITS(code, size) { \
+  free_bits -= size; \
+  if (free_bits < 0) \
+    PUT_AND_FLUSH(code, size) \
+  else \
+    put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size) { \
+  /* temp &= (((JLONG)1) << nbits) - 1; */ \
+  /* temp |= code << nbits; */ \
+  temp = __builtin_e2k_insfd(code, __builtin_e2k_insfd(nbits, 6 * 63 + 64, -nbits), temp); \
+  nbits += size; \
+  PUT_BITS(temp, nbits) \
+}
+
+#define KLOOP_PREPARE(mask, i) \
+  t0 = _mm_cmpeq_epi8(_mm_packs_epi16(v0, v1), zero); \
+  t1 = _mm_cmpeq_epi8(_mm_packs_epi16(v2, v3), zero); \
+  mask = (uint32_t)(_mm_movemask_epi8(t0) | _mm_movemask_epi8(t1) << 16); \
+  t0 = _mm_add_epi16(_mm_srai_epi16(v0, 15), v0); \
+  t1 = _mm_add_epi16(_mm_srai_epi16(v1, 15), v1); \
+  t2 = _mm_add_epi16(_mm_srai_epi16(v2, 15), v2); \
+  t3 = _mm_add_epi16(_mm_srai_epi16(v3, 15), v3); \
+  v0 = _mm_abs_epi16(v0); \
+  v1 = _mm_abs_epi16(v1); \
+  v2 = _mm_abs_epi16(v2); \
+  v3 = _mm_abs_epi16(v3); \
+  VEC_ST(block_nbits + i,      v0); \
+  VEC_ST(block_nbits + i +  8, v1); \
+  VEC_ST(block_nbits + i + 16, v2); \
+  VEC_ST(block_nbits + i + 24, v3); \
+  VEC_ST(block_diff + i,      t0); \
+  VEC_ST(block_diff + i +  8, t1); \
+  VEC_ST(block_diff + i + 16, t2); \
+  VEC_ST(block_diff + i + 24, t3);
+
+#define SHUF16X4(a, b, c, d) _mm_setr_pi8( \
+  a * 2, a * 2 + 1, b * 2, b * 2 + 1, c * 2, c * 2 + 1, d * 2, d * 2 + 1)
+#define VEC_COMBINE(h0, h1) _mm_unpacklo_epi64( \
+  _mm_movpi64_epi64(h0), _mm_movpi64_epi64(h1))
+#define INSFI_M64(a, b, c, d) _mm_cvtsi64_m64(__builtin_e2k_insfd( \
+  _mm_cvtm64_si64(a), (b & 63) | (d & 63) << 6, _mm_cvtm64_si64(c)))
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block_e2k(void *state, JOCTET *buffer,
+                                JCOEFPTR block, int last_dc_val,
+                                c_derived_tbl *dctbl, c_derived_tbl *actbl) {
+  uint64_t temp, nbits;
+  uint64_t i, r, code, size;
+  uint64_t code_0xf0 = actbl->ehufco[0xf0];
+  uint64_t size_0xf0 = actbl->ehufsi[0xf0];
+
+  working_state *state_ptr = (working_state*)state;
+  bit_buf_type put_buffer = state_ptr->cur.put_buffer;
+  int64_t free_bits = state_ptr->cur.free_bits;
+
+  __m128i zero = _mm_setzero_si128();
+  __m128i v0, v1, v2, v3, t0, t1, t2, t3;
+  int64_t mask, mask1;
+  uint16_t __attribute__((aligned(16))) block_nbits[DCTSIZE2];
+  int16_t __attribute__((aligned(16))) block_diff[DCTSIZE2];
+
+#if 1 /* faster this way */
+  {
+    __m64 d0l, d0h, d1l, d1h, d2l, d2h, d3l, d3h;
+    __m64 d4l, d4h, d5l, d5h, d6l, d6h, d7l, d7h;
+    __m64 h0, h1, h2, h3, r0, r1, c1256 = SHUF16X4(1, 2, 5, 6);
+
+    d0l = *(__m64*)(block + 8 * 0); d0h = *(__m64*)(block + 8 * 0 + 4); //  0  4
+    d1l = *(__m64*)(block + 8 * 1); d1h = *(__m64*)(block + 8 * 1 + 4); //  8 12
+    d2l = *(__m64*)(block + 8 * 2); d2h = *(__m64*)(block + 8 * 2 + 4); // 16 20
+    d3l = *(__m64*)(block + 8 * 3); d3h = *(__m64*)(block + 8 * 3 + 4); // 24 28
+    d4l = *(__m64*)(block + 8 * 4); d4h = *(__m64*)(block + 8 * 4 + 4); // 32 36
+    d5l = *(__m64*)(block + 8 * 5); d5h = *(__m64*)(block + 8 * 5 + 4); // 40 44
+    d6l = *(__m64*)(block + 8 * 6); d6h = *(__m64*)(block + 8 * 6 + 4); // 48 52
+    d7l = *(__m64*)(block + 8 * 7); d7h = *(__m64*)(block + 8 * 7 + 4); // 56 60
+
+    // d0l[0] d0l[1] d1l[0] d2l[0]
+    // d1l[1] d0l[2] d0l[3] d1l[2]
+    h0 = _mm_unpacklo_pi16(d1l, d2l);
+    r0 = _mm_unpacklo_pi32(d0l, h0);
+    r1 = _mm_shuffle2_pi8(d1l, d0l, SHUF16X4(1, 6, 7, 2));
+    r0 = _mm_sub_pi16(r0, _mm_cvtsi64_m64((uint16_t)last_dc_val));
+    v0 = VEC_COMBINE(r0, r1);
+
+    // d2l[1] d3l[0] d4l[0] d3l[1]
+    // d2l[2] d1l[3] d0h[0] d0h[1]
+    h0 = _mm_srli_si64(_mm_unpacklo_pi32(d2l, d4l), 16);
+    h2 = INSFI_M64(d1l, 0, d2l, 48);
+    r0 = _mm_unpacklo_pi16(h0, d3l);
+    r1 = _mm_alignr_pi8(d0h, h2, 4);
+    v1 = VEC_COMBINE(r0, r1);
+
+    // d1h[0] d2l[3] d3l[2] d4l[1]
+    // d5l[0] d6l[0] d5l[1] d4l[2]
+    h0 = INSFI_M64(d2l, 32, d1h, 16);
+    h1 = INSFI_M64(d4l, -32, d3l, 48);
+    h2 = INSFI_M64(d4l, 16, d6l, 16);
+    r0 = INSFI_M64(h1, 0, h0, 32);
+    r1 = _mm_unpacklo_pi16(d5l, h2);
+    v2 = VEC_COMBINE(r0, r1);
+
+    // d3l[3] d2h[0] d1h[1] d0h[2]
+    // d0h[3] d1h[2] d2h[1] d3h[0]
+    h0 = _mm_alignr_pi8(d2h, d3l, 6);
+    h1 = INSFI_M64(d0h, 0, d1h, 32);
+    h2 = _mm_unpackhi_pi32(d0h, d1h);
+    h3 = _mm_unpacklo_pi32(d2h, d3h);
+    r0 = INSFI_M64(h1, -16, h0, 32);
+    r1 = _mm_shuffle2_pi8(h2, h3, c1256);
+    v3 = VEC_COMBINE(r0, r1);
+
+    KLOOP_PREPARE(mask, 0)
+
+    // d4l[3] d5l[2] d6l[1] d7l[0]
+    // d7l[1] d6l[2] d5l[3] d4h[0]
+    h0 = _mm_unpackhi_pi32(d4l, d5l);
+    h1 = _mm_unpacklo_pi32(d6l, d7l);
+    h2 = INSFI_M64(d6l, 0, d7l, 32);
+    h2 = INSFI_M64(d5l, 0, h2, 48);
+    r0 = _mm_shuffle2_pi8(h0, h1, c1256);
+    r1 = _mm_alignr_pi8(d4h, h2, 2);
+    v0 = VEC_COMBINE(r0, r1);
+
+    // d3h[1] d2h[2] d1h[3] d2h[3]
+    // d3h[2] d4h[1] d5h[0] d6l[3]
+    h0 = _mm_slli_si64(INSFI_M64(d1h, 16, d3h, 32), 16);
+    h2 = INSFI_M64(d4h, -32, d3h, 48);
+    h3 = INSFI_M64(d6l, 32, d5h, 16);
+    r0 = _mm_unpackhi_pi16(h0, d2h);
+    r1 = _mm_alignr_pi8(h3, h2, 4);
+    v1 = VEC_COMBINE(r0, r1);
+
+    // d7l[2] d7l[3] d6h[0] d5h[1]
+    // d4h[2] d3h[3] d4h[3] d5h[2]
+    h0 = INSFI_M64(d5h, 0, d6h, 16);
+    h2 = _mm_slli_si64(_mm_unpackhi_pi32(d3h, d5h), 16);
+    r0 = _mm_alignr_pi8(h0, d7l, 4);
+    r1 = _mm_unpackhi_pi16(d4h, h2);
+    v2 = VEC_COMBINE(r0, r1);
+
+    // d6h[1] d7h[0] d7h[1] d6h[2]
+    // d5h[3] d6h[3] d7h[2] d7h[3]
+    h0 = INSFI_M64(d6h, -16, d7h, 32);
+    h2 = _mm_unpackhi_pi16(d5h, d6h);
+    r0 = _mm_shuffle_pi16(h0, 0xd2);
+    r1 = _mm_unpackhi_pi32(h2, d7h);
+    v3 = VEC_COMBINE(r0, r1);
+  }
+#else
+  v0 = _mm_setr_epi16(
+      block[0] - last_dc_val, block[1], block[8], block[16],
+      block[9], block[2], block[3], block[10]);
+  v1 = _mm_setr_epi16(
+      block[17], block[24], block[32], block[25],
+      block[18], block[11], block[4], block[5]);
+  v2 = _mm_setr_epi16(
+      block[12], block[19], block[26], block[33],
+      block[40], block[48], block[41], block[34]);
+  v3 = _mm_setr_epi16(
+      block[27], block[20], block[13], block[6],
+      block[7], block[14], block[21], block[28]);
+
+  KLOOP_PREPARE(mask, 0)
+
+  v0 = _mm_setr_epi16(
+      block[35], block[42], block[49], block[56],
+      block[57], block[50], block[43], block[36]);
+  v1 = _mm_setr_epi16(
+      block[29], block[22], block[15], block[23],
+      block[30], block[37], block[44], block[51]);
+  v2 = _mm_setr_epi16(
+      block[58], block[59], block[52], block[45],
+      block[38], block[31], block[39], block[46]);
+  v3 = _mm_setr_epi16(
+      block[53], block[60], block[61], block[54],
+      block[47], block[55], block[62], block[63]);
+#endif
+
+  KLOOP_PREPARE(mask1, 32)
+  mask |= mask1 << 32;
+  mask = ~mask;
+
+  /* Encode the DC coefficient difference per section F.1.2.1 */
+
+  nbits = block_nbits[0];
+  temp = block_diff[0];
+  nbits = nbits ? 32 - __builtin_clz(nbits) : 0;
+
+  /* Emit the Huffman-coded symbol for the number of bits */
+  code = dctbl->ehufco[nbits];
+  size = dctbl->ehufsi[nbits];
+  PUT_CODE(code, size)
+
+  /* Encode the AC coefficients per section F.1.2.2 */
+
+  /* e2k doesn't have a tzcnt instruction */
+  mask = __builtin_e2k_bitrevd(mask) << 1;
+
+  for (i = 1; mask; i++, mask <<= 1) {
+    r = __builtin_clzll(mask);
+    mask <<= r;
+    i += r;
+    /* if run length > 15, must emit special run-length-16 codes (0xF0) */
+    while (r > 15) {
+      PUT_BITS(code_0xf0, size_0xf0)
+      r -= 16;
+    }
+    nbits = block_nbits[i];
+    temp = block_diff[i];
+    nbits = 32 - __builtin_clz(nbits);
+    /* Emit Huffman symbol for run length / number of bits */
+    /* r = r << 4 | nbits; */
+    r = __builtin_e2k_insfd(r, 4 * 63 + 64, nbits);
+    code = actbl->ehufco[r];
+    size = actbl->ehufsi[r];
+    PUT_CODE(code, size)
+  }
+
+  if (i != 64) {
+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+  }
+
+  state_ptr->cur.put_buffer = put_buffer;
+  state_ptr->cur.free_bits = free_bits;
+  return buffer;
+}
diff --git a/simd/e2k/jcphuff-e2k.c b/simd/e2k/jcphuff-e2k.c
new file mode 100644
index 0000000..f69afeb
--- /dev/null
+++ b/simd/e2k/jcphuff-e2k.c
@@ -0,0 +1,145 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2022, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "jsimd_e2k.h"
+
+#define X(i) coefs[i] = block[jpeg_natural_order_start[i]];
+#define Y(i) coefs[i] = i < rem ? block[jpeg_natural_order_start[i]] : 0;
+
+#define LOOP \
+  for (i = 0; i < Sl >> 4; i++) { \
+    X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \
+    X(8) X(9) X(10) X(11) X(12) X(13) X(14) X(15) \
+    BLOCK16 \
+    jpeg_natural_order_start += 16; \
+  } \
+  rem = Sl & 15; \
+  if (Sl & 8) { \
+    X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \
+    Y(8) Y(9) Y(10) Y(11) Y(12) Y(13) Y(14) \
+    coefs[15] = 0; \
+    BLOCK16 \
+  } else if (rem > 0) { \
+    Y(0) Y(1) Y(2) Y(3) Y(4) Y(5) Y(6) Y(7) \
+    BLOCK8 \
+  }
+
+void jsimd_encode_mcu_AC_first_prepare_e2k
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits)
+{
+  JCOEF *diff = values + DCTSIZE2;
+  int16_t __attribute__((aligned(16))) coefs[16];
+  __m128i v0, v1, v2, v3;
+  __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al);
+  int i, rem;
+
+#define BLOCK16 \
+  v0 = _mm_load_si128((__m128i*)coefs); \
+  v1 = _mm_load_si128((__m128i*)coefs + 1); \
+  v2 = _mm_srai_epi16(v0, 15); \
+  v3 = _mm_srai_epi16(v1, 15); \
+  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
+  v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \
+  v2 = _mm_xor_si128(v0, v2); \
+  v3 = _mm_xor_si128(v1, v3); \
+  _mm_store_si128((__m128i*)values, v0); \
+  _mm_store_si128((__m128i*)values + 1, v1); \
+  _mm_store_si128((__m128i*)diff, v2); \
+  _mm_store_si128((__m128i*)diff + 1, v3); \
+  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \
+  ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \
+  values += 16; diff += 16;
+
+#define BLOCK8 \
+  v0 = _mm_load_si128((__m128i*)coefs); \
+  v2 = _mm_srai_epi16(v0, 15); \
+  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
+  v2 = _mm_xor_si128(v0, v2); \
+  _mm_store_si128((__m128i*)values, v0); \
+  _mm_store_si128((__m128i*)diff, v2); \
+  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \
+  ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \
+  values += 8; diff += 8;
+
+  ((uint64_t*)zerobits)[0] = 0;
+  LOOP
+#undef BLOCK16
+#undef BLOCK8
+
+  for (i = (64 - Sl) >> 3; i; i--) {
+    _mm_store_si128((__m128i*)values, c0);
+    _mm_store_si128((__m128i*)diff, c0);
+    values += 8; diff += 8;
+  }
+}
+
+int jsimd_encode_mcu_AC_refine_prepare_e2k
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits)
+{
+  union { uint64_t q; uint16_t w[4]; } mask1 = { 0 };
+  int16_t __attribute__((aligned(16))) coefs[16];
+  __m128i v0, v1, v2, c1 = _mm_set1_epi8(1);
+  __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al);
+  int i, rem;
+
+#define BLOCK16 \
+  v0 = _mm_load_si128((__m128i*)coefs); \
+  v1 = _mm_load_si128((__m128i*)coefs + 1); \
+  v2 = _mm_packs_epi16(v0, v1); \
+  ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \
+  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
+  v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \
+  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \
+  ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \
+  _mm_store_si128((__m128i*)absvalues, v0); \
+  _mm_store_si128((__m128i*)absvalues + 1, v1); \
+  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c1); \
+  mask1.w[i] = _mm_movemask_epi8(v2); \
+  absvalues += 16;
+
+#define BLOCK8 \
+  v0 = _mm_load_si128((__m128i*)coefs); \
+  v2 = _mm_packs_epi16(v0, c0); \
+  ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \
+  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
+  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \
+  ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \
+  _mm_store_si128((__m128i*)absvalues, v0); \
+  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c1); \
+  mask1.w[i] = _mm_movemask_epi8(v2); \
+  absvalues += 8;
+
+  ((uint64_t*)bits)[0] = 0; /* zero */
+  ((uint64_t*)bits)[1] = 0; /* sign */
+  LOOP
+#undef BLOCK16
+#undef BLOCK8
+
+  for (i = (64 - Sl) >> 3; i; i--) {
+    _mm_store_si128((__m128i*)absvalues, c0);
+    absvalues += 8;
+  }
+
+  return 63 - __builtin_clzll(mask1.q | 1);
+}
diff --git a/simd/e2k/jcsample-e2k.c b/simd/e2k/jcsample-e2k.c
new file mode 100644
index 0000000..cac8897
--- /dev/null
+++ b/simd/e2k/jcsample-e2k.c
@@ -0,0 +1,203 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_e2k.h"
+#include "jcsample.h"
+
+void jsimd_h2v1_downsample_e2k(JDIMENSION image_width,
+                               int max_v_samp_factor,
+                               JDIMENSION v_samp_factor,
+                               JDIMENSION width_in_blocks,
+                               JSAMPARRAY input_data,
+                               JSAMPARRAY output_data)
+{
+  int outcol;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow;
+  JSAMPROW inptr, outptr;
+
+  __m128i this0, next0, out;
+  __m128i this0e, this0o, next0e, next0o, outl, outh;
+
+  /* Constants */
+  __m128i pw_bias = _mm_set1_epi32(1 << 16),
+    even_mask = _mm_set1_epi16(255);
+#ifdef NEED_ALIGN8
+  ALIGN8_COMMON
+  ALIGN8_VARS(src)
+#endif
+
+  expand_right_edge(input_data, max_v_samp_factor,
+                    image_width, output_cols * 2);
+
+  if (output_cols > 0)
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr = input_data[outrow];
+
+#ifdef NEED_ALIGN8
+    ALIGN8_START(inptr, src)
+#endif
+    PRAGMA_E2K("ivdep")
+    for (outcol = output_cols; outcol > 8;
+         outcol -= 16, outptr += 16) {
+
+#ifdef NEED_ALIGN8
+      ALIGN8_READ16(this0, src, 0)
+      ALIGN8_READ16(next0, src, 1)
+      src_ptr += 4;
+#else
+      this0 = VEC_LD(inptr);
+      next0 = VEC_LD(inptr + 16);
+      inptr += 32;
+#endif
+      this0e = _mm_and_si128(this0, even_mask);
+      this0o = _mm_srli_epi16(this0, 8);
+      outl = _mm_add_epi16(this0e, this0o);
+      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1);
+      next0e = _mm_and_si128(next0, even_mask);
+      next0o = _mm_srli_epi16(next0, 8);
+      outh = _mm_add_epi16(next0e, next0o);
+      outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 1);
+
+      out = _mm_packus_epi16(outl, outh);
+      VEC_ST(outptr, out);
+    }
+    if (outcol > 0) {
+#ifdef NEED_ALIGN8
+      ALIGN8_READ16(this0, src, 0)
+#else
+      this0 = VEC_LD(inptr);
+#endif
+      this0e = _mm_and_si128(this0, even_mask);
+      this0o = _mm_srli_epi16(this0, 8);
+      outl = _mm_add_epi16(this0e, this0o);
+      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1);
+
+      out = _mm_packus_epi16(outl, outl);
+      VEC_STL(outptr, out);
+    }
+  }
+}
+
+
+void jsimd_h2v2_downsample_e2k(JDIMENSION image_width, int max_v_samp_factor,
+                               JDIMENSION v_samp_factor,
+                               JDIMENSION width_in_blocks,
+                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  int outcol;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow;
+  JSAMPROW inptr0, inptr1, outptr;
+
+  __m128i this0, next0, this1, next1, out;
+  __m128i this0e, this0o, next0e, next0o, this1e, this1o,
+    next1e, next1o, out0l, out0h, out1l, out1h, outl, outh;
+
+  /* Constants */
+  __m128i pw_bias = _mm_set1_epi32(1 | 2 << 16),
+    even_mask = _mm_set1_epi16(255);
+#ifdef NEED_ALIGN8
+  ALIGN8_COMMON
+  ALIGN8_VARS(src0)
+  ALIGN8_VARS(src1)
+#endif
+
+  expand_right_edge(input_data, max_v_samp_factor,
+                    image_width, output_cols * 2);
+
+  if (output_cols > 0)
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    inptr0 = input_data[outrow * 2];
+    inptr1 = input_data[outrow * 2 + 1];
+    outptr = output_data[outrow];
+
+#ifdef NEED_ALIGN8
+    ALIGN8_START(inptr0, src0)
+    ALIGN8_START(inptr1, src1)
+#endif
+    PRAGMA_E2K("ivdep")
+    for (outcol = output_cols; outcol > 8;
+         outcol -= 16, outptr += 16) {
+
+#ifdef NEED_ALIGN8
+      ALIGN8_READ16(this0, src0, 0) src0_ptr += 2;
+      ALIGN8_READ16(this1, src1, 0) src1_ptr += 2;
+#else
+      this0 = VEC_LD(inptr0); inptr0 += 16;
+      this1 = VEC_LD(inptr1); inptr1 += 16;
+#endif
+      this0e = _mm_and_si128(this0, even_mask);
+      this1e = _mm_and_si128(this1, even_mask);
+      this0o = _mm_srli_epi16(this0, 8);
+      this1o = _mm_srli_epi16(this1, 8);
+      out0l = _mm_add_epi16(this0e, this0o);
+      out1l = _mm_add_epi16(this1e, this1o);
+
+      outl = _mm_add_epi16(out0l, out1l);
+      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2);
+
+#ifdef NEED_ALIGN8
+      ALIGN8_READ16(next0, src0, 0) src0_ptr += 2;
+      ALIGN8_READ16(next1, src1, 0) src1_ptr += 2;
+#else
+      next0 = VEC_LD(inptr0); inptr0 += 16;
+      next1 = VEC_LD(inptr1); inptr1 += 16;
+#endif
+      next0e = _mm_and_si128(next0, even_mask);
+      next1e = _mm_and_si128(next1, even_mask);
+      next0o = _mm_srli_epi16(next0, 8);
+      next1o = _mm_srli_epi16(next1, 8);
+      out0h = _mm_add_epi16(next0e, next0o);
+      out1h = _mm_add_epi16(next1e, next1o);
+
+      outh = _mm_add_epi16(out0h, out1h);
+      outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 2);
+
+      out = _mm_packus_epi16(outl, outh);
+      VEC_ST(outptr, out);
+    }
+    if (outcol > 0) {
+#ifdef NEED_ALIGN8
+      ALIGN8_READ16(this0, src0, 0)
+      ALIGN8_READ16(this1, src1, 0)
+#else
+      this0 = VEC_LD(inptr0);
+      this1 = VEC_LD(inptr1);
+#endif
+      this0e = _mm_and_si128(this0, even_mask);
+      this1e = _mm_and_si128(this1, even_mask);
+      this0o = _mm_srli_epi16(this0, 8);
+      this1o = _mm_srli_epi16(this1, 8);
+      out0l = _mm_add_epi16(this0e, this0o);
+      out1l = _mm_add_epi16(this1e, this1o);
+
+      outl = _mm_add_epi16(out0l, out1l);
+      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2);
+
+      out = _mm_packus_epi16(outl, outl);
+      VEC_STL(outptr, out);
+    }
+  }
+}
diff --git a/simd/e2k/jcsample.h b/simd/e2k/jcsample.h
new file mode 100644
index 0000000..2ac4816
--- /dev/null
+++ b/simd/e2k/jcsample.h
@@ -0,0 +1,28 @@
+/*
+ * jcsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+LOCAL(void)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+                  JDIMENSION output_cols)
+{
+  register JSAMPROW ptr;
+  register JSAMPLE pixval;
+  register int count;
+  int row;
+  int numcols = (int)(output_cols - input_cols);
+
+  if (numcols > 0) {
+    for (row = 0; row < num_rows; row++) {
+      ptr = image_data[row] + input_cols;
+      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      for (count = numcols; count > 0; count--)
+        *ptr++ = pixval;
+    }
+  }
+}
diff --git a/simd/e2k/jdcolext-e2k.c b/simd/e2k/jdcolext-e2k.c
new file mode 100644
index 0000000..4f12aef
--- /dev/null
+++ b/simd/e2k/jdcolext-e2k.c
@@ -0,0 +1,258 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-e2k.c */
+
+void ycc_rgbn_convert(JDIMENSION out_width, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows, int shuf_idx)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16];
+
+  __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr;
+  __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh,
+    crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
+  __m128i g0, g1, g2, g3;
+
+  /* Constants
+   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+   * high-order bits, not 16.
+   */
+  __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1),
+    pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1),
+    pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)),
+    pb_255 = _mm_set1_epi8(-1),
+    pw_cj = _mm_set1_epi16(CENTERJSAMPLE),
+    pd_onehalf = _mm_set1_epi32(ONE_HALF),
+    pb_zero = _mm_setzero_si128();
+  RGB_SHUFFLE_INIT
+#ifdef NEED_ALIGN8
+  ALIGN8_COMMON
+  ALIGN8_VARS(src0)
+  ALIGN8_VARS(src1)
+  ALIGN8_VARS(src2)
+#endif
+
+  if (out_width > 0)
+  while (--num_rows >= 0) {
+    int num_cols;
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+
+    if (out_width >= 16) {
+#ifdef NEED_ALIGN8
+      ALIGN8_START(inptr0, src0)
+      ALIGN8_START(inptr1, src1)
+      ALIGN8_START(inptr2, src2)
+      inptr0 += out_width & -16;
+      inptr1 += out_width & -16;
+      inptr2 += out_width & -16;
+#endif
+      PRAGMA_E2K("ivdep")
+      for (num_cols = out_width; num_cols >= 16;
+           num_cols -= 16, outptr += PIXELSIZE * 16) {
+#ifdef NEED_ALIGN8
+        ALIGN8_READ16(y, src0, 0) src0_ptr += 2;
+        ALIGN8_READ16(cb, src1, 0) src1_ptr += 2;
+        ALIGN8_READ16(cr, src2, 0) src2_ptr += 2;
+#else
+        y = VEC_LD(inptr0); inptr0 += 16;
+        cb = VEC_LD(inptr1); inptr1 += 16;
+        cr = VEC_LD(inptr2); inptr2 += 16;
+#endif
+        CALC_RGB
+        RGB_SHUFFLE
+        VEC_ST(outptr, rgb0);
+        VEC_ST(outptr + 16, rgb1);
+        VEC_ST(outptr + 32, rgb2);
+#if PIXELSIZE == 4
+        VEC_ST(outptr + 48, rgb3);
+#endif
+      }
+    }
+
+    num_cols = out_width & 15;
+    if (num_cols) {
+      int i;
+      for (i = 0; i < num_cols; i++) {
+        tmpbuf[i] = inptr0[i];
+        tmpbuf[i + 16] = inptr1[i];
+        tmpbuf[i + 32] = inptr2[i];
+      }
+      y = VEC_LD(tmpbuf);
+      cb = VEC_LD(tmpbuf + 16);
+      cr = VEC_LD(tmpbuf + 32);
+      CALC_RGB
+      RGB_SHUFFLE
+      VEC_ST(tmpbuf, rgb0);
+      VEC_ST(tmpbuf + 16, rgb1);
+      VEC_ST(tmpbuf + 32, rgb2);
+#if PIXELSIZE == 4
+      VEC_ST(tmpbuf + 48, rgb3);
+#endif
+      memcpy(outptr, tmpbuf, num_cols * PIXELSIZE);
+    }
+  }
+}
+
+void ycc_rgbn_merged(JDIMENSION out_width, JSAMPIMAGE input_buf,
+                     JDIMENSION in_row_group_ctr,
+                     JDIMENSION in_row_group_ctr_y,
+                     JSAMPARRAY output_buf, int shuf_idx)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols;
+  uint8_t __attribute__((aligned(16))) tmpbuf[4 * 16];
+
+  __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr;
+  __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh,
+    crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
+    rl, rh, gl, gh, bl, bh;
+  __m128i g_y0, g_y1, g_y2, g_y3;
+
+  /* Constants
+   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+   * high-order bits, not 16.
+   */
+  __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1),
+    pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1),
+    pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)),
+    pb_255 = _mm_set1_epi8(-1),
+    pw_cj = _mm_set1_epi16(CENTERJSAMPLE),
+    pd_onehalf = _mm_set1_epi32(ONE_HALF),
+    pb_zero = _mm_setzero_si128();
+  RGB_SHUFFLE_INIT
+#ifdef NEED_ALIGN8
+  ALIGN8_COMMON
+  ALIGN8_VARS(src0)
+  ALIGN8_VARS(src1)
+  ALIGN8_VARS(src2)
+#endif
+
+  inptr0 = input_buf[0][in_row_group_ctr_y];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  if (out_width >= 32) {
+#ifdef NEED_ALIGN8
+    ALIGN8_START(inptr0, src0)
+    ALIGN8_START(inptr1, src1)
+    ALIGN8_START(inptr2, src2)
+    inptr0 += out_width & -32;
+    inptr1 += (out_width & -32) >> 1;
+    inptr2 += (out_width & -32) >> 1;
+#endif
+    PRAGMA_E2K("ivdep")
+    for (num_cols = out_width; num_cols >= 32; num_cols -= 32) {
+
+#ifdef NEED_ALIGN8
+      ALIGN8_READ16(cb, src1, 0) src1_ptr += 2;
+      ALIGN8_READ16(cr, src2, 0) src2_ptr += 2;
+#else
+      cb = VEC_LD(inptr1); inptr1 += 16;
+      cr = VEC_LD(inptr2); inptr2 += 16;
+#endif
+      CALC_MERGED1
+
+#ifdef NEED_ALIGN8
+      ALIGN8_READ16(y, src0, 0) src0_ptr += 2;
+#else
+      y = VEC_LD(inptr0); inptr0 += 16;
+#endif
+      CALC_MERGED2(r_yl, g_yl, b_yl)
+      RGB_SHUFFLE
+      VEC_ST(outptr, rgb0);
+      VEC_ST(outptr + 16, rgb1);
+      VEC_ST(outptr + 32, rgb2);
+#if PIXELSIZE == 4
+      VEC_ST(outptr + 48, rgb3);
+#endif
+      outptr += PIXELSIZE * 16;
+
+#ifdef NEED_ALIGN8
+      ALIGN8_READ16(y, src0, 0) src0_ptr += 2;
+#else
+      y = VEC_LD(inptr0); inptr0 += 16;
+#endif
+      CALC_MERGED2(r_yh, g_yh, b_yh)
+      RGB_SHUFFLE
+      VEC_ST(outptr, rgb0);
+      VEC_ST(outptr + 16, rgb1);
+      VEC_ST(outptr + 32, rgb2);
+#if PIXELSIZE == 4
+      VEC_ST(outptr + 48, rgb3);
+#endif
+      outptr += PIXELSIZE * 16;
+    }
+  }
+
+  num_cols = out_width & 31;
+  if (num_cols) {
+    int i;
+    for (i = 0; i < (num_cols + 1) >> 1; i++) {
+      tmpbuf[i] = inptr1[i];
+      tmpbuf[i + 16] = inptr2[i];
+      tmpbuf[i * 2 + 32] = inptr0[i * 2];
+      tmpbuf[i * 2 + 32 + 1] = inptr0[i * 2 + 1];
+    }
+    cb = VEC_LD(tmpbuf);
+    cr = VEC_LD(tmpbuf + 16);
+    CALC_MERGED1
+
+    y = VEC_LD(tmpbuf + 32);
+    CALC_MERGED2(r_yl, g_yl, b_yl)
+    RGB_SHUFFLE
+    if (num_cols >= 16) {
+      VEC_ST(outptr, rgb0);
+      VEC_ST(outptr + 16, rgb1);
+      VEC_ST(outptr + 32, rgb2);
+#if PIXELSIZE == 4
+      VEC_ST(outptr + 48, rgb3);
+#endif
+      outptr += PIXELSIZE * 16;
+
+      y = VEC_LD(tmpbuf + 48);
+      CALC_MERGED2(r_yh, g_yh, b_yh)
+      RGB_SHUFFLE
+    }
+    VEC_ST(tmpbuf, rgb0);
+    VEC_ST(tmpbuf + 16, rgb1);
+    VEC_ST(tmpbuf + 32, rgb2);
+#if PIXELSIZE == 4
+    VEC_ST(tmpbuf + 48, rgb3);
+#endif
+    memcpy(outptr, tmpbuf, (out_width & 15) * PIXELSIZE);
+  }
+}
+
+#undef RGB_SHUFFLE_INIT
+#undef RGB_SHUFFLE
+#undef PIXELSIZE
+#undef ycc_rgbn_convert
+#undef ycc_rgbn_merged
+
diff --git a/simd/e2k/jdcolor-e2k.c b/simd/e2k/jdcolor-e2k.c
new file mode 100644
index 0000000..94c80e9
--- /dev/null
+++ b/simd/e2k/jdcolor-e2k.c
@@ -0,0 +1,289 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_e2k.h"
+
+#define F_0_344  22554              /* FIX(0.34414) */
+#define F_0_714  46802              /* FIX(0.71414) */
+#define F_1_402  91881              /* FIX(1.40200) */
+#define F_1_772  116130             /* FIX(1.77200) */
+#define F_0_402  (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+static const uint8_t __attribute__((aligned(16)))
+#if defined(__iset__) && __iset__ >= 5
+ycc_rgb_shuf_const[7][48] = {
+#define SHUF_CONST3          \
+  C0,      C1,      C2,      \
+  C0 + 4,  C1 + 4,  C2 + 4,  \
+  C0 + 8,  C1 + 8,  C2 + 8,  \
+  C0 + 12, C1 + 12, C2 + 12, \
+  C0 + 16, C1 + 16, C2 + 16, \
+  C0 + 20, C1 + 4,  C2 + 4,  \
+  C0 + 8,  C1 + 8,  C2 + 8,  \
+  C0 + 12, C1 + 12, C2 + 12, \
+  C0 + 16, C1 + 16, C2 + 16, \
+  C0 + 20, C1 + 20, C2 + 20, \
+  C0 + 24, C1 + 24, C2 + 8,  \
+  C0 + 12, C1 + 12, C2 + 12, \
+  C0 + 16, C1 + 16, C2 + 16, \
+  C0 + 20, C1 + 20, C2 + 20, \
+  C0 + 24, C1 + 24, C2 + 24, \
+  C0 + 28, C1 + 28, C2 + 28
+#else
+ycc_rgb_shuf_const[7][24] = {
+#define SHUF_CONST3          \
+  C0,      C1,      C2,      \
+  C0 + 4,  C1 + 4,  C2 + 4,  \
+  C0 + 8,  C1 + 8,  C2,      \
+  C0 + 4,  C1 + 4,  C2 + 4,  \
+  C0 + 8,  C1 + 8,  C2 + 8,  \
+  C0 + 12, C1 + 4,  C2 + 4,  \
+  C0 + 8,  C1 + 8,  C2 + 8,  \
+  C0 + 12, C1 + 12, C2 + 12
+#endif
+
+#define SHUF_CONST4 C0, C1, C2, C3, C0 + 4, C1 + 4, C2 + 4, C3 + 4, \
+  C0 + 8, C1 + 8, C2 + 8, C3 + 8, C0 + 12, C1 + 12, C2 + 12, C3 + 12
+
+#define TMP_RED    RGB_RED
+#define TMP_GREEN  RGB_GREEN
+#define TMP_BLUE   RGB_BLUE
+#define PIXELSIZE  RGB_PIXELSIZE
+#include "jdcoltab-e2k.c"
+  ,
+#define TMP_RED    EXT_RGB_RED
+#define TMP_GREEN  EXT_RGB_GREEN
+#define TMP_BLUE   EXT_RGB_BLUE
+#define PIXELSIZE  EXT_RGB_PIXELSIZE
+#include "jdcoltab-e2k.c"
+  ,
+#define TMP_RED    EXT_RGBX_RED
+#define TMP_GREEN  EXT_RGBX_GREEN
+#define TMP_BLUE   EXT_RGBX_BLUE
+#define PIXELSIZE  EXT_RGBX_PIXELSIZE
+#include "jdcoltab-e2k.c"
+  ,
+#define TMP_RED    EXT_BGR_RED
+#define TMP_GREEN  EXT_BGR_GREEN
+#define TMP_BLUE   EXT_BGR_BLUE
+#define PIXELSIZE  EXT_BGR_PIXELSIZE
+#include "jdcoltab-e2k.c"
+  ,
+#define TMP_RED    EXT_BGRX_RED
+#define TMP_GREEN  EXT_BGRX_GREEN
+#define TMP_BLUE   EXT_BGRX_BLUE
+#define PIXELSIZE  EXT_BGRX_PIXELSIZE
+#include "jdcoltab-e2k.c"
+  ,
+#define TMP_RED    EXT_XBGR_RED
+#define TMP_GREEN  EXT_XBGR_GREEN
+#define TMP_BLUE   EXT_XBGR_BLUE
+#define PIXELSIZE  EXT_XBGR_PIXELSIZE
+#include "jdcoltab-e2k.c"
+  ,
+#define TMP_RED    EXT_XRGB_RED
+#define TMP_GREEN  EXT_XRGB_GREEN
+#define TMP_BLUE   EXT_XRGB_BLUE
+#define PIXELSIZE  EXT_XRGB_PIXELSIZE
+#include "jdcoltab-e2k.c"
+};
+
+  /* (Original)
+   * R = Y                + 1.40200 * Cr
+   * G = Y - 0.34414 * Cb - 0.71414 * Cr
+   * B = Y + 1.77200 * Cb
+   *
+   * (This implementation)
+   * R = Y                + 0.40200 * Cr + Cr
+   * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+   * B = Y - 0.22800 * Cb + Cb + Cb
+   */
+
+#define CALC_RGB \
+  yl = _mm_unpacklo_epi8(y, pb_zero); \
+  yh = _mm_unpackhi_epi8(y, pb_zero); \
+  \
+  cbl = _mm_unpacklo_epi8(cb, pb_zero); \
+  cbh = _mm_unpackhi_epi8(cb, pb_zero); \
+  cbl = _mm_sub_epi16(cbl, pw_cj); \
+  cbh = _mm_sub_epi16(cbh, pw_cj); \
+  \
+  crl = _mm_unpacklo_epi8(cr, pb_zero); \
+  crh = _mm_unpackhi_epi8(cr, pb_zero); \
+  crl = _mm_sub_epi16(crl, pw_cj); \
+  crh = _mm_sub_epi16(crh, pw_cj); \
+  \
+  bl = _mm_mulhrs_epi16(cbl, pw_mf0228); \
+  bh = _mm_mulhrs_epi16(cbh, pw_mf0228); \
+  bl = _mm_add_epi16(bl, _mm_add_epi16(cbl, cbl)); \
+  bh = _mm_add_epi16(bh, _mm_add_epi16(cbh, cbh)); \
+  bl = _mm_add_epi16(bl, yl); \
+  bh = _mm_add_epi16(bh, yh); \
+  \
+  rl = _mm_mulhrs_epi16(crl, pw_f0402); \
+  rh = _mm_mulhrs_epi16(crh, pw_f0402); \
+  rl = _mm_add_epi16(rl, crl); \
+  rh = _mm_add_epi16(rh, crh); \
+  rl = _mm_add_epi16(rl, yl); \
+  rh = _mm_add_epi16(rh, yh); \
+  \
+  g0w = _mm_unpacklo_epi16(cbl, crl); \
+  g1w = _mm_unpackhi_epi16(cbl, crl); \
+  g0 = _mm_add_epi32(_mm_madd_epi16(g0w, pw_mf0344_f0285), pd_onehalf); \
+  g1 = _mm_add_epi32(_mm_madd_epi16(g1w, pw_mf0344_f0285), pd_onehalf); \
+  g2w = _mm_unpacklo_epi16(cbh, crh); \
+  g3w = _mm_unpackhi_epi16(cbh, crh); \
+  g2 = _mm_add_epi32(_mm_madd_epi16(g2w, pw_mf0344_f0285), pd_onehalf); \
+  g3 = _mm_add_epi32(_mm_madd_epi16(g3w, pw_mf0344_f0285), pd_onehalf); \
+  \
+  gl = _mm_packhi_epi32(g0, g1); \
+  gh = _mm_packhi_epi32(g2, g3); \
+  gl = _mm_sub_epi16(gl, crl); \
+  gh = _mm_sub_epi16(gh, crh); \
+  gl = _mm_add_epi16(gl, yl); \
+  gh = _mm_add_epi16(gh, yh); \
+  \
+  rl = _mm_packus_epi16(rl, rh); \
+  gl = _mm_packus_epi16(gl, gh); \
+  bl = _mm_packus_epi16(bl, bh); \
+  \
+  rg0 = _mm_unpacklo_epi8(rl, gl); \
+  rg1 = _mm_unpackhi_epi8(rl, gl); \
+  bx0 = _mm_unpacklo_epi8(bl, pb_255); \
+  bx1 = _mm_unpackhi_epi8(bl, pb_255); \
+  \
+  rgb0 = _mm_unpacklo_epi16(rg0, bx0); \
+  rgb1 = _mm_unpackhi_epi16(rg0, bx0); \
+  rgb2 = _mm_unpacklo_epi16(rg1, bx1); \
+  rgb3 = _mm_unpackhi_epi16(rg1, bx1);
+
+#define CALC_MERGED1 \
+  cbl = _mm_unpacklo_epi8(cb, pb_zero); \
+  cbh = _mm_unpackhi_epi8(cb, pb_zero); \
+  cbl = _mm_sub_epi16(cbl, pw_cj); \
+  cbh = _mm_sub_epi16(cbh, pw_cj); \
+  \
+  crl = _mm_unpacklo_epi8(cr, pb_zero); \
+  crh = _mm_unpackhi_epi8(cr, pb_zero); \
+  crl = _mm_sub_epi16(crl, pw_cj); \
+  crh = _mm_sub_epi16(crh, pw_cj); \
+  \
+  b_yl = _mm_mulhrs_epi16(cbl, pw_mf0228); \
+  b_yh = _mm_mulhrs_epi16(cbh, pw_mf0228); \
+  b_yl = _mm_add_epi16(b_yl, _mm_add_epi16(cbl, cbl)); \
+  b_yh = _mm_add_epi16(b_yh, _mm_add_epi16(cbh, cbh)); \
+  \
+  r_yl = _mm_mulhrs_epi16(crl, pw_f0402); \
+  r_yh = _mm_mulhrs_epi16(crh, pw_f0402); \
+  r_yl = _mm_add_epi16(r_yl, crl); \
+  r_yh = _mm_add_epi16(r_yh, crh); \
+  \
+  g_y0w = _mm_unpacklo_epi16(cbl, crl); \
+  g_y1w = _mm_unpackhi_epi16(cbl, crl); \
+  g_y0 = _mm_add_epi32(_mm_madd_epi16(g_y0w, pw_mf0344_f0285), pd_onehalf); \
+  g_y1 = _mm_add_epi32(_mm_madd_epi16(g_y1w, pw_mf0344_f0285), pd_onehalf); \
+  g_y2w = _mm_unpacklo_epi16(cbh, crh); \
+  g_y3w = _mm_unpackhi_epi16(cbh, crh); \
+  g_y2 = _mm_add_epi32(_mm_madd_epi16(g_y2w, pw_mf0344_f0285), pd_onehalf); \
+  g_y3 = _mm_add_epi32(_mm_madd_epi16(g_y3w, pw_mf0344_f0285), pd_onehalf); \
+  \
+  g_yl = _mm_packhi_epi32(g_y0, g_y1); \
+  g_yh = _mm_packhi_epi32(g_y2, g_y3); \
+  g_yl = _mm_sub_epi16(g_yl, crl); \
+  g_yh = _mm_sub_epi16(g_yh, crh);
+
+#define CALC_MERGED2(r_yl, g_yl, b_yl) \
+  yl = _mm_unpacklo_epi8(y, pb_zero); \
+  yh = _mm_unpackhi_epi8(y, pb_zero); \
+  bl = _mm_add_epi16(_mm_unpacklo_epi16(b_yl, b_yl), yl); \
+  bh = _mm_add_epi16(_mm_unpackhi_epi16(b_yl, b_yl), yh); \
+  rl = _mm_add_epi16(_mm_unpacklo_epi16(r_yl, r_yl), yl); \
+  rh = _mm_add_epi16(_mm_unpackhi_epi16(r_yl, r_yl), yh); \
+  gl = _mm_add_epi16(_mm_unpacklo_epi16(g_yl, g_yl), yl); \
+  gh = _mm_add_epi16(_mm_unpackhi_epi16(g_yl, g_yl), yh); \
+  rl = _mm_packus_epi16(rl, rh); \
+  gl = _mm_packus_epi16(gl, gh); \
+  bl = _mm_packus_epi16(bl, bh); \
+  \
+  rg0 = _mm_unpacklo_epi8(rl, gl); \
+  rg1 = _mm_unpackhi_epi8(rl, gl); \
+  bx0 = _mm_unpacklo_epi8(bl, pb_255); \
+  bx1 = _mm_unpackhi_epi8(bl, pb_255); \
+  \
+  rgb0 = _mm_unpacklo_epi16(rg0, bx0); \
+  rgb1 = _mm_unpackhi_epi16(rg0, bx0); \
+  rgb2 = _mm_unpacklo_epi16(rg1, bx1); \
+  rgb3 = _mm_unpackhi_epi16(rg1, bx1);
+
+#define PIXELSIZE 3
+#if defined(__iset__) && __iset__ >= 5
+#define RGB_SHUFFLE_INIT __m128i \
+  rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]), \
+  rgb_index1 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 16), \
+  rgb_index2 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 32);
+#define RGB_SHUFFLE \
+  rgb0 = _mm_shuffle2_epi8(rgb0, rgb1, rgb_index0); \
+  rgb1 = _mm_shuffle2_epi8(rgb1, rgb2, rgb_index1); \
+  rgb2 = _mm_shuffle2_epi8(rgb2, rgb3, rgb_index2);
+#else
+#define RGB_SHUFFLE_INIT __m64 \
+  rgb_index0 = *(__m64*)ycc_rgb_shuf_const[shuf_idx], \
+  rgb_index1 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 8), \
+  rgb_index2 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 16);
+#define RGB_SHUFFLE { \
+  union { __m128i v; __m64 d[2]; } a = { rgb0 }, \
+    b = { rgb1 }, c = { rgb2 }, d = { rgb3 }; \
+  a.d[0] = _mm_shuffle2_pi8(a.d[0], a.d[1], rgb_index0); \
+  a.d[1] = _mm_shuffle2_pi8(a.d[1], b.d[0], rgb_index1); \
+  b.d[0] = _mm_shuffle2_pi8(b.d[0], b.d[1], rgb_index2); \
+  b.d[1] = _mm_shuffle2_pi8(c.d[0], c.d[1], rgb_index0); \
+  c.d[0] = _mm_shuffle2_pi8(c.d[1], d.d[0], rgb_index1); \
+  c.d[1] = _mm_shuffle2_pi8(d.d[0], d.d[1], rgb_index2); \
+  rgb0 = a.v; rgb1 = b.v; rgb2 = c.v; \
+}
+#endif
+
+#define ycc_rgbn_convert jsimd_ycc_rgb3_convert_e2k
+#define ycc_rgbn_merged  jsimd_ycc_rgb3_merged_upsample_e2k
+#include "jdcolext-e2k.c"
+
+#define PIXELSIZE 4
+#define RGB_SHUFFLE_INIT __m128i \
+  rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]);
+#define RGB_SHUFFLE \
+  rgb0 = _mm_shuffle_epi8(rgb0, rgb_index0); \
+  rgb1 = _mm_shuffle_epi8(rgb1, rgb_index0); \
+  rgb2 = _mm_shuffle_epi8(rgb2, rgb_index0); \
+  rgb3 = _mm_shuffle_epi8(rgb3, rgb_index0);
+
+#define ycc_rgbn_convert jsimd_ycc_rgb4_convert_e2k
+#define ycc_rgbn_merged  jsimd_ycc_rgb4_merged_upsample_e2k
+#include "jdcolext-e2k.c"
+
diff --git a/simd/e2k/jdcoltab-e2k.c b/simd/e2k/jdcoltab-e2k.c
new file mode 100644
index 0000000..e19666d
--- /dev/null
+++ b/simd/e2k/jdcoltab-e2k.c
@@ -0,0 +1,80 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-e2k.c */
+
+#if TMP_RED == 0
+#define C0 0
+#elif TMP_GREEN == 0
+#define C0 1
+#elif TMP_BLUE == 0
+#define C0 2
+#else
+#define C0 3
+#endif
+
+#if TMP_RED == 1
+#define C1 0
+#elif TMP_GREEN == 1
+#define C1 1
+#elif TMP_BLUE == 1
+#define C1 2
+#else
+#define C1 3
+#endif
+
+#if TMP_RED == 2
+#define C2 0
+#elif TMP_GREEN == 2
+#define C2 1
+#elif TMP_BLUE == 2
+#define C2 2
+#else
+#define C2 3
+#endif
+
+#if TMP_RED == 3
+#define C3 0
+#elif TMP_GREEN == 3
+#define C3 1
+#elif TMP_BLUE == 3
+#define C3 2
+#else
+#define C3 3
+#endif
+
+#if PIXELSIZE == 3
+{ SHUF_CONST3 }
+#else
+{ SHUF_CONST4 }
+#endif
+
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+
+#undef TMP_RED
+#undef TMP_GREEN
+#undef TMP_BLUE
+#undef PIXELSIZE
+
diff --git a/simd/e2k/jdsample-e2k.c b/simd/e2k/jdsample-e2k.c
new file mode 100644
index 0000000..572b3af
--- /dev/null
+++ b/simd/e2k/jdsample-e2k.c
@@ -0,0 +1,389 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_e2k.h"
+
+
+void jsimd_h2v1_fancy_upsample_e2k(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow, incol;
+
+  __m128i pb_zero = _mm_setzero_si128();
+  __m128i this0, last0, p_last0, next0 = pb_zero, p_next0, out;
+  __m128i this0l, this0h, last0l, last0h,
+    next0l, next0h, outle, outhe, outlo, outho;
+#ifdef NEED_ALIGN8
+  ALIGN8_COMMON
+  ALIGN8_VARS(src)
+#endif
+
+  /* Constants */
+  __m128i pw_three = _mm_set1_epi16(3),
+    next_index_lastcol = _mm_setr_epi8(
+       1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15),
+    pw_one = _mm_set1_epi16(1), pw_two = _mm_set1_epi16(2);
+
+  if (downsampled_width > 0)
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+
+    if (downsampled_width & 15)
+      inptr[downsampled_width] = inptr[downsampled_width - 1];
+
+#ifdef NEED_ALIGN8
+    ALIGN8_START(inptr, src)
+    ALIGN8_READ16(this0, src, 0)
+#else
+    this0 = VEC_LD(inptr);
+#endif
+    last0 = _mm_bslli_si128(this0, 15);
+
+    PRAGMA_E2K("ivdep")
+    for (incol = downsampled_width; incol > 0;
+         incol -= 16, outptr += 32) {
+
+      p_last0 = _mm_alignr_epi8(this0, last0, 15);
+      last0 = this0;
+
+      if (__builtin_expect(incol <= 16, 0))
+        p_next0 = _mm_shuffle_epi8(this0, next_index_lastcol);
+      else {
+#ifdef NEED_ALIGN8
+        ALIGN8_READ16(next0, src, 1) src_ptr += 2;
+#else
+        next0 = VEC_LD(inptr + 16); inptr += 16;
+#endif
+        p_next0 = _mm_alignr_epi8(next0, this0, 1);
+      }
+
+      this0l = _mm_mullo_epi16(_mm_unpacklo_epi8(this0, pb_zero), pw_three);
+      last0l = _mm_unpacklo_epi8(p_last0, pb_zero);
+      next0l = _mm_unpacklo_epi8(p_next0, pb_zero);
+      last0l = _mm_add_epi16(last0l, pw_one);
+      next0l = _mm_add_epi16(next0l, pw_two);
+
+      outle = _mm_add_epi16(this0l, last0l);
+      outlo = _mm_add_epi16(this0l, next0l);
+      outle = _mm_srli_epi16(outle, 2);
+      outlo = _mm_srli_epi16(outlo, 2);
+
+      out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8));
+      VEC_ST(outptr, out);
+
+      if (__builtin_expect(incol <= 8, 0)) break;
+
+      this0h = _mm_mullo_epi16(_mm_unpackhi_epi8(this0, pb_zero), pw_three);
+      last0h = _mm_unpackhi_epi8(p_last0, pb_zero);
+      next0h = _mm_unpackhi_epi8(p_next0, pb_zero);
+      last0h = _mm_add_epi16(last0h, pw_one);
+      next0h = _mm_add_epi16(next0h, pw_two);
+
+      outhe = _mm_add_epi16(this0h, last0h);
+      outho = _mm_add_epi16(this0h, next0h);
+      outhe = _mm_srli_epi16(outhe, 2);
+      outho = _mm_srli_epi16(outho, 2);
+
+      out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8));
+      VEC_ST(outptr + 16, out);
+
+      this0 = next0;
+    }
+  }
+}
+
+
+void jsimd_h2v2_fancy_upsample_e2k(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+  int inrow, outrow, incol;
+#ifdef NEED_ALIGN8
+  ALIGN8_COMMON
+  ALIGN8_VARS(src_1)
+  ALIGN8_VARS(src0)
+  ALIGN8_VARS(src1)
+#endif
+
+  __m128i pb_zero = _mm_setzero_si128();
+  __m128i this_1, this0, this1, out;
+  __m128i this_1l, this_1h, this0l, this0h, this1l, this1h,
+    lastcolsum_1h, lastcolsum1h,
+    p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
+    thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
+    nextcolsum_1l = pb_zero, nextcolsum_1h = pb_zero,
+    nextcolsum1l = pb_zero, nextcolsum1h = pb_zero,
+    p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
+    tmpl, tmph, outle, outhe, outlo, outho;
+
+  /* Constants */
+  __m128i pw_three = _mm_set1_epi16(3),
+    pw_seven = _mm_set1_epi16(7), pw_eight = _mm_set1_epi16(8),
+    next_index_lastcol = _mm_setr_epi8(
+       2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15);
+
+  if (downsampled_width > 0)
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr_1 = input_data[inrow - 1];
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    if (downsampled_width & 15) {
+      inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
+      inptr0[downsampled_width] = inptr0[downsampled_width - 1];
+      inptr1[downsampled_width] = inptr1[downsampled_width - 1];
+    }
+
+#ifdef NEED_ALIGN8
+    ALIGN8_START(inptr0, src0)
+    ALIGN8_START(inptr_1, src_1)
+    ALIGN8_START(inptr1, src1)
+    ALIGN8_READ16(this0, src0, 0)
+    ALIGN8_READ16(this_1, src_1, 0)
+    ALIGN8_READ16(this1, src1, 0)
+#else
+    this0 = VEC_LD(inptr0);
+    this_1 = VEC_LD(inptr_1);
+    this1 = VEC_LD(inptr1);
+#endif
+
+    this0l = _mm_unpacklo_epi8(this0, pb_zero);
+    this0h = _mm_unpackhi_epi8(this0, pb_zero);
+    this0l = _mm_mullo_epi16(this0l, pw_three);
+    this0h = _mm_mullo_epi16(this0h, pw_three);
+
+    this_1l = _mm_unpacklo_epi8(this_1, pb_zero);
+    this_1h = _mm_unpackhi_epi8(this_1, pb_zero);
+    thiscolsum_1l = _mm_add_epi16(this0l, this_1l);
+    thiscolsum_1h = _mm_add_epi16(this0h, this_1h);
+    lastcolsum_1h = _mm_bslli_si128(thiscolsum_1l, 14);;
+
+    this1l = _mm_unpacklo_epi8(this1, pb_zero);
+    this1h = _mm_unpackhi_epi8(this1, pb_zero);
+    thiscolsum1l = _mm_add_epi16(this0l, this1l);
+    thiscolsum1h = _mm_add_epi16(this0h, this1h);
+    lastcolsum1h = _mm_bslli_si128(thiscolsum1l, 14);
+
+    PRAGMA_E2K("ivdep")
+    for (incol = downsampled_width; incol > 0;
+         incol -= 16, outptr0 += 32, outptr1 += 32) {
+
+      p_lastcolsum_1l = _mm_alignr_epi8(thiscolsum_1l, lastcolsum_1h, 14);
+      p_lastcolsum_1h = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 14);
+      p_lastcolsum1l = _mm_alignr_epi8(thiscolsum1l, lastcolsum1h, 14);
+      p_lastcolsum1h = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 14);
+      lastcolsum_1h = thiscolsum_1h;
+      lastcolsum1h = thiscolsum1h;
+
+      if (__builtin_expect(incol <= 16, 0)) {
+        p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2);
+        p_nextcolsum_1h = _mm_shuffle_epi8(thiscolsum_1h, next_index_lastcol);
+        p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2);
+        p_nextcolsum1h = _mm_shuffle_epi8(thiscolsum1h, next_index_lastcol);
+      } else {
+#ifdef NEED_ALIGN8
+        ALIGN8_READ16(this0, src0, 1) src0_ptr += 2;
+        ALIGN8_READ16(this_1, src_1, 1) src_1_ptr += 2;
+        ALIGN8_READ16(this1, src1, 1) src1_ptr += 2;
+#else
+        this0 = VEC_LD(inptr0 + 16); inptr0 += 16;
+        this_1 = VEC_LD(inptr_1 + 16); inptr_1 += 16;
+        this1 = VEC_LD(inptr1 + 16); inptr1 += 16;
+#endif
+        this0l = _mm_unpacklo_epi8(this0, pb_zero);
+        this0h = _mm_unpackhi_epi8(this0, pb_zero);
+        this0l = _mm_mullo_epi16(this0l, pw_three);
+        this0h = _mm_mullo_epi16(this0h, pw_three);
+
+        this_1l = _mm_unpacklo_epi8(this_1, pb_zero);
+        this_1h = _mm_unpackhi_epi8(this_1, pb_zero);
+        nextcolsum_1l = _mm_add_epi16(this0l, this_1l);
+        nextcolsum_1h = _mm_add_epi16(this0h, this_1h);
+        p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2);
+        p_nextcolsum_1h = _mm_alignr_epi8(nextcolsum_1l, thiscolsum_1h, 2);
+
+        this1l = _mm_unpacklo_epi8(this1, pb_zero);
+        this1h = _mm_unpackhi_epi8(this1, pb_zero);
+        nextcolsum1l = _mm_add_epi16(this0l, this1l);
+        nextcolsum1h = _mm_add_epi16(this0h, this1h);
+        p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2);
+        p_nextcolsum1h = _mm_alignr_epi8(nextcolsum1l, thiscolsum1h, 2);
+      }
+
+      /* Process the upper row */
+
+      tmpl = _mm_mullo_epi16(thiscolsum_1l, pw_three);
+      outle = _mm_add_epi16(tmpl, p_lastcolsum_1l);
+      outle = _mm_add_epi16(outle, pw_eight);
+      outle = _mm_srli_epi16(outle, 4);
+
+      outlo = _mm_add_epi16(tmpl, p_nextcolsum_1l);
+      outlo = _mm_add_epi16(outlo, pw_seven);
+      outlo = _mm_srli_epi16(outlo, 4);
+
+      out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8));
+      VEC_ST(outptr0, out);
+
+      /* Process the lower row */
+
+      tmpl = _mm_mullo_epi16(thiscolsum1l, pw_three);
+      outle = _mm_add_epi16(tmpl, p_lastcolsum1l);
+      outle = _mm_add_epi16(outle, pw_eight);
+      outle = _mm_srli_epi16(outle, 4);
+
+      outlo = _mm_add_epi16(tmpl, p_nextcolsum1l);
+      outlo = _mm_add_epi16(outlo, pw_seven);
+      outlo = _mm_srli_epi16(outlo, 4);
+
+      out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8));
+      VEC_ST(outptr1, out);
+
+      if (__builtin_expect(incol <= 8, 0)) break;
+
+      tmph = _mm_mullo_epi16(thiscolsum_1h, pw_three);
+      outhe = _mm_add_epi16(tmph, p_lastcolsum_1h);
+      outhe = _mm_add_epi16(outhe, pw_eight);
+      outhe = _mm_srli_epi16(outhe, 4);
+
+      outho = _mm_add_epi16(tmph, p_nextcolsum_1h);
+      outho = _mm_add_epi16(outho, pw_seven);
+      outho = _mm_srli_epi16(outho, 4);
+
+      out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8));
+      VEC_ST(outptr0 + 16, out);
+
+      tmph = _mm_mullo_epi16(thiscolsum1h, pw_three);
+      outhe = _mm_add_epi16(tmph, p_lastcolsum1h);
+      outhe = _mm_add_epi16(outhe, pw_eight);
+      outhe = _mm_srli_epi16(outhe, 4);
+
+      outho = _mm_add_epi16(tmph, p_nextcolsum1h);
+      outho = _mm_add_epi16(outho, pw_seven);
+      outho = _mm_srli_epi16(outho, 4);
+
+      out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8));
+      VEC_ST(outptr1 + 16, out);
+
+      thiscolsum_1l = nextcolsum_1l;  thiscolsum_1h = nextcolsum_1h;
+      thiscolsum1l = nextcolsum1l;  thiscolsum1h = nextcolsum1h;
+    }
+  }
+}
+
+
+/* These are rarely used (mainly just for decompressing YCCK images) */
+
+void jsimd_h2v1_upsample_e2k(int max_v_samp_factor,
+                             JDIMENSION out_width,
+                             JSAMPARRAY input_data,
+                             JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow, incol;
+
+  __m128i in, inl, inh;
+#ifdef NEED_ALIGN8
+  ALIGN8_COMMON
+  ALIGN8_VARS(src)
+#endif
+
+  if (out_width > 0)
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+#ifdef NEED_ALIGN8
+    ALIGN8_START(inptr, src)
+#endif
+    PRAGMA_E2K("ivdep")
+    for (incol = out_width; incol > 0;
+         incol -= 32, outptr += 32) {
+#ifdef NEED_ALIGN8
+      ALIGN8_READ16(in, src, 0) src_ptr += 2;
+#else
+      in = VEC_LD(inptr); inptr += 16;
+#endif
+      inl = _mm_unpacklo_epi8(in, in);
+      inh = _mm_unpackhi_epi8(in, in);
+
+      VEC_ST(outptr, inl);
+      VEC_ST(outptr + 16, inh);
+    }
+  }
+}
+
+
+void jsimd_h2v2_upsample_e2k(int max_v_samp_factor,
+                             JDIMENSION out_width,
+                             JSAMPARRAY input_data,
+                             JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr0, outptr1;
+  int inrow, outrow, incol;
+
+  __m128i in, inl, inh;
+#ifdef NEED_ALIGN8
+  ALIGN8_COMMON
+  ALIGN8_VARS(src)
+#endif
+
+  if (out_width > 0)
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+#ifdef NEED_ALIGN8
+    ALIGN8_START(inptr, src)
+#endif
+    PRAGMA_E2K("ivdep")
+    for (incol = out_width; incol > 0;
+         incol -= 32, outptr0 += 32, outptr1 += 32) {
+#ifdef NEED_ALIGN8
+      ALIGN8_READ16(in, src, 0) src_ptr += 2;
+#else
+      in = VEC_LD(inptr); inptr += 16;
+#endif
+      inl = _mm_unpacklo_epi8(in, in);
+      inh = _mm_unpackhi_epi8(in, in);
+
+      VEC_ST(outptr0, inl);
+      VEC_ST(outptr1, inl);
+      VEC_ST(outptr0 + 16, inh);
+      VEC_ST(outptr1 + 16, inh);
+    }
+  }
+}
diff --git a/simd/e2k/jfdctflt-e2k.c b/simd/e2k/jfdctflt-e2k.c
new file mode 100644
index 0000000..e3c4d94
--- /dev/null
+++ b/simd/e2k/jfdctflt-e2k.c
@@ -0,0 +1,127 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FLOAT FORWARD DCT */
+
+#include "jsimd_e2k.h"
+
+#define DO_FDCT(in, out) { \
+  tmp0 = _mm_add_ps(in##0, in##7); \
+  tmp7 = _mm_sub_ps(in##0, in##7); \
+  tmp1 = _mm_add_ps(in##1, in##6); \
+  tmp6 = _mm_sub_ps(in##1, in##6); \
+  tmp2 = _mm_add_ps(in##2, in##5); \
+  tmp5 = _mm_sub_ps(in##2, in##5); \
+  tmp3 = _mm_add_ps(in##3, in##4); \
+  tmp4 = _mm_sub_ps(in##3, in##4); \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_ps(tmp0, tmp3); \
+  tmp13 = _mm_sub_ps(tmp0, tmp3); \
+  tmp11 = _mm_add_ps(tmp1, tmp2); \
+  tmp12 = _mm_sub_ps(tmp1, tmp2); \
+  \
+  out##0 = _mm_add_ps(tmp10, tmp11); \
+  out##4 = _mm_sub_ps(tmp10, tmp11); \
+  \
+  z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), pd_f0707); \
+  out##2 = _mm_add_ps(tmp13, z1); \
+  out##6 = _mm_sub_ps(tmp13, z1); \
+  \
+  /* Odd part */ \
+  \
+  tmp10 = _mm_add_ps(tmp4, tmp5); \
+  tmp11 = _mm_add_ps(tmp5, tmp6); \
+  tmp12 = _mm_add_ps(tmp6, tmp7); \
+  \
+  z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12), pd_f0382); \
+  z2 = _mm_add_ps(_mm_mul_ps(tmp10, pd_f0541), z5); \
+  z4 = _mm_add_ps(_mm_mul_ps(tmp12, pd_f1306), z5); \
+  z3 = _mm_mul_ps(tmp11, pd_f0707); \
+  \
+  z11 = _mm_add_ps(tmp7, z3); \
+  z13 = _mm_sub_ps(tmp7, z3); \
+  \
+  out##5 = _mm_add_ps(z13, z2); \
+  out##3 = _mm_sub_ps(z13, z2); \
+  out##1 = _mm_add_ps(z11, z4); \
+  out##7 = _mm_sub_ps(z11, z4); \
+}
+
+#define LOAD_DATA(a, b, c, d, l, i) \
+  l##a = _mm_loadu_ps(data + a * 8 + i); \
+  l##b = _mm_loadu_ps(data + b * 8 + i); \
+  l##c = _mm_loadu_ps(data + c * 8 + i); \
+  l##d = _mm_loadu_ps(data + d * 8 + i);
+
+#define STORE_DATA(a, b, c, d, l, i) \
+  _mm_storeu_ps(data + a * 8 + i, l##a); \
+  _mm_storeu_ps(data + b * 8 + i, l##b); \
+  _mm_storeu_ps(data + c * 8 + i, l##c); \
+  _mm_storeu_ps(data + d * 8 + i, l##d);
+
+
+void jsimd_fdct_float_e2k(FAST_FLOAT *data)
+{
+  __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+    tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13;
+  __m128 l0, l1, l2, l3, l4, l5, l6, l7;
+  __m128 h0, h1, h2, h3, h4, h5, h6, h7;
+  __m128 x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128 y0, y1, y2, y3, y4, y5, y6, y7;
+
+  /* Constants */
+  __m128 pd_f0382 = _mm_set1_ps(0.382683433f),
+    pd_f0541 = _mm_set1_ps(0.541196100f),
+    pd_f0707 = _mm_set1_ps(0.707106781f),
+    pd_f1306 = _mm_set1_ps(1.306562965f);
+
+  /* Pass 1: process columns */
+
+  LOAD_DATA(0, 1, 2, 3, x, 0)
+  LOAD_DATA(0, 1, 2, 3, y, 4)
+  TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3)
+  TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7)
+  DO_FDCT(l, l);
+
+  LOAD_DATA(4, 5, 6, 7, x, 0)
+  LOAD_DATA(4, 5, 6, 7, y, 4)
+  TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3)
+  TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7)
+  DO_FDCT(h, h);
+
+  /* Pass 2: process rows */
+
+  TRANSPOSE_FLOAT(l0, l1, l2, l3, x0, x1, x2, x3)
+  TRANSPOSE_FLOAT(h0, h1, h2, h3, x4, x5, x6, x7)
+  DO_FDCT(x, x);
+  STORE_DATA(0, 1, 2, 3, x, 0)
+  STORE_DATA(4, 5, 6, 7, x, 0)
+
+  TRANSPOSE_FLOAT(l4, l5, l6, l7, y0, y1, y2, y3)
+  TRANSPOSE_FLOAT(h4, h5, h6, h7, y4, y5, y6, y7)
+  DO_FDCT(y, y);
+  STORE_DATA(0, 1, 2, 3, y, 4)
+  STORE_DATA(4, 5, 6, 7, y, 4)
+}
diff --git a/simd/e2k/jfdctfst-e2k.c b/simd/e2k/jfdctfst-e2k.c
new file mode 100644
index 0000000..9e58f05
--- /dev/null
+++ b/simd/e2k/jfdctfst-e2k.c
@@ -0,0 +1,145 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT */
+
+#include "jsimd_e2k.h"
+
+
+#define F_0_382  98   /* FIX(0.382683433) */
+#define F_0_541  139  /* FIX(0.541196100) */
+#define F_0_707  181  /* FIX(0.707106781) */
+#define F_1_306  334  /* FIX(1.306562965) */
+
+#define CONST_BITS  8
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+
+#define DO_FDCT() { \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_epi16(tmp0, tmp3); \
+  tmp13 = _mm_sub_epi16(tmp0, tmp3); \
+  tmp11 = _mm_add_epi16(tmp1, tmp2); \
+  tmp12 = _mm_sub_epi16(tmp1, tmp2); \
+  \
+  out0  = _mm_add_epi16(tmp10, tmp11); \
+  out4  = _mm_sub_epi16(tmp10, tmp11); \
+  \
+  z1 = _mm_add_epi16(tmp12, tmp13); \
+  z1 = _mm_slli_epi16(z1, PRE_MULTIPLY_SCALE_BITS); \
+  z1 = _mm_mulhi_epi16(z1, pw_0707); \
+  \
+  out2 = _mm_add_epi16(tmp13, z1); \
+  out6 = _mm_sub_epi16(tmp13, z1); \
+  \
+  /* Odd part */ \
+  \
+  tmp10 = _mm_add_epi16(tmp4, tmp5); \
+  tmp11 = _mm_add_epi16(tmp5, tmp6); \
+  tmp12 = _mm_add_epi16(tmp6, tmp7); \
+  \
+  tmp10 = _mm_slli_epi16(tmp10, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  z5 = _mm_sub_epi16(tmp10, tmp12); \
+  z5 = _mm_mulhi_epi16(z5, pw_0382); \
+  \
+  z2 = _mm_add_epi16(_mm_mulhi_epi16(tmp10, pw_0541), z5); \
+  z4 = _mm_add_epi16(_mm_mulhi_epi16(tmp12, pw_1306), z5); \
+  \
+  tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
+  z3 = _mm_mulhi_epi16(tmp11, pw_0707); \
+  \
+  z11 = _mm_add_epi16(tmp7, z3); \
+  z13 = _mm_sub_epi16(tmp7, z3); \
+  \
+  out5 = _mm_add_epi16(z13, z2); \
+  out3 = _mm_sub_epi16(z13, z2); \
+  out1 = _mm_add_epi16(z11, z4); \
+  out7 = _mm_sub_epi16(z11, z4); \
+}
+
+
+void jsimd_fdct_ifast_e2k(DCTELEM *data)
+{
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    z1, z2, z3, z4, z5, z11, z13,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __m128i pw_0382 = _mm_set1_epi16(F_0_382 << CONST_SHIFT),
+    pw_0541 = _mm_set1_epi16(F_0_541 << CONST_SHIFT),
+    pw_0707 = _mm_set1_epi16(F_0_707 << CONST_SHIFT),
+    pw_1306 = _mm_set1_epi16(F_1_306 << CONST_SHIFT);
+
+  /* Pass 1: process rows */
+
+  row0 = VEC_LD(data + 0 * 8);
+  row1 = VEC_LD(data + 1 * 8);
+  row2 = VEC_LD(data + 2 * 8);
+  row3 = VEC_LD(data + 3 * 8);
+  row4 = VEC_LD(data + 4 * 8);
+  row5 = VEC_LD(data + 5 * 8);
+  row6 = VEC_LD(data + 6 * 8);
+  row7 = VEC_LD(data + 7 * 8);
+
+  TRANSPOSE(row, col);
+
+  tmp0 = _mm_add_epi16(col0, col7);
+  tmp7 = _mm_sub_epi16(col0, col7);
+  tmp1 = _mm_add_epi16(col1, col6);
+  tmp6 = _mm_sub_epi16(col1, col6);
+  tmp2 = _mm_add_epi16(col2, col5);
+  tmp5 = _mm_sub_epi16(col2, col5);
+  tmp3 = _mm_add_epi16(col3, col4);
+  tmp4 = _mm_sub_epi16(col3, col4);
+
+  DO_FDCT();
+
+  /* Pass 2: process columns */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = _mm_add_epi16(row0, row7);
+  tmp7 = _mm_sub_epi16(row0, row7);
+  tmp1 = _mm_add_epi16(row1, row6);
+  tmp6 = _mm_sub_epi16(row1, row6);
+  tmp2 = _mm_add_epi16(row2, row5);
+  tmp5 = _mm_sub_epi16(row2, row5);
+  tmp3 = _mm_add_epi16(row3, row4);
+  tmp4 = _mm_sub_epi16(row3, row4);
+
+  DO_FDCT();
+
+  VEC_ST(data + 0 * 8, out0);
+  VEC_ST(data + 1 * 8, out1);
+  VEC_ST(data + 2 * 8, out2);
+  VEC_ST(data + 3 * 8, out3);
+  VEC_ST(data + 4 * 8, out4);
+  VEC_ST(data + 5 * 8, out5);
+  VEC_ST(data + 6 * 8, out6);
+  VEC_ST(data + 7 * 8, out7);
+}
diff --git a/simd/e2k/jfdctint-e2k.c b/simd/e2k/jfdctint-e2k.c
new file mode 100644
index 0000000..2200852
--- /dev/null
+++ b/simd/e2k/jfdctint-e2k.c
@@ -0,0 +1,255 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER FORWARD DCT */
+
+#include "jsimd_e2k.h"
+
+
+#define F_0_298  2446   /* FIX(0.298631336) */
+#define F_0_390  3196   /* FIX(0.390180644) */
+#define F_0_541  4433   /* FIX(0.541196100) */
+#define F_0_765  6270   /* FIX(0.765366865) */
+#define F_0_899  7373   /* FIX(0.899976223) */
+#define F_1_175  9633   /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+  /* (Original) \
+   * z1 = (tmp12 + tmp13) * 0.541196100; \
+   * data2 = z1 + tmp13 * 0.765366865; \
+   * data6 = z1 + tmp12 * -1.847759065; \
+   * \
+   * (This implementation) \
+   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+   */ \
+  \
+  tmp1312l = _mm_unpacklo_epi16(tmp13, tmp12); \
+  tmp1312h = _mm_unpackhi_epi16(tmp13, tmp12); \
+  \
+  out2l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f130_f054), pd_descale_p##PASS); \
+  out2h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f130_f054), pd_descale_p##PASS); \
+  out6l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f054_mf130), pd_descale_p##PASS); \
+  out6h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f054_mf130), pd_descale_p##PASS); \
+  \
+  out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \
+  out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \
+  out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \
+  out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \
+  \
+  out2 = _mm_packs_epi32(out2l, out2h); \
+  out6 = _mm_packs_epi32(out6l, out6h); \
+  \
+  /* Odd part */ \
+  \
+  z3 = _mm_add_epi16(tmp4, tmp6); \
+  z4 = _mm_add_epi16(tmp5, tmp7); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = _mm_unpacklo_epi16(z3, z4); \
+  z34h = _mm_unpackhi_epi16(z3, z4); \
+  \
+  z3l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_mf078_f117), pd_descale_p##PASS); \
+  z3h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_mf078_f117), pd_descale_p##PASS); \
+  z4l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_f117_f078), pd_descale_p##PASS); \
+  z4h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_f117_f078), pd_descale_p##PASS); \
+  \
+  /* (Original) \
+   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6; \
+   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869; \
+   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4; \
+   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+   * data7 = tmp4 + z3;  data5 = tmp5 + z4; \
+   * data3 = tmp6 + z3;  data1 = tmp7 + z4; \
+   */ \
+  \
+  tmp47l = _mm_unpacklo_epi16(tmp4, tmp7); \
+  tmp47h = _mm_unpackhi_epi16(tmp4, tmp7); \
+  \
+  out7l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf060_mf089), z3l); \
+  out7h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf060_mf089), z3h); \
+  out1l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf089_f060), z4l); \
+  out1h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf089_f060), z4h); \
+  \
+  out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \
+  out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \
+  out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \
+  out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \
+  \
+  out7 = _mm_packs_epi32(out7l, out7h); \
+  out1 = _mm_packs_epi32(out1l, out1h); \
+  \
+  tmp56l = _mm_unpacklo_epi16(tmp5, tmp6); \
+  tmp56h = _mm_unpackhi_epi16(tmp5, tmp6); \
+  \
+  out5l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf050_mf256), z4l); \
+  out5h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf050_mf256), z4h); \
+  out3l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf256_f050), z3l); \
+  out3h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf256_f050), z3h); \
+  \
+  out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \
+  out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \
+  out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \
+  out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \
+  \
+  out5 = _mm_packs_epi32(out5l, out5h); \
+  out3 = _mm_packs_epi32(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_epi16(tmp0, tmp3); \
+  tmp13 = _mm_sub_epi16(tmp0, tmp3); \
+  tmp11 = _mm_add_epi16(tmp1, tmp2); \
+  tmp12 = _mm_sub_epi16(tmp1, tmp2); \
+  \
+  out0  = _mm_add_epi16(tmp10, tmp11); \
+  out0  = _mm_slli_epi16(out0, PASS1_BITS); \
+  out4  = _mm_sub_epi16(tmp10, tmp11); \
+  out4  = _mm_slli_epi16(out4, PASS1_BITS); \
+  \
+  DO_FDCT_COMMON(1); \
+}
+
+#define DO_FDCT_PASS2() { \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_epi16(tmp0, tmp3); \
+  tmp13 = _mm_sub_epi16(tmp0, tmp3); \
+  tmp11 = _mm_add_epi16(tmp1, tmp2); \
+  tmp12 = _mm_sub_epi16(tmp1, tmp2); \
+  \
+  out0  = _mm_add_epi16(tmp10, tmp11); \
+  out0  = _mm_add_epi16(out0, pw_descale_p2x); \
+  out0  = _mm_srai_epi16(out0, PASS1_BITS); \
+  out4  = _mm_sub_epi16(tmp10, tmp11); \
+  out4  = _mm_add_epi16(out4, pw_descale_p2x); \
+  out4  = _mm_srai_epi16(out4, PASS1_BITS); \
+  \
+  DO_FDCT_COMMON(2); \
+}
+
+
+void jsimd_fdct_islow_e2k(DCTELEM *data)
+{
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+    z3, z4, z34l, z34h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i z3l, z3h, z4l, z4h,
+    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+    out7l, out7h;
+
+  /* Constants */
+  __m128i pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)),
+    pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)),
+    pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)),
+    pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)),
+    pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)),
+    pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)),
+    pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)),
+    pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)),
+    pw_descale_p2x = _mm_set1_epi16(1 << (PASS1_BITS - 1)),
+    pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)),
+    pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1));
+
+  /* Pass 1: process rows */
+
+  row0 = VEC_LD(data + 0 * 8);
+  row1 = VEC_LD(data + 1 * 8);
+  row2 = VEC_LD(data + 2 * 8);
+  row3 = VEC_LD(data + 3 * 8);
+  row4 = VEC_LD(data + 4 * 8);
+  row5 = VEC_LD(data + 5 * 8);
+  row6 = VEC_LD(data + 6 * 8);
+  row7 = VEC_LD(data + 7 * 8);
+
+  TRANSPOSE(row, col);
+
+  tmp0 = _mm_add_epi16(col0, col7);
+  tmp7 = _mm_sub_epi16(col0, col7);
+  tmp1 = _mm_add_epi16(col1, col6);
+  tmp6 = _mm_sub_epi16(col1, col6);
+  tmp2 = _mm_add_epi16(col2, col5);
+  tmp5 = _mm_sub_epi16(col2, col5);
+  tmp3 = _mm_add_epi16(col3, col4);
+  tmp4 = _mm_sub_epi16(col3, col4);
+
+  DO_FDCT_PASS1();
+
+  /* Pass 2: process columns */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = _mm_add_epi16(row0, row7);
+  tmp7 = _mm_sub_epi16(row0, row7);
+  tmp1 = _mm_add_epi16(row1, row6);
+  tmp6 = _mm_sub_epi16(row1, row6);
+  tmp2 = _mm_add_epi16(row2, row5);
+  tmp5 = _mm_sub_epi16(row2, row5);
+  tmp3 = _mm_add_epi16(row3, row4);
+  tmp4 = _mm_sub_epi16(row3, row4);
+
+  DO_FDCT_PASS2();
+
+  VEC_ST(data + 0 * 8, out0);
+  VEC_ST(data + 1 * 8, out1);
+  VEC_ST(data + 2 * 8, out2);
+  VEC_ST(data + 3 * 8, out3);
+  VEC_ST(data + 4 * 8, out4);
+  VEC_ST(data + 5 * 8, out5);
+  VEC_ST(data + 6 * 8, out6);
+  VEC_ST(data + 7 * 8, out7);
+}
diff --git a/simd/e2k/jidctflt-e2k.c b/simd/e2k/jidctflt-e2k.c
new file mode 100644
index 0000000..7682965
--- /dev/null
+++ b/simd/e2k/jidctflt-e2k.c
@@ -0,0 +1,215 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FLOAT INVERSE DCT */
+
+#include "jsimd_e2k.h"
+
+#define DO_IDCT(in, out) { \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_ps(in##0, in##4); \
+  tmp11 = _mm_sub_ps(in##0, in##4); \
+  \
+  tmp13 = _mm_add_ps(in##2, in##6); \
+  tmp12 = _mm_sub_ps(in##2, in##6); \
+  tmp12 = _mm_sub_ps(_mm_mul_ps(tmp12, pd_f1414), tmp13); \
+  \
+  tmp0 = _mm_add_ps(tmp10, tmp13); \
+  tmp3 = _mm_sub_ps(tmp10, tmp13); \
+  tmp1 = _mm_add_ps(tmp11, tmp12); \
+  tmp2 = _mm_sub_ps(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  z13 = _mm_add_ps(in##5, in##3); \
+  z10 = _mm_sub_ps(in##5, in##3); \
+  z11 = _mm_add_ps(in##1, in##7); \
+  z12 = _mm_sub_ps(in##1, in##7); \
+  \
+  tmp7 = _mm_add_ps(z11, z13); \
+  tmp11 = _mm_sub_ps(z11, z13); \
+  tmp11 = _mm_mul_ps(tmp11, pd_f1414); \
+  \
+  z5 = _mm_mul_ps(_mm_add_ps(z10, z12), pd_f1847); \
+  tmp10 = _mm_sub_ps(z5, _mm_mul_ps(z12, pd_f1082)); \
+  tmp12 = _mm_sub_ps(z5, _mm_mul_ps(z10, pd_f2613)); \
+  \
+  tmp6 = _mm_sub_ps(tmp12, tmp7); \
+  tmp5 = _mm_sub_ps(tmp11, tmp6); \
+  tmp4 = _mm_sub_ps(tmp10, tmp5); \
+  \
+  out##0 = _mm_add_ps(tmp0, tmp7); \
+  out##7 = _mm_sub_ps(tmp0, tmp7); \
+  out##1 = _mm_add_ps(tmp1, tmp6); \
+  out##6 = _mm_sub_ps(tmp1, tmp6); \
+  out##2 = _mm_add_ps(tmp2, tmp5); \
+  out##5 = _mm_sub_ps(tmp2, tmp5); \
+  out##3 = _mm_add_ps(tmp3, tmp4); \
+  out##4 = _mm_sub_ps(tmp3, tmp4); \
+}
+
+#define QUANT_MUL(a, b, c, d, l, lo, i) \
+  out0 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##a, col##a), 16); \
+  out1 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##b, col##b), 16); \
+  out2 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##c, col##c), 16); \
+  out3 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##d, col##d), 16); \
+  l##a = _mm_cvtepi32_ps(out0); \
+  l##b = _mm_cvtepi32_ps(out1); \
+  l##c = _mm_cvtepi32_ps(out2); \
+  l##d = _mm_cvtepi32_ps(out3); \
+  l##a = _mm_mul_ps(l##a, _mm_load_ps(dct_table + a * 8 + i)); \
+  l##b = _mm_mul_ps(l##b, _mm_load_ps(dct_table + b * 8 + i)); \
+  l##c = _mm_mul_ps(l##c, _mm_load_ps(dct_table + c * 8 + i)); \
+  l##d = _mm_mul_ps(l##d, _mm_load_ps(dct_table + d * 8 + i));
+
+
+void jsimd_idct_float_e2k(void *dct_table_, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  float *dct_table = (float *)dct_table_;
+
+  __m128i col0, col1, col2, col3, col4, col5, col6, col7,
+    out0, out1, out2, out3, out4, out5, out6, out7, row0, row1, row2, row3;
+  __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+    tmp10, tmp11, tmp12, tmp13, z5, z10, z11, z12, z13;
+  __m128 l0, l1, l2, l3, l4, l5, l6, l7;
+  __m128 h0, h1, h2, h3, h4, h5, h6, h7;
+  __m128 x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128 y0, y1, y2, y3, y4, y5, y6, y7;
+
+  /* Constants */
+  __m128 pd_f1414 = _mm_set1_ps(1.414213562f),
+    pd_f1847 = _mm_set1_ps(1.847759065f),
+    pd_f1082 = _mm_set1_ps(1.082392200f),
+    pd_f2613 = _mm_set1_ps(2.613125930f);
+
+  /* Pass 1: process columns */
+
+  col0 = VEC_LD(coef_block + 0 * 8);
+  col1 = VEC_LD(coef_block + 1 * 8);
+  col2 = VEC_LD(coef_block + 2 * 8);
+  col3 = VEC_LD(coef_block + 3 * 8);
+  col4 = VEC_LD(coef_block + 4 * 8);
+  col5 = VEC_LD(coef_block + 5 * 8);
+  col6 = VEC_LD(coef_block + 6 * 8);
+  col7 = VEC_LD(coef_block + 7 * 8);
+
+  out1 = _mm_or_si128(col1, col2);
+  out2 = _mm_or_si128(col3, col4);
+  out1 = _mm_or_si128(out1, out2);
+  out3 = _mm_or_si128(col5, col6);
+  out3 = _mm_or_si128(out3, col7);
+  out1 = _mm_or_si128(out1, out3);
+
+  if (VEC_ISZERO(out1)) {
+    /* AC terms all zero */
+
+    out0 = _mm_srai_epi32(_mm_unpacklo_epi16(col0, col0), 16);
+    out1 = _mm_srai_epi32(_mm_unpackhi_epi16(col0, col0), 16);
+    tmp0 = _mm_cvtepi32_ps(out0);
+    tmp1 = _mm_cvtepi32_ps(out1);
+    tmp0 = _mm_mul_ps(tmp0, _mm_load_ps(dct_table));
+    tmp1 = _mm_mul_ps(tmp1, _mm_load_ps(dct_table + 4));
+
+    l0 = h0 = _mm_shuffle_ps(tmp0, tmp0, 0x00);
+    l1 = h1 = _mm_shuffle_ps(tmp0, tmp0, 0x55);
+    l2 = h2 = _mm_shuffle_ps(tmp0, tmp0, 0xaa);
+    l3 = h3 = _mm_shuffle_ps(tmp0, tmp0, 0xff);
+    l4 = h4 = _mm_shuffle_ps(tmp1, tmp1, 0x00);
+    l5 = h5 = _mm_shuffle_ps(tmp1, tmp1, 0x55);
+    l6 = h6 = _mm_shuffle_ps(tmp1, tmp1, 0xaa);
+    l7 = h7 = _mm_shuffle_ps(tmp1, tmp1, 0xff);
+
+  } else {
+
+    QUANT_MUL(0, 2, 4, 6, l, lo, 0)
+    QUANT_MUL(1, 3, 5, 7, l, lo, 0)
+    DO_IDCT(l, x);
+
+    QUANT_MUL(0, 2, 4, 6, h, hi, 4)
+    QUANT_MUL(1, 3, 5, 7, h, hi, 4)
+    DO_IDCT(h, y);
+
+    TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3)
+    TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3)
+    TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7)
+    TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7)
+  }
+
+  /* Pass 2: process rows */
+
+  DO_IDCT(l, x);
+  DO_IDCT(h, y);
+
+#ifdef JSIMD_SAME_ROUNDING
+#define OUT_ROUND(i) \
+  tmp0 = _mm_add_ps(_mm_mul_ps(x##i, pd_f0125), pd_cj_rnd); \
+  tmp1 = _mm_add_ps(_mm_mul_ps(y##i, pd_f0125), pd_cj_rnd); \
+  out##i = _mm_packs_epi32(_mm_cvttps_epi32(tmp0), _mm_cvttps_epi32(tmp1));
+
+  {
+    __m128 pd_cj_rnd = _mm_set1_ps(0.5f + CENTERJSAMPLE),
+      pd_f0125 = _mm_set1_ps(0.125f);
+
+    OUT_ROUND(0) OUT_ROUND(1)
+    OUT_ROUND(2) OUT_ROUND(3)
+    OUT_ROUND(4) OUT_ROUND(5)
+    OUT_ROUND(6) OUT_ROUND(7)
+  }
+  row0 = _mm_packus_epi16(out0, out1);
+  row1 = _mm_packus_epi16(out2, out3);
+  row2 = _mm_packus_epi16(out4, out5);
+  row3 = _mm_packus_epi16(out6, out7);
+
+  TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col)
+#else  /* faster, slightly differ in rounding */
+#define OUT_ROUND(i, a, b) out##i = _mm_blendv_epi8( \
+  _mm_slli_epi32(_mm_castps_si128(_mm_add_ps(b, pd_round)), 16), \
+  _mm_castps_si128(_mm_add_ps(a, pd_round)), pd_mask);
+
+  {
+    __m128i pd_mask = _mm_set1_epi32(0xffff);
+    __m128 pd_round = _mm_set1_ps((3 << 22 | CENTERJSAMPLE) * 8);
+
+    OUT_ROUND(0, x0, x4) OUT_ROUND(1, y0, y4)
+    OUT_ROUND(2, x1, x5) OUT_ROUND(3, y1, y5)
+    OUT_ROUND(4, x2, x6) OUT_ROUND(5, y2, y6)
+    OUT_ROUND(6, x3, x7) OUT_ROUND(7, y3, y7)
+  }
+  row0 = _mm_packus_epi16(out0, out1);
+  row1 = _mm_packus_epi16(out2, out3);
+  row2 = _mm_packus_epi16(out4, out5);
+  row3 = _mm_packus_epi16(out6, out7);
+
+  TRANSPOSE8(row, out) TRANSPOSE8(out, col)
+#endif
+  VEC_STL(output_buf[0] + output_col, col0);
+  VEC_STH(output_buf[1] + output_col, col0);
+  VEC_STL(output_buf[2] + output_col, col1);
+  VEC_STH(output_buf[3] + output_col, col1);
+  VEC_STL(output_buf[4] + output_col, col2);
+  VEC_STH(output_buf[5] + output_col, col2);
+  VEC_STL(output_buf[6] + output_col, col3);
+  VEC_STH(output_buf[7] + output_col, col3);
+}
diff --git a/simd/e2k/jidctfst-e2k.c b/simd/e2k/jidctfst-e2k.c
new file mode 100644
index 0000000..18bc425
--- /dev/null
+++ b/simd/e2k/jidctfst-e2k.c
@@ -0,0 +1,187 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT */
+
+#include "jsimd_e2k.h"
+
+
+#define F_1_082  277              /* FIX(1.082392200) */
+#define F_1_414  362              /* FIX(1.414213562) */
+#define F_1_847  473              /* FIX(1.847759065) */
+#define F_2_613  669              /* FIX(2.613125930) */
+#define F_1_613  (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
+
+#define CONST_BITS  8
+#define PASS1_BITS  2
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+
+#define DO_IDCT(in) { \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_epi16(in##0, in##4); \
+  tmp11 = _mm_sub_epi16(in##0, in##4); \
+  tmp13 = _mm_add_epi16(in##2, in##6); \
+  \
+  tmp12 = _mm_sub_epi16(in##2, in##6); \
+  tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_mulhi_epi16(tmp12, pw_F1414); \
+  tmp12 = _mm_sub_epi16(tmp12, tmp13); \
+  \
+  tmp0 = _mm_add_epi16(tmp10, tmp13); \
+  tmp3 = _mm_sub_epi16(tmp10, tmp13); \
+  tmp1 = _mm_add_epi16(tmp11, tmp12); \
+  tmp2 = _mm_sub_epi16(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  z13 = _mm_add_epi16(in##5, in##3); \
+  z10 = _mm_sub_epi16(in##5, in##3); \
+  z10s = _mm_slli_epi16(z10, PRE_MULTIPLY_SCALE_BITS); \
+  z11 = _mm_add_epi16(in##1, in##7); \
+  z12s = _mm_sub_epi16(in##1, in##7); \
+  z12s = _mm_slli_epi16(z12s, PRE_MULTIPLY_SCALE_BITS); \
+  \
+  tmp11 = _mm_sub_epi16(z11, z13); \
+  tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
+  tmp11 = _mm_mulhi_epi16(tmp11, pw_F1414); \
+  \
+  tmp7 = _mm_add_epi16(z11, z13); \
+  \
+  /* To avoid overflow... \
+   * \
+   * (Original) \
+   * tmp12 = -2.613125930 * z10 + z5; \
+   * \
+   * (This implementation) \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+   *       = -1.613125930 * z10 - z10 + z5; \
+   */ \
+  \
+  z5 = _mm_add_epi16(z10s, z12s); \
+  z5 = _mm_mulhi_epi16(z5, pw_F1847); \
+  \
+  tmp10 = _mm_mulhi_epi16(z12s, pw_F1082); \
+  tmp10 = _mm_sub_epi16(tmp10, z5); \
+  tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10s, pw_MF1613), z5); \
+  tmp12 = _mm_sub_epi16(tmp12, z10); \
+  \
+  tmp6 = _mm_sub_epi16(tmp12, tmp7); \
+  tmp5 = _mm_sub_epi16(tmp11, tmp6); \
+  tmp4 = _mm_add_epi16(tmp10, tmp5); \
+  \
+  out0 = _mm_add_epi16(tmp0, tmp7); \
+  out1 = _mm_add_epi16(tmp1, tmp6); \
+  out2 = _mm_add_epi16(tmp2, tmp5); \
+  out3 = _mm_sub_epi16(tmp3, tmp4); \
+  out4 = _mm_add_epi16(tmp3, tmp4); \
+  out5 = _mm_sub_epi16(tmp2, tmp5); \
+  out6 = _mm_sub_epi16(tmp1, tmp6); \
+  out7 = _mm_sub_epi16(tmp0, tmp7); \
+}
+
+
+void jsimd_idct_ifast_e2k(void *dct_table_, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  short *dct_table = (short *)dct_table_;
+
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    z5, z10, z10s, z11, z12s, z13,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __m128i pw_F1414 = _mm_set1_epi16(F_1_414 << CONST_SHIFT),
+    pw_F1847 = _mm_set1_epi16(F_1_847 << CONST_SHIFT),
+    pw_MF1613 = _mm_set1_epi16(-F_1_613 << CONST_SHIFT),
+    pw_F1082 = _mm_set1_epi16(F_1_082 << CONST_SHIFT);
+
+  /* Pass 1: process columns */
+
+  col0 = VEC_LD(coef_block + 0 * 8);
+  col1 = VEC_LD(coef_block + 1 * 8);
+  col2 = VEC_LD(coef_block + 2 * 8);
+  col3 = VEC_LD(coef_block + 3 * 8);
+  col4 = VEC_LD(coef_block + 4 * 8);
+  col5 = VEC_LD(coef_block + 5 * 8);
+  col6 = VEC_LD(coef_block + 6 * 8);
+  col7 = VEC_LD(coef_block + 7 * 8);
+
+  tmp1 = _mm_or_si128(col1, col2);
+  tmp2 = _mm_or_si128(col3, col4);
+  tmp1 = _mm_or_si128(tmp1, tmp2);
+  tmp3 = _mm_or_si128(col5, col6);
+  tmp3 = _mm_or_si128(tmp3, col7);
+  tmp1 = _mm_or_si128(tmp1, tmp3);
+
+  quant0 = VEC_LD(dct_table);
+  col0 = _mm_mullo_epi16(col0, quant0);
+
+  if (VEC_ISZERO(tmp1)) {
+    /* AC terms all zero */
+
+    IDCT_SPLAT8(col0);
+
+  } else {
+
+    quant1 = VEC_LD(dct_table + 1 * 8);
+    quant2 = VEC_LD(dct_table + 2 * 8);
+    quant3 = VEC_LD(dct_table + 3 * 8);
+    quant4 = VEC_LD(dct_table + 4 * 8);
+    quant5 = VEC_LD(dct_table + 5 * 8);
+    quant6 = VEC_LD(dct_table + 6 * 8);
+    quant7 = VEC_LD(dct_table + 7 * 8);
+
+    col1 = _mm_mullo_epi16(col1, quant1);
+    col2 = _mm_mullo_epi16(col2, quant2);
+    col3 = _mm_mullo_epi16(col3, quant3);
+    col4 = _mm_mullo_epi16(col4, quant4);
+    col5 = _mm_mullo_epi16(col5, quant5);
+    col6 = _mm_mullo_epi16(col6, quant6);
+    col7 = _mm_mullo_epi16(col7, quant7);
+
+    DO_IDCT(col);
+
+    TRANSPOSE(out, row);
+  }
+
+  /* Pass 2: process rows */
+
+  DO_IDCT(row);
+
+  out0 = _mm_srai_epi16(out0, PASS1_BITS + 3);
+  out1 = _mm_srai_epi16(out1, PASS1_BITS + 3);
+  out2 = _mm_srai_epi16(out2, PASS1_BITS + 3);
+  out3 = _mm_srai_epi16(out3, PASS1_BITS + 3);
+  out4 = _mm_srai_epi16(out4, PASS1_BITS + 3);
+  out5 = _mm_srai_epi16(out5, PASS1_BITS + 3);
+  out6 = _mm_srai_epi16(out6, PASS1_BITS + 3);
+  out7 = _mm_srai_epi16(out7, PASS1_BITS + 3);
+
+  IDCT_SAVE();
+}
diff --git a/simd/e2k/jidctint-e2k.c b/simd/e2k/jidctint-e2k.c
new file mode 100644
index 0000000..7bb79c0
--- /dev/null
+++ b/simd/e2k/jidctint-e2k.c
@@ -0,0 +1,294 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER INVERSE DCT */
+
+#include "jsimd_e2k.h"
+
+
+#define F_0_298  2446   /* FIX(0.298631336) */
+#define F_0_390  3196   /* FIX(0.390180644) */
+#define F_0_541  4433   /* FIX(0.541196100) */
+#define F_0_765  6270   /* FIX(0.765366865) */
+#define F_0_899  7373   /* FIX(0.899976223) */
+#define F_1_175  9633   /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+
+#define DO_IDCT(in, PASS) { \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
+  \
+  in##26l = _mm_unpacklo_epi16(in##2, in##6); \
+  in##26h = _mm_unpackhi_epi16(in##2, in##6); \
+  \
+  tmp3l = _mm_madd_epi16(in##26l, pw_f130_f054); \
+  tmp3h = _mm_madd_epi16(in##26h, pw_f130_f054); \
+  tmp2l = _mm_madd_epi16(in##26l, pw_f054_mf130); \
+  tmp2h = _mm_madd_epi16(in##26h, pw_f054_mf130); \
+  \
+  tmp0 = _mm_add_epi16(in##0, in##4); \
+  tmp1 = _mm_sub_epi16(in##0, in##4); \
+  \
+  tmp0l = _mm_unpacklo_epi16(pw_zero, tmp0); \
+  tmp0h = _mm_unpackhi_epi16(pw_zero, tmp0); \
+  tmp0l = _mm_srai_epi32(tmp0l, 16 - CONST_BITS); \
+  tmp0h = _mm_srai_epi32(tmp0h, 16 - CONST_BITS); \
+  tmp0l = _mm_add_epi32(tmp0l, pd_descale_p##PASS); \
+  tmp0h = _mm_add_epi32(tmp0h, pd_descale_p##PASS); \
+  \
+  tmp10l = _mm_add_epi32(tmp0l, tmp3l); \
+  tmp10h = _mm_add_epi32(tmp0h, tmp3h); \
+  tmp13l = _mm_sub_epi32(tmp0l, tmp3l); \
+  tmp13h = _mm_sub_epi32(tmp0h, tmp3h); \
+  \
+  tmp1l = _mm_unpacklo_epi16(pw_zero, tmp1); \
+  tmp1h = _mm_unpackhi_epi16(pw_zero, tmp1); \
+  tmp1l = _mm_srai_epi32(tmp1l, 16 - CONST_BITS); \
+  tmp1h = _mm_srai_epi32(tmp1h, 16 - CONST_BITS); \
+  tmp1l = _mm_add_epi32(tmp1l, pd_descale_p##PASS); \
+  tmp1h = _mm_add_epi32(tmp1h, pd_descale_p##PASS); \
+  \
+  tmp11l = _mm_add_epi32(tmp1l, tmp2l); \
+  tmp11h = _mm_add_epi32(tmp1h, tmp2h); \
+  tmp12l = _mm_sub_epi32(tmp1l, tmp2l); \
+  tmp12h = _mm_sub_epi32(tmp1h, tmp2h); \
+  \
+  /* Odd part */ \
+  \
+  z3 = _mm_add_epi16(in##3, in##7); \
+  z4 = _mm_add_epi16(in##1, in##5); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = _mm_unpacklo_epi16(z3, z4); \
+  z34h = _mm_unpackhi_epi16(z3, z4); \
+  \
+  z3l = _mm_madd_epi16(z34l, pw_mf078_f117); \
+  z3h = _mm_madd_epi16(z34h, pw_mf078_f117); \
+  z4l = _mm_madd_epi16(z34l, pw_f117_f078); \
+  z4h = _mm_madd_epi16(z34h, pw_f117_f078); \
+  \
+  /* (Original) \
+   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2; \
+   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869; \
+   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * tmp0 += z1 + z3;  tmp1 += z2 + z4; \
+   * tmp2 += z2 + z3;  tmp3 += z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+   * tmp0 += z3;  tmp1 += z4; \
+   * tmp2 += z3;  tmp3 += z4; \
+   */ \
+  \
+  in##71l = _mm_unpacklo_epi16(in##7, in##1); \
+  in##71h = _mm_unpackhi_epi16(in##7, in##1); \
+  \
+  tmp0l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf060_mf089), z3l); \
+  tmp0h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf060_mf089), z3h); \
+  tmp3l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf089_f060), z4l); \
+  tmp3h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf089_f060), z4h); \
+  \
+  in##53l = _mm_unpacklo_epi16(in##5, in##3); \
+  in##53h = _mm_unpackhi_epi16(in##5, in##3); \
+  \
+  tmp1l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf050_mf256), z4l); \
+  tmp1h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf050_mf256), z4h); \
+  tmp2l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf256_f050), z3l); \
+  tmp2h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf256_f050), z3h); \
+  \
+  /* Final output stage */ \
+  \
+  out0l = _mm_add_epi32(tmp10l, tmp3l); \
+  out0h = _mm_add_epi32(tmp10h, tmp3h); \
+  out7l = _mm_sub_epi32(tmp10l, tmp3l); \
+  out7h = _mm_sub_epi32(tmp10h, tmp3h); \
+  \
+  out0l = _mm_srai_epi32(out0l, DESCALE_P##PASS); \
+  out0h = _mm_srai_epi32(out0h, DESCALE_P##PASS); \
+  out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \
+  out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \
+  \
+  out0 = _mm_packs_epi32(out0l, out0h); \
+  out7 = _mm_packs_epi32(out7l, out7h); \
+  \
+  out1l = _mm_add_epi32(tmp11l, tmp2l); \
+  out1h = _mm_add_epi32(tmp11h, tmp2h); \
+  out6l = _mm_sub_epi32(tmp11l, tmp2l); \
+  out6h = _mm_sub_epi32(tmp11h, tmp2h); \
+  \
+  out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \
+  out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \
+  out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \
+  out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \
+  \
+  out1 = _mm_packs_epi32(out1l, out1h); \
+  out6 = _mm_packs_epi32(out6l, out6h); \
+  \
+  out2l = _mm_add_epi32(tmp12l, tmp1l); \
+  out2h = _mm_add_epi32(tmp12h, tmp1h); \
+  out5l = _mm_sub_epi32(tmp12l, tmp1l); \
+  out5h = _mm_sub_epi32(tmp12h, tmp1h); \
+  \
+  out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \
+  out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \
+  out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \
+  out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \
+  \
+  out2 = _mm_packs_epi32(out2l, out2h); \
+  out5 = _mm_packs_epi32(out5l, out5h); \
+  \
+  out3l = _mm_add_epi32(tmp13l, tmp0l); \
+  out3h = _mm_add_epi32(tmp13h, tmp0h); \
+  out4l = _mm_sub_epi32(tmp13l, tmp0l); \
+  out4h = _mm_sub_epi32(tmp13h, tmp0h); \
+  \
+  out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \
+  out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \
+  out4l = _mm_srai_epi32(out4l, DESCALE_P##PASS); \
+  out4h = _mm_srai_epi32(out4h, DESCALE_P##PASS); \
+  \
+  out3 = _mm_packs_epi32(out3l, out3h); \
+  out4 = _mm_packs_epi32(out4l, out4h); \
+}
+
+
+void jsimd_idct_islow_e2k(void *dct_table_, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  short *dct_table = (short *)dct_table_;
+
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+    tmp0, tmp1, tmp2, tmp3, z3, z4,
+    z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
+    row71l, row71h, row26l, row26h, row53l, row53h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
+    tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
+    z3l, z3h, z4l, z4h,
+    out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
+    out5l, out5h, out6l, out6h, out7l, out7h;
+
+  /* Constants */
+  __m128i pw_zero = _mm_setzero_si128(),
+    pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)),
+    pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)),
+    pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)),
+    pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)),
+    pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)),
+    pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)),
+    pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)),
+    pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)),
+    pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)),
+    pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1));
+
+  /* Pass 1: process columns */
+
+  col0 = VEC_LD(coef_block + 0 * 8);
+  col1 = VEC_LD(coef_block + 1 * 8);
+  col2 = VEC_LD(coef_block + 2 * 8);
+  col3 = VEC_LD(coef_block + 3 * 8);
+  col4 = VEC_LD(coef_block + 4 * 8);
+  col5 = VEC_LD(coef_block + 5 * 8);
+  col6 = VEC_LD(coef_block + 6 * 8);
+  col7 = VEC_LD(coef_block + 7 * 8);
+
+  tmp1 = _mm_or_si128(col1, col2);
+  tmp2 = _mm_or_si128(col3, col4);
+  tmp1 = _mm_or_si128(tmp1, tmp2);
+  tmp3 = _mm_or_si128(col5, col6);
+  tmp3 = _mm_or_si128(tmp3, col7);
+  tmp1 = _mm_or_si128(tmp1, tmp3);
+
+  quant0 = VEC_LD(dct_table);
+  col0 = _mm_mullo_epi16(col0, quant0);
+
+  if (VEC_ISZERO(tmp1)) {
+    /* AC terms all zero */
+
+    col0 = _mm_slli_epi16(col0, PASS1_BITS);
+    IDCT_SPLAT8(col0);
+
+  } else {
+
+    quant1 = VEC_LD(dct_table + 1 * 8);
+    quant2 = VEC_LD(dct_table + 2 * 8);
+    quant3 = VEC_LD(dct_table + 3 * 8);
+    quant4 = VEC_LD(dct_table + 4 * 8);
+    quant5 = VEC_LD(dct_table + 5 * 8);
+    quant6 = VEC_LD(dct_table + 6 * 8);
+    quant7 = VEC_LD(dct_table + 7 * 8);
+
+    col1 = _mm_mullo_epi16(col1, quant1);
+    col2 = _mm_mullo_epi16(col2, quant2);
+    col3 = _mm_mullo_epi16(col3, quant3);
+    col4 = _mm_mullo_epi16(col4, quant4);
+    col5 = _mm_mullo_epi16(col5, quant5);
+    col6 = _mm_mullo_epi16(col6, quant6);
+    col7 = _mm_mullo_epi16(col7, quant7);
+
+    DO_IDCT(col, 1);
+
+    TRANSPOSE(out, row);
+  }
+
+  /* Pass 2: process rows */
+
+  DO_IDCT(row, 2);
+
+  IDCT_SAVE();
+}
diff --git a/simd/e2k/jquantf-e2k.c b/simd/e2k/jquantf-e2k.c
new file mode 100644
index 0000000..106e99a
--- /dev/null
+++ b/simd/e2k/jquantf-e2k.c
@@ -0,0 +1,121 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FLOAT QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_e2k.h"
+
+#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col)
+#define STORE_ROW(i) \
+  in0 = _mm_unpacklo_epi16(out##i, pb_zero); \
+  in1 = _mm_unpackhi_epi16(out##i, pb_zero); \
+  in0 = _mm_sub_epi32(in0, pd_cj); \
+  in1 = _mm_sub_epi32(in1, pd_cj); \
+  _mm_storeu_ps(workspace + i * 8, _mm_cvtepi32_ps(in0)); \
+  _mm_storeu_ps(workspace + i * 8 + 4, _mm_cvtepi32_ps(in1));
+
+void jsimd_convsamp_float_e2k(JSAMPARRAY sample_data, JDIMENSION start_col,
+                              FAST_FLOAT *workspace)
+{
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __m128i pd_cj = _mm_set1_epi32(CENTERJSAMPLE),
+    pb_zero = _mm_setzero_si128();
+
+  LOAD_ROW(0);
+  LOAD_ROW(1);
+  LOAD_ROW(2);
+  LOAD_ROW(3);
+  LOAD_ROW(4);
+  LOAD_ROW(5);
+  LOAD_ROW(6);
+  LOAD_ROW(7);
+
+  out0 = _mm_unpacklo_epi8(in0, pb_zero);
+  out1 = _mm_unpacklo_epi8(in1, pb_zero);
+  out2 = _mm_unpacklo_epi8(in2, pb_zero);
+  out3 = _mm_unpacklo_epi8(in3, pb_zero);
+  out4 = _mm_unpacklo_epi8(in4, pb_zero);
+  out5 = _mm_unpacklo_epi8(in5, pb_zero);
+  out6 = _mm_unpacklo_epi8(in6, pb_zero);
+  out7 = _mm_unpacklo_epi8(in7, pb_zero);
+
+  STORE_ROW(0)
+  STORE_ROW(1)
+  STORE_ROW(2)
+  STORE_ROW(3)
+  STORE_ROW(4)
+  STORE_ROW(5)
+  STORE_ROW(6)
+  STORE_ROW(7)
+}
+
+void jsimd_quantize_float_e2k(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                              FAST_FLOAT *workspace)
+{
+  int i = 0;
+  __m128 row0, row1, row2, row3, recip0, recip1, recip2, recip3;
+  __m128i out0, out1;
+#ifdef JSIMD_SAME_ROUNDING
+  __m128 pd_f16k5 = _mm_set1_ps(16384.5f);
+  __m128i pw_m16k = _mm_set1_epi16(-16384);
+#endif
+
+  PRAGMA_E2K("ivdep")
+  for (; i < 4; i++, workspace += 16, divisors += 16, coef_block += 16) {
+    row0 = _mm_loadu_ps(workspace + 0 * 4);
+    row1 = _mm_loadu_ps(workspace + 1 * 4);
+    row2 = _mm_loadu_ps(workspace + 2 * 4);
+    row3 = _mm_loadu_ps(workspace + 3 * 4);
+
+    recip0 = _mm_loadu_ps(divisors + 0 * 4);
+    recip1 = _mm_loadu_ps(divisors + 1 * 4);
+    recip2 = _mm_loadu_ps(divisors + 2 * 4);
+    recip3 = _mm_loadu_ps(divisors + 3 * 4);
+
+    row0 = _mm_mul_ps(row0, recip0);
+    row1 = _mm_mul_ps(row1, recip1);
+    row2 = _mm_mul_ps(row2, recip2);
+    row3 = _mm_mul_ps(row3, recip3);
+
+#ifdef JSIMD_SAME_ROUNDING
+    row0 = _mm_add_ps(row0, pd_f16k5);
+    row1 = _mm_add_ps(row1, pd_f16k5);
+    row2 = _mm_add_ps(row2, pd_f16k5);
+    row3 = _mm_add_ps(row3, pd_f16k5);
+
+    out0 = _mm_packs_epi32(_mm_cvttps_epi32(row0), _mm_cvttps_epi32(row1));
+    out1 = _mm_packs_epi32(_mm_cvttps_epi32(row2), _mm_cvttps_epi32(row3));
+
+    out0 = _mm_add_epi16(out0, pw_m16k);
+    out1 = _mm_add_epi16(out1, pw_m16k);
+#else
+    out0 = _mm_packs_epi32(_mm_cvtps_epi32(row0), _mm_cvtps_epi32(row1));
+    out1 = _mm_packs_epi32(_mm_cvtps_epi32(row2), _mm_cvtps_epi32(row3));
+#endif
+    VEC_ST(coef_block, out0);
+    VEC_ST(coef_block + 8, out1);
+  }
+}
diff --git a/simd/e2k/jquanti-e2k.c b/simd/e2k/jquanti-e2k.c
new file mode 100644
index 0000000..a3e1ff1
--- /dev/null
+++ b/simd/e2k/jquanti-e2k.c
@@ -0,0 +1,178 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_e2k.h"
+
+#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col)
+
+void jsimd_convsamp_e2k(JSAMPARRAY sample_data, JDIMENSION start_col,
+                        DCTELEM *workspace)
+{
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+
+  /* Constants */
+  __m128i pw_cj = _mm_set1_epi16(CENTERJSAMPLE),
+    pb_zero = _mm_setzero_si128();
+
+  LOAD_ROW(0);
+  LOAD_ROW(1);
+  LOAD_ROW(2);
+  LOAD_ROW(3);
+  LOAD_ROW(4);
+  LOAD_ROW(5);
+  LOAD_ROW(6);
+  LOAD_ROW(7);
+
+  out0 = _mm_unpacklo_epi8(in0, pb_zero);
+  out1 = _mm_unpacklo_epi8(in1, pb_zero);
+  out2 = _mm_unpacklo_epi8(in2, pb_zero);
+  out3 = _mm_unpacklo_epi8(in3, pb_zero);
+  out4 = _mm_unpacklo_epi8(in4, pb_zero);
+  out5 = _mm_unpacklo_epi8(in5, pb_zero);
+  out6 = _mm_unpacklo_epi8(in6, pb_zero);
+  out7 = _mm_unpacklo_epi8(in7, pb_zero);
+
+  out0 = _mm_sub_epi16(out0, pw_cj);
+  out1 = _mm_sub_epi16(out1, pw_cj);
+  out2 = _mm_sub_epi16(out2, pw_cj);
+  out3 = _mm_sub_epi16(out3, pw_cj);
+  out4 = _mm_sub_epi16(out4, pw_cj);
+  out5 = _mm_sub_epi16(out5, pw_cj);
+  out6 = _mm_sub_epi16(out6, pw_cj);
+  out7 = _mm_sub_epi16(out7, pw_cj);
+
+  VEC_ST(workspace + 0 * 8, out0);
+  VEC_ST(workspace + 1 * 8, out1);
+  VEC_ST(workspace + 2 * 8, out2);
+  VEC_ST(workspace + 3 * 8, out3);
+  VEC_ST(workspace + 4 * 8, out4);
+  VEC_ST(workspace + 5 * 8, out5);
+  VEC_ST(workspace + 6 * 8, out6);
+  VEC_ST(workspace + 7 * 8, out7);
+}
+
+
+#define WORD_BIT  16
+#define MULTIPLY(vs0, vs1, out) out = _mm_mulhi_epu16(vs0, vs1)
+
+void jsimd_quantize_e2k(JCOEFPTR coef_block, DCTELEM *divisors,
+                        DCTELEM *workspace)
+{
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
+    row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
+    corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
+    recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
+    scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
+
+  row0s = VEC_LD(workspace + 0 * 8);
+  row1s = VEC_LD(workspace + 1 * 8);
+  row2s = VEC_LD(workspace + 2 * 8);
+  row3s = VEC_LD(workspace + 3 * 8);
+  row4s = VEC_LD(workspace + 4 * 8);
+  row5s = VEC_LD(workspace + 5 * 8);
+  row6s = VEC_LD(workspace + 6 * 8);
+  row7s = VEC_LD(workspace + 7 * 8);
+  row0 = _mm_abs_epi16(row0s);
+  row1 = _mm_abs_epi16(row1s);
+  row2 = _mm_abs_epi16(row2s);
+  row3 = _mm_abs_epi16(row3s);
+  row4 = _mm_abs_epi16(row4s);
+  row5 = _mm_abs_epi16(row5s);
+  row6 = _mm_abs_epi16(row6s);
+  row7 = _mm_abs_epi16(row7s);
+
+  corr0 = VEC_LD(divisors + DCTSIZE2 + 0 * 8);
+  corr1 = VEC_LD(divisors + DCTSIZE2 + 1 * 8);
+  corr2 = VEC_LD(divisors + DCTSIZE2 + 2 * 8);
+  corr3 = VEC_LD(divisors + DCTSIZE2 + 3 * 8);
+  corr4 = VEC_LD(divisors + DCTSIZE2 + 4 * 8);
+  corr5 = VEC_LD(divisors + DCTSIZE2 + 5 * 8);
+  corr6 = VEC_LD(divisors + DCTSIZE2 + 6 * 8);
+  corr7 = VEC_LD(divisors + DCTSIZE2 + 7 * 8);
+
+  row0 = _mm_add_epi16(row0, corr0);
+  row1 = _mm_add_epi16(row1, corr1);
+  row2 = _mm_add_epi16(row2, corr2);
+  row3 = _mm_add_epi16(row3, corr3);
+  row4 = _mm_add_epi16(row4, corr4);
+  row5 = _mm_add_epi16(row5, corr5);
+  row6 = _mm_add_epi16(row6, corr6);
+  row7 = _mm_add_epi16(row7, corr7);
+
+  recip0 = VEC_LD(divisors + 0 * 8);
+  recip1 = VEC_LD(divisors + 1 * 8);
+  recip2 = VEC_LD(divisors + 2 * 8);
+  recip3 = VEC_LD(divisors + 3 * 8);
+  recip4 = VEC_LD(divisors + 4 * 8);
+  recip5 = VEC_LD(divisors + 5 * 8);
+  recip6 = VEC_LD(divisors + 6 * 8);
+  recip7 = VEC_LD(divisors + 7 * 8);
+
+  MULTIPLY(row0, recip0, row0);
+  MULTIPLY(row1, recip1, row1);
+  MULTIPLY(row2, recip2, row2);
+  MULTIPLY(row3, recip3, row3);
+  MULTIPLY(row4, recip4, row4);
+  MULTIPLY(row5, recip5, row5);
+  MULTIPLY(row6, recip6, row6);
+  MULTIPLY(row7, recip7, row7);
+
+  scale0 = VEC_LD(divisors + DCTSIZE2 * 2 + 0 * 8);
+  scale1 = VEC_LD(divisors + DCTSIZE2 * 2 + 1 * 8);
+  scale2 = VEC_LD(divisors + DCTSIZE2 * 2 + 2 * 8);
+  scale3 = VEC_LD(divisors + DCTSIZE2 * 2 + 3 * 8);
+  scale4 = VEC_LD(divisors + DCTSIZE2 * 2 + 4 * 8);
+  scale5 = VEC_LD(divisors + DCTSIZE2 * 2 + 5 * 8);
+  scale6 = VEC_LD(divisors + DCTSIZE2 * 2 + 6 * 8);
+  scale7 = VEC_LD(divisors + DCTSIZE2 * 2 + 7 * 8);
+
+  MULTIPLY(row0, scale0, row0);
+  MULTIPLY(row1, scale1, row1);
+  MULTIPLY(row2, scale2, row2);
+  MULTIPLY(row3, scale3, row3);
+  MULTIPLY(row4, scale4, row4);
+  MULTIPLY(row5, scale5, row5);
+  MULTIPLY(row6, scale6, row6);
+  MULTIPLY(row7, scale7, row7);
+
+  row0 = _mm_sign_epi16(row0, row0s);
+  row1 = _mm_sign_epi16(row1, row1s);
+  row2 = _mm_sign_epi16(row2, row2s);
+  row3 = _mm_sign_epi16(row3, row3s);
+  row4 = _mm_sign_epi16(row4, row4s);
+  row5 = _mm_sign_epi16(row5, row5s);
+  row6 = _mm_sign_epi16(row6, row6s);
+  row7 = _mm_sign_epi16(row7, row7s);
+
+  VEC_ST(coef_block + 0 * 8, row0);
+  VEC_ST(coef_block + 1 * 8, row1);
+  VEC_ST(coef_block + 2 * 8, row2);
+  VEC_ST(coef_block + 3 * 8, row3);
+  VEC_ST(coef_block + 4 * 8, row4);
+  VEC_ST(coef_block + 5 * 8, row5);
+  VEC_ST(coef_block + 6 * 8, row6);
+  VEC_ST(coef_block + 7 * 8, row7);
+}
diff --git a/simd/e2k/jsimd.c b/simd/e2k/jsimd.c
new file mode 100644
index 0000000..f8c0465
--- /dev/null
+++ b/simd/e2k/jsimd.c
@@ -0,0 +1,761 @@
+/*
+ * jsimd_e2k.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * PowerPC architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "jsimd_api_e2k.h"
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char *env = NULL;
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = JSIMD_SSE2;
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+  env = getenv("JSIMD_NOHUFFENC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_huffman = 0;
+#endif
+}
+
+static inline int color_space_idx(J_COLOR_SPACE color_space) {
+  switch (color_space) {
+  case JCS_EXT_RGB:
+    return 1 + (EXT_RGB_PIXELSIZE != 3) * 16;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    return 2 + (EXT_RGBX_PIXELSIZE != 3) * 16;
+  case JCS_EXT_BGR:
+    return 3 + (EXT_BGR_PIXELSIZE != 3) * 16;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    return 4 + (EXT_BGRX_PIXELSIZE != 3) * 16;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    return 5 + (EXT_XBGR_PIXELSIZE != 3) * 16;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    return 6 + (EXT_XRGB_PIXELSIZE != 3) * 16;
+  default:
+    break;
+  }
+  return 0 + (RGB_PIXELSIZE != 3) * 16;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int);
+  int idx = color_space_idx(cinfo->in_color_space);
+
+  e2kfct = idx < 16 ? jsimd_rgb3_ycc_convert_e2k :
+                      jsimd_rgb4_ycc_convert_e2k;
+  idx &= 15;
+
+  e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int);
+  int idx = color_space_idx(cinfo->in_color_space);
+
+  e2kfct = idx < 16 ? jsimd_rgb3_gray_convert_e2k :
+                      jsimd_rgb4_gray_convert_e2k;
+  idx &= 15;
+
+  e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int, int);
+  int idx = color_space_idx(cinfo->out_color_space);
+
+  e2kfct = idx < 16 ? jsimd_ycc_rgb3_convert_e2k :
+                      jsimd_ycc_rgb4_convert_e2k;
+  idx &= 15;
+
+  e2kfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows, idx);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks, input_data,
+                                output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks, input_data,
+                                output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width,
+                          input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width,
+                          input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_e2k(cinfo->max_v_samp_factor,
+                                compptr->downsampled_width, input_data,
+                                output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_e2k(cinfo->max_v_samp_factor,
+                                compptr->downsampled_width, input_data,
+                                output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int);
+  int idx = color_space_idx(cinfo->out_color_space);
+
+  e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k :
+                      jsimd_ycc_rgb4_merged_upsample_e2k;
+  idx &= 15;
+
+  e2kfct(cinfo->output_width, input_buf, in_row_group_ctr,
+         in_row_group_ctr * 2, output_buf, idx);
+  e2kfct(cinfo->output_width, input_buf, in_row_group_ctr,
+         in_row_group_ctr * 2 + 1, output_buf + 1, idx);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int);
+  int idx = color_space_idx(cinfo->out_color_space);
+
+  e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k :
+                      jsimd_ycc_rgb4_merged_upsample_e2k;
+  idx &= 15;
+
+  e2kfct(cinfo->output_width, input_buf, in_row_group_ctr,
+         in_row_group_ctr, output_buf, idx);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_e2k(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+  jsimd_convsamp_float_e2k(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_e2k(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_e2k(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+  jsimd_fdct_float_e2k(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_e2k(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+  jsimd_quantize_float_e2k(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+  if (sizeof(FLOAT_MULT_TYPE) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_e2k(compptr->dct_table, coef_block, output_buf,
+                       output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_e2k(compptr->dct_table, coef_block, output_buf,
+                       output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_float_e2k(compptr->dct_table, coef_block, output_buf,
+                       output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_e2k(state, buffer, block, last_dc_val,
+                                         dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if ((simd_support & JSIMD_SSE2) && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, UJCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_e2k(block, jpeg_natural_order_start,
+                                        Sl, Al, (JCOEF*)values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if ((simd_support & JSIMD_SSE2) && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, UJCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_e2k(block,
+                                                jpeg_natural_order_start,
+                                                Sl, Al, (JCOEF*)absvalues, bits);
+}
diff --git a/simd/e2k/jsimd_api_e2k.h b/simd/e2k/jsimd_api_e2k.h
new file mode 100644
index 0000000..d857203
--- /dev/null
+++ b/simd/e2k/jsimd_api_e2k.h
@@ -0,0 +1,94 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* Function declarations */
+
+#define CONVERT_DECL(n) \
+EXTERN(void) jsimd_rgb##n##_ycc_convert_e2k(JDIMENSION img_width, \
+    JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \
+    JDIMENSION output_row, int num_rows, int shuf_idx); \
+EXTERN(void) jsimd_rgb##n##_gray_convert_e2k(JDIMENSION img_width, \
+    JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \
+    JDIMENSION output_row, int num_rows, int shuf_idx); \
+EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \
+    JSAMPIMAGE input_buf, JDIMENSION input_row, \
+    JSAMPARRAY output_buf, int num_rows, int shuf_idx); \
+EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \
+    JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, \
+    int num_rows, int shuf_idx); \
+EXTERN(void) jsimd_ycc_rgb##n##_merged_upsample_e2k(JDIMENSION out_width, \
+    JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, \
+    JDIMENSION in_row_group_ctr_y, JSAMPARRAY output_buf, int shuf_idx); \
+
+CONVERT_DECL(3)
+CONVERT_DECL(4)
+
+EXTERN(void) jsimd_h2v1_downsample_e2k
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v2_downsample_e2k
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+#define UPSAMPLE_DECL(name) \
+EXTERN(void) jsimd_##name##_upsample_e2k \
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, \
+   JSAMPARRAY *output_data_ptr);
+
+UPSAMPLE_DECL(h2v1)
+UPSAMPLE_DECL(h2v2)
+UPSAMPLE_DECL(h2v1_fancy)
+UPSAMPLE_DECL(h2v2_fancy)
+
+EXTERN(void) jsimd_convsamp_e2k
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+EXTERN(void) jsimd_convsamp_float_e2k
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_fdct_islow_e2k(DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_e2k(DCTELEM *data);
+EXTERN(void) jsimd_fdct_float_e2k(FAST_FLOAT *data);
+EXTERN(void) jsimd_quantize_e2k
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+EXTERN(void) jsimd_quantize_float_e2k
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+EXTERN(void) jsimd_idct_islow_e2k
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_ifast_e2k
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_float_e2k
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_e2k
+  (void *state, JOCTET *buffer, JCOEFPTR block,
+   int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_e2k
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_e2k
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits);
diff --git a/simd/e2k/jsimd_e2k.h b/simd/e2k/jsimd_e2k.h
new file mode 100644
index 0000000..15d6262
--- /dev/null
+++ b/simd/e2k/jsimd_e2k.h
@@ -0,0 +1,207 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "jsimd_api_e2k.h"
+#include <stdint.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+
+/* Common code */
+
+#define __4X2(a, b)  a, b, a, b, a, b, a, b
+#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
+
+#ifdef __e2k__
+#define PRAGMA_E2K _Pragma
+#define _mm_shuffle2_pi8(a, b, c) \
+    ((__m64)__builtin_e2k_pshufb((uint64_t)(b), (uint64_t)(a), (uint64_t)(c)))
+#define _mm_shuffle2_epi8(a, b, c) \
+    ((__m128i)__builtin_e2k_qppermb((__v2di)(b), (__v2di)(a), (__v2di)(c)))
+#define _mm_blendv_pi8(a, b, c) \
+    ((__m64)__builtin_e2k_pmerge((uint64_t)(a), (uint64_t)(b), (uint64_t)(c)))
+#else
+#define PRAGMA_E2K(x)
+#define _mm_shuffle2_pi8(a, b, c) \
+    _mm_movepi64_pi64(_mm_shuffle_epi8(_mm_unpacklo_epi64( \
+        _mm_movpi64_epi64(a), _mm_movpi64_epi64(b)), _mm_movpi64_epi64(c)))
+#define _mm_shuffle2_epi8(a, b, c) \
+    _mm_blendv_epi8(_mm_shuffle_epi8(a, c), _mm_shuffle_epi8(b, c), \
+        _mm_slli_epi16(c, 3))
+#define _mm_blendv_pi8(a, b, c) \
+    _mm_movepi64_pi64(_mm_blendv_epi8(_mm_movpi64_epi64(a), \
+        _mm_movpi64_epi64(b), _mm_movpi64_epi64(c)))
+
+#define BITREV_ROUND(c, i) a = (a & c) << i | (a >> i & c);
+static ALWAYS_INLINE uint64_t __builtin_e2k_bitrevd(uint64_t a) {
+    BITREV_ROUND(0x5555555555555555ll, 1)
+    BITREV_ROUND(0x3333333333333333ll, 2)
+    BITREV_ROUND(0x0F0F0F0F0F0F0F0Fll, 4)
+    BITREV_ROUND(0x00FF00FF00FF00FFll, 8)
+    BITREV_ROUND(0x0000FFFF0000FFFFll, 16)
+    return a >> 32 | a << 32;
+}
+
+static ALWAYS_INLINE uint64_t __builtin_e2k_insfd(uint64_t a, uint64_t b, uint64_t c) {
+  int n = b & 63;
+  a = a >> n | a << (64 - n);
+  return c ^ ((a ^ c) & (~0ll << (b >> 6 & 63)));
+}
+#endif
+
+#if defined(__iset__) && __iset__ >= 5
+static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) {
+  __m128i index = _mm_setr_epi8(
+    2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31);
+  return _mm_shuffle2_epi8(a, b, index);
+}
+
+#define VEC_ISZERO(a) !_mm_cvtsi128_si64(_mm_packs_epi16(a, a))
+#else
+static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) {
+  union { __m128i v; __m64 d[2]; } l = { a }, h = { b }, x;
+  __m64 index = _mm_setr_pi8(2, 3, 6, 7, 10, 11, 14, 15);
+  x.d[0] = _mm_shuffle2_pi8(l.d[0], l.d[1], index);
+  x.d[1] = _mm_shuffle2_pi8(h.d[0], h.d[1], index);
+  return x.v;
+}
+
+static ALWAYS_INLINE uint64_t vec_isnonzero(__m128i a) {
+  __v2di x = (__v2di)a;
+  return x[0] | x[1];
+}
+
+#define VEC_ISZERO(a) !vec_isnonzero(a)
+#endif
+
+#define VEC_ALIGNR8(a, b) _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), 1))
+
+#define TRANSPOSE_FLOAT(a, b, c, d, e, f, g, h) \
+  tmp0 = _mm_unpacklo_ps(a, b); \
+  tmp1 = _mm_unpackhi_ps(a, b); \
+  tmp2 = _mm_unpacklo_ps(c, d); \
+  tmp3 = _mm_unpackhi_ps(c, d); \
+  e = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \
+  f = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \
+  g = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); \
+  h = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+
+#define TRANSPOSE8(a, b) \
+  b##0 = _mm_unpacklo_epi8(a##0, a##2); \
+  b##1 = _mm_unpackhi_epi8(a##0, a##2); \
+  b##2 = _mm_unpacklo_epi8(a##1, a##3); \
+  b##3 = _mm_unpackhi_epi8(a##1, a##3);
+
+#define TRANSPOSE16(a, b) \
+  b##0 = _mm_unpacklo_epi16(a##0, a##2); \
+  b##1 = _mm_unpackhi_epi16(a##0, a##2); \
+  b##2 = _mm_unpacklo_epi16(a##1, a##3); \
+  b##3 = _mm_unpackhi_epi16(a##1, a##3); \
+  b##4 = _mm_unpacklo_epi16(a##4, a##6); \
+  b##5 = _mm_unpackhi_epi16(a##4, a##6); \
+  b##6 = _mm_unpacklo_epi16(a##5, a##7); \
+  b##7 = _mm_unpackhi_epi16(a##5, a##7);
+
+#define TRANSPOSE(a, b) \
+  TRANSPOSE16(a, b) TRANSPOSE16(b, a) \
+  b##0 = _mm_unpacklo_epi64(a##0, a##4); \
+  b##1 = _mm_unpackhi_epi64(a##0, a##4); \
+  b##2 = _mm_unpacklo_epi64(a##1, a##5); \
+  b##3 = _mm_unpackhi_epi64(a##1, a##5); \
+  b##4 = _mm_unpacklo_epi64(a##2, a##6); \
+  b##5 = _mm_unpackhi_epi64(a##2, a##6); \
+  b##6 = _mm_unpacklo_epi64(a##3, a##7); \
+  b##7 = _mm_unpackhi_epi64(a##3, a##7);
+
+#define IDCT_SAVE() { \
+  __m128i pb_cj = _mm_set1_epi8((int8_t)CENTERJSAMPLE); \
+  \
+  row0 = _mm_xor_si128(_mm_packs_epi16(out0, out1), pb_cj); \
+  row1 = _mm_xor_si128(_mm_packs_epi16(out2, out3), pb_cj); \
+  row2 = _mm_xor_si128(_mm_packs_epi16(out4, out5), pb_cj); \
+  row3 = _mm_xor_si128(_mm_packs_epi16(out6, out7), pb_cj); \
+  \
+  TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col) \
+  \
+  VEC_STL(output_buf[0] + output_col, col0); \
+  VEC_STH(output_buf[1] + output_col, col0); \
+  VEC_STL(output_buf[2] + output_col, col1); \
+  VEC_STH(output_buf[3] + output_col, col1); \
+  VEC_STL(output_buf[4] + output_col, col2); \
+  VEC_STH(output_buf[5] + output_col, col2); \
+  VEC_STL(output_buf[6] + output_col, col3); \
+  VEC_STH(output_buf[7] + output_col, col3); \
+}
+
+#define IDCT_SPLAT8(col0) { \
+  row3 = _mm_unpacklo_epi16(col0, col0); \
+  row7 = _mm_unpackhi_epi16(col0, col0); \
+  row1 = _mm_unpacklo_epi16(row3, row3); \
+  row3 = _mm_unpackhi_epi16(row3, row3); \
+  row5 = _mm_unpacklo_epi16(row7, row7); \
+  row7 = _mm_unpackhi_epi16(row7, row7); \
+  row0 = _mm_unpacklo_epi64(row1, row1); \
+  row1 = _mm_unpackhi_epi64(row1, row1); \
+  row2 = _mm_unpacklo_epi64(row3, row3); \
+  row3 = _mm_unpackhi_epi64(row3, row3); \
+  row4 = _mm_unpacklo_epi64(row5, row5); \
+  row5 = _mm_unpackhi_epi64(row5, row5); \
+  row6 = _mm_unpacklo_epi64(row7, row7); \
+  row7 = _mm_unpackhi_epi64(row7, row7); \
+}
+
+#ifndef min
+#define min(a, b)  ((a) < (b) ? (a) : (b))
+#endif
+
+#define VEC_LD(a)     _mm_loadu_si128((const __m128i*)(a))
+#define VEC_ST(a, b)  _mm_storeu_si128((__m128i*)(a), b)
+#define VEC_LD8(a)    _mm_loadl_epi64((const __m128i*)(a))
+#define VEC_STL(a, b) _mm_storel_epi64((__m128i*)(a), b)
+#define VEC_STH(a, b) _mm_storeh_pd((double*)(a), _mm_castsi128_pd(b));
+#define VEC_SPLAT(v, i) _mm_shuffle_epi8(v, _mm_set1_epi16((i) * 2 | ((i) * 2 + 1) << 8))
+
+#if !defined(__iset__) || __iset__ < 5
+#define NEED_ALIGN8
+#define ALIGN8_COMMON uint64_t src_shr; __m64 src_tmp0, src_tmp1;
+#define ALIGN8_VARS(src) __m64 *src##_ptr, src##_next, src##_index;
+#define ALIGN8_START(ptr, src) \
+  src_shr = (intptr_t)(ptr - 1) & 7; \
+  src##_ptr = (__m64*)((intptr_t)(ptr - 1) & -8); \
+  src##_next = src##_ptr[src_shr == 7]; \
+  src##_index = _mm_add_pi8(_mm_set1_pi8(src_shr), \
+                            _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8));
+#define ALIGN8_READ16(v0, src, i) \
+  src_tmp1 = src##_ptr[i * 2 + 1]; \
+  src_tmp0 = _mm_shuffle2_pi8(src##_next, src_tmp1, src##_index); \
+  src##_next = src##_ptr[i * 2 + 2]; \
+  src_tmp1 = _mm_shuffle2_pi8(src_tmp1, src##_next, src##_index); \
+  v0 = _mm_setr_epi64(src_tmp0, src_tmp1);
+#endif
+
-- 
2.34.1