diff --git a/module/amd64/Makefile.am b/module/amd64/Makefile.am index be4d928..ed2d7c6 100644 --- a/module/amd64/Makefile.am +++ b/module/amd64/Makefile.am @@ -3,6 +3,7 @@ NAFLAGS += -DASM_ARCH_AMD64 ASMSOURCES = \ a8r8g8b8_to_a8b8g8r8_box_amd64_sse2.asm \ a8r8g8b8_to_nv12_box_amd64_sse2.asm \ + a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm \ a8r8g8b8_to_yuvalp_box_amd64_sse2.asm \ cpuid_amd64.asm \ i420_to_rgb32_amd64_sse2.asm \ diff --git a/module/amd64/a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm b/module/amd64/a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm new file mode 100644 index 0000000..c18e9d6 --- /dev/null +++ b/module/amd64/a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm @@ -0,0 +1,304 @@ +; +;Copyright 2015 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;ARGB to NV12 709 full range +;amd64 SSE2 +; +; notes +; address s8 should be aligned on 16 bytes, will be slower if not +; width should be multiple of 8 and > 0 +; height should be even and > 0 + +%include "common.asm" + +PREPARE_RODATA + cd255 times 4 dd 255 + + cw255 times 8 dw 255 + cw128 times 8 dw 128 + cw54 times 8 dw 54 + cw183 times 8 dw 183 + cw18 times 8 dw 18 + cw29 times 8 dw 29 + cw99 times 8 dw 99 + cw116 times 8 dw 116 + cw12 times 8 dw 12 + cw2 times 8 dw 2 + +%define LS8 [rsp + 0] ; s8 +%define LSRC_STRIDE [rsp + 8] ; src_stride +%define LD8_Y [rsp + 16] ; d8_y +%define LDST_Y_STRIDE [rsp + 24] ; dst_stride_y +%define LD8_UV [rsp + 32] ; d8_uv +%define LDST_UV_STRIDE [rsp + 40] ; dst_stride_uv +%define LU1 [rsp + 48] ; first line U, 8 bytes +%define LV1 [rsp + 56] ; first line V, 8 bytes +%define LU2 [rsp + 64] ; second line U, 8 bytes +%define LV2 [rsp + 72] ; second line V, 8 bytes + +%define LWIDTH [rsp + 104] ; width +%define LHEIGHT [rsp + 112] ; height + +;The first six integer or pointer arguments are passed in registers +; RDI, RSI, RDX, RCX, R8, and R9 + +;int +;a8r8g8b8_to_nv12_709fr_box_amd64_sse2(const char *s8, int src_stride, +; char *d8_y, int dst_stride_y, +; char *d8_uv, int dst_stride_uv, +; int width, int height); +PROC a8r8g8b8_to_nv12_709fr_box_amd64_sse2 + push rbx + push rbp + sub rsp, 80 ; local vars, 80 bytes + + mov LS8, rdi ; s8 + mov LSRC_STRIDE, rsi ; src_stride + mov LD8_Y, rdx ; d8_y + mov LDST_Y_STRIDE, rcx ; dst_stride_y + mov LD8_UV, r8 ; d8_uv + mov LDST_UV_STRIDE, r9 ; dst_stride_uv + + pxor xmm7, xmm7 + + mov ebx, LHEIGHT ; ebx = height + shr ebx, 1 ; doing 2 lines at a time + +row_loop1: + mov rsi, LS8 ; s8 + mov rdi, LD8_Y ; d8_y + mov rdx, LD8_UV ; d8_uv + + mov ecx, LWIDTH ; ecx = width + shr ecx, 3 ; doing 8 pixels at a time + +loop1: + ; first line + movdqu xmm0, [rsi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + + movdqu xmm0, [rsi + 16] ; 4 pixels, 16 bytes + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (( 54 * _R + 183 * _G + 18 * _B) >> 8); + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw18)] + pmullw xmm5, [lsym(cw183)] + pmullw xmm6, [lsym(cw54)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [rdi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-29 * _R - 99 * _G + 128 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw99)] + pmullw xmm6, [lsym(cw29)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LU1, xmm4 ; save for later + + ; _V = ((128 * _R - 116 * _G - 12 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw116)] + pmullw xmm6, [lsym(cw12)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LV1, xmm4 ; save for later + + ; go down to second line + add rsi, LSRC_STRIDE + add rdi, LDST_Y_STRIDE + + ; second line + movdqu xmm0, [rsi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + + movdqu xmm0, [rsi + 16] ; 4 pixels, 16 bytes + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (( 54 * _R + 183 * _G + 18 * _B) >> 8); + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw18)] + pmullw xmm5, [lsym(cw183)] + pmullw xmm6, [lsym(cw54)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [rdi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-29 * _R - 99 * _G + 128 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw99)] + pmullw xmm6, [lsym(cw29)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LU2, xmm4 ; save for later + + ; _V = ((128 * _R - 116 * _G - 12 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw116)] + pmullw xmm6, [lsym(cw12)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LV2, xmm4 ; save for later + + ; uv add and divide(average) + movq mm1, LU1 ; u from first line + movq mm3, mm1 + pand mm1, [lsym(cw255)] + psrlw mm3, 8 + pand mm3, [lsym(cw255)] + paddw mm1, mm3 ; add + movq mm2, LU2 ; u from second line + movq mm3, mm2 + pand mm2, [lsym(cw255)] + paddw mm1, mm2 ; add + psrlw mm3, 8 + pand mm3, [lsym(cw255)] + paddw mm1, mm3 ; add + paddw mm1, [lsym(cw2)] ; add 2 + psrlw mm1, 2 ; div 4 + + movq mm2, LV1 ; v from first line + movq mm4, mm2 + pand mm2, [lsym(cw255)] + psrlw mm4, 8 + pand mm4, [lsym(cw255)] + paddw mm2, mm4 ; add + movq mm3, LV2 ; v from second line + movq mm4, mm3 + pand mm3, [lsym(cw255)] + paddw mm2, mm3 ; add + psrlw mm4, 8 + pand mm4, [lsym(cw255)] + paddw mm2, mm4 ; add + paddw mm2, [lsym(cw2)] ; add 2 + psrlw mm2, 2 ; div 4 + + packuswb mm1, mm1 + packuswb mm2, mm2 + + punpcklbw mm1, mm2 ; uv + movq [rdx], mm1 ; out 8 bytes uvuvuvuv + + ; go up to first line + sub rsi, LSRC_STRIDE + sub rdi, LDST_Y_STRIDE + + ; move right + lea rsi, [rsi + 32] + lea rdi, [rdi + 8] + lea rdx, [rdx + 8] + + dec ecx + jnz loop1 + + ; update s8 + mov rax, LS8 ; s8 + add rax, LSRC_STRIDE ; s8 += src_stride + add rax, LSRC_STRIDE ; s8 += src_stride + mov LS8, rax + + ; update d8_y + mov rax, LD8_Y ; d8_y + add rax, LDST_Y_STRIDE ; d8_y += dst_stride_y + add rax, LDST_Y_STRIDE ; d8_y += dst_stride_y + mov LD8_Y, rax + + ; update d8_uv + mov rax, LD8_UV ; d8_uv + add rax, LDST_UV_STRIDE ; d8_uv += dst_stride_uv + mov LD8_UV, rax + + dec ebx + jnz row_loop1 + + mov rax, 0 ; return value + add rsp, 80 ; local vars, 80 bytes + pop rbp + pop rbx + ret +END_OF_FILE diff --git a/module/amd64/funcs_amd64.h b/module/amd64/funcs_amd64.h index 3b54e2b..9d746fd 100644 --- a/module/amd64/funcs_amd64.h +++ b/module/amd64/funcs_amd64.h @@ -44,6 +44,11 @@ a8r8g8b8_to_nv12_box_amd64_sse2(const uint8_t *s8, int src_stride, uint8_t *d8_uv, int dst_stride_uv, int width, int height); int +a8r8g8b8_to_nv12_709fr_box_amd64_sse2(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height); +int a8r8g8b8_to_yuvalp_box_amd64_sse2(const uint8_t *s8, int src_stride, uint8_t *d8, int dst_stride, int width, int height); diff --git a/module/rdp.h b/module/rdp.h index f9e4ac6..9122bc9 100644 --- a/module/rdp.h +++ b/module/rdp.h @@ -297,6 +297,7 @@ struct _rdpRec copy_box_proc a8r8g8b8_to_a8b8g8r8_box; copy_box_dst2_proc a8r8g8b8_to_nv12_box; + copy_box_dst2_proc a8r8g8b8_to_nv12_709fr_box; copy_box_proc a8r8g8b8_to_yuvalp_box; /* multimon */ diff --git a/module/rdpCapture.c b/module/rdpCapture.c index 893917c..7591af4 100644 --- a/module/rdpCapture.c +++ b/module/rdpCapture.c @@ -553,6 +553,103 @@ a8r8g8b8_to_nv12_box(const uint8_t *s8, int src_stride, return 0; } +/******************************************************************************/ +int +a8r8g8b8_to_nv12_709fr_box(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height) +{ + int index; + int jndex; + int R; + int G; + int B; + int Y; + int U; + int V; + int U_sum; + int V_sum; + int pixel; + const uint32_t *s32a; + const uint32_t *s32b; + uint8_t *d8ya; + uint8_t *d8yb; + uint8_t *d8uv; + + for (jndex = 0; jndex < height; jndex += 2) + { + s32a = (const uint32_t *) (s8 + src_stride * jndex); + s32b = (const uint32_t *) (s8 + src_stride * (jndex + 1)); + d8ya = d8_y + dst_stride_y * jndex; + d8yb = d8_y + dst_stride_y * (jndex + 1); + d8uv = d8_uv + dst_stride_uv * (jndex / 2); + for (index = 0; index < width; index += 2) + { + U_sum = 0; + V_sum = 0; + + pixel = s32a[0]; + s32a++; + R = (pixel >> 16) & 0xff; + G = (pixel >> 8) & 0xff; + B = (pixel >> 0) & 0xff; + Y = ( 54 * R + 183 * G + 18 * B) >> 8; + U = ((-29 * R - 99 * G + 128 * B) >> 8) + 128; + V = ((128 * R - 116 * G - 12 * B) >> 8) + 128; + d8ya[0] = RDPCLAMP(Y, 0, 255); + d8ya++; + U_sum += RDPCLAMP(U, 0, 255); + V_sum += RDPCLAMP(V, 0, 255); + + pixel = s32a[0]; + s32a++; + R = (pixel >> 16) & 0xff; + G = (pixel >> 8) & 0xff; + B = (pixel >> 0) & 0xff; + Y = ( 54 * R + 183 * G + 18 * B) >> 8; + U = ((-29 * R - 99 * G + 128 * B) >> 8) + 128; + V = ((128 * R - 116 * G - 12 * B) >> 8) + 128; + d8ya[0] = RDPCLAMP(Y, 0, 255); + d8ya++; + U_sum += RDPCLAMP(U, 0, 255); + V_sum += RDPCLAMP(V, 0, 255); + + pixel = s32b[0]; + s32b++; + R = (pixel >> 16) & 0xff; + G = (pixel >> 8) & 0xff; + B = (pixel >> 0) & 0xff; + Y = ( 54 * R + 183 * G + 18 * B) >> 8; + U = ((-29 * R - 99 * G + 128 * B) >> 8) + 128; + V = ((128 * R - 116 * G - 12 * B) >> 8) + 128; + d8yb[0] = RDPCLAMP(Y, 0, 255); + d8yb++; + U_sum += RDPCLAMP(U, 0, 255); + V_sum += RDPCLAMP(V, 0, 255); + + pixel = s32b[0]; + s32b++; + R = (pixel >> 16) & 0xff; + G = (pixel >> 8) & 0xff; + B = (pixel >> 0) & 0xff; + Y = ( 54 * R + 183 * G + 18 * B) >> 8; + U = ((-29 * R - 99 * G + 128 * B) >> 8) + 128; + V = ((128 * R - 116 * G - 12 * B) >> 8) + 128; + d8yb[0] = RDPCLAMP(Y, 0, 255); + d8yb++; + U_sum += RDPCLAMP(U, 0, 255); + V_sum += RDPCLAMP(V, 0, 255); + + d8uv[0] = (U_sum + 2) / 4; + d8uv++; + d8uv[0] = (V_sum + 2) / 4; + d8uv++; + } + } + return 0; +} + /******************************************************************************/ /* copy rects with no error checking */ static int @@ -590,6 +687,44 @@ rdpCopyBox_a8r8g8b8_to_nv12(rdpClientCon *clientCon, return 0; } +/******************************************************************************/ +/* copy rects with no error checking */ +static int +rdpCopyBox_a8r8g8b8_to_nv12_709fr(rdpClientCon *clientCon, + const uint8_t *src, int src_stride, + int srcx, int srcy, + uint8_t *dst_y, int dst_stride_y, + uint8_t *dst_uv, int dst_stride_uv, + int dstx, int dsty, + BoxPtr rects, int num_rects) +{ + const uint8_t *s8; + uint8_t *d8_y; + uint8_t *d8_uv; + int index; + int width; + int height; + BoxPtr box; + + for (index = 0; index < num_rects; index++) + { + box = rects + index; + s8 = src + (box->y1 - srcy) * src_stride; + s8 += (box->x1 - srcx) * 4; + d8_y = dst_y + (box->y1 - dsty) * dst_stride_y; + d8_y += (box->x1 - dstx) * 1; + d8_uv = dst_uv + ((box->y1 - dsty) / 2) * dst_stride_uv; + d8_uv += (box->x1 - dstx) * 1; + width = box->x2 - box->x1; + height = box->y2 - box->y1; + clientCon->dev->a8r8g8b8_to_nv12_709fr_box(s8, src_stride, + d8_y, dst_stride_y, + d8_uv, dst_stride_uv, + width, height); + } + return 0; +} + /******************************************************************************/ static Bool isShmStatusActive(enum shared_memory_status status) { @@ -623,8 +758,8 @@ wyhash_rfx_tile(const uint8_t *src, int src_stride, int x, int y, uint64_t seed) /******************************************************************************/ static Bool -rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, - int *num_out_rects, struct image_data *id) +rdpCaptureSimple(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) { BoxPtr psrc_rects; BoxRec rect; @@ -637,10 +772,10 @@ rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int dst_stride; int dst_format; - LLOGLN(10, ("rdpCapture0:")); + LLOGLN(10, ("rdpCaptureSimple:")); if (!isShmStatusActive(clientCon->shmemstatus)) { - LLOGLN(0, ("rdpCapture0: WARNING -- Shared memory is not configured." + LLOGLN(0, ("rdpCaptureSimple: WARNING -- Shared memory is not configured." " Aborting capture!")); return FALSE; } @@ -708,7 +843,7 @@ rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } else { - LLOGLN(0, ("rdpCapture0: unimplemented color conversion")); + LLOGLN(0, ("rdpCaptureSimple: unimplemented color conversion")); } return rv; } @@ -716,8 +851,8 @@ rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, /******************************************************************************/ /* make out_rects always multiple of 16 width and height */ static Bool -rdpCapture1(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, - int *num_out_rects, struct image_data *id) +rdpCaptureSufA16(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) { BoxPtr psrc_rects; BoxRec rect; @@ -748,10 +883,10 @@ rdpCapture1(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int dst_stride; int dst_format; - LLOGLN(10, ("rdpCapture1:")); + LLOGLN(10, ("rdpCaptureSufA16:")); if (!isShmStatusActive(clientCon->shmemstatus)) { - LLOGLN(0, ("rdpCapture1: WARNING -- Shared memory is not configured." + LLOGLN(0, ("rdpCaptureSufA16: WARNING -- Shared memory is not configured." " Aborting capture!")); return FALSE; } @@ -857,15 +992,15 @@ rdpCapture1(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } else { - LLOGLN(0, ("rdpCapture1: unimplemented color conversion")); + LLOGLN(0, ("rdpCaptureSufA16: unimplemented color conversion")); } return rv; } /******************************************************************************/ static Bool -rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, - int *num_out_rects, struct image_data *id) +rdpCaptureGfxPro(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) { int x; int y; @@ -887,11 +1022,11 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int num_crcs; int mon_index; - LLOGLN(10, ("rdpCapture2:")); + LLOGLN(10, ("rdpCaptureGfxPro:")); if (!isShmStatusActive(clientCon->shmemstatus)) { - LLOGLN(0, ("rdpCapture2: WARNING -- Shared memory is not configured" + LLOGLN(0, ("rdpCaptureGfxPro: WARNING -- Shared memory is not configured" " for RFX. Aborting capture!")); return FALSE; } @@ -917,7 +1052,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, num_crcs = crc_stride * ((id->height + 63) / 64); if (num_crcs != clientCon->num_rfx_crcs_alloc[mon_index]) { - LLOGLN(0, ("rdpCapture2: resize the crc list was %d now %d", + LLOGLN(0, ("rdpCaptureGfxPro: resize the crc list was %d now %d", clientCon->num_rfx_crcs_alloc[mon_index], num_crcs)); /* resize the crc list */ clientCon->num_rfx_crcs_alloc[mon_index] = num_crcs; @@ -937,11 +1072,11 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, rect.x2 = rect.x1 + XRDP_RFX_ALIGN; rect.y2 = rect.y1 + XRDP_RFX_ALIGN; rcode = rdpRegionContainsRect(in_reg, &rect); - LLOGLN(10, ("rdpCapture2: rcode %d", rcode)); + LLOGLN(10, ("rdpCaptureGfxPro: rcode %d", rcode)); if (rcode == rgnOUT) { - LLOGLN(10, ("rdpCapture2: rgnOUT")); + LLOGLN(10, ("rdpCaptureGfxPro: rgnOUT")); rdpRegionInit(&tile_reg, &rect, 0); rdpRegionSubtract(in_reg, in_reg, &tile_reg); rdpRegionUninit(&tile_reg); @@ -952,7 +1087,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, crc = WYHASH_SEED; if (rcode == rgnPART) { - LLOGLN(10, ("rdpCapture2: rgnPART")); + LLOGLN(10, ("rdpCaptureGfxPro: rgnPART")); rdpFillBox_yuvalp(x, y, dst, dst_stride); rdpRegionInit(&tile_reg, &rect, 0); rdpRegionIntersect(&tile_reg, in_reg, &tile_reg); @@ -969,16 +1104,16 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } else /* rgnIN */ { - LLOGLN(10, ("rdpCapture2: rgnIN")); + LLOGLN(10, ("rdpCaptureGfxPro: rgnIN")); crc = wyhash_rfx_tile(src, src_stride, x, y, crc); } crc_offset = (y / XRDP_RFX_ALIGN) * crc_stride + (x / XRDP_RFX_ALIGN); - LLOGLN(10, ("rdpCapture2: crc 0x%" PRIx64 " 0x%" PRIx64, + LLOGLN(10, ("rdpCaptureGfxPro: crc 0x%" PRIx64 " 0x%" PRIx64, crc, clientCon->rfx_crcs[mon_index][crc_offset])); if (crc == clientCon->rfx_crcs[mon_index][crc_offset]) { - LLOGLN(10, ("rdpCapture2: crc skip at x %d y %d", x, y)); + LLOGLN(10, ("rdpCaptureGfxPro: crc skip at x %d y %d", x, y)); rdpRegionInit(&tile_reg, &rect, 0); rdpRegionSubtract(in_reg, in_reg, &tile_reg); rdpRegionUninit(&tile_reg); @@ -1015,8 +1150,8 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, /******************************************************************************/ /* make out_rects always multiple of 2 width and height */ static Bool -rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, - int *num_out_rects, struct image_data *id) +rdpCaptureSufA2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) { BoxPtr psrc_rects; BoxRec rect; @@ -1030,11 +1165,11 @@ rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int dst_stride; int dst_format; - LLOGLN(10, ("rdpCapture3:")); + LLOGLN(10, ("rdpCaptureSufA2:")); if (!isShmStatusActive(clientCon->shmemstatus)) { - LLOGLN(0, ("rdpCapture3: WARNING -- Shared memory is not configured." + LLOGLN(0, ("rdpCaptureSufA2: WARNING -- Shared memory is not configured." " Aborting capture!")); return FALSE; } @@ -1094,7 +1229,100 @@ rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } else { - LLOGLN(0, ("rdpCapture3: unimplemented color conversion")); + LLOGLN(0, ("rdpCaptureSufA2: unimplemented color conversion")); + } + + return rv; +} + +/******************************************************************************/ +/* make out_rects always multiple of 2 width and height */ +static Bool +rdpCaptureGfxA2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) +{ + BoxPtr psrc_rects; + BoxRec rect; + int num_rects; + int index; + uint8_t *dst_uv; + Bool rv; + const uint8_t *src; + uint8_t *dst; + int src_stride; + int dst_stride; + int dst_format; + + LLOGLN(10, ("rdpCaptureGfxA2:")); + + if (!isShmStatusActive(clientCon->shmemstatus)) + { + LLOGLN(0, ("rdpCaptureGfxA2: WARNING -- Shared memory is not configured." + " Aborting capture!")); + return FALSE; + } + + rv = TRUE; + + rdpRegionTranslate(in_reg, -id->left, -id->top); + + num_rects = REGION_NUM_RECTS(in_reg); + psrc_rects = REGION_RECTS(in_reg); + + if (num_rects < 1) + { + return FALSE; + } + + *num_out_rects = num_rects; + + *out_rects = g_new(BoxRec, num_rects * 4); + index = 0; + while (index < num_rects) + { + rect = psrc_rects[index]; + LLOGLN(10, ("old x1 %d y1 %d x2 %d y2 %d", rect.x1, rect.y1, + rect.x2, rect.y2)); + rect.x1 -= rect.x1 & 1; + rect.y1 -= rect.y1 & 1; + rect.x2 += rect.x2 & 1; + rect.y2 += rect.y2 & 1; + if (rect.x2 > id->width) + { + rect.x2 = id->width & ~1; + } + if (rect.y2 > id->height) + { + rect.y2 = id->height & ~1; + } + LLOGLN(10, ("new x1 %d y1 %d x2 %d y2 %d", rect.x1, rect.y1, + rect.x2, rect.y2)); + (*out_rects)[index] = rect; + index++; + } + + src = id->pixels; + dst = id->shmem_pixels; + dst_format = clientCon->rdp_format; + src_stride = id->lineBytes; + dst_stride = id->width; + + src = src + src_stride * id->top + id->left * 4; + + if (dst_format == XRDP_nv12_709fr) + { + dst_uv = dst; + dst_uv += id->width * id->height; + rdpCopyBox_a8r8g8b8_to_nv12_709fr(clientCon, + src, src_stride, 0, 0, + dst, dst_stride, + dst_uv, dst_stride, + 0, 0, + *out_rects, num_rects); + } + else + { + LLOGLN(0, ("rdpCaptureGfxA2: unimplemented color conversion")); } return rv; @@ -1166,7 +1394,7 @@ Bool rdpCapture(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int *num_out_rects, struct image_data *id) { - int mode; + enum xrdp_capture_code mode; LLOGLN(10, ("rdpCapture:")); mode = clientCon->client_info.capture_code; @@ -1183,18 +1411,20 @@ rdpCapture(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } switch (mode) { - case 0: - return rdpCapture0(clientCon, in_reg, out_rects, num_out_rects, id); - case 1: - return rdpCapture1(clientCon, in_reg, out_rects, num_out_rects, id); - case 2: - case 4: - /* used for remotefx capture */ - return rdpCapture2(clientCon, in_reg, out_rects, num_out_rects, id); - case 3: - case 5: + case CC_SIMPLE: + return rdpCaptureSimple(clientCon, in_reg, out_rects, num_out_rects, id); + case CC_SUF_A16: + return rdpCaptureSufA16(clientCon, in_reg, out_rects, num_out_rects, id); + case CC_SUF_RFX: /* surface command RFX */ + /* FALLTHROUGH */ + case CC_GFX_PRO: /* GFX progressive */ + return rdpCaptureGfxPro(clientCon, in_reg, out_rects, num_out_rects, id); + case CC_SUF_A2: /* surface command h264 */ + /* used for even align capture */ + return rdpCaptureSufA2(clientCon, in_reg, out_rects, num_out_rects, id); + case CC_GFX_A2: /* GFX h264 */ /* used for even align capture */ - return rdpCapture3(clientCon, in_reg, out_rects, num_out_rects, id); + return rdpCaptureGfxA2(clientCon, in_reg, out_rects, num_out_rects, id); default: LLOGLN(0, ("rdpCapture: mode %d not implemented", mode)); break; diff --git a/module/rdpCapture.h b/module/rdpCapture.h index 5bf3fc7..eadfe24 100644 --- a/module/rdpCapture.h +++ b/module/rdpCapture.h @@ -50,6 +50,11 @@ a8r8g8b8_to_nv12_box(const uint8_t *s8, int src_stride, uint8_t *d8_uv, int dst_stride_uv, int width, int height); extern _X_EXPORT int +a8r8g8b8_to_nv12_709fr_box(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height); +extern _X_EXPORT int a8r8g8b8_to_yuvalp_box(const uint8_t *s8, int src_stride, uint8_t *d8, int dst_stride, int width, int height); diff --git a/module/rdpClientCon.c b/module/rdpClientCon.c index 4a6faec..85f72c8 100644 --- a/module/rdpClientCon.c +++ b/module/rdpClientCon.c @@ -776,8 +776,8 @@ rdpClientConResizeAllMemoryAreas(rdpPtr dev, rdpClientCon *clientCon) clientCon->rdp_height = height; /* Set the capture parameters */ - if ((clientCon->client_info.capture_code == 2) || /* RFX */ - (clientCon->client_info.capture_code == 4)) + if ((clientCon->client_info.capture_code == CC_SUF_RFX) || /* RFX */ + (clientCon->client_info.capture_code == CC_GFX_PRO)) { LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got RFX capture")); /* RFX capture needs fixed-size rectangles */ @@ -793,8 +793,8 @@ rdpClientConResizeAllMemoryAreas(rdpPtr dev, rdpClientCon *clientCon) clientCon->cap_stride_bytes = clientCon->cap_width * 4; shmemstatus = SHM_RFX_ACTIVE_PENDING; } - else if ((clientCon->client_info.capture_code == 3) || /* H264 */ - (clientCon->client_info.capture_code == 5)) + else if ((clientCon->client_info.capture_code == CC_SUF_A2) || /* H264 */ + (clientCon->client_info.capture_code == CC_GFX_A2)) { LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got H264 capture")); clientCon->cap_width = width; @@ -1011,12 +1011,12 @@ rdpSendMemoryAllocationComplete(rdpPtr dev, rdpClientCon *clientCon) switch (clientCon->client_info.capture_code) { - case 2: - case 4: + case CC_SUF_RFX: + case CC_GFX_PRO: alignment = XRDP_RFX_ALIGN; break; - case 3: - case 5: + case CC_SUF_A2: + case CC_GFX_A2: alignment = XRDP_H264_ALIGN; break; default: @@ -2608,7 +2608,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, int num_rects_d; int num_rects_c; struct stream *s; - int capture_code; + enum xrdp_capture_code capture_code; int start_frame_bytes; int wiretosurface1_bytes; int wiretosurface2_bytes; @@ -2625,6 +2625,8 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, id->flags, id->left, id->top, id->width, id->height)); capture_code = clientCon->client_info.capture_code; + LLOGLN(10, ("rdpClientConSendPaintRectShmFd: capture_code %d", + capture_code)); num_rects_d = REGION_NUM_RECTS(dirtyReg); num_rects_c = numCopyRects; @@ -2636,7 +2638,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, rdpClientConBeginUpdate(dev, clientCon); - if (capture_code < 4) + if (capture_code < CC_GFX_PRO) { /* non gfx */ size = 2 + 2 + 2 + num_rects_d * 8 + 2 + num_rects_c * 8; @@ -2656,7 +2658,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, out_uint32_le(s, clientCon->rect_id); out_uint32_le(s, id->shmem_bytes); out_uint32_le(s, id->shmem_offset); - if (capture_code == 2) /* rfx */ + if (capture_code == CC_SUF_RFX) /* rfx */ { out_uint16_le(s, id->left); out_uint16_le(s, id->top); @@ -2673,7 +2675,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, rdpClientConSendPending(clientCon->dev, clientCon); g_sck_send_fd_set(clientCon->sck, "int", 4, &(id->shmem_fd), 1); } - else if (capture_code == 4) /* gfx pro rfx */ + else if (capture_code == CC_GFX_PRO) /* gfx pro rfx */ { start_frame_bytes = 8 + 8; wiretosurface2_bytes = 8 + 13 + @@ -2745,7 +2747,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, out_uint32_le(s, 0); /* shmem_bytes */ } } - else if (capture_code == 5) /* gfx h264 */ + else if (capture_code == CC_GFX_A2) /* gfx h264 */ { start_frame_bytes = 8 + 8; wiretosurface1_bytes = 8 + 9 + diff --git a/module/rdpSimd.c b/module/rdpSimd.c index 05dd6a0..59feb9d 100644 --- a/module/rdpSimd.c +++ b/module/rdpSimd.c @@ -65,6 +65,92 @@ int g_simd_use_accel = 1; #if SIMD_USE_ACCEL #if defined(__x86_64__) || defined(__AMD64__) || defined (_M_AMD64) +/******************************************************************************/ +static int +a8r8g8b8_to_nv12_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_nv12_box_amd64_sse2(s8, src_stride, + d8_y, dst_stride_y, + d8_uv, dst_stride_uv, + aligned_width, height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_nv12_box(s8 + aligned_width * 4, src_stride, + d8_y + aligned_width, dst_stride_y, + d8_uv + aligned_width, dst_stride_uv, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} + +/******************************************************************************/ +static int +a8r8g8b8_to_nv12_709fr_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_nv12_709fr_box_amd64_sse2(s8, src_stride, + d8_y, dst_stride_y, + d8_uv, dst_stride_uv, + aligned_width, + height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_nv12_709fr_box(s8 + aligned_width * 4, + src_stride, + d8_y + aligned_width, + dst_stride_y, + d8_uv + aligned_width, + dst_stride_uv, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} + /*****************************************************************************/ int a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride, @@ -105,6 +191,91 @@ a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride, #endif #if defined(__x86__) || defined(_M_IX86) || defined(__i386__) +/******************************************************************************/ +static int +a8r8g8b8_to_nv12_box_x86_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_nv12_box_x86_sse2(s8, src_stride, + d8_y, dst_stride_y, + d8_uv, dst_stride_uv, + aligned_width, height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_nv12_box(s8 + aligned_width * 4, src_stride, + d8_y + aligned_width, dst_stride_y, + d8_uv + aligned_width, dst_stride_uv, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} + +/******************************************************************************/ +static int +a8r8g8b8_to_nv12_709fr_box_x86_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_nv12_709fr_box_x86_sse2(s8, src_stride, + d8_y, dst_stride_y, + d8_uv, dst_stride_uv, + aligned_width, height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_nv12_709fr_box(s8 + aligned_width * 4, + src_stride, + d8_y + aligned_width, + dst_stride_y, + d8_uv + aligned_width, + dst_stride_uv, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} + /*****************************************************************************/ int a8r8g8b8_to_yuvalp_box_x86_sse2_wrap(const uint8_t *s8, int src_stride, @@ -161,6 +332,7 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->uyvy_to_rgb32 = UYVY_to_RGB32; dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box; dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box; + dev->a8r8g8b8_to_nv12_709fr_box = a8r8g8b8_to_nv12_709fr_box; dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box; #if SIMD_USE_ACCEL if (g_simd_use_accel) @@ -177,7 +349,8 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->yuy2_to_rgb32 = yuy2_to_rgb32_amd64_sse2; dev->uyvy_to_rgb32 = uyvy_to_rgb32_amd64_sse2; dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_amd64_sse2; - dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_amd64_sse2; + dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_amd64_sse2_wrap; + dev->a8r8g8b8_to_nv12_709fr_box = a8r8g8b8_to_nv12_709fr_box_amd64_sse2_wrap; dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap; LLOGLN(0, ("rdpSimdInit: sse2 amd64 yuv functions assigned")); } @@ -193,7 +366,8 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->yuy2_to_rgb32 = yuy2_to_rgb32_x86_sse2; dev->uyvy_to_rgb32 = uyvy_to_rgb32_x86_sse2; dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_x86_sse2; - dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_x86_sse2; + dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_x86_sse2_wrap; + dev->a8r8g8b8_to_nv12_709fr_box = a8r8g8b8_to_nv12_709fr_box_x86_sse2_wrap; dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box_x86_sse2_wrap; LLOGLN(0, ("rdpSimdInit: sse2 x86 yuv functions assigned")); } diff --git a/module/x86/Makefile.am b/module/x86/Makefile.am index 9539f8c..92acda6 100644 --- a/module/x86/Makefile.am +++ b/module/x86/Makefile.am @@ -3,6 +3,7 @@ NAFLAGS += -DASM_ARCH_I386 ASMSOURCES = \ a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm \ a8r8g8b8_to_nv12_box_x86_sse2.asm \ + a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm \ a8r8g8b8_to_yuvalp_box_x86_sse2.asm \ cpuid_x86.asm \ i420_to_rgb32_x86_sse2.asm \ diff --git a/module/x86/a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm b/module/x86/a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm new file mode 100644 index 0000000..262f1af --- /dev/null +++ b/module/x86/a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm @@ -0,0 +1,300 @@ +; +;Copyright 2015 Jay Sorg +;Copyright 2017 mirabilos +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;ARGB to NV12 709 full range +;x86 SSE2 +; +; notes +; address s8 should be aligned on 16 bytes, will be slower if not +; width should be multiple of 8 and > 0 +; height should be even and > 0 + +%include "common.asm" + +PREPARE_RODATA + cd255 times 4 dd 255 + + cw255 times 8 dw 255 + cw128 times 8 dw 128 + cw54 times 8 dw 54 + cw183 times 8 dw 183 + cw18 times 8 dw 18 + cw29 times 8 dw 29 + cw99 times 8 dw 99 + cw116 times 8 dw 116 + cw12 times 8 dw 12 + cw2 times 8 dw 2 + +%define LU1 [esp + 0] ; first line U, 8 bytes +%define LV1 [esp + 8] ; first line V, 8 bytes +%define LU2 [esp + 16] ; second line U, 8 bytes +%define LV2 [esp + 24] ; second line V, 8 bytes + +%define LS8 [esp + 52] ; s8 +%define LSRC_STRIDE [esp + 56] ; src_stride +%define LD8_Y [esp + 60] ; d8_y +%define LDST_Y_STRIDE [esp + 64] ; dst_stride_y +%define LD8_UV [esp + 68] ; d8_uv +%define LDST_UV_STRIDE [esp + 72] ; dst_stride_uv +%define LWIDTH [esp + 76] ; width +%define LHEIGHT [esp + 80] ; height + +;int +;a8r8g8b8_to_nv12_709fr_box_x86_sse2(const char *s8, int src_stride, +; char *d8_y, int dst_stride_y, +; char *d8_uv, int dst_stride_uv, +; int width, int height); +PROC a8r8g8b8_to_nv12_709fr_box_x86_sse2 + push ebx + RETRIEVE_RODATA + push esi + push edi + push ebp + sub esp, 32 ; local vars, 32 bytes + + pxor xmm7, xmm7 + + mov ebp, LHEIGHT ; ebp = height + shr ebp, 1 ; doing 2 lines at a time + +row_loop1: + mov esi, LS8 ; s8 + mov edi, LD8_Y ; d8_y + mov edx, LD8_UV ; d8_uv + + mov ecx, LWIDTH ; ecx = width + shr ecx, 3 ; doing 8 pixels at a time + +loop1: + ; first line + movdqu xmm0, [esi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + + movdqu xmm0, [esi + 16] ; 4 pixels, 16 bytes + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (( 66 * _R + 129 * _G + 25 * _B) >> 8) + 16; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw18)] + pmullw xmm5, [lsym(cw183)] + pmullw xmm6, [lsym(cw54)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [edi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-38 * _R - 74 * _G + 112 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw99)] + pmullw xmm6, [lsym(cw29)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LU1, xmm4 ; save for later + + ; _V = ((112 * _R - 94 * _G - 18 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw116)] + pmullw xmm6, [lsym(cw12)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LV1, xmm4 ; save for later + + ; go down to second line + add esi, LSRC_STRIDE + add edi, LDST_Y_STRIDE + + ; second line + movdqu xmm0, [esi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + + movdqu xmm0, [esi + 16] ; 4 pixels, 16 bytes + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (( 66 * _R + 129 * _G + 25 * _B) >> 8) + 16; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw18)] + pmullw xmm5, [lsym(cw183)] + pmullw xmm6, [lsym(cw54)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [edi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-38 * _R - 74 * _G + 112 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw99)] + pmullw xmm6, [lsym(cw29)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LU2, xmm4 ; save for later + + ; _V = ((112 * _R - 94 * _G - 18 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw116)] + pmullw xmm6, [lsym(cw12)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LV2, xmm4 ; save for later + + ; uv add and divide(average) + movq mm1, LU1 ; u from first line + movq mm3, mm1 + pand mm1, [lsym(cw255)] + psrlw mm3, 8 + pand mm3, [lsym(cw255)] + paddw mm1, mm3 ; add + movq mm2, LU2 ; u from second line + movq mm3, mm2 + pand mm2, [lsym(cw255)] + paddw mm1, mm2 ; add + psrlw mm3, 8 + pand mm3, [lsym(cw255)] + paddw mm1, mm3 ; add + paddw mm1, [lsym(cw2)] ; add 2 + psrlw mm1, 2 ; div 4 + + movq mm2, LV1 ; v from first line + movq mm4, mm2 + pand mm2, [lsym(cw255)] + psrlw mm4, 8 + pand mm4, [lsym(cw255)] + paddw mm2, mm4 ; add + movq mm3, LV2 ; v from second line + movq mm4, mm3 + pand mm3, [lsym(cw255)] + paddw mm2, mm3 ; add + psrlw mm4, 8 + pand mm4, [lsym(cw255)] + paddw mm2, mm4 ; add + paddw mm2, [lsym(cw2)] ; add 2 + psrlw mm2, 2 ; div 4 + + packuswb mm1, mm1 + packuswb mm2, mm2 + + punpcklbw mm1, mm2 ; uv + movq [edx], mm1 ; out 8 bytes uvuvuvuv + + ; go up to first line + sub esi, LSRC_STRIDE + sub edi, LDST_Y_STRIDE + + ; move right + lea esi, [esi + 32] + lea edi, [edi + 8] + lea edx, [edx + 8] + + dec ecx + jnz loop1 + + ; update s8 + mov eax, LS8 ; s8 + add eax, LSRC_STRIDE ; s8 += src_stride + add eax, LSRC_STRIDE ; s8 += src_stride + mov LS8, eax + + ; update d8_y + mov eax, LD8_Y ; d8_y + add eax, LDST_Y_STRIDE ; d8_y += dst_stride_y + add eax, LDST_Y_STRIDE ; d8_y += dst_stride_y + mov LD8_Y, eax + + ; update d8_uv + mov eax, LD8_UV ; d8_uv + add eax, LDST_UV_STRIDE ; d8_uv += dst_stride_uv + mov LD8_UV, eax + + dec ebp + jnz row_loop1 + + mov eax, 0 ; return value + add esp, 32 ; local vars, 32 bytes + pop ebp + pop edi + pop esi + pop ebx + ret +END_OF_FILE diff --git a/module/x86/funcs_x86.h b/module/x86/funcs_x86.h index d1f3357..a08834f 100644 --- a/module/x86/funcs_x86.h +++ b/module/x86/funcs_x86.h @@ -44,6 +44,11 @@ a8r8g8b8_to_nv12_box_x86_sse2(const uint8_t *s8, int src_stride, uint8_t *d8_uv, int dst_stride_uv, int width, int height); int +a8r8g8b8_to_nv12_709fr_box_x86_sse2(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height); +int a8r8g8b8_to_yuvalp_box_x86_sse2(const uint8_t *s8, int src_stride, uint8_t *d8, int dst_stride, int width, int height);