diff -r 0bd29c6e965b -r 51f39d0f1466 src/xine-engine/alphablend.c --- a/src/xine-engine/alphablend.c Mon Apr 23 13:13:04 2007 +0300 +++ b/src/xine-engine/alphablend.c Mon Apr 30 14:04:50 2007 +0300 @@ -42,6 +42,138 @@ #define BLEND_BYTE(dst, src, o) (((((src)-(dst))*(o*0x1111+1))>>16)+(dst)) +static void mem_blend8(uint8_t *mem, uint8_t val, uint8_t o, size_t sz) +{ + uint8_t *limit = mem + sz; + while (mem < limit) { + *mem = BLEND_BYTE(*mem, val, o); + mem++; + } +} + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static inline void mem_blend8_mmx(uint8_t *mem, int val, int o, size_t sz) +{ + uint8_t *limit = mem + sz; + + if (sz > 3) { + /* + * MMX register allocation: + * mm7: zero + * mm6: division approximation constant (4x 0x1112) + * mm5: alpha for destination (4x 0xf-A) + * mm4: overlay * overlay alpha + */ + static const mmx_t max_o = {uq:0x000f000f000f000fULL}; + static const mmx_t div_0f = {uq:0x1112111211121112ULL}; + + uint8_t *limit4 = mem + (sz & (~3)); + + movd_a2r (o, mm0); /* mm0 = 0 0 0 A */ + pxor_r2r (mm7, mm7); /* mm7 = 0 0 0 0 */ + + movd_a2r (val, mm4); /* mm4 = 0 0 0 V */ +# if 0 /*defined(__SSE__)*/ + pshufw_r2r (mm0, mm0, 0); /* mm0 = A A A A */ + + movq_m2r (max_o, mm5); /* mm5 = 0xf 0xf 0xf 0xf */ + pshufw_r2r (mm4, mm4, 0); /* mm4 = V V V V */ + + psubw_r2r (mm0, mm5); /* mm5 = 0xf-A 0xf-A 0xf-A 0xf-A */ +# else + punpcklwd_r2r (mm0, mm0); /* mm0 = 0 0 A A */ + + movq_m2r (max_o, mm5); /* mm5 = 0xf 0xf 0xf 0xf */ + punpcklwd_r2r (mm0, mm0); /* mm0 = A A A A */ + + punpcklwd_r2r (mm4, mm4); /* mm4 = 0 0 V V */ + psubw_r2r (mm0, mm5); /* mm5 = 0xf-A 0xf-A 0xf-A 0xf-A */ + + punpcklwd_r2r (mm4, mm4); /* mm4 = V V V V */ +# endif + movq_m2r (div_0f, mm6); /* mm6 = x1112 x1112 x1112 x1112 */ + pmullw_r2r (mm0, mm4); /* mm4 = V*A V*A V*A V*A */ + + for ( ; mem < limit4 ; mem += 4) { + movd_m2r (*mem, mm0); /* mm0 = D3 D2 D1 D0 */ + punpcklbw_r2r (mm7, mm0); /* mm0 = D3 D2 D1 D0 */ + pmullw_r2r (mm5, mm0); /* mm0 = D3*iA D2*iA D1*iA D0*iA */ + paddw_r2r (mm4, mm0); /* mm0 = 4x (V*A + D*iA) */ + pmulhw_r2r (mm6, mm0); /* mm0 = R3 R2 R1 R0 */ + packuswb_r2r (mm0, mm0); /* mm0 = R3 R2 R1 R0 */ + movd_r2m (mm0, *mem); /* store */ + } + sz &= 3; + } + + for (; mem < limit; mem++) + *mem = BLEND_BYTE(*mem, val, o); +} +#endif + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static inline void mem_blend8_sse2(uint8_t *mem, int val, int o, size_t sz) +{ + uint8_t *limit = mem + sz; + + if (sz > 3) { + /* + * SSE register allocation: + * xmm7: zero + * xmm6: division approximation constant (8x 0x1112) + * xmm5: alpha for destination (8x 0xf-A) + * xmm4: overlay * overlay alpha + */ + static const sse_t max_o = {uq:{0x000f000f000f000fULL,0x000f000f000f000fULL}}; + static const sse_t div_0f = {uq:{0x1112111211121112ULL,0x1112111211121112ULL}}; + + uint8_t *limit8 = mem + (sz & (~7)); + + movd_a2r (o, xmm0); /* xmm0 = [ 0 0 0 0 0 0 0 A ] */ + pxor_r2r (xmm7, xmm7); /* xmm7 = [ 0 0 0 0 0 0 0 0 ] */ + + movd_a2r (val, xmm4); /* xmm4 = [ 0 0 0 0 0 0 0 V ] */ + pshuflw_r2r (xmm0, xmm0, 0); /* xmm0 = [ 0 0 0 0 A A A A ] */ + + movdqa_m2r (max_o, xmm5); /* xmm5 = [ f f f f f f f f ] */ + pshufd_r2r (xmm0, xmm0, 0); /* xmm0 = [ A A A A A A A A ] */ + + pshuflw_r2r (xmm4, xmm4, 0); /* xmm4 = [ 0 0 0 0 V V V V ] */ + psubw_r2r (xmm0, xmm5); /* xmm5 = [ 0xf-A 0xf-A 0xf-A ... ] */ + + pshufd_r2r (xmm4, xmm4, 0); /* xmm4 = [ V V V V V V V V ] */ + + movdqa_m2r (div_0f, xmm6); /* xmm6 = [ x1112 x1112 x1112 ... ] */ + pmullw_r2r (xmm0, xmm4); /* xmm4 = [ V*A V*A V*A ... ] */ + + for ( ; mem < limit8 ; mem += 8) { + movq_m2r (*mem, xmm0); /* xmm0 = [ D7 ... D0 ] */ + punpcklbw_r2r (xmm7, xmm0); /* xmm0 = [ D7 D6 D5 D4 D3 D2 D1 D0 ] */ + pmullw_r2r (xmm5, xmm0); /* xmm0 = [ D7*iA D6*iA ... D0*iA ] */ + paddw_r2r (xmm4, xmm0); /* xmm0 = 8x (V*A + D*iA) */ + pmulhw_r2r (xmm6, xmm0); /* xmm0 = [ R7 R6 R5 R4 R3 R2 R1 R0 ] */ + packuswb_r2r (xmm0, xmm0); /* xmm0 = [ R7 ... R0 ] */ + movq_r2m (xmm0, *mem); /* store */ + } + sz &= 7; + + if (sz > 3) { + movd_m2r (*mem, xmm0); /* mm0 = [ D3...D0 ] */ + punpcklbw_r2r (xmm7, xmm0); /* mm0 = [ D3 D2 D1 D0 ] */ + pmullw_r2r (xmm5, xmm0); /* mm0 = [ D3*iA ... D0*iA ] */ + paddw_r2r (xmm4, xmm0); /* mm0 = 4x (V*A + D*iA) */ + pmulhw_r2r (xmm6, xmm0); /* mm0 = [ R3 R2 R1 R0 ] */ + packuswb_r2r (xmm0, xmm0); /* mm0 = [ R3....R0 ] */ + movd_r2m (xmm0, *mem); /* store */ + sz &= 3; + } + } + + for (; mem < limit; mem++) + *mem = BLEND_BYTE(*mem, val, o); +} +#endif + static void mem_blend16(uint16_t *mem, uint16_t clr, uint8_t o, int len) { uint16_t *limit = mem + len; while (mem < limit) { @@ -79,6 +211,775 @@ static void mem_blend32(uint8_t *mem, co mem++; } } + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static inline void mem_blend32_mmx(uint8_t *mem, const uint8_t *src, int o, int len) { + /* + * MMX register allocation: + * mm7: zero + * mm6: division approximation constant (4x 0x1112) + * mm5: alpha for destination (4x 0xf-A) + * mm4: overlay * overlay alpha + */ + static const mmx_t max_o = {uq:0x000f000f000f000fULL}; + static const mmx_t div_0f = {uq:0x1112111211121112ULL}; + + uint8_t *limit = mem + len*4; + + movd_a2r (o, mm0); /* mm0 = 0 0 0 A */ + pxor_r2r (mm7, mm7); /* mm7 = 0 0 0 0 */ + +# if 0 /*defined(__SSE__)*/ + movd_m2r (*src, mm4); /* mm4 = V3 V2 V1 V0 */ + pshufw_r2r (mm0, mm0, 0); /* mm0 = A A A A */ + + movq_m2r (div_0f, mm6); /* mm6 = x1112 x1112 x1112 x1112 */ + punpcklbw_r2r (mm7, mm4); /* mm4 = V3 V2 V1 V0 */ +# else + punpcklwd_r2r (mm0, mm0); /* mm0 = 0 0 A A */ + movd_m2r (*src, mm4); /* mm4 = V3 V2 V1 V0 */ + + punpcklbw_r2r (mm7, mm4); /* mm4 = V3 V2 V1 V0 */ + movq_m2r (div_0f, mm6); /* mm6 = x1112 x1112 x1112 x1112 */ + + punpcklwd_r2r (mm0, mm0); /* mm0 = A A A A */ +# endif + movq_m2r (max_o, mm5); /* mm5 = 0xf 0xf 0xf 0xf */ + + psubw_r2r (mm0, mm5); /* mm5 = f-A f-A f-A f-A */ + pmullw_r2r (mm0, mm4); /* mm4 = V3*A V2*A V1*A V0*A */ + + for ( ; mem < limit ; mem += 4) { + movd_m2r (*mem, mm0); /* mm0 = D3 D2 D1 D0 */ + punpcklbw_r2r (mm7, mm0); /* mm0 = D3 D2 D1 D0 */ + pmullw_r2r (mm5, mm0); /* mm0 = D3*iA D2*iA D1*iA D0*iA */ + paddw_r2r (mm4, mm0); /* mm0 = 4x (V*A + D*iA) */ + pmulhw_r2r (mm6, mm0); /* div by 0xf */ + /* mm0 = R3 R2 R1 R0 */ + packuswb_r2r (mm0, mm0); /* mm0 = R3 R2 R1 R0 */ + movd_r2m (mm0, *mem); /* store */ + } +} +#endif + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static inline void mem_blend32_sse2(uint8_t *mem, const uint8_t *src, int o, int len) { + /* + * SSE register allocation: + * xmm7: zero + * xmm6: division approximation constant (4x 0x1112) + * xmm5: alpha for destination (4x 0xf-A) + * xmm4: overlay * overlay alpha + */ + static const sse_t max_o = {uq:{0x000f000f000f000fULL,0x000f000f000f000fULL}}; + static const sse_t div_0f = {uq:{0x1112111211121112ULL,0x1112111211121112ULL}}; + uint8_t *limit = mem + (len/2)*8; + + movd_a2r (o, xmm0); /* mm0 = 0 0 0 0 0 0 0 A */ + pxor_r2r (xmm7, xmm7); /* mm7 = 0 0 0 0 0 0 0 0 */ + + movq_m2r (*src, xmm4); /* mm4 = V3...V0 */ + pshuflw_r2r (xmm0, xmm0, 0); /* mm0 = 0 0 0 0 A A A A */ + + pshufd_r2r (xmm4, xmm4, 0); /* mm4 = V3...V0 V3...V0 */ + pshufd_r2r (xmm0, xmm0, 0); /* mm0 = A A A A A A A A */ + + movdqa_m2r (max_o, xmm5); /* mm5 = f f f f f f f f */ + punpcklbw_r2r (xmm7, xmm4); /* mm4 = V3 V2 V1 V0 V3 V2 V1 V0 */ + + movdqa_m2r (div_0f, xmm6); /* mm6 = x1112 ... ... x1112 */ + psubw_r2r (xmm0, xmm5); /* mm5 = f-A ... ... f-A */ + + pmullw_r2r (xmm0, xmm4); /* mm4 = V3*A V2*A V1*A V0*A ... */ + + for ( ; mem < limit ; mem += 8) { + movq_m2r (*mem, xmm0); /* mm0 = D7D6D5D4D3D2D1D0 */ + punpcklbw_r2r (xmm7, xmm0); /* mm0 = D7 D6 D5 D4 D3 D2 D1 D0 */ + pmullw_r2r (xmm5, xmm0); /* mm0 = D7*iA D6*iA D5*iA ... D0*iA */ + paddw_r2r (xmm4, xmm0); /* mm0 = [4x (V*A + D*iA), 4x (V*A + D*iA) */ + pmulhw_r2r (xmm6, xmm0); /* div by 0xf */ + /* mm0 = R7 R6 R5 R4 R3 R2 R1 R0 */ + packuswb_r2r (xmm0, xmm0); /* mm0 = R7R6R5R4R3R2R1R0 */ + movq_r2m (xmm0, *mem); /* store */ + } + if(len & 1) { + movd_m2r (*mem, xmm0); /* mm0 = D3D2D1D0 */ + punpcklbw_r2r (xmm7, xmm0); /* mm0 = D3 D2 D1 D0 */ + pmullw_r2r (xmm5, xmm0); /* mm0 = D3*iA ... D0*iA */ + paddw_r2r (xmm4, xmm0); /* mm0 = 4x (V*A + D*iA) */ + pmulhw_r2r (xmm6, xmm0); /* div by 0xf */ + /* mm0 = R3 R2 R1 R0 */ + packuswb_r2r (xmm0, xmm0); /* mm0 = R3R2R1R0 */ + movd_r2m (xmm0, *mem); /* store */ + } +} +#endif + +static void blend_yuy2_y(uint8_t *mem, uint8_t val, uint8_t o, int len) { + uint8_t *limit = mem + len*2; + while (mem < limit) { + *mem = BLEND_BYTE(*mem, val, o); + mem += 2; + } +} + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static inline void blend_yuy2_y_mmx(uint8_t *dst, int y, int o, size_t sz) +{ + int i = sz / 4; + + if (i) { + /* + * MMX register allocation: + * mm7: Y byte mask + * mm6: division constant + * mm5: dst alpha (0xf-A) + * mm4: U/V bytes mask + * mm3: y*alpha + * mm2: (stored U/V values) + * mm0: (calculation) + */ + static const mmx_t y_mask = {uq:0x00ff00ff00ff00ff}; + static const mmx_t uv_mask = {uq:0xff00ff00ff00ff00}; + static const mmx_t div_0f = {uq:0x1112111211121112}; + static const mmx_t max_o = {uq:0x000f000f000f000f}; + + movd_a2r (o, mm1); /* mm1 = 0 0 0 o */ + + movd_a2r (y, mm3); /* mm3 = 0 0 0 y */ +# if 0 /*defined(__SSE__)*/ + pshufw_r2r (mm1, mm1, 0); /* mm1 = o o o o */ + + movq_m2r (uv_mask, mm4); /* mm4 = ff 00 ff 00 ff 00 ff 00 */ + pshufw_r2r (mm3, mm3, 0); /* mm3 = y y y y */ + + movq_m2r (max_o, mm5); /* mm5 = 0xf 0xf 0xf 0xf */ + pmullw_r2r (mm1, mm3); /* mm3 = y*o y*o y*o y*o */ + + movq_m2r (div_0f, mm6); /* mm6 = x1112 x1112 x1112 x1112 */ + psubw_r2r (mm1, mm5); /* mm5 = 0xf-o 0xf-o 0xf-o 0xf-o */ + + movq_m2r (y_mask, mm7); /* mm7 = 00 ff 00 ff 00 ff 00 ff */ +# else + punpcklwd_r2r (mm1, mm1); /* mm1 = 0 0 o o */ + + movq_m2r (max_o, mm5); /* mm5 = 0xf 0xf 0xf 0xf */ + punpcklwd_r2r (mm1, mm1); /* mm1 = o o o o */ + + movq_m2r (uv_mask, mm4); /* mm4 = ff 00 ff 00 ff 00 ff 00 */ + punpcklwd_r2r (mm3, mm3); /* mm3 = 0 0 y y */ + + movq_m2r (div_0f, mm6); /* mm6 = x1112 x1112 x1112 x1112 */ + punpcklwd_r2r (mm3, mm3); /* mm3 = y y y y */ + + movq_m2r (y_mask, mm7); /* mm7 = 00 ff 00 ff 00 ff 00 ff */ + pmullw_r2r (mm1, mm3); /* mm3 = y*o y*o y*o y*o */ + + psubw_r2r (mm1, mm5); /* mm5 = 0xf-o 0xf-o 0xf-o 0xf-o */ +# endif + + while (i--) { + movq_m2r (*dst, mm0); /* mm0 = cb y3 cr y2 cb y1 cr y0 */ + movq_r2r (mm0, mm2); /* save cr/cb */ + pand_r2r (mm7, mm0); /* mm0 = 0 y3 0 y2 0 y1 0 y0 */ + pmullw_r2r (mm5, mm0); /* y[] *= (0xf-o) */ + pand_r2r (mm4, mm2); /* drop Y from saved cr/cb vector */ + paddw_r2r (mm3, mm0); /* blend */ + pmulhw_r2r (mm6, mm0); /* div by 0xf (= (X*0x1112)>>16) */ + por_r2r (mm2, mm0); /* interleave with Y */ + movq_r2m (mm0, *dst); /* store */ + dst += 8; + } + sz &= 3; + } + + while(sz--) { + *dst = BLEND_BYTE(*dst, y, o); + dst += 2; + } +} +#endif + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static inline void blend_yuy2_y_sse2(uint8_t *dst, int y, int o, size_t sz) +{ + if (sz > 3) { + /* + * SSE register allocation: + * xmm7: Y byte mask + * xmm6: division constant + * xmm5: dst alpha (0xf-A) + * xmm4: U/V bytes mask + * xmm3: y*alpha + * xmm2: (stored U/V values) + * xmm0: (calculation) + */ + static const sse_t y_mask = {uq:{0x00ff00ff00ff00ff,0x00ff00ff00ff00ff}}; + static const sse_t uv_mask = {uq:{0xff00ff00ff00ff00,0xff00ff00ff00ff00}}; + static const sse_t div_0f = {uq:{0x1112111211121112,0x1112111211121112}}; + static const sse_t max_o = {uq:{0x000f000f000f000f,0x000f000f000f000f}}; + int i = sz / 8; + + movd_a2r (o, xmm1); /* xmm1 = 0 0 0 0 0 0 0 o */ + movd_a2r (y, xmm3); /* xmm3 = 0 0 0 0 0 0 0 y */ + + movdqa_m2r (max_o, xmm5); /* xmm5 = f f f f f f f f */ + pshuflw_r2r (xmm1, xmm1, 0);/* xmm1 = 0 0 0 0 o o o o */ + + movdqa_m2r (uv_mask, xmm4); /* xmm4 = ff00ff ... ff00ff00 */ + pshuflw_r2r (xmm3, xmm3, 0);/* xmm3 = 0 0 0 0 y y y y */ + + movdqa_m2r (div_0f, xmm6); /* xmm6 = x1112 ... x1112 x1112 */ + pshufd_r2r (xmm1, xmm1, 0); /* xmm1 = o o o o o o o o */ + + movdqa_m2r (y_mask, xmm7); /* xmm7 = 00ff00 ... 00ff00ff */ + pshufd_r2r (xmm3, xmm3, 0); /* xmm3 = y y y y y y y y */ + + psubw_r2r (xmm1, xmm5); /* xmm5 = 0xf-o ... 0xf-o 0xf-o */ + pmullw_r2r (xmm1, xmm3); /* xmm3 = y*o ... ... y*o y*o */ + + while(i--) { + movdqu_m2r (*dst, xmm0); /* xmm0 = cb y7 cr y6 ... cr y0 */ + movdqa_r2r (xmm0, xmm2); /* xmm2 = cb y7 cr y6 ... cr y0 */ + pand_r2r (xmm7, xmm0); /* xmm0 = 0 y7 0 y6 ... 0 y0 */ + pmullw_r2r (xmm5, xmm0); /* xmm0 = dst_y[] * (0xf-o) */ + pand_r2r (xmm4, xmm2); /* xmm2 = cb 0 cr 0 ... cr 0 */ + paddw_r2r (xmm3, xmm0); /* xmm0 = blended Y */ + pmulhw_r2r (xmm6, xmm0); /* div xmm0 by 0xf */ + por_r2r (xmm2, xmm0); /* interleave with saved cr/cb */ + movdqu_r2m (xmm0, *dst); /* store */ + dst += 16; + } + sz &= 7; + + if (sz > 3) { + movq_m2r (*dst, xmm0); /* xmm0 = cb y3 cr y2 cb y1 cr y0 */ + movdqa_r2r (xmm0, xmm2); /* xmm2 = cb y3 cr y3 cb y1 cr y0 */ + pand_r2r (xmm7, xmm0); /* xmm0 = 0 y3 0 y2 0 y1 0 y0 */ + pmullw_r2r (xmm5, xmm0); /* xmm0 = dst_y[] * (0xf-o) */ + pand_r2r (xmm4, xmm2); /* xmm2 = cb 0 cr 0 cb 0 cr 0 */ + paddw_r2r (xmm3, xmm0); /* xmm0 = blended Y */ + pmulhw_r2r (xmm6, xmm0); /* div xmm0 by 0xf */ + por_r2r (xmm2, xmm0); /* interleave with saved cr/cb */ + movq_r2m (xmm0, *dst); /* store */ + dst += 8; + sz &= 3; + } + } + + while(sz--) { + *dst = BLEND_BYTE(*dst, y, o); + dst += 2; + } +} +#endif + +static void blend_yuv_exact(uint8_t *dst_cr, uint8_t *dst_cb, int src_width, + uint8_t *(*blend_yuv_data)[ 3 ][ 2 ]) +{ + int x; + + for (x = 0; x < src_width; x += 2) { + /* get opacity of the 4 pixels that share chroma */ + int o00 = (*blend_yuv_data)[ 0 ][ 0 ][ x + 0 ]; + int o01 = (*blend_yuv_data)[ 0 ][ 0 ][ x + 1 ]; + int o = o00 + o01; + int o10 = (*blend_yuv_data)[ 0 ][ 1 ][ x + 0 ]; + o += o10; + int o11 = (*blend_yuv_data)[ 0 ][ 1 ][ x + 1 ]; + o += o11; + + /* are there any pixels a little bit opaque? */ + if (o) { + /* get the chroma components of the 4 pixels */ + int cr00 = (*blend_yuv_data)[ 1 ][ 0 ][ x + 0 ]; + int cr01 = (*blend_yuv_data)[ 1 ][ 0 ][ x + 1 ]; + int cr10 = (*blend_yuv_data)[ 1 ][ 1 ][ x + 0 ]; + int cr11 = (*blend_yuv_data)[ 1 ][ 1 ][ x + 1 ]; + + int cb00 = (*blend_yuv_data)[ 2 ][ 0 ][ x + 0 ]; + int cb01 = (*blend_yuv_data)[ 2 ][ 0 ][ x + 1 ]; + int cb10 = (*blend_yuv_data)[ 2 ][ 1 ][ x + 0 ]; + int cb11 = (*blend_yuv_data)[ 2 ][ 1 ][ x + 1 ]; + + /* are all pixels completely opaque? */ + if (o >= 4*0xf) { + /* set the output chroma to the average of the four pixels */ + *dst_cr = (cr00 + cr01 + cr10 + cr11) / 4; + *dst_cb = (cb00 + cb01 + cb10 + cb11) / 4; + } else { + /* calculate transparency of background over the four pixels */ + int t4 = 4*0xf - o; + + /* blend the output chroma to the average of the four pixels */ + /* for explanation of the used equation, see blend_yuy2_exact() */ + *dst_cr = ((*dst_cr * t4 + cr00 * o00 + cr01 * o01 + cr10 * o10 + cr11 * o11) * (0x1111+1)) >> 18; + *dst_cb = ((*dst_cb * t4 + cb00 * o00 + cb01 * o01 + cb10 * o10 + cb11 * o11) * (0x1111+1)) >> 18; + } + } + + /* next chroma sample */ + dst_cr++; + dst_cb++; + } +} + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static inline void blend_yuv_exact_mmx(uint8_t *dst_cr, uint8_t *dst_cb, int src_width, + uint8_t *(*blend_yuv_data)[ 3 ][ 2 ], int safe) +{ + static const mmx_t lobytes = {uq:0x00ff00ff00ff00ff}; + static const mmx_t div_3c = {uq:0x0445044504450445}; + static const mmx_t max4a = {uq:0x003c003c003c003c}; + int xovl = 0, xdst = 0; + + /* + Code blends pixels outside of overlay right border + if safe flag is set (and if overlay width is not + multiplicant of 4). + But, we don't care, as it has no effect on result: + the few extra pixels in blend_yuv_data are already marked + as transparent and those pixels in frame stay untouched. + Of course, rest of pixels could be blended one by one, + but that is slower and not necessarily. + */ + + if (!safe) { + safe = src_width & 7; + src_width -= safe; + } + + while (xovl < src_width) { + + if(0 == (*(uint32_t*)&(*blend_yuv_data)[ 0 ][ 0 ][ xovl ] | + *(uint32_t*)&(*blend_yuv_data)[ 0 ][ 0 ][ xovl ] | + *(uint32_t*)&(*blend_yuv_data)[ 1 ][ 0 ][ xovl+4 ] | + *(uint32_t*)&(*blend_yuv_data)[ 1 ][ 0 ][ xovl+4 ])) { + /* all transparent */ + xovl += 8; + xdst += 4; + continue; + } + + /* 1. lines */ + + movq_m2r (((*blend_yuv_data)[ 0 ][ 0 ][ xovl ]), mm0); /* load 8x alpha, line 1 */ + movq_m2r (lobytes, mm7); + movq_r2r (mm0, mm4); + pand_r2r (mm7, mm0); /* mm0: a06 a04 a02 a00 */ + psrlw_i2r (8, mm4); /* mm4: a07 a05 a03 a01 */ + /* used mm: 0, 4 */ + + movq_m2r (((*blend_yuv_data)[ 1 ][ 0 ][ xovl ]), mm1); /* load 8x cr, line 1 */ + movq_r2r (mm1, mm5); + pand_r2r (mm7, mm1) ; /* mm1: [ r06 r04 r02 r00 ] */ + psrlw_i2r (8, mm5); /* mm5: [ r07 r05 r03 r01 ] */ + /* used mm: 0, 1, 4, 5 */ + + movq_m2r (((*blend_yuv_data)[ 2 ][ 0 ][ xovl ]), mm2); /* load 8x cb, line 1 */ + movq_r2r (mm2, mm6); + pand_r2r (mm7, mm2); /* mm2: [ r06 r04 r02 r00 ] */ + psrlw_i2r (8, mm6); /* mm6: [ r07 r05 r03 r01 ] */ + /* used mm: 0, 1, 2, 4, 5, 6 */ + + pmullw_r2r (mm0, mm1); /* mm2: [ r06*a06 r04*a04 r02*a02 r00*a00 ] */ + pmullw_r2r (mm4, mm5); /* mm3: [ r07*a07 r05*a05 r03*a03 r01*a01 ] */ + pmullw_r2r (mm0, mm2); /* mm4: [ b06*a06 b04*a04 b02*a02 b00*a00 ] */ + pmullw_r2r (mm4, mm6); /* mm5: [ b07*a07 b05*a05 b03*a03 b01*a01 ] */ + + paddw_r2r (mm4, mm0); /* mm0: [ a07+a06 a05+a04 a03+a02 a01+a00 ] */ + paddw_r2r (mm5, mm1); /* mm1: [ r07*a07+r06*a06 r05*a05+r04*a04 ... ] */ + paddw_r2r (mm6, mm2); /* mm2: [ b07*a07+b06*a06 b05*a05+b04*a04 ... ] */ + /* used mm: 0, 1, 2 */ + + /* 2. lines */ + + movq_m2r (((*blend_yuv_data)[ 0 ][ 1 ][ xovl ]), mm3); /* load 8x alpha, line 2 */ + movq_r2r (mm3, mm4); + pand_r2r (mm7, mm3); /* mm3: A16 A14 A12 A10 */ + psrlw_i2r (8, mm4); /* mm4: A17 A15 A13 A11 */ + + movq_m2r (((*blend_yuv_data)[ 1 ][ 1 ][ xovl ]), mm5); /* load 8x cr, line 2 */ + movq_r2r (mm5, mm6); + pand_r2r (mm7, mm5); /* mm5: r16 r14 r12 r10 */ + psrlw_i2r (8, mm6); /* mm6: r17 r15 r13 r11 */ + + pmullw_r2r (mm3, mm5); /* mm5: [ r16*a16 r14*a14 r12*a12 r10*a10 ] */ + pmullw_r2r (mm4, mm6); /* mm6: [ r17*a17 r15*a15 r13*a13 r11*a11 ] */ + paddw_r2r (mm6, mm5); /* mm5: r*A pairs, line 2 */ + + movq_m2r (((*blend_yuv_data)[ 2 ][ 1 ][ xovl ]), mm6); /* load 8x cb, line 2 */ + movq_r2r (mm6, mm7); + pand_m2r (lobytes, mm6); /* mm6: r06 r04 r02 r00 */ + psrlw_i2r (8, mm7); /* mm7: r07 r05 r03 r01 */ + + pmullw_r2r (mm3, mm6); /* mm6: [ b16*a16 b14*a14 b12*a12 b10*a10 ] */ + pmullw_r2r (mm4, mm7); /* mm7: [ b17*a17 b15*a15 b13*a13 b11*a11 ] */ + paddw_r2r (mm7, mm6); /* mm6: b*A pairs, line 2 */ + + paddw_r2r (mm4, mm3); /* mm3: [ a17+a16 a15+a14 a13+a12 a11+a10 ] */ + + paddw_r2r (mm3, mm0); /* mm0: A qubits */ + paddw_r2r (mm5, mm1); /* mm1: r*A qubits */ + paddw_r2r (mm6, mm2); /* mm2: b*A qubits */ + + movq_m2r (max4a, mm5); /* a qubits - 4*0xf */ + psubw_r2r (mm0, mm5); /* a qubits - 4*0xf */ + + /* dst */ + + movd_m2r (dst_cr[xdst], mm3); + movd_m2r (dst_cb[xdst], mm4); + pxor_r2r (mm7, mm7); + punpcklbw_r2r (mm7, mm3); /* bytes -> words */ + punpcklbw_r2r (mm7, mm4); + + pmullw_r2r (mm5, mm3); /* *= (1-a) */ + xovl += 8; + pmullw_r2r (mm5, mm4); + + movq_m2r (div_3c, mm6); + + paddw_r2r (mm3, mm1); /* blend */ + paddw_r2r (mm4, mm2); + + pmulhw_r2r (mm6, mm1); /* div by 4*0xf */ + pmulhw_r2r (mm6, mm2); + + packuswb_r2r (mm1, mm1); /* words -> bytes */ + packuswb_r2r (mm2, mm2); + + movd_r2m (mm1, dst_cr[xdst]); /* store */ + movd_r2m (mm2, dst_cb[xdst]); + xdst += 4; + } + + if (safe) { + /* near frame corner, so do last bytes individually */ + uint8_t *tmp[ 3 ][ 2 ] = { + { (*blend_yuv_data)[ 0 ][ 0 ] + xovl, (*blend_yuv_data)[ 0 ][ 1 ] + xovl }, + { (*blend_yuv_data)[ 1 ][ 0 ] + xovl, (*blend_yuv_data)[ 1 ][ 1 ] + xovl }, + { (*blend_yuv_data)[ 2 ][ 0 ] + xovl, (*blend_yuv_data)[ 2 ][ 1 ] + xovl }}; + blend_yuv_exact(dst_cr + xdst, dst_cb + xdst, safe, &tmp); + } +} +#endif + +#if defined(ARCH_X86_64) +/* + * SSE2: Enabled only for 64-bit x86 to be sure assembler + * understands SSE2 instructions. + * (algorithm uses also x86_64 additional SSE + * registers xmm8...xmm15) + * + * In some older processors SSE2 is implemented as MMX uops + * -> Every SSE2 op creates two MMX uops + * -> SSE2 op takes double the time of corresponding MMX op + * -> it is usually faster to use MMX in those archs + */ +static inline void blend_yuv_exact_sse2_x64(uint8_t *dst_cr, uint8_t *dst_cb, int src_width, + uint8_t *(*blend_yuv_data)[ 3 ][ 2 ], int safe) +{ + /* + * Almost direct copy of mmx version. + * Use 128bit registers, xmm 8...15 for constants and SSE2 comparsions. + * + * NOTE: blend_yuv_data buffers must be 16-byte aligned or movdqa raises an exception ! + */ + static const sse_t lobytes = {uq:{0x00ff00ff00ff00ff,0x00ff00ff00ff00ff}}; + static const sse_t max4a = {uq:{0x003c003c003c003c,0x003c003c003c003c}}; + static const sse_t div_3c = {uq:{0x0445044504450445,0x0445044504450445}}; + register unsigned int mask; + int xovl = 0, xdst = 0; + + if (!safe) { + safe = src_width & 15; + src_width -= safe; + } + + movdqa_m2r (lobytes, xmm10); + movdqa_m2r (max4a, xmm11); + movdqa_m2r (div_3c, xmm12); + + while (xovl < src_width) { + + /* Alpha */ + pxor_r2r (xmm1, xmm1); /* xmm1: 0 */ + movdqa_m2r (((*blend_yuv_data)[ 0 ][ 0 ][ xovl ]), + xmm0); /* xmm0: alpha, line 1 */ + pxor_r2r (xmm2, xmm2); /* xmm2: 0 */ + movdqa_m2r (((*blend_yuv_data)[ 0 ][ 1 ][ xovl ]), + xmm3); /* xmm3: alpha, line 2 */ + + pcmpeqb_r2r (xmm0, xmm1); /* xmm1: compare 0 and alphas (line 1) */ + movdqa_r2r (xmm0, xmm4); /* xmm4: (copy of) alpha, line 1 */ + pcmpeqb_r2r (xmm3, xmm1); /* xmm2: compare 0 and alphas (line 2) */ + pand_r2r (xmm10, xmm0); /* xmm0: A0e A0c ... A06 A04 A02 A00 */ + pand_r2r (xmm2, xmm1); /* xmm1: compare result = 16 x ff if all pixels transparent */ + psrlw_i2r (8, xmm4); /* xmm4: A0f A0d ... A07 A05 A03 A01 */ + pmovmskb_r2a (xmm1, mask); /* mask: move compare result bitmask to GP register */ + if(mask == 0xff) { + /* all alphas 0, skip transparent area */ + xovl += 16; + xdst += 8; + continue; + } + + /* 1. line Cr and Cb */ + movdqa_m2r (((*blend_yuv_data)[ 1 ][ 0 ][ xovl ]), + xmm1); /* xmm1: cr, line 1 */ + movdqa_m2r (((*blend_yuv_data)[ 2 ][ 0 ][ xovl ]), + xmm2); /* xmm2: cb, line 1 */ + movdqa_r2r (xmm1, xmm5); /* xmm5: (copy of) cr, line 1 */ + movdqa_r2r (xmm2, xmm6); /* xmm6: (copy of) cb, line 1 */ + pand_r2r (xmm10, xmm1); /* xmm1: r0e r0c r0a r08 r06 r04 r02 r00 */ + pand_r2r (xmm10, xmm2); /* xmm2: b0e b0c b0a b08 b06 b04 b02 b00 */ + pmullw_r2r (xmm0, xmm1); /* xmm2: r*A even */ + pmullw_r2r (xmm0, xmm2); /* xmm4: b*A even */ + psrlw_i2r (8, xmm5); /* xmm5: r0f r0d r0b r09 r07 r05 r03 r01 */ + psrlw_i2r (8, xmm6); /* xmm6: b0f b0d b0b b09 b07 b05 b03 b01 */ + pmullw_r2r (xmm4, xmm5); /* xmm3: r*A odd pixels */ + pmullw_r2r (xmm4, xmm6); /* xmm5: b*A odd */ + + paddw_r2r (xmm4, xmm0); /* xmm0: A pairs, line 1 */ + paddw_r2r (xmm5, xmm1); /* xmm1: r*A pairs, line 1 */ + paddw_r2r (xmm6, xmm2); /* xmm2: b*A pairs, line 1 : b0*A0+b1*A1, b2*A2+b3*A3, ... */ + + /* 2. line Cr and Cb */ + movdqa_r2r (xmm3, xmm4); /* xmm4: (copy of) alpha line 2 */ + movdqa_m2r (((*blend_yuv_data)[ 1 ][ 1 ][ xovl ]), + xmm5); /* xmm5: cr, line 2 */ + pand_r2r (xmm10, xmm3); /* xmm3: A16 A14 A12 A10 */ + movdqa_m2r (((*blend_yuv_data)[ 2 ][ 1 ][ xovl ]), + xmm7); /* xmm7: cb, line 2 */ + movdqa_r2r (xmm5, xmm6); /* xmm6: (copy of) cr, line 2 */ + movdqa_r2r (xmm7, xmm8); /* xmm8: (copy of) cb, line 2 */ + psrlw_i2r (8, xmm4); /* xmm4: A17 A15 A13 A11 */ + pand_r2r (xmm10, xmm5); /* xmm5: r1e ... r14 r12 r10 */ + psrlw_i2r (8, xmm6); /* xmm6: r1f ... r15 r13 r11 */ + pmullw_r2r (xmm3, xmm5); /* xmm5: r*A even, line 2 */ + pand_r2r (xmm10, xmm7); /* xmm7: b1e ... b14 b12 b10 */ + pmullw_r2r (xmm4, xmm6); /* xmm6: r*A odd, line 2 */ + psrlw_i2r (8, xmm8); /* xmm8: b1f ... b15 b13 b11 */ + pmullw_r2r (xmm3, xmm7); /* xmm7: b*A even, line 2 */ + pmullw_r2r (xmm4, xmm8); /* xmm8: b*A odd, line 2 */ + paddw_r2r (xmm6, xmm5); /* xmm5: r*A pairs, line 2 */ + paddw_r2r (xmm8, xmm7); /* xmm7: b*A pairs, line 2 */ + paddw_r2r (xmm4, xmm3); /* xmm3: A pairs, line 2 */ + + paddw_r2r (xmm5, xmm1); /* xmm1: r*A qubits */ + paddw_r2r (xmm7, xmm2); /* xmm2: b*A qubits */ + movdqa_r2r (xmm11, xmm5); /* xmm5: 8x (4*0f) */ + paddw_r2r (xmm3, xmm0); /* xmm0: A qubits */ + + /* dst and blending */ + movq_m2r (dst_cr[xdst], xmm3); /* xmm3: load 4x dest cr */ + psubw_r2r (xmm0, xmm5); /* xmm5: 8x (4*0xf - a qubits) = 4* alpha of destination */ + movq_m2r (dst_cb[xdst], xmm4); /* xmm4: load 4x dest cb */ + pxor_r2r (xmm7, xmm7); + punpcklbw_r2r (xmm7, xmm3); /* xmm3: bytes -> words */ + punpcklbw_r2r (xmm7, xmm4); /* xmm4: bytes -> words */ + + pmullw_r2r (xmm5, xmm3); /* xmm3: dest cr *= (1-a) */ + xovl += 16; + pmullw_r2r (xmm5, xmm4); /* xmm4: dest cr *= (1-a) */ + + paddw_r2r (xmm3, xmm1); /* xmm1: blend cr */ + paddw_r2r (xmm4, xmm2); /* xmm2: blend cb */ + + pmulhw_r2r (xmm12, xmm1); /* xmm1: div cr by 4*0xf -> results */ + pmulhw_r2r (xmm12, xmm2); /* xmm2: div cb by 4*0xf -> results */ + + packuswb_r2r (xmm1, xmm1); /* xmm1: cr words -> bytes */ + packuswb_r2r (xmm2, xmm2); /* xmm2: cb words -> bytes */ + + movq_r2m (xmm1, dst_cr[xdst]); /* store cr */ + movq_r2m (xmm2, dst_cb[xdst]); /* store cb */ + xdst += 8; + } + + if (safe) { + /* near frame corner, so do last bytes individually */ + uint8_t *tmp[ 3 ][ 2 ] = { + { (*blend_yuv_data)[ 0 ][ 0 ] + xovl, (*blend_yuv_data)[ 0 ][ 1 ] + xovl }, + { (*blend_yuv_data)[ 1 ][ 0 ] + xovl, (*blend_yuv_data)[ 1 ][ 1 ] + xovl }, + { (*blend_yuv_data)[ 2 ][ 0 ] + xovl, (*blend_yuv_data)[ 2 ][ 1 ] + xovl }}; + blend_yuv_exact(dst_cr + xdst, dst_cb + xdst, safe, &tmp); + } +} +#endif + +#if defined(ARCH_X86_64) +/* + * SSSE3 (Supplemental Streaming SIMD Extension 3) + * + * There's one very nice instruction in SSSE3: + * pmaddusb [af ae ad ac ab aa a9 a8 a7 a6 a5 a4 a3 a2 a1 a0], + * [rf re rd rc rb ra r9 r8 r7 r6 r5 r4 r3 r2 r1 r0] + * = [af*rf+ae*re, ad*rd+ac*rc, ..., a3*r3+a2*r2, a1*r1+a0*r0] + * NOTE: a vector is _signed_ bytes, r vector unsigned. + * -> result is signed (sign of a-vector elements) + * + * pmaddusb replaces 2x pmullw, paddw, 2x movdqa, 2x pand and 2x psrlw + * + * GNU assembler 2.16.1 didn't regonize ssse3 instructions, so those are + * encoded in binary: + * PMADDUBSW .byte 66 0f 38 04 /r + * + */ +static inline void blend_yuv_exact_ssse3_x64(uint8_t *dst_cr, uint8_t *dst_cb, int src_width, + uint8_t *(*blend_yuv_data)[ 3 ][ 2 ], int safe) +{ + /* + * NOTE: blend_yuv_data buffers must be 16-byte aligned or movdqa raises an exception ! + */ + static const sse_t lobytes = {uq:{0x00ff00ff00ff00ff,0x00ff00ff00ff00ff}}; + static const sse_t max4a = {uq:{0x003c003c003c003c,0x003c003c003c003c}}; + static const sse_t div_3c = {uq:{0x0445044504450445,0x0445044504450445}}; + int xovl = 0, xdst = 0; + + if (!safe) { + safe = src_width & 15; + src_width -= safe; + } + + movdqa_m2r (lobytes, xmm8); + movdqa_m2r (max4a, xmm9); + movdqa_m2r (div_3c, xmm10); + + while (xovl < src_width) { + /* line numbers show original instruction order before scheduling */ + + /* Overlay 1. line (16 pixels A, Cb, Cr) */ + + /* 01 */ movdqa_m2r (((*blend_yuv_data)[ 0 ][ 0 ][ xovl ]), + xmm0); /* xmm0: alpha, line 1 */ + /* 02 */ movdqa_m2r (((*blend_yuv_data)[ 1 ][ 0 ][ xovl ]), + xmm1); /* xmm1: cr, line 1 */ + /* 06 */ movdqa_r2r (xmm0, xmm3); /* xmm3: (copy of) alpha, line 1 */ + /* 03 */ movdqa_m2r (((*blend_yuv_data)[ 2 ][ 0 ][ xovl ]), + xmm2); /* xmm2: cb, line 1 */ + /* 04 pmaddubsw_r2r (xmm0, xmm1); xmm1: a0f*r0f+a03*r0e, ... */ + /* */ __asm__ __volatile__ (".byte 0x66, 0x0f, 0x38, 0x04, 0xc8"); + /* 07 */ psrlw_i2r (8, xmm3); /* xmm3: a0f ... a07 a05 a03 a01 */ + /* 05 pmaddubsw_r2r (xmm0, xmm2); xmm2: a0f*b0f+a0e*b0e ... */ + /* */ __asm__ __volatile__ (".byte 0x66, 0x0f, 0x38, 0x04, 0xd0"); + + /* Overlay 2. line (16 pixels A, Cb, Cr) */ + + /* 08 */ pand_r2r (xmm8, xmm0); /* xmm0: a0e ... a06 a04 a02 a00 */ + /* 10 */ movdqa_m2r (((*blend_yuv_data)[ 0 ][ 1 ][ xovl ]), + xmm4); /* xmm4: alpha, line 2 */ + /* 11 */ movdqa_m2r (((*blend_yuv_data)[ 1 ][ 1 ][ xovl ]), + xmm5); /* xmm5: cr, line 2 */ + /* 15 */ movdqa_r2r (xmm4, xmm7); /* xmm7: (copy) alphas line 2 */ + /* 09 */ paddw_r2r (xmm3, xmm0); /* xmm0: a0f+a0e ... a01+a00 */ + /* 13 pmaddubsw_r2r (xmm4, xmm5); /* xmm5: a1f*r1e+a1e*r1e ... */ + /* */ __asm__ __volatile__ (".byte 0x66, 0x0f, 0x38, 0x04, 0xec"); + /* 12 */ movdqa_m2r (((*blend_yuv_data)[ 2 ][ 1 ][ xovl ]), + xmm6); /* xmm6: cb, line 2 */ + /* 16 */ pand_r2r (xmm8, xmm7); /* xmm7: a1f ... a16 a14 a12 a10 */ + /* 14 pmaddubsw_r2r (xmm4, xmm6); xmm6: a1f*b1f+a1e*b1e ... */ + /* */ __asm__ __volatile__ (".byte 0x66, 0x0f, 0x38, 0x04, 0xf4"); + + /* Combine overlay lines */ + + /* 19 */ paddw_r2r (xmm5, xmm1); /* xmm1: r*A qubits */ + /* 17 */ psrlw_i2r (8, xmm4); /* xmm4: a1e ... a17 a15 a13 a11 */ + /* 20 */ paddw_r2r (xmm6, xmm2); /* xmm2: b*A qubits */ + /* 18 */ paddw_r2r (xmm7, xmm4); /* xmm4: a1f+a1e ... a11+a10 */ + + /* Prepare destination - 8 pixels */ + + /* 22 */ movdqa_r2r (xmm9, xmm5); /* xmm5: 0x003c 0x003c ... (4*0xf) */ + /* 26 */ pxor_r2r (xmm7, xmm7); /* xmm7: 0 */ + /* 21 */ paddw_r2r (xmm4, xmm0); /* xmm0: A qubits */ + /* 24 */ movq_m2r (dst_cr[xdst], xmm3);/* xmm3: load 8x dest cr */ + /* 23 */ psubw_r2r (xmm0, xmm5); /* xmm5: 4*0f-(a qubits) = dst alpha*/ + /* 25 */ movq_m2r (dst_cb[xdst], xmm4);/* xmm4: load 8x dest cb */ + /* 27 */ punpcklbw_r2r (xmm7, xmm3); /* xmm3: cb bytes -> words */ + /* 28 */ punpcklbw_r2r (xmm7, xmm4); /* xmm3: cr bytes -> words */ + /* 29 */ pmullw_r2r (xmm5, xmm3); /* xmm3: apply alpha cb *= (1-a) */ + /* 41 */ xovl += 16; + /* 30 */ pmullw_r2r (xmm5, xmm4); /* xmm4: apply alpha cr *= (1-a) */ + + /* Blend */ + + /* 31 */ paddw_r2r (xmm3, xmm1); /* xmm1: blend, cb */ + /* 32 */ paddw_r2r (xmm4, xmm2); /* xmm2: blend, cr */ + /* 33 */ pmulhw_r2r (xmm10, xmm1); /* xmm1: div xmm1 by 4*0xf (cr) */ + /* 39 */ xdst += 8; + /* 34 */ pmulhw_r2r (xmm10, xmm2); /* xmm2: div xmm2 by 4*0xf (cb) */ + /* TODO: load next line while waiting multiplications to complete ? */ + /* 35 */ packuswb_r2r (xmm1, xmm1); /* xmm1: words -> bytes, cr */ + /* 36 */ packuswb_r2r (xmm2, xmm2); /* xmm2: words -> bytes, cb */ + /* 37 */ movq_r2m (xmm1, dst_cr[xdst-8]); /* store cr */ + /* 38 */ movq_r2m (xmm2, dst_cb[xdst-8]); /* store cb */ + } + + if (safe) { + /* near frame corner, so do last bytes individually */ + uint8_t *tmp[ 3 ][ 2 ] = { + { (*blend_yuv_data)[ 0 ][ 0 ] + xovl, (*blend_yuv_data)[ 0 ][ 1 ] + xovl }, + { (*blend_yuv_data)[ 1 ][ 0 ] + xovl, (*blend_yuv_data)[ 1 ][ 1 ] + xovl }, + { (*blend_yuv_data)[ 2 ][ 0 ] + xovl, (*blend_yuv_data)[ 2 ][ 1 ] + xovl }}; + blend_yuv_exact(dst_cr + xdst, dst_cb + xdst, safe, &tmp); + } +} +#endif + +/* + * Compiling for x86_64 ? + * - Use SSE2 accelerated functions + */ +#if defined(ARCH_X86_64) +# define SSSE3_RUNTIME_DETECT 1 +# define mem_blend8(m,v,o,s) mem_blend8_sse2(m,v,o,s) +# define mem_blend32(m,v,o,s) mem_blend32_sse2(m,v,o,s) +# define blend_yuy2_y(m,v,o,s) blend_yuy2_y_sse2(m,v,o,s) +# define blend_yuv_exact(r,b,w,s) do { \ + if(ssse3) \ + blend_yuv_exact_ssse3_x64(r,b,w,s, \ + ((y < src_height-2) || ((src_width+x_off) < dst_width-8))); \ + else \ + blend_yuv_exact_sse2_x64(r,b,w,s, \ + ((y < src_height-2) || ((src_width+x_off) < dst_width-8))); \ + } while(0); +# define MMX_EXIT() + +#elif defined(ARCH_X86) +# define MMX_RUNTIME_DETECT 1 +# define mem_blend8(m,v,o,s) \ + do { \ + if (mmx) mem_blend8_mmx(m,v,o,s); \ + else mem_blend8(m,v,o,s); \ + } while(0) +# define mem_blend32(m,v,o,s) \ + do { \ + if (mmx) mem_blend32_mmx(m,v,o,s); \ + else mem_blend32(m,v,o,s); \ + } while(0) +# define blend_yuy2_y(m,v,o,s) \ + do { \ + if (mmx) blend_yuy2_y_mmx(m,v,o,s); \ + else blend_yuy2_y(m,v,o,s); \ + } while(0) +# define blend_yuv_exact(r,b,w,s) \ + do { \ + if (mmx) blend_yuv_exact_mmx(r,b,w,s, \ + ((y < src_height-2) || ((src_width+x_off) < dst_width-4))); \ + else blend_yuv_exact(r,b,w,s); \ + } while(0) +# define MMX_EXIT() \ + do { \ + if (mmx) emms(); \ + } while (0) + +#else +# define MMX_EXIT() + +#endif + /* * Some macros for fixed point arithmetic. @@ -894,6 +1795,9 @@ void _x_blend_rgb32 (uint8_t * img, vo_o int hili_right, hili_left; int clip_right, clip_left, clip_top; uint8_t *img_pix; +#if defined(MMX_RUNTIME_DETECT) + int mmx = xine_mm_accel() & MM_ACCEL_X86_MMX; +#endif dy_step = INT_TO_SCALED(dst_height) / img_height; x_scale = INT_TO_SCALED(img_width) / dst_width; @@ -1041,65 +1945,8 @@ void _x_blend_rgb32 (uint8_t * img, vo_o rle = rle_start; /* y-scaling, reuse the last rle encoded line */ } } -} - -static void mem_blend8(uint8_t *mem, uint8_t val, uint8_t o, size_t sz) -{ - uint8_t *limit = mem + sz; - while (mem < limit) { - *mem = BLEND_BYTE(*mem, val, o); - mem++; - } -} - -static void blend_yuv_exact(uint8_t *dst_cr, uint8_t *dst_cb, int src_width, - uint8_t *(*blend_yuv_data)[ 3 ][ 2 ]) -{ - int x; - - for (x = 0; x < src_width; x += 2) { - /* get opacity of the 4 pixels that share chroma */ - int o00 = (*blend_yuv_data)[ 0 ][ 0 ][ x + 0 ]; - int o01 = (*blend_yuv_data)[ 0 ][ 0 ][ x + 1 ]; - int o = o00 + o01; - int o10 = (*blend_yuv_data)[ 0 ][ 1 ][ x + 0 ]; - o += o10; - int o11 = (*blend_yuv_data)[ 0 ][ 1 ][ x + 1 ]; - o += o11; - - /* are there any pixels a little bit opaque? */ - if (o) { - /* get the chroma components of the 4 pixels */ - int cr00 = (*blend_yuv_data)[ 1 ][ 0 ][ x + 0 ]; - int cr01 = (*blend_yuv_data)[ 1 ][ 0 ][ x + 1 ]; - int cr10 = (*blend_yuv_data)[ 1 ][ 1 ][ x + 0 ]; - int cr11 = (*blend_yuv_data)[ 1 ][ 1 ][ x + 1 ]; - - int cb00 = (*blend_yuv_data)[ 2 ][ 0 ][ x + 0 ]; - int cb01 = (*blend_yuv_data)[ 2 ][ 0 ][ x + 1 ]; - int cb10 = (*blend_yuv_data)[ 2 ][ 1 ][ x + 0 ]; - int cb11 = (*blend_yuv_data)[ 2 ][ 1 ][ x + 1 ]; - - /* are all pixels completely opaque? */ - if (o >= 4*0xf) { - /* set the output chroma to the average of the four pixels */ - *dst_cr = (cr00 + cr01 + cr10 + cr11) / 4; - *dst_cb = (cb00 + cb01 + cb10 + cb11) / 4; - } else { - /* calculate transparency of background over the four pixels */ - int t4 = 4*0xf - o; - - /* blend the output chroma to the average of the four pixels */ - /* for explanation of the used equation, see blend_yuy2_exact() */ - *dst_cr = ((*dst_cr * t4 + cr00 * o00 + cr01 * o01 + cr10 * o10 + cr11 * o11) * (0x1111+1)) >> 18; - *dst_cb = ((*dst_cb * t4 + cb00 * o00 + cb01 * o01 + cb10 * o10 + cb11 * o11) * (0x1111+1)) >> 18; - } - } - - /* next chroma sample */ - dst_cr++; - dst_cb++; - } + + MMX_EXIT(); } static uint8_t *(*blend_yuv_grow_extra_data(alphablend_t *extra_data, int osd_width))[ 3 ][ 2 ] @@ -1109,8 +1956,20 @@ static uint8_t *(*blend_yuv_grow_extra_d int max_width; uint8_t *data[ 3 ][ 2 ]; } *header = (struct header_s *)extra_data->buffer; - - int needed_buffer_size = sizeof (*header) + osd_width * sizeof (uint8_t[ 3 ][ 2 ]); + + int needed_buffer_size; + int header_size = sizeof(*header); + int alloc_width = osd_width; +#if defined(ARCH_X86_64) + /* align buffers to dqword (16 bytes). It speeds up SSE2 blending. */ + header_size = (header_size + 15 + 16) & (~15); + alloc_width = (alloc_width + 15) & (~15); +#elif defined(ARCH_X86) + /* align buffers to qword (8 bytes). It speeds up MMX blending. */ + header_size = (header_size + 7 + 8) & (~7); + alloc_width = (alloc_width + 7) & (~7); +#endif + needed_buffer_size = header_size + alloc_width * sizeof (uint8_t[ 3 ][ 2 ]); if (extra_data->buffer_size < needed_buffer_size) { @@ -1129,12 +1988,20 @@ static uint8_t *(*blend_yuv_grow_extra_d header->id = ME_FOURCC('y', 'u', 'v', 0); header->max_width = osd_width; - header->data[ 0 ][ 0 ] = ((uint8_t *)extra_data->buffer) + sizeof (*header); - header->data[ 0 ][ 1 ] = header->data[ 0 ][ 0 ] + osd_width; - header->data[ 1 ][ 0 ] = header->data[ 0 ][ 1 ] + osd_width; - header->data[ 1 ][ 1 ] = header->data[ 1 ][ 0 ] + osd_width; - header->data[ 2 ][ 0 ] = header->data[ 1 ][ 1 ] + osd_width; - header->data[ 2 ][ 1 ] = header->data[ 2 ][ 0 ] + osd_width; +#if defined(ARCH_X86_64) + header->data[ 0 ][ 0 ] = + (uint8_t*)((((unsigned long int)extra_data->buffer) + sizeof(*header) + 15) & (~15)); +#elif defined(ARCH_X86) + header->data[ 0 ][ 0 ] = + (uint8_t*)((((unsigned long int)extra_data->buffer) + sizeof(*header) + 7) & (~7)); +#else + header->data[ 0 ][ 0 ] = ((uint8_t *)extra_data->buffer) + sizeof(*header); +#endif + header->data[ 0 ][ 1 ] = header->data[ 0 ][ 0 ] + alloc_width; + header->data[ 1 ][ 0 ] = header->data[ 0 ][ 1 ] + alloc_width; + header->data[ 1 ][ 1 ] = header->data[ 1 ][ 0 ] + alloc_width; + header->data[ 2 ][ 0 ] = header->data[ 1 ][ 1 ] + alloc_width; + header->data[ 2 ][ 1 ] = header->data[ 2 ][ 0 ] + alloc_width; } return &(header->data); @@ -1164,6 +2031,11 @@ void _x_blend_yuv (uint8_t *dst_base[3], int hili_right, hili_left; int clip_right, clip_left, clip_top; uint8_t clr=0; +#if defined(MMX_RUNTIME_DETECT) + int mmx = xine_mm_accel() & MM_ACCEL_X86_MMX; +#elif defined(SSSE3_RUNTIME_DETECT) + int ssse3 = xine_mm_accel() & MM_ACCEL_X86_SSSE3; +#endif int any_line_buffered = 0; int exact_blend_width = ((src_width <= (dst_width - x_off)) ? src_width : (dst_width - x_off)); @@ -1216,13 +2088,13 @@ void _x_blend_yuv (uint8_t *dst_base[3], if (exact_blend_width <= 0) return; - blend_yuv_data = blend_yuv_grow_extra_data(extra_data, exact_blend_width_m2); + blend_yuv_data = blend_yuv_grow_extra_data(extra_data, exact_blend_width_m2 + 15); if (!blend_yuv_data) return; /* make linebuffer transparent */ - memset(&(*blend_yuv_data)[ 0 ][ 0 ][ 0 ], 0, exact_blend_width_m2); - memset(&(*blend_yuv_data)[ 0 ][ 1 ][ 0 ], 0, exact_blend_width_m2); + memset(&(*blend_yuv_data)[ 0 ][ 0 ][ 0 ], 0, exact_blend_width_m2 + 15); + memset(&(*blend_yuv_data)[ 0 ][ 1 ][ 0 ], 0, exact_blend_width_m2 + 15); } rlelen=rle_remainder=0; @@ -1455,6 +2327,8 @@ void _x_blend_yuv (uint8_t *dst_base[3], blend_yuv_exact(dst_cr, dst_cb, exact_blend_width, blend_yuv_data); } } + + MMX_EXIT(); #ifdef LOG_BLEND_YUV printf("overlay_blend ended\n"); @@ -1602,6 +2476,9 @@ void _x_blend_yuy2 (uint8_t * dst_img, v int l = 0; int hili_right, hili_left; int clip_right, clip_left, clip_top; +#if defined(MMX_RUNTIME_DETECT) + int mmx = xine_mm_accel() & MM_ACCEL_X86_MMX; +#endif union { uint32_t value; @@ -1854,11 +2731,8 @@ void _x_blend_yuy2 (uint8_t * dst_img, v dst++; } } else { - l = rle_this_bite; - while (l--) { - *dst = BLEND_BYTE(*dst, my_clut[clr].y, o); - dst += 2; - } + blend_yuy2_y(dst, my_clut[clr].y, o, rle_this_bite); + dst += rle_this_bite * 2; } } @@ -1886,6 +2760,8 @@ void _x_blend_yuy2 (uint8_t * dst_img, v dst_y += dst_pitch; } + + MMX_EXIT(); } void _x_clear_xx44_palette(xx44_palette_t *p)