# HG changeset patch # User Petri Hintukainen # Date 1177788403 -10800 # Node ID 9193085ea70a9e8642e2287be606dffce7e3db1e # Parent 77f0f0afae6b06c1144d7f48e95675e28e4972b3 Move helper functions to beginning of file (no functional changes) diff -r 77f0f0afae6b -r 9193085ea70a src/xine-engine/alphablend.c --- a/src/xine-engine/alphablend.c Sat Apr 28 20:16:53 2007 +0300 +++ b/src/xine-engine/alphablend.c Sat Apr 28 22:26:43 2007 +0300 @@ -42,6 +42,15 @@ #define BLEND_BYTE(dst, src, o) (((((src)-(dst))*(o*0x1111+1))>>16)+(dst)) +static void mem_blend8(uint8_t *mem, uint8_t val, uint8_t o, size_t sz) +{ + uint8_t *limit = mem + sz; + while (mem < limit) { + *mem = BLEND_BYTE(*mem, val, o); + mem++; + } +} + static void mem_blend16(uint16_t *mem, uint16_t clr, uint8_t o, int len) { uint16_t *limit = mem + len; while (mem < limit) { @@ -77,6 +86,56 @@ static void mem_blend32(uint8_t *mem, co mem++; *mem = BLEND_BYTE(*mem, src[3], o); mem++; + } +} + +static void blend_yuv_exact(uint8_t *dst_cr, uint8_t *dst_cb, int src_width, + uint8_t *(*blend_yuv_data)[ 3 ][ 2 ]) +{ + int x; + + for (x = 0; x < src_width; x += 2) { + /* get opacity of the 4 pixels that share chroma */ + int o00 = (*blend_yuv_data)[ 0 ][ 0 ][ x + 0 ]; + int o01 = (*blend_yuv_data)[ 0 ][ 0 ][ x + 1 ]; + int o = o00 + o01; + int o10 = (*blend_yuv_data)[ 0 ][ 1 ][ x + 0 ]; + o += o10; + int o11 = (*blend_yuv_data)[ 0 ][ 1 ][ x + 1 ]; + o += o11; + + /* are there any pixels a little bit opaque? */ + if (o) { + /* get the chroma components of the 4 pixels */ + int cr00 = (*blend_yuv_data)[ 1 ][ 0 ][ x + 0 ]; + int cr01 = (*blend_yuv_data)[ 1 ][ 0 ][ x + 1 ]; + int cr10 = (*blend_yuv_data)[ 1 ][ 1 ][ x + 0 ]; + int cr11 = (*blend_yuv_data)[ 1 ][ 1 ][ x + 1 ]; + + int cb00 = (*blend_yuv_data)[ 2 ][ 0 ][ x + 0 ]; + int cb01 = (*blend_yuv_data)[ 2 ][ 0 ][ x + 1 ]; + int cb10 = (*blend_yuv_data)[ 2 ][ 1 ][ x + 0 ]; + int cb11 = (*blend_yuv_data)[ 2 ][ 1 ][ x + 1 ]; + + /* are all pixels completely opaque? */ + if (o >= 4*0xf) { + /* set the output chroma to the average of the four pixels */ + *dst_cr = (cr00 + cr01 + cr10 + cr11) / 4; + *dst_cb = (cb00 + cb01 + cb10 + cb11) / 4; + } else { + /* calculate transparency of background over the four pixels */ + int t4 = 4*0xf - o; + + /* blend the output chroma to the average of the four pixels */ + /* for explanation of the used equation, see blend_yuy2_exact() */ + *dst_cr = ((*dst_cr * t4 + cr00 * o00 + cr01 * o01 + cr10 * o10 + cr11 * o11) * (0x1111+1)) >> 18; + *dst_cb = ((*dst_cb * t4 + cb00 * o00 + cb01 * o01 + cb10 * o10 + cb11 * o11) * (0x1111+1)) >> 18; + } + } + + /* next chroma sample */ + dst_cr++; + dst_cb++; } } @@ -1040,65 +1099,6 @@ void _x_blend_rgb32 (uint8_t * img, vo_o } else { rle = rle_start; /* y-scaling, reuse the last rle encoded line */ } - } -} - -static void mem_blend8(uint8_t *mem, uint8_t val, uint8_t o, size_t sz) -{ - uint8_t *limit = mem + sz; - while (mem < limit) { - *mem = BLEND_BYTE(*mem, val, o); - mem++; - } -} - -static void blend_yuv_exact(uint8_t *dst_cr, uint8_t *dst_cb, int src_width, - uint8_t *(*blend_yuv_data)[ 3 ][ 2 ]) -{ - int x; - - for (x = 0; x < src_width; x += 2) { - /* get opacity of the 4 pixels that share chroma */ - int o00 = (*blend_yuv_data)[ 0 ][ 0 ][ x + 0 ]; - int o01 = (*blend_yuv_data)[ 0 ][ 0 ][ x + 1 ]; - int o = o00 + o01; - int o10 = (*blend_yuv_data)[ 0 ][ 1 ][ x + 0 ]; - o += o10; - int o11 = (*blend_yuv_data)[ 0 ][ 1 ][ x + 1 ]; - o += o11; - - /* are there any pixels a little bit opaque? */ - if (o) { - /* get the chroma components of the 4 pixels */ - int cr00 = (*blend_yuv_data)[ 1 ][ 0 ][ x + 0 ]; - int cr01 = (*blend_yuv_data)[ 1 ][ 0 ][ x + 1 ]; - int cr10 = (*blend_yuv_data)[ 1 ][ 1 ][ x + 0 ]; - int cr11 = (*blend_yuv_data)[ 1 ][ 1 ][ x + 1 ]; - - int cb00 = (*blend_yuv_data)[ 2 ][ 0 ][ x + 0 ]; - int cb01 = (*blend_yuv_data)[ 2 ][ 0 ][ x + 1 ]; - int cb10 = (*blend_yuv_data)[ 2 ][ 1 ][ x + 0 ]; - int cb11 = (*blend_yuv_data)[ 2 ][ 1 ][ x + 1 ]; - - /* are all pixels completely opaque? */ - if (o >= 4*0xf) { - /* set the output chroma to the average of the four pixels */ - *dst_cr = (cr00 + cr01 + cr10 + cr11) / 4; - *dst_cb = (cb00 + cb01 + cb10 + cb11) / 4; - } else { - /* calculate transparency of background over the four pixels */ - int t4 = 4*0xf - o; - - /* blend the output chroma to the average of the four pixels */ - /* for explanation of the used equation, see blend_yuy2_exact() */ - *dst_cr = ((*dst_cr * t4 + cr00 * o00 + cr01 * o01 + cr10 * o10 + cr11 * o11) * (0x1111+1)) >> 18; - *dst_cb = ((*dst_cb * t4 + cb00 * o00 + cb01 * o01 + cb10 * o10 + cb11 * o11) * (0x1111+1)) >> 18; - } - } - - /* next chroma sample */ - dst_cr++; - dst_cb++; } } # HG changeset patch # User Petri Hintukainen # Date 1177788625 -10800 # Node ID fed19a7cf1ed70a50e953f8e690f7a31627413b9 # Parent 9193085ea70a9e8642e2287be606dffce7e3db1e MMX implementation of mem_blend8 function diff -r 9193085ea70a -r fed19a7cf1ed src/xine-engine/alphablend.c --- a/src/xine-engine/alphablend.c Sat Apr 28 22:26:43 2007 +0300 +++ b/src/xine-engine/alphablend.c Sat Apr 28 22:30:25 2007 +0300 @@ -50,6 +50,66 @@ static void mem_blend8(uint8_t *mem, uin mem++; } } + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static inline void mem_blend8_mmx(uint8_t *mem, int val, int o, size_t sz) +{ + uint8_t *limit = mem + sz; + + if (sz > 3) { + /* + * MMX register allocation: + * mm7: zero + * mm6: division approximation constant (4x 0x1112) + * mm5: alpha for destination (4x 0xf-A) + * mm4: overlay * overlay alpha + */ + static const mmx_t max_o = {uq:0x000f000f000f000fULL}; + static const mmx_t div_0f = {uq:0x1112111211121112ULL}; + + uint8_t *limit4 = mem + (sz & (~3)); + + movd_a2r (o, mm0); /* mm0 = 0 0 0 A */ + pxor_r2r (mm7, mm7); /* mm7 = 0 0 0 0 */ + + movd_a2r (val, mm4); /* mm4 = 0 0 0 V */ +# if defined(__SSE__) + pshufw_r2r (mm0, mm0, 0); /* mm0 = A A A A */ + + movq_m2r (max_o, mm5); /* mm5 = 0xf 0xf 0xf 0xf */ + pshufw_r2r (mm4, mm4, 0); /* mm4 = V V V V */ + + psubw_r2r (mm0, mm5); /* mm5 = 0xf-A 0xf-A 0xf-A 0xf-A */ +# else + punpcklwd_r2r (mm0, mm0); /* mm0 = 0 0 A A */ + + movq_m2r (max_o, mm5); /* mm5 = 0xf 0xf 0xf 0xf */ + punpcklwd_r2r (mm0, mm0); /* mm0 = A A A A */ + + punpcklwd_r2r (mm4, mm4); /* mm4 = 0 0 V V */ + psubw_r2r (mm0, mm5); /* mm5 = 0xf-A 0xf-A 0xf-A 0xf-A */ + + punpcklwd_r2r (mm4, mm4); /* mm4 = V V V V */ +# endif + movq_m2r (div_0f, mm6); /* mm6 = x1112 x1112 x1112 x1112 */ + pmullw_r2r (mm0, mm4); /* mm4 = V*A V*A V*A V*A */ + + for ( ; mem < limit4 ; mem += 4) { + movd_m2r (*mem, mm0); /* mm0 = D3 D2 D1 D0 */ + punpcklbw_r2r (mm7, mm0); /* mm0 = D3 D2 D1 D0 */ + pmullw_r2r (mm5, mm0); /* mm0 = D3*iA D2*iA D1*iA D0*iA */ + paddw_r2r (mm4, mm0); /* mm0 = 4x (V*A + D*iA) */ + pmulhw_r2r (mm6, mm0); /* mm0 = R3 R2 R1 R0 */ + packuswb_r2r (mm0, mm0); /* mm0 = R3 R2 R1 R0 */ + movd_r2m (mm0, *mem); /* store */ + } + sz &= 3; + } + + for (; mem < limit; mem++) + *mem = BLEND_BYTE(*mem, val, o); +} +#endif static void mem_blend16(uint16_t *mem, uint16_t clr, uint8_t o, int len) { uint16_t *limit = mem + len; # HG changeset patch # User Petri Hintukainen # Date 1177788675 -10800 # Node ID 303a8ebe44c45eb5979271915b6ce3133ee1fdb8 # Parent fed19a7cf1ed70a50e953f8e690f7a31627413b9 MMX implementation of mem_blend32 function diff -r fed19a7cf1ed -r 303a8ebe44c4 src/xine-engine/alphablend.c --- a/src/xine-engine/alphablend.c Sat Apr 28 22:30:25 2007 +0300 +++ b/src/xine-engine/alphablend.c Sat Apr 28 22:31:15 2007 +0300 @@ -148,6 +148,56 @@ static void mem_blend32(uint8_t *mem, co mem++; } } + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static inline void mem_blend32_mmx(uint8_t *mem, const uint8_t *src, int o, int len) { + /* + * MMX register allocation: + * mm7: zero + * mm6: division approximation constant (4x 0x1112) + * mm5: alpha for destination (4x 0xf-A) + * mm4: overlay * overlay alpha + */ + static const mmx_t max_o = {uq:0x000f000f000f000fULL}; + static const mmx_t div_0f = {uq:0x1112111211121112ULL}; + + uint8_t *limit = mem + len*4; + + movd_a2r (o, mm0); /* mm0 = 0 0 0 A */ + pxor_r2r (mm7, mm7); /* mm7 = 0 0 0 0 */ + +# if defined(__SSE__) + movd_m2r (*src, mm4); /* mm4 = V3 V2 V1 V0 */ + pshufw_r2r (mm0, mm0, 0); /* mm0 = A A A A */ + + movq_m2r (div_0f, mm6); /* mm6 = x1112 x1112 x1112 x1112 */ + punpcklbw_r2r (mm7, mm4); /* mm4 = V3 V2 V1 V0 */ +# else + punpcklwd_r2r (mm0, mm0); /* mm0 = 0 0 A A */ + movd_m2r (*src, mm4); /* mm4 = V3 V2 V1 V0 */ + + punpcklbw_r2r (mm7, mm4); /* mm4 = V3 V2 V1 V0 */ + movq_m2r (div_0f, mm6); /* mm6 = x1112 x1112 x1112 x1112 */ + + punpcklwd_r2r (mm0, mm0); /* mm0 = A A A A */ +# endif + movq_m2r (max_o, mm5); /* mm5 = 0xf 0xf 0xf 0xf */ + + psubw_r2r (mm0, mm5); /* mm5 = f-A f-A f-A f-A */ + pmullw_r2r (mm0, mm4); /* mm4 = V3*A V2*A V1*A V0*A */ + + for ( ; mem < limit ; mem += 4) { + movd_m2r (*mem, mm0); /* mm0 = D3 D2 D1 D0 */ + punpcklbw_r2r (mm7, mm0); /* mm0 = D3 D2 D1 D0 */ + pmullw_r2r (mm5, mm0); /* mm0 = D3*iA D2*iA D1*iA D0*iA */ + paddw_r2r (mm4, mm0); /* mm0 = 4x (V*A + D*iA) */ + pmulhw_r2r (mm6, mm0); /* div by 0xf */ + /* mm0 = R3 R2 R1 R0 */ + packuswb_r2r (mm0, mm0); /* mm0 = R3 R2 R1 R0 */ + movd_r2m (mm0, *mem); /* store */ + } +} +#endif static void blend_yuv_exact(uint8_t *dst_cr, uint8_t *dst_cb, int src_width, uint8_t *(*blend_yuv_data)[ 3 ][ 2 ]) # HG changeset patch # User Petri Hintukainen # Date 1177788755 -10800 # Node ID fc3162d00e5a4daa79378041dce26f12669f4be7 # Parent 303a8ebe44c45eb5979271915b6ce3133ee1fdb8 MMX implementation of blend_yuv_exact function diff -r 303a8ebe44c4 -r fc3162d00e5a src/xine-engine/alphablend.c --- a/src/xine-engine/alphablend.c Sat Apr 28 22:31:15 2007 +0300 +++ b/src/xine-engine/alphablend.c Sat Apr 28 22:32:35 2007 +0300 @@ -248,6 +248,137 @@ static void blend_yuv_exact(uint8_t *dst dst_cb++; } } + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static inline void blend_yuv_exact_mmx(uint8_t *dst_cr, uint8_t *dst_cb, int src_width, + uint8_t *(*blend_yuv_data)[ 3 ][ 2 ], int safe) +{ + static const mmx_t lobytes = {uq:0x00ff00ff00ff00ff}; + static const mmx_t div_3c = {uq:0x0445044504450445}; + static const mmx_t max4a = {uq:0x003c003c003c003c}; + int xovl = 0, xdst = 0; + + /* + Code blends pixels outside of overlay right border + if safe flag is set (and if overlay width is not + multiplicant of 4). + But, we don't care, as it has no effect on result: + the few extra pixels in blend_yuv_data are already marked + as transparent and those pixels in frame stay untouched. + Of course, rest of pixels could be blended one by one, + but that is slower and not necessarily. + */ + + if (!safe) { + safe = src_width & 7; + src_width -= safe; + } + + while (xovl < src_width) { + + /* 1. lines */ + + movq_m2r (((*blend_yuv_data)[ 0 ][ 0 ][ xovl ]), mm0); /* load 8x alpha, line 1 */ + movq_m2r (lobytes, mm7); + movq_r2r (mm0, mm4); + pand_r2r (mm7, mm0); /* mm0: a06 a04 a02 a00 */ + psrlw_i2r (8, mm4); /* mm4: a07 a05 a03 a01 */ + /* used mm: 0, 4 */ + + movq_m2r (((*blend_yuv_data)[ 1 ][ 0 ][ xovl ]), mm1); /* load 8x cr, line 1 */ + movq_r2r (mm1, mm5); + pand_r2r (mm7, mm1) ; /* mm1: [ r06 r04 r02 r00 ] */ + psrlw_i2r (8, mm5); /* mm5: [ r07 r05 r03 r01 ] */ + /* used mm: 0, 1, 4, 5 */ + + movq_m2r (((*blend_yuv_data)[ 2 ][ 0 ][ xovl ]), mm2); /* load 8x cb, line 1 */ + movq_r2r (mm2, mm6); + pand_r2r (mm7, mm2); /* mm2: [ r06 r04 r02 r00 ] */ + psrlw_i2r (8, mm6); /* mm6: [ r07 r05 r03 r01 ] */ + /* used mm: 0, 1, 2, 4, 5, 6 */ + + pmullw_r2r (mm0, mm1); /* mm2: [ r06*a06 r04*a04 r02*a02 r00*a00 ] */ + pmullw_r2r (mm4, mm5); /* mm3: [ r07*a07 r05*a05 r03*a03 r01*a01 ] */ + pmullw_r2r (mm0, mm2); /* mm4: [ b06*a06 b04*a04 b02*a02 b00*a00 ] */ + pmullw_r2r (mm4, mm6); /* mm5: [ b07*a07 b05*a05 b03*a03 b01*a01 ] */ + + paddw_r2r (mm4, mm0); /* mm0: [ a07+a06 a05+a04 a03+a02 a01+a00 ] */ + paddw_r2r (mm5, mm1); /* mm1: [ r07*a07+r06*a06 r05*a05+r04*a04 ... ] */ + paddw_r2r (mm6, mm2); /* mm2: [ b07*a07+b06*a06 b05*a05+b04*a04 ... ] */ + /* used mm: 0, 1, 2 */ + + /* 2. lines */ + + movq_m2r (((*blend_yuv_data)[ 0 ][ 1 ][ xovl ]), mm3); /* load 8x alpha, line 2 */ + movq_r2r (mm3, mm4); + pand_r2r (mm7, mm3); /* mm3: A16 A14 A12 A10 */ + psrlw_i2r (8, mm4); /* mm4: A17 A15 A13 A11 */ + + movq_m2r (((*blend_yuv_data)[ 1 ][ 1 ][ xovl ]), mm5); /* load 8x cr, line 2 */ + movq_r2r (mm5, mm6); + pand_r2r (mm7, mm5); /* mm5: r16 r14 r12 r10 */ + psrlw_i2r (8, mm6); /* mm6: r17 r15 r13 r11 */ + + pmullw_r2r (mm3, mm5); /* mm5: [ r16*a16 r14*a14 r12*a12 r10*a10 ] */ + pmullw_r2r (mm4, mm6); /* mm6: [ r17*a17 r15*a15 r13*a13 r11*a11 ] */ + paddw_r2r (mm6, mm5); /* mm5: r*A pairs, line 2 */ + + movq_m2r (((*blend_yuv_data)[ 2 ][ 1 ][ xovl ]), mm6); /* load 8x cb, line 2 */ + movq_r2r (mm6, mm7); + pand_m2r (lobytes, mm6); /* mm6: r06 r04 r02 r00 */ + psrlw_i2r (8, mm7); /* mm7: r07 r05 r03 r01 */ + + pmullw_r2r (mm3, mm6); /* mm6: [ b16*a16 b14*a14 b12*a12 b10*a10 ] */ + pmullw_r2r (mm4, mm7); /* mm7: [ b17*a17 b15*a15 b13*a13 b11*a11 ] */ + paddw_r2r (mm7, mm6); /* mm6: b*A pairs, line 2 */ + + paddw_r2r (mm4, mm3); /* mm3: [ a17+a16 a15+a14 a13+a12 a11+a10 ] */ + + paddw_r2r (mm3, mm0); /* mm0: A qubits */ + paddw_r2r (mm5, mm1); /* mm1: r*A qubits */ + paddw_r2r (mm6, mm2); /* mm2: b*A qubits */ + + movq_m2r (max4a, mm5); /* a qubits - 4*0xf */ + psubw_r2r (mm0, mm5); /* a qubits - 4*0xf */ + + /* dst */ + + movd_m2r (dst_cr[xdst], mm3); + movd_m2r (dst_cb[xdst], mm4); + pxor_r2r (mm7, mm7); + punpcklbw_r2r (mm7, mm3); /* bytes -> words */ + punpcklbw_r2r (mm7, mm4); + + pmullw_r2r (mm5, mm3); /* *= (1-a) */ + xovl += 8; + pmullw_r2r (mm5, mm4); + + movq_m2r (div_3c, mm6); + + paddw_r2r (mm3, mm1); /* blend */ + paddw_r2r (mm4, mm2); + + pmulhw_r2r (mm6, mm1); /* div by 4*0xf */ + xdst += 4; + pmulhw_r2r (mm6, mm2); + + packuswb_r2r (mm1, mm1); /* words -> bytes */ + packuswb_r2r (mm2, mm2); + + movd_r2m (mm1, dst_cr[xdst-4]); /* store */ + movd_r2m (mm2, dst_cb[xdst-4]); + } + + if (safe) { + /* near frame corner, so do last bytes individually */ + uint8_t *tmp[ 3 ][ 2 ] = { + { (*blend_yuv_data)[ 0 ][ 0 ] + xovl, (*blend_yuv_data)[ 0 ][ 1 ] + xovl }, + { (*blend_yuv_data)[ 1 ][ 0 ] + xovl, (*blend_yuv_data)[ 1 ][ 1 ] + xovl }, + { (*blend_yuv_data)[ 2 ][ 0 ] + xovl, (*blend_yuv_data)[ 2 ][ 1 ] + xovl }}; + blend_yuv_exact(dst_cr + xdst, dst_cb + xdst, safe, &tmp); + } +} +#endif /* * Some macros for fixed point arithmetic. # HG changeset patch # User Petri Hintukainen # Date 1177788891 -10800 # Node ID 6400257f5e054927474e984e686cdb6586d71e7b # Parent fc3162d00e5a4daa79378041dce26f12669f4be7 Added blend_yuy2_y: alphablend Y component of yuy2 Added MMX version of blend_yuy2_y function diff -r fc3162d00e5a -r 6400257f5e05 src/xine-engine/alphablend.c --- a/src/xine-engine/alphablend.c Sat Apr 28 22:32:35 2007 +0300 +++ b/src/xine-engine/alphablend.c Sat Apr 28 22:34:51 2007 +0300 @@ -195,6 +195,91 @@ static inline void mem_blend32_mmx(uint8 /* mm0 = R3 R2 R1 R0 */ packuswb_r2r (mm0, mm0); /* mm0 = R3 R2 R1 R0 */ movd_r2m (mm0, *mem); /* store */ + } +} +#endif + +static void blend_yuy2_y(uint8_t *mem, uint8_t val, uint8_t o, int len) { + uint8_t *limit = mem + len*2; + while (mem < limit) { + *mem = BLEND_BYTE(*mem, val, o); + mem += 2; + } +} + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static inline void blend_yuy2_y_mmx(uint8_t *dst, int y, int o, size_t sz) +{ + int i = sz & (~3); + + if (i) { + /* + * MMX register allocation: + * mm7: Y byte mask + * mm6: division constant + * mm5: dst alpha (0xf-A) + * mm4: U/V bytes mask + * mm3: y*alpha + * mm2: (stored U/V values) + * mm0: (calculation) + */ + static const mmx_t y_mask = {uq:0x00ff00ff00ff00ff}; + static const mmx_t uv_mask = {uq:0xff00ff00ff00ff00}; + static const mmx_t div_0f = {uq:0x1112111211121112}; + static const mmx_t max_o = {uq:0x000f000f000f000f}; + + movd_a2r (o, mm1); /* mm1 = 0 0 0 o */ + + movd_a2r (y, mm3); /* mm3 = 0 0 0 y */ +# if defined(__SSE__) + pshufw_r2r (mm1, mm1, 0); /* mm1 = o o o o */ + + movq_m2r (uv_mask, mm4); /* mm4 = ff 00 ff 00 ff 00 ff 00 */ + pshufw_r2r (mm3, mm3, 0); /* mm3 = y y y y */ + + movq_m2r (max_o, mm5); /* mm5 = 0xf 0xf 0xf 0xf */ + pmullw_r2r (mm1, mm3); /* mm3 = y*o y*o y*o y*o */ + + movq_m2r (div_0f, mm6); /* mm6 = x1112 x1112 x1112 x1112 */ + psubw_r2r (mm1, mm5); /* mm5 = 0xf-o 0xf-o 0xf-o 0xf-o */ + + movq_m2r (y_mask, mm7); /* mm7 = 00 ff 00 ff 00 ff 00 ff */ +# else + punpcklwd_r2r (mm1, mm1); /* mm1 = 0 0 o o */ + + movq_m2r (max_o, mm5); /* mm5 = 0xf 0xf 0xf 0xf */ + punpcklwd_r2r (mm1, mm1); /* mm1 = o o o o */ + + movq_m2r (uv_mask, mm4); /* mm4 = ff 00 ff 00 ff 00 ff 00 */ + punpcklwd_r2r (mm3, mm3); /* mm3 = 0 0 y y */ + + movq_m2r (div_0f, mm6); /* mm6 = x1112 x1112 x1112 x1112 */ + punpcklwd_r2r (mm3, mm3); /* mm3 = y y y y */ + + movq_m2r (y_mask, mm7); /* mm7 = 00 ff 00 ff 00 ff 00 ff */ + pmullw_r2r (mm1, mm3); /* mm3 = y*o y*o y*o y*o */ + + psubw_r2r (mm1, mm5); /* mm5 = 0xf-o 0xf-o 0xf-o 0xf-o */ +# endif + + do { + movq_m2r (*dst, mm0); /* mm0 = cb y3 cr y2 cb y1 cr y0 */ + movq_r2r (mm0, mm2); /* save cr/cb */ + pand_r2r (mm7, mm0); /* mm0 = 0 y3 0 y2 0 y1 0 y0 */ + pmullw_r2r (mm5, mm0); /* y[] *= (0xf-o) */ + pand_r2r (mm4, mm2); /* drop Y from saved cr/cb vector */ + paddw_r2r (mm3, mm0); /* blend */ + pmulhw_r2r (mm6, mm0); /* div by 0xf (= (X*0x1112)>>16) */ + por_r2r (mm2, mm0); /* interleave with Y */ + movq_r2m (mm0, *dst); /* store */ + dst += 8; + } while (i -= 4); + sz &= 3; + } + + while(sz--) { + *dst = BLEND_BYTE(*dst, y, o); + dst += 2; } } #endif # HG changeset patch # User Petri Hintukainen # Date 1177789416 -10800 # Node ID d3c05ae7efb0f485126177bc288bc86331e044e2 # Parent 6400257f5e054927474e984e686cdb6586d71e7b MMX compile-time detection and runtime detection: use new MMX blending functions diff -r 6400257f5e05 -r d3c05ae7efb0 src/xine-engine/alphablend.c --- a/src/xine-engine/alphablend.c Sat Apr 28 22:34:51 2007 +0300 +++ b/src/xine-engine/alphablend.c Sat Apr 28 22:43:36 2007 +0300 @@ -464,6 +464,52 @@ static inline void blend_yuv_exact_mmx(u } } #endif + +/* + * MMX enabled at compile-time ? + * - Use accelerated function without run-time detection + */ +#if defined(__MMX__) || defined(ARCH_X86_64) +# define mem_blend8(m,v,o,s) mem_blend8_mmx(m,v,o,s) +# define mem_blend32(m,v,o,s) mem_blend32_mmx(m,v,o,s) +# define blend_yuy2_y(m,v,o,s) blend_yuy2_y_mmx(m,v,o,s) +# define blend_yuv_exact(r,b,w,s) blend_yuv_exact_mmx(r,b,w,s, \ + ((y < src_height-2) || ((src_width+x_off) < dst_width-4))) +# define MMX_EXIT() emms() + +#elif defined(ARCH_X86) +# define MMX_RUNTIME_DETECT 1 +# define mem_blend8(m,v,o,s) \ + do { \ + if (mmx) mem_blend8_mmx(m,v,o,s); \ + else mem_blend8(m,v,o,s); \ + } while(0) +# define mem_blend32(m,v,o,s) \ + do { \ + if (mmx) mem_blend32_mmx(m,v,o,s); \ + else mem_blend32(m,v,o,s); \ + } while(0) +# define blend_yuy2_y(m,v,o,s) \ + do { \ + if (mmx) blend_yuy2_y_mmx(m,v,o,s); \ + else blend_yuy2_y(m,v,o,s); \ + } while(0) +# define blend_yuv_exact(r,b,w,s) \ + do { \ + if (mmx) blend_yuv_exact_mmx(r,b,w,s, \ + ((y < src_height-2) || ((src_width+x_off) < dst_width-4))); \ + else blend_yuv_exact(r,b,w,s); \ + } while(0) +# define MMX_EXIT() \ + do { \ + if (mmx) emms(); \ + } while (0) + +#else +# define MMX_EXIT() + +#endif + /* * Some macros for fixed point arithmetic. @@ -1279,6 +1325,9 @@ void _x_blend_rgb32 (uint8_t * img, vo_o int hili_right, hili_left; int clip_right, clip_left, clip_top; uint8_t *img_pix; +#if defined(MMX_RUNTIME_DETECT) + int mmx = xine_mm_accel() & MM_ACCEL_X86_MMX; +#endif dy_step = INT_TO_SCALED(dst_height) / img_height; x_scale = INT_TO_SCALED(img_width) / dst_width; @@ -1426,6 +1475,8 @@ void _x_blend_rgb32 (uint8_t * img, vo_o rle = rle_start; /* y-scaling, reuse the last rle encoded line */ } } + + MMX_EXIT(); } static uint8_t *(*blend_yuv_grow_extra_data(alphablend_t *extra_data, int osd_width))[ 3 ][ 2 ] @@ -1490,6 +1541,9 @@ void _x_blend_yuv (uint8_t *dst_base[3], int hili_right, hili_left; int clip_right, clip_left, clip_top; uint8_t clr=0; +#if defined(MMX_RUNTIME_DETECT) + int mmx = xine_mm_accel() & MM_ACCEL_X86_MMX; +#endif int any_line_buffered = 0; int exact_blend_width = ((src_width <= (dst_width - x_off)) ? src_width : (dst_width - x_off)); @@ -1781,6 +1835,8 @@ void _x_blend_yuv (uint8_t *dst_base[3], blend_yuv_exact(dst_cr, dst_cb, exact_blend_width, blend_yuv_data); } } + + MMX_EXIT(); #ifdef LOG_BLEND_YUV printf("overlay_blend ended\n"); @@ -1928,6 +1984,9 @@ void _x_blend_yuy2 (uint8_t * dst_img, v int l = 0; int hili_right, hili_left; int clip_right, clip_left, clip_top; +#if defined(MMX_RUNTIME_DETECT) + int mmx = xine_mm_accel() & MM_ACCEL_X86_MMX; +#endif union { uint32_t value; @@ -2180,11 +2239,8 @@ void _x_blend_yuy2 (uint8_t * dst_img, v dst++; } } else { - l = rle_this_bite; - while (l--) { - *dst = BLEND_BYTE(*dst, my_clut[clr].y, o); - dst += 2; - } + blend_yuy2_y(dst, my_clut[clr].y, o, rle_this_bite); + dst += rle_this_bite * 2; } } @@ -2212,6 +2268,8 @@ void _x_blend_yuy2 (uint8_t * dst_img, v dst_y += dst_pitch; } + + MMX_EXIT(); } void _x_clear_xx44_palette(xx44_palette_t *p) # HG changeset patch # User Petri Hintukainen # Date 1177789529 -10800 # Node ID 27ed042554b84be540e27763ec91cc283b40a430 # Parent d3c05ae7efb0f485126177bc288bc86331e044e2 Set beginning of temporary UVA planes buffers to 8 or 16 byte boundary to speed up MMX/SSE2 blending diff -r d3c05ae7efb0 -r 27ed042554b8 src/xine-engine/alphablend.c --- a/src/xine-engine/alphablend.c Sat Apr 28 22:43:36 2007 +0300 +++ b/src/xine-engine/alphablend.c Sat Apr 28 22:45:29 2007 +0300 @@ -1486,8 +1486,20 @@ static uint8_t *(*blend_yuv_grow_extra_d int max_width; uint8_t *data[ 3 ][ 2 ]; } *header = (struct header_s *)extra_data->buffer; - - int needed_buffer_size = sizeof (*header) + osd_width * sizeof (uint8_t[ 3 ][ 2 ]); + + int needed_buffer_size; + int header_size = sizeof(*header); + int alloc_width = osd_width; +#if defined(ARCH_X86_64) + /* align buffers to dqword (16 bytes). It speeds up SSE2 blending. */ + header_size = (header_size + 15 + 16) & (~15); + alloc_width = (alloc_width + 15) & (~15); +#elif defined(ARCH_X86) + /* align buffers to qword (8 bytes). It speeds up MMX blending. */ + header_size = (header_size + 7 + 8) & (~7); + alloc_width = (alloc_width + 7) & (~7); +#endif + needed_buffer_size = header_size + alloc_width * sizeof (uint8_t[ 3 ][ 2 ]); if (extra_data->buffer_size < needed_buffer_size) { @@ -1506,7 +1518,15 @@ static uint8_t *(*blend_yuv_grow_extra_d header->id = ME_FOURCC('y', 'u', 'v', 0); header->max_width = osd_width; - header->data[ 0 ][ 0 ] = ((uint8_t *)extra_data->buffer) + sizeof (*header); +#if defined(ARCH_X86_64) + header->data[ 0 ][ 0 ] = + (uint8_t*)((((unsigned long int)extra_data->buffer) + sizeof(*header) + 15) & (~15)); +#elif defined(ARCH_X86) + header->data[ 0 ][ 0 ] = + (uint8_t*)((((unsigned long int)extra_data->buffer) + sizeof(*header) + 7) & (~7)); +#else + header->data[ 0 ][ 0 ] = ((uint8_t *)extra_data->buffer) + sizeof(*header); +#endif header->data[ 0 ][ 1 ] = header->data[ 0 ][ 0 ] + osd_width; header->data[ 1 ][ 0 ] = header->data[ 0 ][ 1 ] + osd_width; header->data[ 1 ][ 1 ] = header->data[ 1 ][ 0 ] + osd_width; @@ -1596,13 +1616,13 @@ void _x_blend_yuv (uint8_t *dst_base[3], if (exact_blend_width <= 0) return; - blend_yuv_data = blend_yuv_grow_extra_data(extra_data, exact_blend_width_m2); + blend_yuv_data = blend_yuv_grow_extra_data(extra_data, exact_blend_width_m2 + 15); if (!blend_yuv_data) return; /* make linebuffer transparent */ - memset(&(*blend_yuv_data)[ 0 ][ 0 ][ 0 ], 0, exact_blend_width_m2); - memset(&(*blend_yuv_data)[ 0 ][ 1 ][ 0 ], 0, exact_blend_width_m2); + memset(&(*blend_yuv_data)[ 0 ][ 0 ][ 0 ], 0, exact_blend_width_m2 + 15); + memset(&(*blend_yuv_data)[ 0 ][ 1 ][ 0 ], 0, exact_blend_width_m2 + 15); } rlelen=rle_remainder=0;