# HG changeset patch
# User Petri Hintukainen <phintuka@users.sourceforge.net>
# Date 1177788403 -10800
# Node ID 9193085ea70a9e8642e2287be606dffce7e3db1e
# Parent  77f0f0afae6b06c1144d7f48e95675e28e4972b3
Move helper functions to beginning of file (no functional changes)

diff -r 77f0f0afae6b -r 9193085ea70a src/xine-engine/alphablend.c
--- a/src/xine-engine/alphablend.c	Sat Apr 28 20:16:53 2007 +0300
+++ b/src/xine-engine/alphablend.c	Sat Apr 28 22:26:43 2007 +0300
@@ -42,6 +42,15 @@
 
 #define BLEND_BYTE(dst, src, o) (((((src)-(dst))*(o*0x1111+1))>>16)+(dst))
 
+static void mem_blend8(uint8_t *mem, uint8_t val, uint8_t o, size_t sz)
+{
+  uint8_t *limit = mem + sz;
+  while (mem < limit) {
+    *mem = BLEND_BYTE(*mem, val, o);
+    mem++;
+  }
+}
+
 static void mem_blend16(uint16_t *mem, uint16_t clr, uint8_t o, int len) {
   uint16_t *limit = mem + len;
   while (mem < limit) {
@@ -77,6 +86,56 @@ static void mem_blend32(uint8_t *mem, co
     mem++;
     *mem = BLEND_BYTE(*mem, src[3], o);
     mem++;
+  }
+}
+
+static void blend_yuv_exact(uint8_t *dst_cr, uint8_t *dst_cb, int src_width, 
+                            uint8_t *(*blend_yuv_data)[ 3 ][ 2 ])
+{
+  int x;
+  
+  for (x = 0; x < src_width; x += 2) {
+    /* get opacity of the 4 pixels that share chroma */
+    int o00 = (*blend_yuv_data)[ 0 ][ 0 ][ x + 0 ];
+    int o01 = (*blend_yuv_data)[ 0 ][ 0 ][ x + 1 ];
+    int o = o00 + o01;
+    int o10 = (*blend_yuv_data)[ 0 ][ 1 ][ x + 0 ];
+    o += o10;
+    int o11 = (*blend_yuv_data)[ 0 ][ 1 ][ x + 1 ];
+    o += o11;
+
+    /* are there any pixels a little bit opaque? */
+    if (o) {
+      /* get the chroma components of the 4 pixels */
+      int cr00 = (*blend_yuv_data)[ 1 ][ 0 ][ x + 0 ];
+      int cr01 = (*blend_yuv_data)[ 1 ][ 0 ][ x + 1 ];
+      int cr10 = (*blend_yuv_data)[ 1 ][ 1 ][ x + 0 ];
+      int cr11 = (*blend_yuv_data)[ 1 ][ 1 ][ x + 1 ];
+          
+      int cb00 = (*blend_yuv_data)[ 2 ][ 0 ][ x + 0 ];
+      int cb01 = (*blend_yuv_data)[ 2 ][ 0 ][ x + 1 ];
+      int cb10 = (*blend_yuv_data)[ 2 ][ 1 ][ x + 0 ];
+      int cb11 = (*blend_yuv_data)[ 2 ][ 1 ][ x + 1 ];
+
+      /* are all pixels completely opaque? */
+      if (o >= 4*0xf) {
+        /* set the output chroma to the average of the four pixels */
+        *dst_cr = (cr00 + cr01 + cr10 + cr11) / 4;
+        *dst_cb = (cb00 + cb01 + cb10 + cb11) / 4;
+      } else {
+        /* calculate transparency of background over the four pixels */
+        int t4 = 4*0xf - o;
+
+        /* blend the output chroma to the average of the four pixels */
+        /* for explanation of the used equation, see blend_yuy2_exact() */
+        *dst_cr = ((*dst_cr * t4 + cr00 * o00 + cr01 * o01 + cr10 * o10 + cr11 * o11) * (0x1111+1)) >> 18;
+        *dst_cb = ((*dst_cb * t4 + cb00 * o00 + cb01 * o01 + cb10 * o10 + cb11 * o11) * (0x1111+1)) >> 18;
+      }
+    }
+
+    /* next chroma sample */
+    dst_cr++;
+    dst_cb++;
   }
 }
 
@@ -1040,65 +1099,6 @@ void _x_blend_rgb32 (uint8_t * img, vo_o
     } else {
       rle = rle_start;		/* y-scaling, reuse the last rle encoded line */
     }
-  }
-}
-
-static void mem_blend8(uint8_t *mem, uint8_t val, uint8_t o, size_t sz)
-{
-  uint8_t *limit = mem + sz;
-  while (mem < limit) {
-    *mem = BLEND_BYTE(*mem, val, o);
-    mem++;
-  }
-}
-
-static void blend_yuv_exact(uint8_t *dst_cr, uint8_t *dst_cb, int src_width, 
-                            uint8_t *(*blend_yuv_data)[ 3 ][ 2 ])
-{
-  int x;
-  
-  for (x = 0; x < src_width; x += 2) {
-    /* get opacity of the 4 pixels that share chroma */
-    int o00 = (*blend_yuv_data)[ 0 ][ 0 ][ x + 0 ];
-    int o01 = (*blend_yuv_data)[ 0 ][ 0 ][ x + 1 ];
-    int o = o00 + o01;
-    int o10 = (*blend_yuv_data)[ 0 ][ 1 ][ x + 0 ];
-    o += o10;
-    int o11 = (*blend_yuv_data)[ 0 ][ 1 ][ x + 1 ];
-    o += o11;
-
-    /* are there any pixels a little bit opaque? */
-    if (o) {
-      /* get the chroma components of the 4 pixels */
-      int cr00 = (*blend_yuv_data)[ 1 ][ 0 ][ x + 0 ];
-      int cr01 = (*blend_yuv_data)[ 1 ][ 0 ][ x + 1 ];
-      int cr10 = (*blend_yuv_data)[ 1 ][ 1 ][ x + 0 ];
-      int cr11 = (*blend_yuv_data)[ 1 ][ 1 ][ x + 1 ];
-          
-      int cb00 = (*blend_yuv_data)[ 2 ][ 0 ][ x + 0 ];
-      int cb01 = (*blend_yuv_data)[ 2 ][ 0 ][ x + 1 ];
-      int cb10 = (*blend_yuv_data)[ 2 ][ 1 ][ x + 0 ];
-      int cb11 = (*blend_yuv_data)[ 2 ][ 1 ][ x + 1 ];
-
-      /* are all pixels completely opaque? */
-      if (o >= 4*0xf) {
-        /* set the output chroma to the average of the four pixels */
-        *dst_cr = (cr00 + cr01 + cr10 + cr11) / 4;
-        *dst_cb = (cb00 + cb01 + cb10 + cb11) / 4;
-      } else {
-        /* calculate transparency of background over the four pixels */
-        int t4 = 4*0xf - o;
-
-        /* blend the output chroma to the average of the four pixels */
-        /* for explanation of the used equation, see blend_yuy2_exact() */
-        *dst_cr = ((*dst_cr * t4 + cr00 * o00 + cr01 * o01 + cr10 * o10 + cr11 * o11) * (0x1111+1)) >> 18;
-        *dst_cb = ((*dst_cb * t4 + cb00 * o00 + cb01 * o01 + cb10 * o10 + cb11 * o11) * (0x1111+1)) >> 18;
-      }
-    }
-
-    /* next chroma sample */
-    dst_cr++;
-    dst_cb++;
   }
 }
 
# HG changeset patch
# User Petri Hintukainen <phintuka@users.sourceforge.net>
# Date 1177788625 -10800
# Node ID fed19a7cf1ed70a50e953f8e690f7a31627413b9
# Parent  9193085ea70a9e8642e2287be606dffce7e3db1e
MMX implementation of mem_blend8 function

diff -r 9193085ea70a -r fed19a7cf1ed src/xine-engine/alphablend.c
--- a/src/xine-engine/alphablend.c	Sat Apr 28 22:26:43 2007 +0300
+++ b/src/xine-engine/alphablend.c	Sat Apr 28 22:30:25 2007 +0300
@@ -50,6 +50,66 @@ static void mem_blend8(uint8_t *mem, uin
     mem++;
   }
 }
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+static inline void mem_blend8_mmx(uint8_t *mem, int val, int o, size_t sz)
+{
+  uint8_t *limit = mem + sz;
+
+  if (sz > 3) {
+    /*
+     * MMX register allocation:
+     *   mm7: zero
+     *   mm6: division approximation constant (4x 0x1112)
+     *   mm5: alpha for destination (4x 0xf-A)
+     *   mm4: overlay * overlay alpha
+     */
+    static const mmx_t max_o  = {uq:0x000f000f000f000fULL};
+    static const mmx_t div_0f = {uq:0x1112111211121112ULL};
+
+    uint8_t *limit4 = mem + (sz & (~3));
+
+    movd_a2r (o, mm0);          /* mm0 =     0     0     0     A  */
+    pxor_r2r (mm7, mm7);        /* mm7 =     0     0     0     0  */
+
+    movd_a2r (val, mm4);        /* mm4 =     0     0     0     V  */
+# if defined(__SSE__)
+    pshufw_r2r (mm0, mm0, 0);   /* mm0 =     A     A     A     A  */
+ 
+    movq_m2r (max_o, mm5);      /* mm5 =   0xf   0xf   0xf   0xf  */
+    pshufw_r2r (mm4, mm4, 0);   /* mm4 =     V     V     V     V  */
+    
+    psubw_r2r (mm0, mm5);       /* mm5 = 0xf-A 0xf-A 0xf-A 0xf-A  */   
+# else
+    punpcklwd_r2r (mm0, mm0);   /* mm0 =     0     0     A     A  */
+
+    movq_m2r (max_o, mm5);      /* mm5 =   0xf   0xf   0xf   0xf  */
+    punpcklwd_r2r (mm0, mm0);   /* mm0 =     A     A     A     A  */
+    
+    punpcklwd_r2r (mm4, mm4);   /* mm4 =     0     0     V     V  */
+    psubw_r2r (mm0, mm5);       /* mm5 = 0xf-A 0xf-A 0xf-A 0xf-A  */
+    
+    punpcklwd_r2r (mm4, mm4);   /* mm4 =     V     V     V     V  */
+# endif
+    movq_m2r (div_0f, mm6);     /* mm6 = x1112 x1112 x1112 x1112  */
+    pmullw_r2r (mm0, mm4);      /* mm4 =   V*A   V*A   V*A   V*A  */
+    
+    for ( ; mem < limit4 ; mem += 4) {
+      movd_m2r (*mem, mm0);     /* mm0 =                 D3  D2  D1  D0  */
+      punpcklbw_r2r (mm7, mm0); /* mm0 =     D3      D2      D1      D0  */
+      pmullw_r2r (mm5, mm0);    /* mm0 =  D3*iA   D2*iA   D1*iA   D0*iA  */
+      paddw_r2r (mm4, mm0);     /* mm0 = 4x (V*A + D*iA)                 */
+      pmulhw_r2r (mm6, mm0);    /* mm0 =     R3      R2      R1      R0  */
+      packuswb_r2r (mm0, mm0);  /* mm0 =                 R3  R2  R1  R0 */
+      movd_r2m (mm0, *mem);     /* store */
+    }
+    sz &= 3;
+  }
+  
+  for (; mem < limit; mem++)
+    *mem = BLEND_BYTE(*mem, val, o);
+}
+#endif
 
 static void mem_blend16(uint16_t *mem, uint16_t clr, uint8_t o, int len) {
   uint16_t *limit = mem + len;
# HG changeset patch
# User Petri Hintukainen <phintuka@users.sourceforge.net>
# Date 1177788675 -10800
# Node ID 303a8ebe44c45eb5979271915b6ce3133ee1fdb8
# Parent  fed19a7cf1ed70a50e953f8e690f7a31627413b9
MMX implementation of mem_blend32 function

diff -r fed19a7cf1ed -r 303a8ebe44c4 src/xine-engine/alphablend.c
--- a/src/xine-engine/alphablend.c	Sat Apr 28 22:30:25 2007 +0300
+++ b/src/xine-engine/alphablend.c	Sat Apr 28 22:31:15 2007 +0300
@@ -148,6 +148,56 @@ static void mem_blend32(uint8_t *mem, co
     mem++;
   }
 }
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+static inline void mem_blend32_mmx(uint8_t *mem, const uint8_t *src, int o, int len) {
+  /*
+   * MMX register allocation:
+   *   mm7: zero
+   *   mm6: division approximation constant (4x 0x1112)
+   *   mm5: alpha for destination (4x 0xf-A)
+   *   mm4: overlay * overlay alpha
+   */
+  static const mmx_t max_o  = {uq:0x000f000f000f000fULL};
+  static const mmx_t div_0f = {uq:0x1112111211121112ULL};
+
+  uint8_t *limit = mem + len*4;
+
+  movd_a2r (o, mm0);          /* mm0 =     0     0     0     A  */
+  pxor_r2r (mm7, mm7);        /* mm7 =     0     0     0     0 */
+
+# if defined(__SSE__)
+  movd_m2r (*src, mm4);       /* mm4 =             V3 V2 V1 V0  */
+  pshufw_r2r (mm0, mm0, 0);   /* mm0 =     A     A     A     A  */
+
+  movq_m2r (div_0f, mm6);     /* mm6 = x1112 x1112 x1112 x1112  */
+  punpcklbw_r2r (mm7, mm4);   /* mm4 =    V3    V2    V1    V0  */
+# else 
+  punpcklwd_r2r (mm0, mm0);   /* mm0 =     0     0     A     A  */
+  movd_m2r (*src, mm4);       /* mm4 =             V3 V2 V1 V0  */
+
+  punpcklbw_r2r (mm7, mm4);   /* mm4 =    V3    V2    V1    V0  */
+  movq_m2r (div_0f, mm6);     /* mm6 = x1112 x1112 x1112 x1112  */
+
+  punpcklwd_r2r (mm0, mm0);   /* mm0 =     A     A     A     A  */
+# endif
+  movq_m2r (max_o, mm5);      /* mm5 =   0xf   0xf   0xf   0xf  */
+    
+  psubw_r2r (mm0, mm5);       /* mm5 =   f-A   f-A   f-A   f-A  */   
+  pmullw_r2r (mm0, mm4);      /* mm4 =  V3*A  V2*A  V1*A  V0*A  */
+
+  for ( ; mem < limit ; mem += 4) {
+    movd_m2r (*mem, mm0);     /* mm0 =                D3  D2  D1  D0  */
+    punpcklbw_r2r (mm7, mm0); /* mm0 =    D3      D2      D1      D0  */
+    pmullw_r2r (mm5, mm0);    /* mm0 = D3*iA   D2*iA   D1*iA   D0*iA  */
+    paddw_r2r (mm4, mm0);     /* mm0 = 4x (V*A + D*iA)                */
+    pmulhw_r2r (mm6, mm0);    /* div by 0xf                           */
+                              /* mm0 =    R3      R2      R1      R0  */
+    packuswb_r2r (mm0, mm0);  /* mm0 =                R3  R2  R1  R0  */
+    movd_r2m (mm0, *mem);     /* store                                */
+  }
+}
+#endif
 
 static void blend_yuv_exact(uint8_t *dst_cr, uint8_t *dst_cb, int src_width, 
                             uint8_t *(*blend_yuv_data)[ 3 ][ 2 ])
# HG changeset patch
# User Petri Hintukainen <phintuka@users.sourceforge.net>
# Date 1177788755 -10800
# Node ID fc3162d00e5a4daa79378041dce26f12669f4be7
# Parent  303a8ebe44c45eb5979271915b6ce3133ee1fdb8
MMX implementation of blend_yuv_exact function

diff -r 303a8ebe44c4 -r fc3162d00e5a src/xine-engine/alphablend.c
--- a/src/xine-engine/alphablend.c	Sat Apr 28 22:31:15 2007 +0300
+++ b/src/xine-engine/alphablend.c	Sat Apr 28 22:32:35 2007 +0300
@@ -248,6 +248,137 @@ static void blend_yuv_exact(uint8_t *dst
     dst_cb++;
   }
 }
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+static inline void blend_yuv_exact_mmx(uint8_t *dst_cr, uint8_t *dst_cb, int src_width,
+				       uint8_t *(*blend_yuv_data)[ 3 ][ 2 ], int safe)
+{
+  static const mmx_t lobytes = {uq:0x00ff00ff00ff00ff};
+  static const mmx_t div_3c  = {uq:0x0445044504450445};
+  static const mmx_t max4a   = {uq:0x003c003c003c003c};
+  int xovl = 0, xdst = 0;
+  
+  /*
+    Code blends pixels outside of overlay right border 
+    if safe flag is set (and if overlay width is not 
+    multiplicant of 4).
+    But, we don't care, as it has no effect on result: 
+    the few extra pixels in blend_yuv_data are already marked 
+    as transparent and those pixels in frame stay untouched.
+    Of course, rest of pixels could be blended one by one, 
+    but that is slower and not necessarily.
+  */
+
+  if (!safe) {
+    safe = src_width & 7;
+    src_width -= safe;
+  }
+
+  while (xovl < src_width) {
+
+    /* 1. lines */
+
+    movq_m2r (((*blend_yuv_data)[ 0 ][ 0 ][ xovl ]), mm0); /* load 8x alpha, line 1 */
+    movq_m2r (lobytes, mm7);
+    movq_r2r (mm0, mm4);      
+    pand_r2r (mm7, mm0);      /* mm0: a06  a04  a02  a00 */
+    psrlw_i2r (8, mm4);       /* mm4: a07  a05  a03  a01 */
+    /* used mm: 0, 4 */
+
+    movq_m2r (((*blend_yuv_data)[ 1 ][ 0 ][ xovl ]), mm1); /* load 8x cr,    line 1 */
+    movq_r2r (mm1, mm5);
+    pand_r2r (mm7, mm1)    ;  /* mm1: [ r06  r04  r02  r00 ]  */
+    psrlw_i2r (8, mm5);       /* mm5: [ r07  r05  r03  r01 ]  */
+    /* used mm: 0, 1, 4, 5 */
+
+    movq_m2r (((*blend_yuv_data)[ 2 ][ 0 ][ xovl ]), mm2); /* load 8x cb,    line 1 */
+    movq_r2r (mm2, mm6);
+    pand_r2r (mm7, mm2);      /* mm2: [ r06  r04  r02  r00 ] */
+    psrlw_i2r (8, mm6);       /* mm6: [ r07  r05  r03  r01 ] */
+    /* used mm: 0, 1, 2, 4, 5, 6 */
+
+    pmullw_r2r (mm0, mm1);    /* mm2: [ r06*a06  r04*a04  r02*a02  r00*a00 ]  */
+    pmullw_r2r (mm4, mm5);    /* mm3: [ r07*a07  r05*a05  r03*a03  r01*a01 ]  */
+    pmullw_r2r (mm0, mm2);    /* mm4: [ b06*a06  b04*a04  b02*a02  b00*a00 ]  */
+    pmullw_r2r (mm4, mm6);    /* mm5: [ b07*a07  b05*a05  b03*a03  b01*a01 ]  */
+
+    paddw_r2r (mm4, mm0);     /* mm0: [ a07+a06  a05+a04  a03+a02  a01+a00 ]  */
+    paddw_r2r (mm5, mm1);     /* mm1: [ r07*a07+r06*a06  r05*a05+r04*a04 ... ] */
+    paddw_r2r (mm6, mm2);     /* mm2: [ b07*a07+b06*a06  b05*a05+b04*a04 ... ] */
+    /* used mm: 0, 1, 2 */
+
+    /* 2. lines */
+
+    movq_m2r (((*blend_yuv_data)[ 0 ][ 1 ][ xovl ]), mm3); /* load 8x alpha, line 2 */
+    movq_r2r (mm3, mm4);
+    pand_r2r (mm7, mm3);      /* mm3: A16  A14  A12  A10 */
+    psrlw_i2r (8, mm4);       /* mm4: A17  A15  A13  A11 */
+
+    movq_m2r (((*blend_yuv_data)[ 1 ][ 1 ][ xovl ]), mm5); /* load 8x cr,    line 2 */
+    movq_r2r (mm5, mm6);
+    pand_r2r (mm7, mm5);      /* mm5: r16  r14  r12  r10 */
+    psrlw_i2r (8, mm6);       /* mm6: r17  r15  r13  r11 */
+
+    pmullw_r2r (mm3, mm5);    /* mm5: [ r16*a16  r14*a14  r12*a12  r10*a10 ]  */
+    pmullw_r2r (mm4, mm6);    /* mm6: [ r17*a17  r15*a15  r13*a13  r11*a11 ]  */
+    paddw_r2r (mm6, mm5);     /* mm5: r*A pairs, line 2 */
+
+    movq_m2r (((*blend_yuv_data)[ 2 ][ 1 ][ xovl ]), mm6); /* load 8x cb,    line 2 */
+    movq_r2r (mm6, mm7);
+    pand_m2r (lobytes, mm6);  /* mm6: r06  r04  r02  r00 */
+    psrlw_i2r (8, mm7);       /* mm7: r07  r05  r03  r01 */
+
+    pmullw_r2r (mm3, mm6);    /* mm6: [ b16*a16  b14*a14  b12*a12  b10*a10 ]  */
+    pmullw_r2r (mm4, mm7);    /* mm7: [ b17*a17  b15*a15  b13*a13  b11*a11 ]  */
+    paddw_r2r (mm7, mm6);     /* mm6: b*A pairs, line 2 */
+
+    paddw_r2r (mm4, mm3);     /* mm3: [ a17+a16  a15+a14  a13+a12  a11+a10 ]  */
+
+    paddw_r2r (mm3, mm0);     /* mm0: A qubits */
+    paddw_r2r (mm5, mm1);     /* mm1: r*A qubits */
+    paddw_r2r (mm6, mm2);     /* mm2: b*A qubits */
+
+    movq_m2r (max4a, mm5);    /* a qubits - 4*0xf */
+    psubw_r2r (mm0, mm5);     /* a qubits - 4*0xf */
+
+    /* dst */
+
+    movd_m2r (dst_cr[xdst], mm3);
+    movd_m2r (dst_cb[xdst], mm4);
+    pxor_r2r (mm7, mm7);
+    punpcklbw_r2r (mm7, mm3); /* bytes -> words */
+    punpcklbw_r2r (mm7, mm4);
+
+    pmullw_r2r (mm5, mm3);    /*  *= (1-a) */
+    xovl += 8;
+    pmullw_r2r (mm5, mm4);
+
+    movq_m2r (div_3c, mm6);
+
+    paddw_r2r (mm3, mm1);     /* blend */
+    paddw_r2r (mm4, mm2);
+
+    pmulhw_r2r (mm6, mm1);    /* div by 4*0xf */
+    xdst += 4;
+    pmulhw_r2r (mm6, mm2);
+
+    packuswb_r2r (mm1, mm1);  /* words -> bytes */
+    packuswb_r2r (mm2, mm2);
+
+    movd_r2m (mm1, dst_cr[xdst-4]); /* store */
+    movd_r2m (mm2, dst_cb[xdst-4]);
+  }
+
+  if (safe) {
+    /* near frame corner, so do last bytes individually */
+    uint8_t *tmp[ 3 ][ 2 ] = {
+      { (*blend_yuv_data)[ 0 ][ 0 ] + xovl, (*blend_yuv_data)[ 0 ][ 1 ] + xovl },
+      { (*blend_yuv_data)[ 1 ][ 0 ] + xovl, (*blend_yuv_data)[ 1 ][ 1 ] + xovl },
+      { (*blend_yuv_data)[ 2 ][ 0 ] + xovl, (*blend_yuv_data)[ 2 ][ 1 ] + xovl }};
+    blend_yuv_exact(dst_cr + xdst, dst_cb + xdst, safe, &tmp);
+  }
+}
+#endif
 
 /*
  * Some macros for fixed point arithmetic.
# HG changeset patch
# User Petri Hintukainen <phintuka@users.sourceforge.net>
# Date 1177788891 -10800
# Node ID 6400257f5e054927474e984e686cdb6586d71e7b
# Parent  fc3162d00e5a4daa79378041dce26f12669f4be7
Added blend_yuy2_y: alphablend Y component of yuy2
Added MMX version of blend_yuy2_y function

diff -r fc3162d00e5a -r 6400257f5e05 src/xine-engine/alphablend.c
--- a/src/xine-engine/alphablend.c	Sat Apr 28 22:32:35 2007 +0300
+++ b/src/xine-engine/alphablend.c	Sat Apr 28 22:34:51 2007 +0300
@@ -195,6 +195,91 @@ static inline void mem_blend32_mmx(uint8
                               /* mm0 =    R3      R2      R1      R0  */
     packuswb_r2r (mm0, mm0);  /* mm0 =                R3  R2  R1  R0  */
     movd_r2m (mm0, *mem);     /* store                                */
+  }
+}
+#endif
+
+static void blend_yuy2_y(uint8_t *mem, uint8_t val, uint8_t o, int len) {
+  uint8_t *limit = mem + len*2;
+  while (mem < limit) {
+    *mem = BLEND_BYTE(*mem, val, o);
+    mem += 2;
+  }
+}
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+static inline void blend_yuy2_y_mmx(uint8_t *dst, int y, int o, size_t sz)
+{
+  int i = sz & (~3);
+
+  if (i) {
+    /*
+     * MMX register allocation:
+     *   mm7: Y byte mask       
+     *   mm6: division constant 
+     *   mm5: dst alpha (0xf-A)   
+     *   mm4: U/V bytes mask      
+     *   mm3: y*alpha             
+     *   mm2: (stored U/V values) 
+     *   mm0: (calculation)       
+     */
+    static const mmx_t y_mask  = {uq:0x00ff00ff00ff00ff};
+    static const mmx_t uv_mask = {uq:0xff00ff00ff00ff00};
+    static const mmx_t div_0f  = {uq:0x1112111211121112};
+    static const mmx_t max_o   = {uq:0x000f000f000f000f};
+
+    movd_a2r (o, mm1);        /*  mm1 =      0     0    0     o  */
+
+    movd_a2r (y, mm3);        /*  mm3 =      0     0    0     y  */
+# if defined(__SSE__)
+    pshufw_r2r (mm1, mm1, 0); /*  mm1 =     o     o     o     o  */
+
+    movq_m2r (uv_mask, mm4);  /*  mm4 = ff 00 ff 00 ff 00 ff 00  */
+    pshufw_r2r (mm3, mm3, 0); /*  mm3 =     y     y     y     y  */
+
+    movq_m2r (max_o, mm5);    /*  mm5 =   0xf   0xf   0xf   0xf  */
+    pmullw_r2r (mm1, mm3);    /*  mm3 =   y*o   y*o   y*o   y*o  */
+
+    movq_m2r (div_0f, mm6);   /*  mm6 = x1112 x1112 x1112 x1112  */
+    psubw_r2r (mm1, mm5);     /*  mm5 = 0xf-o 0xf-o 0xf-o 0xf-o  */ 
+
+    movq_m2r (y_mask, mm7);   /*  mm7 = 00 ff 00 ff 00 ff 00 ff  */
+# else
+    punpcklwd_r2r (mm1, mm1); /*  mm1 =      0     0    o     o  */
+
+    movq_m2r (max_o, mm5);    /*  mm5 =    0xf   0xf   0xf   0xf  */
+    punpcklwd_r2r (mm1, mm1); /*  mm1 =      o     o     o     o  */
+
+    movq_m2r (uv_mask, mm4);  /*  mm4 =  ff 00 ff 00 ff 00 ff 00  */
+    punpcklwd_r2r (mm3, mm3); /*  mm3 =      0     0     y     y  */
+
+    movq_m2r (div_0f, mm6);   /*  mm6 =  x1112 x1112 x1112 x1112  */
+    punpcklwd_r2r (mm3, mm3); /*  mm3 =      y     y     y     y  */
+
+    movq_m2r (y_mask, mm7);   /*  mm7 =  00 ff 00 ff 00 ff 00 ff  */
+    pmullw_r2r (mm1, mm3);    /*  mm3 =   y*o    y*o   y*o   y*o  */ 
+
+    psubw_r2r (mm1, mm5);     /*  mm5 = 0xf-o  0xf-o 0xf-o 0xf-o  */ 
+# endif
+
+    do {
+      movq_m2r (*dst, mm0);   /*  mm0 = cb y3 cr y2 cb y1 cr y0  */
+      movq_r2r (mm0, mm2);    /*  save cr/cb                     */
+      pand_r2r (mm7, mm0);    /*  mm0 =  0 y3  0 y2  0 y1  0 y0  */
+      pmullw_r2r (mm5, mm0);  /*  y[] *= (0xf-o)                 */
+      pand_r2r (mm4, mm2);    /*  drop Y from saved cr/cb vector */
+      paddw_r2r (mm3, mm0);   /*  blend                          */
+      pmulhw_r2r (mm6, mm0);  /*  div by 0xf (= (X*0x1112)>>16)  */
+      por_r2r (mm2, mm0);     /*  interleave with Y              */
+      movq_r2m (mm0, *dst);   /*  store                          */
+      dst += 8;
+    } while (i -= 4);
+    sz &= 3;
+  }
+
+  while(sz--) {
+    *dst = BLEND_BYTE(*dst, y, o);
+    dst += 2;
   }
 }
 #endif
# HG changeset patch
# User Petri Hintukainen <phintuka@users.sourceforge.net>
# Date 1177789416 -10800
# Node ID d3c05ae7efb0f485126177bc288bc86331e044e2
# Parent  6400257f5e054927474e984e686cdb6586d71e7b
MMX compile-time detection and runtime detection: use new MMX blending functions

diff -r 6400257f5e05 -r d3c05ae7efb0 src/xine-engine/alphablend.c
--- a/src/xine-engine/alphablend.c	Sat Apr 28 22:34:51 2007 +0300
+++ b/src/xine-engine/alphablend.c	Sat Apr 28 22:43:36 2007 +0300
@@ -464,6 +464,52 @@ static inline void blend_yuv_exact_mmx(u
   }
 }
 #endif
+
+/* 
+ * MMX enabled at compile-time ? 
+ * - Use accelerated function without run-time detection
+ */
+#if defined(__MMX__) || defined(ARCH_X86_64)
+#    define mem_blend8(m,v,o,s)      mem_blend8_mmx(m,v,o,s)
+#    define mem_blend32(m,v,o,s)     mem_blend32_mmx(m,v,o,s)
+#    define blend_yuy2_y(m,v,o,s)    blend_yuy2_y_mmx(m,v,o,s)
+#    define blend_yuv_exact(r,b,w,s) blend_yuv_exact_mmx(r,b,w,s, \
+                                     ((y < src_height-2) || ((src_width+x_off) < dst_width-4)))
+#    define MMX_EXIT() emms()
+
+#elif defined(ARCH_X86)
+#    define MMX_RUNTIME_DETECT  1
+#    define mem_blend8(m,v,o,s) \
+            do { \
+	      if (mmx) mem_blend8_mmx(m,v,o,s); \
+              else mem_blend8(m,v,o,s); \
+            } while(0)
+#    define mem_blend32(m,v,o,s) \
+            do { \
+              if (mmx) mem_blend32_mmx(m,v,o,s); \
+              else mem_blend32(m,v,o,s); \
+            } while(0)
+#    define blend_yuy2_y(m,v,o,s) \
+            do { \
+              if (mmx) blend_yuy2_y_mmx(m,v,o,s); \
+              else blend_yuy2_y(m,v,o,s); \
+            } while(0)
+#    define blend_yuv_exact(r,b,w,s) \
+            do { \
+              if (mmx) blend_yuv_exact_mmx(r,b,w,s, \
+                           ((y < src_height-2) || ((src_width+x_off) < dst_width-4))); \
+              else blend_yuv_exact(r,b,w,s); \
+            } while(0)
+#    define MMX_EXIT() \
+            do { \
+              if (mmx) emms(); \
+            } while (0)
+
+#else
+#    define MMX_EXIT()
+
+#endif
+
 
 /*
  * Some macros for fixed point arithmetic.
@@ -1279,6 +1325,9 @@ void _x_blend_rgb32 (uint8_t * img, vo_o
   int hili_right, hili_left;
   int clip_right, clip_left, clip_top;
   uint8_t *img_pix;
+#if defined(MMX_RUNTIME_DETECT)
+  int mmx = xine_mm_accel() & MM_ACCEL_X86_MMX;
+#endif
 
   dy_step = INT_TO_SCALED(dst_height) / img_height;
   x_scale = INT_TO_SCALED(img_width)  / dst_width;
@@ -1426,6 +1475,8 @@ void _x_blend_rgb32 (uint8_t * img, vo_o
       rle = rle_start;		/* y-scaling, reuse the last rle encoded line */
     }
   }
+
+  MMX_EXIT();
 }
 
 static uint8_t *(*blend_yuv_grow_extra_data(alphablend_t *extra_data, int osd_width))[ 3 ][ 2 ]
@@ -1490,6 +1541,9 @@ void _x_blend_yuv (uint8_t *dst_base[3],
   int hili_right, hili_left;
   int clip_right, clip_left, clip_top;
   uint8_t clr=0;
+#if defined(MMX_RUNTIME_DETECT)
+  int mmx = xine_mm_accel() & MM_ACCEL_X86_MMX;
+#endif
   
   int any_line_buffered = 0;
   int exact_blend_width = ((src_width <= (dst_width - x_off)) ? src_width : (dst_width - x_off));
@@ -1781,6 +1835,8 @@ void _x_blend_yuv (uint8_t *dst_base[3],
       blend_yuv_exact(dst_cr, dst_cb, exact_blend_width, blend_yuv_data);
     }
   }
+
+  MMX_EXIT();
       
 #ifdef LOG_BLEND_YUV
   printf("overlay_blend ended\n");
@@ -1928,6 +1984,9 @@ void _x_blend_yuy2 (uint8_t * dst_img, v
   int l = 0;
   int hili_right, hili_left;
   int clip_right, clip_left, clip_top;
+#if defined(MMX_RUNTIME_DETECT)
+  int mmx = xine_mm_accel() & MM_ACCEL_X86_MMX;
+#endif
 
   union {
     uint32_t value;
@@ -2180,11 +2239,8 @@ void _x_blend_yuy2 (uint8_t * dst_img, v
                 dst++;
               }
             } else {
-              l = rle_this_bite;
-              while (l--) {
-                *dst = BLEND_BYTE(*dst, my_clut[clr].y, o);
-                dst += 2;
-              }
+              blend_yuy2_y(dst, my_clut[clr].y, o, rle_this_bite);
+              dst += rle_this_bite * 2;
             }
           }
 
@@ -2212,6 +2268,8 @@ void _x_blend_yuy2 (uint8_t * dst_img, v
 
     dst_y += dst_pitch;
   }
+
+  MMX_EXIT();
 }
 
 void _x_clear_xx44_palette(xx44_palette_t *p) 
# HG changeset patch
# User Petri Hintukainen <phintuka@users.sourceforge.net>
# Date 1177789529 -10800
# Node ID 27ed042554b84be540e27763ec91cc283b40a430
# Parent  d3c05ae7efb0f485126177bc288bc86331e044e2
Set beginning of temporary UVA planes buffers to 8 or 16 byte boundary
to speed up MMX/SSE2 blending

diff -r d3c05ae7efb0 -r 27ed042554b8 src/xine-engine/alphablend.c
--- a/src/xine-engine/alphablend.c	Sat Apr 28 22:43:36 2007 +0300
+++ b/src/xine-engine/alphablend.c	Sat Apr 28 22:45:29 2007 +0300
@@ -1486,8 +1486,20 @@ static uint8_t *(*blend_yuv_grow_extra_d
     int max_width;
     uint8_t *data[ 3 ][ 2 ];
   } *header = (struct header_s *)extra_data->buffer;
-  
-  int needed_buffer_size = sizeof (*header) + osd_width * sizeof (uint8_t[ 3 ][ 2 ]);
+
+  int needed_buffer_size;
+  int header_size = sizeof(*header);
+  int alloc_width = osd_width;
+#if defined(ARCH_X86_64)
+  /* align buffers to dqword (16 bytes). It speeds up SSE2 blending. */
+  header_size = (header_size + 15 + 16) & (~15);
+  alloc_width = (alloc_width + 15) & (~15);
+#elif defined(ARCH_X86)
+  /* align buffers to qword (8 bytes). It speeds up MMX blending. */
+  header_size = (header_size + 7 + 8) & (~7);
+  alloc_width = (alloc_width + 7) & (~7);
+#endif  
+  needed_buffer_size = header_size + alloc_width * sizeof (uint8_t[ 3 ][ 2 ]);
   
   if (extra_data->buffer_size < needed_buffer_size) {
     
@@ -1506,7 +1518,15 @@ static uint8_t *(*blend_yuv_grow_extra_d
     header->id = ME_FOURCC('y', 'u', 'v', 0);
     header->max_width = osd_width;
 
-    header->data[ 0 ][ 0 ] = ((uint8_t *)extra_data->buffer) + sizeof (*header);
+#if defined(ARCH_X86_64)
+    header->data[ 0 ][ 0 ] =
+      (uint8_t*)((((unsigned long int)extra_data->buffer) + sizeof(*header) + 15) & (~15));
+#elif defined(ARCH_X86)
+    header->data[ 0 ][ 0 ] =
+      (uint8_t*)((((unsigned long int)extra_data->buffer) + sizeof(*header) + 7) & (~7));
+#else
+    header->data[ 0 ][ 0 ] = ((uint8_t *)extra_data->buffer) + sizeof(*header);
+#endif
     header->data[ 0 ][ 1 ] = header->data[ 0 ][ 0 ] + osd_width;
     header->data[ 1 ][ 0 ] = header->data[ 0 ][ 1 ] + osd_width;
     header->data[ 1 ][ 1 ] = header->data[ 1 ][ 0 ] + osd_width;
@@ -1596,13 +1616,13 @@ void _x_blend_yuv (uint8_t *dst_base[3],
     if (exact_blend_width <= 0)
       return;
   
-    blend_yuv_data = blend_yuv_grow_extra_data(extra_data, exact_blend_width_m2);
+    blend_yuv_data = blend_yuv_grow_extra_data(extra_data, exact_blend_width_m2 + 15);
     if (!blend_yuv_data)
       return;
     
     /* make linebuffer transparent */
-    memset(&(*blend_yuv_data)[ 0 ][ 0 ][ 0 ], 0, exact_blend_width_m2);
-    memset(&(*blend_yuv_data)[ 0 ][ 1 ][ 0 ], 0, exact_blend_width_m2);
+    memset(&(*blend_yuv_data)[ 0 ][ 0 ][ 0 ], 0, exact_blend_width_m2 + 15);
+    memset(&(*blend_yuv_data)[ 0 ][ 1 ][ 0 ], 0, exact_blend_width_m2 + 15);
   }
   
   rlelen=rle_remainder=0;