diff -ur libmovtar-0.1.3-orig/movtar_play.c libmovtar-0.1.3/movtar_play.c
--- libmovtar-0.1.3-orig/movtar_play.c	2005-08-20 16:24:54.000000000 -0400
+++ libmovtar-0.1.3/movtar_play.c	2005-08-20 16:48:03.000000000 -0400
@@ -173,8 +173,11 @@
 	  "punpcklbw %%mm7,%%mm0\n"// mm0: y3 y2 y1 y0 - expand to 16 bit
 	  "punpcklbw %%mm7,%%mm1\n"// mm1: cb3 cb2 cb1 cb0
 	  "punpcklbw %%mm7,%%mm2\n"// mm2: cr3 cr2 cr1 cr0
-	  "psubw te0,%%mm1\n"  //minus 128 for cb and cr
-	  "psubw te0,%%mm2\n"
+	  //"psubw te0,%%mm1\n"  //minus 128 for cb and cr
+	  //"psubw te0,%%mm2\n"
+		"psubw %4,%%mm1\n"  //minus 128 for cb and cr
+	  "psubw %4,%%mm2\n"
+
 	  "psllw $2,%%mm1\n"       // shift left 2 bits for Cr and Cb to fit the mult constants
 	  "psllw $2,%%mm2\n"
 
@@ -190,8 +193,11 @@
 	  //------------------
 	  // R    G     B  
 
-	  "pmulhw te1,%%mm3\n"// multiplicate in the constants: mm3: cb1/green cb1/blue cb0/green cb0/blue
-	  "pmulhw te2,%%mm4\n"// mm4: cr1/red cb1/green cr0/red cr0/green
+	  //"pmulhw te1,%%mm3\n"// multiplicate in the constants: mm3: cb1/green cb1/blue cb0/green cb0/blue
+	  //"pmulhw te2,%%mm4\n"// mm4: cr1/red cb1/green cr0/red cr0/green
+		 "pmulhw %5,%%mm3\n"// multiplicate in the constants: mm3: cb1/green cb1/blue cb0/green cb0/blue
+	  "pmulhw %6,%%mm4\n"// mm4: cr1/red cb1/green cr0/red cr0/green
+
 
 	  "movq %%mm0,%%mm5\n"      // mm5: y3 y2 y1 y0
 	  "punpcklwd %%mm5,%%mm5\n" // expand to 32 bit: y1 y1 y0 y0
@@ -223,8 +229,11 @@
 	  "punpckhwd %%mm0,%%mm0\n" //mm0 = y3 y3 y2 y2
 	  "punpckhwd %%mm1,%%mm1\n" //mm1 = cb3 cb3 cb2 cb2
 	  "punpckhwd %%mm2,%%mm2\n" //mm2 = cr3 cr3 cr2 cr2
-	  "pmulhw te1,%%mm1\n"      //mm1 = cb * ?
-	  "pmulhw te2,%%mm2\n"      //mm2 = cr * ?
+	  //"pmulhw te1,%%mm1\n"      //mm1 = cb * ?
+	  //"pmulhw te2,%%mm2\n"      //mm2 = cr * ?
+	  "pmulhw %5,%%mm1\n"      //mm1 = cb * ?
+	  "pmulhw %6,%%mm2\n"      //mm2 = cr * ?
+
 	  "movq %%mm0,%%mm3\n"      //mm3 = y3 y3 y2 y2
 	  "punpcklwd %%mm3,%%mm3\n" //mm3 = y2 y2 y2 y2
 	  "punpckhwd %%mm0,%%mm0\n" //mm0 = y3 y3 y3 y3
@@ -250,7 +259,8 @@
 	  "movq %%mm3,8%0\n"       //  save two more RGB pixels
 
 	  :"=m"(outptr[0])
-	  :"m"(inptr0[0]),"m"(inptr1[0]),"m"(inptr2[0]) //y cb cr
+	  :"m"(inptr0[0]),"m"(inptr1[0]),"m"(inptr2[0]),"m"(te0),"m"(te1),
+		"m"(te2) //y cb cr
 	  : "st");
 #endif
       outptr+=16;
@@ -318,8 +328,11 @@
 	  "punpcklbw %%mm7,%%mm0\n"// mm0: y3 y2 y1 y0 - expand to 16 bit
 	  "punpcklbw %%mm7,%%mm1\n"// mm1: cb3 cb2 cb1 cb0
 	  "punpcklbw %%mm7,%%mm2\n"// mm2: cr3 cr2 cr1 cr0
-	  "psubw te0,%%mm1\n"  //minus 128 for cb and cr
-	  "psubw te0,%%mm2\n"
+	  //"psubw te0,%%mm1\n"  //minus 128 for cb and cr
+//	  "psubw te0,%%mm2\n"
+		"psubw %5,%%mm1\n"  //minus 128 for cb and cr
+	  "psubw %5,%%mm2\n"
+
 	  "psllw $2,%%mm1\n"       // shift left 2 bits for Cr and Cb to fit the mult constants
 	  "psllw $2,%%mm2\n"
 
@@ -335,8 +348,11 @@
 	  //------------------
 	  // R    G     B  
 
-	  "pmulhw te1,%%mm3\n"// multiplicate in the constants: mm3: cb1/green cb1/blue cb0/green cb0/blue
-	  "pmulhw te2,%%mm4\n"// mm4: cr1/red cb1/green cr0/red cr0/green
+	  //"pmulhw te1,%%mm3\n"// multiplicate in the constants: mm3: cb1/green cb1/blue cb0/green cb0/blue
+	  //"pmulhw te2,%%mm4\n"// mm4: cr1/red cb1/green cr0/red cr0/green
+	  "pmulhw %6,%%mm3\n"// multiplicate in the constants: mm3: cb1/green cb1/blue cb0/green cb0/blue
+	  "pmulhw %7,%%mm4\n"// mm4: cr1/red cb1/green cr0/red cr0/green
+
 
 	  "movq %%mm0,%%mm5\n"      // mm5: y3 y2 y1 y0
 	  "punpcklwd %%mm5,%%mm5\n" // expand to 32 bit: y1 y1 y0 y0
@@ -367,8 +383,11 @@
 	  "punpckhwd %%mm0,%%mm0\n" //mm0 = y3 y3 y2 y2
 	  "punpckhwd %%mm1,%%mm1\n" //mm1 = cb3 cb3 cb2 cb2
 	  "punpckhwd %%mm2,%%mm2\n" //mm2 = cr3 cr3 cr2 cr2
-	  "pmulhw te1,%%mm1\n"      //mm1 = cb * ?
-	  "pmulhw te2,%%mm2\n"      //mm2 = cr * ?
+//	  "pmulhw te1,%%mm1\n"      //mm1 = cb * ?
+//	  "pmulhw te2,%%mm2\n"      //mm2 = cr * ?
+	  "pmulhw %6,%%mm1\n"      //mm1 = cb * ?
+	  "pmulhw %7,%%mm2\n"      //mm2 = cr * ?
+
 	  "movq %%mm0,%%mm3\n"      //mm3 = y3 y3 y2 y2
 	  "punpcklwd %%mm3,%%mm3\n" //mm3 = y2 y2 y2 y2
 	  "punpckhwd %%mm0,%%mm0\n" //mm0 = y3 y3 y3 y3
@@ -439,18 +458,20 @@
 	  // and would replace all the workaround code below !!!! Reason: There is no packusdw !! 
           "movq %%mm5, %%mm6\n" // copy mm5
 	  "psrlq $16, %%mm6\n" // shift out pixel 1, keep pixel 0
-	  "pand shiftmask, %%mm5\n" // and out pixel 0
+	  //"pand shiftmask, %%mm5\n" // and out pixel 0
+	  "pand %4, %%mm5\n" // and out pixel 0
 	  "por %%mm6, %%mm5\n" // or pix 0 and pix 1 together
 	  "movd %%mm5, %0\n" // write pix 0 and 1 out
 
           "movq %%mm3, %%mm0\n" // copy mm3
 	  "psrlq $16, %%mm0\n" // shift out pixel 3, keep pixel 2
-	  "pand shiftmask, %%mm3\n" // and out pixel 2
+	  "pand %4, %%mm3\n" // and out pixel 2
 	  "por %%mm0, %%mm3\n" // or pix 3 and pix 2 together
 	  "movd %%mm3, 4%0\n" // write pix 2 and 3
 
 	  :"=m"(outptr[0])
-	  :"m"(inptr0[0]),"m"(inptr1[0]),"m"(inptr2[0]) //y cb cr
+	  :"m"(inptr0[0]),"m"(inptr1[0]),"m"(inptr2[0]),"m"(shiftmask),"m"(te0),
+			"m"(te1), "m"(te2)//y cb cr
 	  : "st");
 #endif
       outptr+=8;