diff -ur koules1.4/koules.sndsrv.linux.c koules1.4-gcc3/koules.sndsrv.linux.c
--- koules1.4/koules.sndsrv.linux.c	1998-03-04 19:59:19.000000000 +0100
+++ koules1.4-gcc3/koules.sndsrv.linux.c	2003-04-23 01:15:16.000000000 +0200
@@ -136,7 +136,7 @@
    Eventually I'll look at the koules signal handlers and
    just trap this.
  */
-int
+void
 do_nothing (void)
 {
   fprintf (stderr, "koules.sndsrv: doing nothing, something is broken\n");
diff -ur koules1.4/xlib/inlstring.h koules1.4-gcc3/xlib/inlstring.h
--- koules1.4/xlib/inlstring.h	1998-03-04 19:59:19.000000000 +0100
+++ koules1.4-gcc3/xlib/inlstring.h	2003-04-23 00:53:56.000000000 +0200
@@ -1,292 +1,348 @@
-
 /* Based on functions in linux/string.h */
 
-
+#ifndef INLSTRING_H
+#define INLSTRING_H
+#include <sys/types.h>	/* for size_t */
 
 #if !defined(__386__)||!defined(ASSEMBLY)
 
 #define __memcpy(dst,src,n)			memcpy((dst),(src),(n))
-#define __memcpy_conventioanl(dst,src,n)	memcpy((dst),(src),(n))
+#define __memcpy_conventional(dst,src,n)	memcpy((dst),(src),(n))
 #define __memcpyb(dst,src,n)			memcpy((dst),(src),(n))
 #define __memsetb(dst,c,n)			memset((dst),(c),(n))
-#define __memsetlong(dst,c,n)			memset((dst),(c),(n))
 #define __memset(dst,c,n)			memset((dst),(c),(n))
-#define __memset2(dst,c,n)			memset((dst),(c),2*(n))
-#define __memset3(dst,c,n)			memset((dst),(c),3*(n))
+
+static inline void *__memsetlong(void *s, long c, size_t count) {
+    long *p=s;
+    int i;
+    for(i=0;i<count;i++)*p++=c;
+    return s;
+}
+
+static inline void *__memset2(void *s, short c, size_t count) {
+    short *p=s;
+    int i;
+    for(i=0;i<count;i++)*p++=c;
+    return s;
+}
+
+static inline void *__memset3(void *s, short c, size_t count) {
+    unsigned char *p=s;
+    int i;
+    for(i=0;i<count;i++) {
+        *p++=c&0xff;
+        *p++=(c>>8)&0xff;
+        *p++=(c>>16)&0xff;
+    }
+    return s;
+}
 
 #else
 
-#include <linux/types.h>	/* for size_t */
-static INLINE void *
-__memcpy_conventional (void *to, const void *from, size_t n)
+static inline void *
+ __memcpy_conventional(void *to, const void *from, size_t n)
 {
-  __asm__ ("cld\n\t"
-	   "movl %%edi,%%ecx\n\t"
-	   "andl $1,%%ecx\n\t"
-	   "subl %%ecx,%%edx\n\t"
-	   "rep ; movsb\n\t"	/* 16-bit align destination */
-	   "movl %%edx,%%ecx\n\t"
-	   "shrl $2,%%ecx\n\t"
-	   "rep ; movsl\n\t"
-	   "testb $1,%%dl\n\t"
-	   "je 1f\n\t"
-	   "movsb\n"
-	   "1:\ttestb $2,%%dl\n\t"
-	   "je 2f\n\t"
-	   "movsw\n"
-	   "2:\n"
-: :	   "d" (n), "D" ((long) to), "S" ((long) from)
-:	   "cx", "dx", "di", "si");
-  return (to);
+  int dummy1;
+  long dummy2, dummy3;
+    __asm__ __volatile__("cld\n\t"
+    	    "cmpl $0,%%edx\n\t"
+    	    "jle 2f\n\t"
+	    "movl %%edi,%%ecx\n\t"
+	    "andl $1,%%ecx\n\t"
+	    "subl %%ecx,%%edx\n\t"
+	    "rep ; movsb\n\t"	/* 16-bit align destination */
+	    "movl %%edx,%%ecx\n\t"
+	    "shrl $2,%%ecx\n\t"
+	    "jz 3f\n\t"
+	    "rep ; movsl\n\t"
+	    "3:\n\t"
+	    "testb $1,%%dl\n\t"
+	    "je 1f\n\t"
+	    "movsb\n"
+	    "1:\ttestb $2,%%dl\n\t"
+	    "je 2f\n\t"
+	    "movsw\n"
+	    "2:\n"
+  :         "=d"(dummy1), "=D"(dummy2), "=S"(dummy3)   /* fake output */ 
+  :	    "0"(n), "1"((long) to), "2"((long) from)
+  :	    "cx"/***rjr***, "dx", "di", "si"***/
+  );
+    return (to);
 }
 
 
-static INLINE void *
-__memcpyb (void *to, const void *from, size_t n)
+static inline void *
+ __memcpyb(void *to, const void *from, size_t n)
 {
-  __asm__ ("cld\n\t"
-	   "rep ; movsb\n\t"
-: :	   "c" (n), "D" ((long) to), "S" ((long) from)
-:	   "cx", "di", "si");
-  return (to);
+  int dummy1;
+  long dummy2, dummy3;
+    __asm__ __volatile__("cld\n\t"
+	    "rep ; movsb\n\t"
+  :         "=c"(dummy1), "=D"(dummy2), "=S"(dummy3) /* fake output */
+  :	    "0"(n), "1"((long) to), "2"((long) from)
+			 /***rjr***:	    "cx", "di", "si"***/
+  );
+    return (to);
 }
 
-static INLINE void *
-__memsetb (void *s, char c, size_t count)
+static inline void *
+ __memsetb(void *s, char c, size_t count)
 {
-  __asm__ ("cld\n\t"
-	   "rep\n\t"
-	   "stosb"
-: :	   "a" (c), "D" (s), "c" (count)
-:	   "cx", "di");
-  return s;
+    __asm__("cld\n\t"
+	    "rep\n\t"
+	    "stosb"
+  : :	    "a"(c), "D"(s), "c"(count)
+  :	    "cx", "di");
+    return s;
 }
 
-static INLINE void *
-__memsetlong (void *s, unsigned c, size_t count)
+static inline void *
+ __memsetlong(void *s, unsigned c, size_t count)
 {
-  __asm__ ("cld\n\t"
-	   "rep\n\t"
-	   "stosl"
-: :	   "a" (c), "D" (s), "c" (count)
-:	   "cx", "di");
-  return s;
+  long dummy1;
+  int dummy2;
+    __asm__ __volatile__("cld\n\t"
+	    "rep\n\t"
+	    "stosl"
+  :         "=D"(dummy1), "=c"(dummy2) /* fake outputs */
+  :	    "a"(c), "0"(s), "1"(count)
+			 /***rjr***:	    "cx", "di"***/
+  );
+    return s;
 }
 
-static INLINE void *
-__memset (void *s, char c, size_t count)
+static inline void *
+ __memset(void *s, char c, size_t count)
 {
-  __asm__ (
-	    "cld\n\t"
-	    "cmpl $12,%%edx\n\t"
-	    "jl 1f\n\t"		/* if (count >= 12) */
-
-	    "movzbl %%al,%%ax\n\t"
-	    "movl %%eax,%%ecx\n\t"
-	    "shll $8,%%ecx\n\t"	/* c |= c << 8 */
-	    "orl %%ecx,%%eax\n\t"
-	    "movl %%eax,%%ecx\n\t"
-	    "shll $16,%%ecx\n\t"	/* c |= c << 16 */
-	    "orl %%ecx,%%eax\n\t"
-
-	    "movl %%edx,%%ecx\n\t"
-	    "negl %%ecx\n\t"
-	    "andl $3,%%ecx\n\t"	/* (-s % 4) */
-	    "subl %%ecx,%%edx\n\t"	/* count -= (-s % 4) */
-	    "rep ; stosb\n\t"	/* align to longword boundary */
-
-	    "movl %%edx,%%ecx\n\t"
-	    "shrl $2,%%ecx\n\t"
-	    "rep ; stosl\n\t"	/* fill longwords */
-
-	    "andl $3,%%edx\n"	/* fill last few bytes */
-	    "1:\tmovl %%edx,%%ecx\n\t"	/* <= 12 entry point */
-	    "rep ; stosb\n\t"
-: :	    "a" (c), "D" (s), "d" (count)
-:	    "ax", "cx", "dx", "di");
-  return s;
+  int dummy1;
+  long dummy2;
+  int dummy3;
+    __asm__ __volatile__(
+	       "cld\n\t"
+	       "cmpl $12,%%edx\n\t"
+	       "jl 1f\n\t"	/* if (count >= 12) */
+
+	       "movzbl %%al,%%ax\n\t"
+	       "movl %%eax,%%ecx\n\t"
+	       "shll $8,%%ecx\n\t"	/* c |= c << 8 */
+	       "orl %%ecx,%%eax\n\t"
+	       "movl %%eax,%%ecx\n\t"
+	       "shll $16,%%ecx\n\t"	/* c |= c << 16 */
+	       "orl %%ecx,%%eax\n\t"
+
+	       "movl %%edx,%%ecx\n\t"
+	       "negl %%ecx\n\t"
+	       "andl $3,%%ecx\n\t"	/* (-s % 4) */
+	       "subl %%ecx,%%edx\n\t"	/* count -= (-s % 4) */
+	       "rep ; stosb\n\t"	/* align to longword boundary */
+
+	       "movl %%edx,%%ecx\n\t"
+	       "shrl $2,%%ecx\n\t"
+	       "rep ; stosl\n\t"	/* fill longwords */
+
+	       "andl $3,%%edx\n"	/* fill last few bytes */
+	       "1:\tmovl %%edx,%%ecx\n\t"	/* <= 12 entry point */
+	       "rep ; stosb\n\t"
+  :            "=a"(dummy1), "=D"(dummy2), "=d"(dummy3) /* fake outputs */
+  :	       "0"(c), "1"(s), "2"(count)
+  :	       /***rjr***"ax",*/ "cx"/*, "dx", "di"*/
+  );
+    return s;
 }
 
-static INLINE void *
-__memset2 (void *s, short c, size_t count)
+static inline void *
+ __memset2(void *s, short c, size_t count)
 /* count is in 16-bit pixels */
 /* s is assumed to be 16-bit aligned */
 {
-  __asm__ (
-	    "cld\n\t"
-	    "cmpl $12,%%edx\n\t"
-	    "jl 1f\n\t"		/* if (count >= 12) */
-
-	    "movzwl %%ax,%%eax\n\t"
-	    "movl %%eax,%%ecx\n\t"
-	    "shll $16,%%ecx\n\t"	/* c |= c << 16 */
-	    "orl %%ecx,%%eax\n\t"
-
-	    "movl %%edi,%%ecx\n\t"
-	    "andl $2,%%ecx\n\t"	/* s & 2 */
-	    "jz 2f\n\t"
-	    "decl %%edx\n\t"	/* count -= 1 */
-	    "stosw\n\t"		/* align to longword boundary */
-
-	    "2:\n\t"
-	    "movl %%edx,%%ecx\n\t"
-	    "shrl $1,%%ecx\n\t"
-	    "rep ; stosl\n\t"	/* fill longwords */
-
-	    "andl $1,%%edx\n"	/* one 16-bit word left? */
-	    "jz 3f\n\t"		/* no, finished */
-	    "1:\tmovl %%edx,%%ecx\n\t"	/* <= 12 entry point */
-	    "rep ; stosw\n\t"
-	    "3:\n\t"
-: :	    "a" (c), "D" (s), "d" (count)
-:	    "ax", "cx", "dx", "di");
-  return s;
+  int dummy1;
+  long dummy2;
+  int dummy3;
+    __asm__ __volatile__(
+	       "cld\n\t"
+	       "cmpl $12,%%edx\n\t"
+	       "jl 1f\n\t"	/* if (count >= 12) */
+
+	       "movzwl %%ax,%%eax\n\t"
+	       "movl %%eax,%%ecx\n\t"
+	       "shll $16,%%ecx\n\t"	/* c |= c << 16 */
+	       "orl %%ecx,%%eax\n\t"
+
+	       "movl %%edi,%%ecx\n\t"
+	       "andl $2,%%ecx\n\t"	/* s & 2 */
+	       "jz 2f\n\t"
+	       "decl %%edx\n\t"	/* count -= 1 */
+	       "movw %%ax,(%%edi)\n\t"	/* align to longword boundary */
+	       "addl $2,%%edi\n\t"
+
+	       "2:\n\t"
+	       "movl %%edx,%%ecx\n\t"
+	       "shrl $1,%%ecx\n\t"
+	       "rep ; stosl\n\t"	/* fill longwords */
+
+	       "andl $1,%%edx\n"	/* one 16-bit word left? */
+	       "jz 3f\n\t"	/* no, finished */
+	       "1:\tmovl %%edx,%%ecx\n\t"	/* <= 12 entry point */
+	       "rep ; stosw\n\t"
+	       "3:\n\t"
+  :            "=a"(dummy1), "=D"(dummy2), "=d"(dummy3) /* fake outputs */
+  :	       "0"(c), "1"(s), "2"(count)
+  :	       /***rjr***"ax",*/ "cx"/*, "dx", "di"*/
+  );
+    return s;
 }
 
-static INLINE void *
-__memset3 (void *s, int c, size_t count)
+static inline void *
+ __memset3(void *s, int c, size_t count)
 /* count is in 24-bit pixels (3 bytes per pixel) */
 {
-  __asm__ (
-	    "cmpl $8,%%edx\n\t"
-  /*      "jmp 2f\n\t" *//* debug */
-	    "jl 2f\n\t"
-
-	    "movl %%eax,%%ebx\n\t"	/* eax = (low) BGR0 (high) */
-	    "shll $24,%%ebx\n\t"	/* ebx = 000B */
-	    "orl %%ebx,%%eax\n\t"	/* eax = BGRB */
-
-	    "movl %%eax,%%ebx\n\t"
-	    "shrl $8,%%ebx\n\t"	/* ebx = GRB0 */
-	    "movl %%ebx,%%ecx\n\t"
-	    "shll $24,%%ecx\n\t"	/* ecx = 000G */
-	    "orl %%ecx,%%ebx\n\t"	/* ebx = GRBG */
-
-	    "movl %%eax,%%ecx\n\t"
-	    "shll $8,%%ecx\n\t"	/* ecx = 0BGR */
-	    "movb %%bh,%%cl\n\t"	/* ecx = RBGR */
-
-	    "cmpl $16,%%edx\n\t"
-	    "jl 1f\n\t"
-	    "jmp 5f\n\t"
-	    ".align 4,0x90\n\t"
-
-	    "5:\n\t"		/* loop unrolling */
-	    "movl %%eax,(%%edi)\n\t"	/* write BGRB */
-	    "movl %%ebx,4(%%edi)\n\t"	/* write GRBG */
-	    "movl %%ecx,8(%%edi)\n\t"	/* write RBGR */
-	    "movl %%eax,12(%%edi)\n\t"
-	    "movl %%ebx,16(%%edi)\n\t"
-	    "movl %%ecx,20(%%edi)\n\t"
-	    "movl %%eax,24(%%edi)\n\t"
-	    "movl %%ebx,28(%%edi)\n\t"
-	    "movl %%ecx,32(%%edi)\n\t"
-	    "movl %%eax,36(%%edi)\n\t"
-	    "subl $16,%%edx\n\t"	/* blend end-of-loop instr. */
-	    "movl %%ebx,40(%%edi)\n\t"
-	    "movl %%ecx,44(%%edi)\n\t"
-	    "addl $48,%%edi\n\t"
-	    "cmpl $16,%%edx\n\t"
-	    "jge 5b\n\t"
-	    "andl %%edx,%%edx\n\t"
-	    "jz 4f\n\t"		/* finished */
-	    "cmpl $4,%%edx\n\t"
-	    "jl 2f\n\t"		/* less than 4 pixels left */
-	    "jmp 1f\n\t"
-	    ".align 4,0x90\n\t"
-
-	    "1:\n\t"
-	    "movl %%eax,(%%edi)\n\t"	/* write BGRB */
-	    "movl %%ebx,4(%%edi)\n\t"	/* write GRBG */
-	    "movl %%ecx,8(%%edi)\n\t"	/* write RBGR */
-	    "addl $12,%%edi\n\t"
-	    "subl $4,%%edx\n\t"
-	    "cmpl $4,%%edx\n\t"
-	    "jge 1b\n\t"
-
-	    "2:\n\t"
-	    "cmpl $0,%%edx\n\t"	/* none left? */
-	    "jle 4f\n\t"	/* finished */
-
-	    "mov %%eax,%%ecx\n\t"
-	    "shrl $16,%%ecx\n\t"	/* B in cl */
-
-	    "3:\n\t"		/* write last few pixels */
-	    "movw %%ax,(%%edi)\n\t"	/* write RG */
-	    "movb %%cl,2(%%edi)\n\t"	/* write B */
-	    "addl $3,%%edi\n\t"
-	    "decl %%edx\n\t"
-	    "jnz 3b\n\t"
-
-	    "4:\n\t"
-: :	    "a" (c), "D" (s), "d" (count)
-:	    "ax", "bx", "cx", "dx", "di");
-  return s;
+  int dummy1;
+  long dummy2;
+  int dummy3;
+    __asm__ __volatile__(
+	       "cmpl $8,%%edx\n\t"
+    /*      "jmp 2f\n\t" *//* debug */
+	       "jl 2f\n\t"
+
+	       "movl %%eax,%%esi\n\t"	/* esi = (low) BGR0 (high) */
+	       "shll $24,%%eax\n\t"	/* eax = 000B */
+	       "orl %%eax,%%esi\n\t"	/* esi = BGRB */
+
+	       "movl %%esi,%%eax\n\t"
+	       "shrl $8,%%eax\n\t"	/* eax = GRB0 */
+	       "movl %%eax,%%ecx\n\t"
+	       "shll $24,%%ecx\n\t"	/* ecx = 000G */
+	       "orl %%ecx,%%eax\n\t"	/* eax = GRBG */
+
+	       "movl %%esi,%%ecx\n\t"
+	       "shll $8,%%ecx\n\t"	/* ecx = 0BGR */
+	       "movb %%ah,%%cl\n\t"	/* ecx = RBGR */
+
+	       "cmpl $16,%%edx\n\t"
+	       "jl 1f\n\t"
+	       "jmp 5f\n\t"
+	       ".align 4,0x90\n\t"
+
+	       "5:\n\t"		/* loop unrolling */
+	       "movl %%esi,(%%edi)\n\t"		/* write BGRB */
+	       "movl %%eax,4(%%edi)\n\t"	/* write GRBG */
+	       "movl %%ecx,8(%%edi)\n\t"	/* write RBGR */
+	       "movl %%esi,12(%%edi)\n\t"
+	       "movl %%eax,16(%%edi)\n\t"
+	       "movl %%ecx,20(%%edi)\n\t"
+	       "movl %%esi,24(%%edi)\n\t"
+	       "movl %%eax,28(%%edi)\n\t"
+	       "movl %%ecx,32(%%edi)\n\t"
+	       "movl %%esi,36(%%edi)\n\t"
+	       "subl $16,%%edx\n\t"	/* blend end-of-loop instr. */
+	       "movl %%eax,40(%%edi)\n\t"
+	       "movl %%ecx,44(%%edi)\n\t"
+	       "addl $48,%%edi\n\t"
+	       "cmpl $16,%%edx\n\t"
+	       "jge 5b\n\t"
+	       "andl %%edx,%%edx\n\t"
+	       "jz 4f\n\t"	/* finished */
+	       "cmpl $4,%%edx\n\t"
+	       "jl 2f\n\t"	/* less than 4 pixels left */
+	       "jmp 1f\n\t"
+	       ".align 4,0x90\n\t"
+
+	       "1:\n\t"
+	       "movl %%esi,(%%edi)\n\t"		/* write BGRB */
+	       "movl %%eax,4(%%edi)\n\t"	/* write GRBG */
+	       "movl %%ecx,8(%%edi)\n\t"	/* write RBGR */
+	       "addl $12,%%edi\n\t"
+	       "subl $4,%%edx\n\t"
+	       "cmpl $4,%%edx\n\t"
+	       "jge 1b\n\t"
+
+	       "2:\n\t"
+	       "cmpl $0,%%edx\n\t"	/* none left? */
+	       "jle 4f\n\t"	/* finished */
+
+	       "mov %%ecx,%%eax\n\t"
+	       "shrl $8,%%ecx\n\t"	/* R in cl */
+
+	       "3:\n\t"		/* write last few pixels */
+	       "movw %%cx,(%%edi)\n\t"	/* write BG */
+	       "movb %%al,2(%%edi)\n\t"		/* write R */
+	       "addl $3,%%edi\n\t"
+	       "decl %%edx\n\t"
+	       "jnz 3b\n\t"
+
+	       "4:\n\t"
+  :            "=a"(dummy1), "=D"(dummy2), "=d"(dummy3) /* fake outputs */
+  :	       "0"(c), "1"(s), "2"(count)
+  :	       /***rjr***"ax",*/ "cx", /*"dx",*/ "si"/*, "di"*/
+  );
+    return s;
 }
 
-/* Functions defined in mem.S */
-
-extern          memcpy4to3 (void *dest, void *src, int n);
-extern          memcpy32shift8 (void *dest, void *src, int n);
-
 /* Functions for which arguments must be passed in %ebx, %edx, and %ecx. */
-extern          __memcpyasm_regargs ();		/* nu_bytes >= 3 */
-extern          __memcpyasm_regargs_aligned ();		/* nu_bytes >= 32 */
+#if 0				/* Why declare 'em? Just confuses the compiler and can't be called from C
+				   anyway */
+extern __memcpyasm_regargs();	/* nu_bytes >= 3 */
+extern __memcpyasm_regargs_aligned();	/* nu_bytes >= 32 */
+#endif
 
 
 /* Always 32-bit align destination, even for a small number of bytes. */
-static INLINE void *
-__memcpy_aligndest (void *dest, const void *src, int n)
+static inline void *
+ __memcpy_aligndest(void *dest, const void *src, int n)
 {
-  __asm__         __volatile__ ("
-				cmpl $3, %%ecx
-				ja 1f
-				call * __memcpy_jumptable (, %%ecx, 4)
-				jmp 2f
-				1:call __memcpyasm_regargs
-				"
-				:
-				:"b"            (dest), "d" (src), "c" (n)
-				:"ax", "0", "1", "2");
+    __asm__ __volatile__("cmpl $3, %%ecx\n\t"
+			 "ja 1f\n\t"
+			 "call * __memcpy_jumptable (, %%ecx, 4)\n\t"
+			 "jmp 2f\n\t"
+			 "1:call __memcpyasm_regargs\n\t"
+			 "2:":
+			 :"S"(dest), "d"(src), "c"(n)
+			 :"ax", "0", "1", "2");
+    return dest;
 }
 
 
 /* Optimized version for 32-bit aligned destination. */
-static INLINE void *
-__memcpy_destaligned (void *dest, const void *src, int n)
+static inline void *
+ __memcpy_destaligned(void *dest, const void *src, int n)
 {
-  __asm__         __volatile__ ("
-				cmpl $32, %%ecx
-				ja 1f
-				call * __memcpy_jumptable (, %%ecx, 4)
-				jmp 2f
-				1:call __memcpyasm_regargs_aligned
-				2:
-				"
-				:
-				:"b"            (dest), "d" (src), "c" (n)
-				:"ax", "0", "1", "2");
+    __asm__ __volatile__("cmpl $32, %%ecx\n\t"
+			 "ja 1f\n\t"
+			 "call * __memcpy_jumptable (, %%ecx, 4)\n\t"
+			 "jmp 2f\n\t"
+			 "1:call __memcpyasm_regargs_aligned\n\t"
+			 "2:\n\t":
+			 :"S"(dest), "d"(src), "c"(n)
+			 :"ax", "0", "1", "2");
+    return dest;
 }
 
 
-/* Balanced INLINE memcpy; 32-bit align destination if nu_bytes >= 20. */
-static INLINE void *
-__memcpy_balanced (void *dest, const void *src, int n)
+/* Balanced inline memcpy; 32-bit align destination if nu_bytes >= 20. */
+static inline void *
+ __memcpy_balanced(void *dest, const void *src, int n)
 {
-  __asm__         __volatile__ ("
-				cmpl $19, %%ecx
-				ja 1f
-				call * __memcpy_jumptable (, %%ecx, 4)
-				jmp 2f
-				1:call __memcpyasm_regargs
-				2:
-				"
-				:
-	     :"b"            ((long) dest), "d" ((long) src), "c" ((long) n)
-				:"ax", "bx", "cx", "dx");
+    __asm__ __volatile__("cmpl $19, %%ecx\n\t"
+			 "ja 1f\n\t"
+			 "call * __memcpy_jumptable (, %%ecx, 4)\n\t"
+			 "jmp 2f\n\t"
+			 "1:call __memcpyasm_regargs\n\t"
+			 "2:\n\t"
+			 :
+			 :"S"((long) dest), "d"((long) src), "c"((long) n)
+			 :"ax", "0", "1", "2");
+    return dest;
 }
 
 
 #define __memcpy __memcpy_conventional
 
 #endif
+
+/* Functions defined in mem.S or mem.c */
+
+extern void __memcpy4to3(void *dest, void *src, int n);
+extern void __memcpy32shift8(void *dest, void *src, int n);
+
+#endif
+
diff -ur koules1.4/xlib/shmbitmap.c koules1.4-gcc3/xlib/shmbitmap.c
--- koules1.4/xlib/shmbitmap.c	1998-03-04 19:59:19.000000000 +0100
+++ koules1.4-gcc3/xlib/shmbitmap.c	2003-04-23 01:11:02.000000000 +0200
@@ -139,23 +139,37 @@
 	  count = *dp++;
 	  /* __memcpy gives severe bug here */
 	  if (y >= ny)
+	    {
 	    if (x >= nx)
+	      {
 	      if (x + count > __clipx2 + 1)
 		{
 		  if (x <= __clipx2)
-		    __memcpyb (vp, dp, __clipx2 - x + 1);
+		    {
+		      __memcpyb (vp, dp, __clipx2 - x + 1);
+		    }
 		}
 	      else
-		__memcpyb (vp, dp, count);
+		{
+		  __memcpyb (vp, dp, count);
+		}
+	      }
 	    else if (x + count > __clipx1)
+	      {
 	      if (x + count > __clipx2 + 1)
+		{
 		__memcpyb (vp + __clipx1 - x,
 			   dp + __clipx1 - x,
 			   __clipx2 - __clipx1 + 1);
+		}
 	      else
+		{
 		__memcpy (vp + __clipx1 - x,
 			  dp + __clipx1 - x,
 			  count - __clipx1 + x);
+		}
+	      }
+	    }
 	  x += count;
 	  vp += count;
 	  dp += count;
@@ -224,11 +238,7 @@
 
 
 /*following routines are ripped from vgagl library */
-/* We use the 32-bit to 64-bit multiply and 64-bit to 32-bit divide of the */
-/* 386 (which gcc doesn't know well enough) to efficiently perform integer */
-/* scaling without having to worry about overflows. */
 #define swap(x, y) { int temp = x; x = y; y = temp; }
-#define setpixel (*(backscreen->ff.driver_setpixel_func))
 #undef __clipx2
 #define __clipx2 (MAPWIDTH-1)
 #undef __clipx1
@@ -237,23 +247,15 @@
 #define __clipy1 0
 #undef __clipy2
 #define __clipy2 (MAPHEIGHT+19)
-#ifdef __i386__
+
 static INLINE int
-muldiv64 (int CONST m1, int CONST m2, int CONST d)
+muldiv64(int m1, int m2, int d)
 {
-/* int32 * int32 -> int64 / int32 -> int32 */
-  int             result;
-  __asm__ (
-	    "imull %%edx\n\t"
-	    "idivl %3\n\t"
-:	    "=a" (result)	/* out */
-:	    "a" (m1), "d" (m2), "g" (d)		/* in */
-:	    "ax", "dx"		/* mod */
-    );
-  return result;
+  return (float) m1 * (float) m2 / ((float) d);
 }
 
-#define INC_IF_NEG(y)                                  \
+#ifdef __i386__
+#define INC_IF_NEG(y, result)                           \
 {                                                       \
         __asm__("btl $31,%1\n\t"                        \
                 "adcl $0,%0"                            \
@@ -264,20 +266,20 @@
 static INLINE int
 gl_regioncode (CONST int x, CONST int y)
 {
-  int             dx1, dx2, dy1, dy2;
-  int             result;
+  int dx1, dx2, dy1, dy2;
+  int result;
   result = 0;
   dy2 = __clipy2 - y;
-  INC_IF_NEG (dy2);
+  INC_IF_NEG (dy2, result);
   result <<= 1;
   dy1 = y - __clipy1;
-  INC_IF_NEG (dy1);
+  INC_IF_NEG (dy1, result);
   result <<= 1;
   dx2 = __clipx2 - x;
-  INC_IF_NEG (dx2);
+  INC_IF_NEG (dx2, result);
   result <<= 1;
   dx1 = x - __clipx1;
-  INC_IF_NEG (dx1);
+  INC_IF_NEG (dx1, result);
   return result;
 }
 
@@ -287,7 +289,7 @@
 static INLINE int
 gl_regioncode (CONST int x, CONST int y)
 {
-  int             result = 0;
+  int result = 0;
   if (x < 0)
     result |= 1;
   else if (x > __clipx2)
@@ -300,15 +302,44 @@
 }
 #endif
 
-/* Partly based on vgalib by Tommy Frandsen */
-/* This would be a lot faster if setpixel was inlined */
+#define line_loop_linear_a(m,i,u,v)	\
+	    {    \
+	    int d = ay - (ax >> 1);	\
+	    if ((x = abs (dx)))	\
+		do {	\
+		    i;	\
+		    if (d m 0) {	\
+			vp v;	\
+			d -= ax;	\
+		    }	\
+		    vp u;	\
+		    d += ay;	\
+		} while (--x);    \
+	    }
+
+#define line_loop_linear_b(m,i,u,v)	\
+		{    \
+		int d = ax - (ay >> 1);	\
+		if ((y = abs (dy)))	\
+		    do {	\
+			i;	\
+			if (d m 0) {	\
+			    vp u;	\
+			    d -= ay;	\
+			}	\
+			vp v;	\
+			d += ax;	\
+		    } while (--y);    \
+		}
+
+/* Partly based on the work which was partly based on vgalib by Tommy Frandsen */
+/* This is a lot faster now that setpixel is inlined */
 
 void
 Line (int x1, int y1, int x2, int y2, int c)
 {
-  int             dx, dy, ax, ay, sx, sy, x, y;
-  int             syp;
-  char           *point;
+  int dx, dy, ax, ay, sx, sy, x, y;
+  unsigned char *vp = NULL;
   if (!shm)
     {
       qLine (x1, y1, x2, y2, c);
@@ -319,8 +350,8 @@
   if (Clipping)
     for (;;)
       {
-	int             r1 = gl_regioncode (x1, y1);
-	int             r2 = gl_regioncode (x2, y2);
+	int r1 = gl_regioncode (x1, y1);
+	int r2 = gl_regioncode (x2, y2);
 	if (!(r1 | r2))
 	  break;		/* completely inside */
 	if (r1 & r2)
@@ -333,38 +364,22 @@
 	  }
 	if (r1 & 1)
 	  {			/* left */
-#ifdef __i386__
 	    y1 += muldiv64 (__clipx1 - x1, y2 - y1, x2 - x1);
-#else
-	    y1 += (long) (__clipx1 - x1) * (long) (y2 - y1) / (long) (x2 - x1);
-#endif
 	    x1 = __clipx1;
 	  }
 	else if (r1 & 2)
 	  {			/* right */
-#ifdef __i386__
 	    y1 += muldiv64 (__clipx2 - x1, y2 - y1, x2 - x1);
-#else
-	    y1 += (long) (__clipx2 - x1) * (long) (y2 - y1) / (long) (x2 - x1);
-#endif
 	    x1 = __clipx2;
 	  }
 	else if (r1 & 4)
 	  {			/* top */
-#ifdef __i386__
 	    x1 += muldiv64 (__clipy1 - y1, x2 - x1, y2 - y1);
-#else
-	    x1 += (long) (__clipy1 - y1) * (long) (x2 - x1) / (long) (y2 - y1);
-#endif
 	    y1 = __clipy1;
 	  }
 	else if (r1 & 8)
 	  {			/* bottom */
-#ifdef __i386__
 	    x1 += muldiv64 (__clipy2 - y1, x2 - x1, y2 - y1);
-#else
-	    x1 += (long) (__clipy2 - y1) * (long) (x2 - x1) / (long) (y2 - y1);
-#endif
 	    y1 = __clipy2;
 	  }
       }
@@ -377,45 +392,66 @@
   x = x1;
   y = y1;
 
-  point = VScreenToBuffer (backscreen) + x + y * MAPWIDTH;
+#define insert_pixel_1 *((unsigned char *) vp) = c;
+
+  vp = VScreenToBuffer (backscreen) + y * MAPWIDTH + x;
   if (ax > ay)
     {
-      int             d = ay - (ax >> 1);
-      syp = sy * MAPWIDTH;
-      while (x != x2)
+      if(sx > 0)
 	{
-	  *point = c;
-	  if (d > 0 || (d == 0 && sx == 1))
-	    {
-	      y += sy;
-	      point += syp;
-	      d -= ax;
-	    }
-	  x += sx;
-	  point += sx;
-	  d += ay;
+	  line_loop_linear_a(>=,insert_pixel_1,++,+=MAPWIDTH*sy);
+	} 
+      else 
+	{
+	  line_loop_linear_a(>,insert_pixel_1,--,+=MAPWIDTH*sy);
 	}
     }
   else
     {
-      int             sy = (dy >= 0) ? 1 : -1;
-      int             d = ax - (ay >> 1);
-      syp = sy * MAPWIDTH;
-      while (y != y2)
+      if(sy > 0)
+	{
+	  line_loop_linear_b(>=,insert_pixel_1,+=sx,+=MAPWIDTH);
+	} 
+      else 
 	{
-	  *(point) = c;
-	  if (d > 0 || (d == 0 && sy == 1))
+	  line_loop_linear_b(>,insert_pixel_1,+=sx,-=MAPWIDTH);
+	}
+    }
+  insert_pixel_1;
+ 
+  if (!vp)
+    {
+      if (ax > ay)
+	{
+	  int d = ay - (ax >> 1);
+	  while (x != x2)
 	    {
+	      insert_pixel_1;
+	      if (d > 0 || (d == 0 && sx == 1))
+		{
+		  y += sy;
+		  d -= ax;
+		}
 	      x += sx;
-	      point += sx;
-	      d -= ay;
+	      d += ay;
+	    }
+	} 
+      else
+	{
+	  int d = ax - (ay >> 1);
+	  while (y != y2)
+	    {
+	      insert_pixel_1;
+	      if (d > 0 || (d == 0 && sy == 1))
+		{
+		  x += sx;
+		  d -= ay;
+		}
+	      y += sy;
+	      d += ax;
 	    }
-	  y += sy;
-	  point += syp;
-	  d += ax;
 	}
+      insert_pixel_1;
     }
-  *(point) = c;
-  point++;
 }
 #endif
--- koules1.4/Iconfig	2003-07-12 00:20:13.000000000 -0400
+++ koules1.4-gcc3/Iconfig	2003-07-12 00:20:45.000000000 -0400
@@ -36,7 +36,7 @@
 /* directories*/
 KOULESDIR		=/usr/bin/X11
 SOUNDDIR		=/usr/local/lib/koules
-MANDIR			=/usr/local/man/man6
+MANDIR			=/usr/share/man/man6
 
 /*You need some extra libraryes for BSD sockets compatibility?*/
 /* TOP_INCLUDES =                       /* Sun users with GCC need this */