2 files changed, 723 insertions, 177 deletions
diff --git a/src/win95/inline.h b/src/win95/inline.h
index f09e79a..6054f29 100644
--- a/src/win95/inline.h
+++ b/src/win95/inline.h
@@ -1215,9 +1215,9 @@ fptmp = (b); \
 FloatToInt(); \
 a = itmp;}
 
-#else /* other compiler ? */
+#else
 
-/* #error "Unknown compiler" */
+#if 0
 void ADD_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c);
 void ADD_LL_PP(LONGLONGCH *c, LONGLONGCH *a);
 void SUB_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c);
@@ -1240,6 +1240,722 @@ void RotateAndCopyVector_ASM(VECTORCH *v1, VECTORCH *v2, MATRIXCH *m);
 
 int FloatToInt(float);
 #define f2i(a, b) { a = FloatToInt(b); }
+#endif
+
+/* ADD */
+
+static __inline__ void ADD_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c)
+{
+/*
+	_asm
+	{
+		mov esi,a
+		mov edi,b
+		mov ebx,c
+		mov	eax,[esi]
+		mov	edx,[esi+4]
+		add	eax,[edi]
+		adc	edx,[edi+4]
+		mov	[ebx],eax
+		mov	[ebx+4],edx
+	}
+*/
+
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"movl	4(%%esi), %%edx		\n\t"
+	"addl	0(%%edi), %%eax		\n\t"
+	"adcl	4(%%edi), %%edx		\n\t"
+	"movl	%%eax, 0(%%ebx)		\n\t"
+	"movl	%%edx, 4(%%ebx)		\n\t"
+	: 
+	: "S" (a), "D" (b), "b" (c)
+	: "%eax", "%edx", "memory", "cc"
+	);
+
+/*
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"movl	4(%%esi), %%edx		\n\t"
+	"addl	0(%%edi), %%eax		\n\t"
+	"adcl	4(%%edi), %%edx		\n\t"
+	: "=a" (c->lo32), "=d" (c->hi32)
+	: "S" (a), "D" (b)
+	);
+*/
+}
+
+/* ADD ++ */
+
+static __inline__ void ADD_LL_PP(LONGLONGCH *c, LONGLONGCH *a)
+{
+/*
+	_asm
+	{
+		mov edi,c
+		mov esi,a
+		mov	eax,[esi]
+		mov	edx,[esi+4]
+		add	[edi],eax
+		adc	[edi+4],edx
+	}
+*/
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"movl	4(%%esi), %%edx		\n\t"
+	"addl	%%eax, 0(%%edi)		\n\t"
+	"adcl	%%edx, 4(%%edi)		\n\t"
+	:
+	: "D" (c), "S" (a)
+	: "%eax", "%edx", "memory", "cc"
+	);
+}
+
+/* SUB */
+
+static __inline__ void SUB_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c)
+{
+/*
+	_asm
+	{
+		mov esi,a
+		mov edi,b
+		mov ebx,c
+		mov	eax,[esi]
+		mov	edx,[esi+4]
+		sub	eax,[edi]
+		sbb	edx,[edi+4]
+		mov	[ebx],eax
+		mov	[ebx+4],edx
+	}
+*/
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"movl	4(%%esi), %%edx		\n\t"
+	"subl	0(%%edi), %%eax		\n\t"
+	"sbbl	4(%%edi), %%edx		\n\t"
+	"movl	%%eax, 0(%%ebx)		\n\t"
+	"movl	%%edx, 4(%%ebx)		\n\t"
+	:
+	: "S" (a), "D" (b), "b" (c)
+	: "%eax", "%edx", "memory", "cc"
+	);
+}
+
+/* SUB -- */
+
+static __inline__ void SUB_LL_MM(LONGLONGCH *c, LONGLONGCH *a)
+{
+/*
+	_asm
+	{
+		mov edi,c
+		mov esi,a
+		mov	eax,[esi]
+		mov	edx,[esi+4]
+		sub	[edi],eax
+		sbb	[edi+4],edx
+	}
+*/
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"movl	4(%%esi), %%edx		\n\t"
+	"subl	%%eax, 0(%%edi)		\n\t"
+	"sbbl	%%edx, 4(%%edi)		\n\t"
+	:
+	: "D" (c), "S" (a)
+	: "%eax", "%edx", "memory", "cc"
+	);
+}
+
+/*
+
+ MUL
+
+ This is the multiply we use, the 32 x 32 = 64 widening version
+
+*/
+
+static __inline__ void MUL_I_WIDE(int a, int b, LONGLONGCH *c)
+{
+/*
+	_asm
+	{
+		mov eax,a
+		mov ebx,c
+		imul b
+		mov	[ebx],eax
+		mov	[ebx+4],edx
+	}
+*/
+__asm__("imull	%0			\n\t"
+	"movl	%%eax, 0(%%ebx)		\n\t"
+	"movl	%%edx, 4(%%ebx)		\n\t"
+	:
+	: "a" (a), "b" (c), "q" (b)
+	: "%edx", "memory", "cc"
+	);
+}
+
+/*
+
+ CMP
+
+ This substitutes for ==, >, <, >=, <=
+
+*/
+
+static __inline__ int CMP_LL(LONGLONGCH *a, LONGLONGCH *b)
+{
+	int retval;
+/*
+	_asm
+	{
+		mov ebx,a
+		mov ecx,b
+		mov	eax,[ebx]
+		mov	edx,[ebx+4]
+		sub	eax,[ecx]
+		sbb	edx,[ecx+4]
+		and	edx,edx
+		jne	llnz
+		and	eax,eax
+		je	llgs
+		llnz:
+		mov	retval,1
+		and	edx,edx
+		jge	llgs
+		neg	retval
+		llgs:
+	}
+*/
+/* TODO */
+__asm__("xorl	%0, %0			\n\t"
+	"movl	0(%%ebx), %%eax		\n\t"
+	"movl	4(%%ebx), %%edx		\n\t"
+	"subl	0(%%ecx), %%eax		\n\t"
+	"sbbl	4(%%ecx), %%edx		\n\t"
+	"andl	%%edx, %%edx		\n\t"
+	"jne	llnz			\n\t"
+	"andl	%%eax, %%eax		\n\t"
+	"je	llgs			\n"
+"llnz:					\n\t"
+	"movl	$1, %0			\n\t"
+	"andl	%%edx, %%edx		\n\t"
+	"jge	llgs			\n\t"
+	"negl	%0			\n"
+"llgs:					\n\t"
+	: "=r" (retval)
+	: "b" (a), "c" (b)
+	: "%eax", "%edx", "memory", "cc"
+	);
+	
+	return retval;
+}
+
+/* EQUALS */
+
+static __inline__ void EQUALS_LL(LONGLONGCH *a, LONGLONGCH *b)
+{
+/*
+	_asm
+	{
+		mov edi,a
+		mov esi,b
+		mov	eax,[esi]
+		mov	edx,[esi+4]
+		mov	[edi],eax
+		mov	[edi+4],edx
+	}
+*/
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"movl	4(%%esi), %%edx		\n\t"
+	"movl	%%eax, 0(%%edi)		\n\t"
+	"movl	%%edx, 4(%%edi)		\n\t"
+	:
+	: "D" (a), "S" (b)
+	: "%eax", "%edx", "memory"
+	);
+}
+
+/* NEGATE */
+
+static __inline__ void NEG_LL(LONGLONGCH *a)
+{
+/*
+	_asm
+	{
+		mov esi,a
+		not	dword ptr[esi]
+		not	dword ptr[esi+4]
+		add	dword ptr[esi],1
+		adc	dword ptr[esi+4],0
+	}
+*/
+__asm__("notl	0(%%esi)		\n\t"
+	"notl	4(%%esi)		\n\t"
+	"addl	$1, 0(%%esi)		\n\t"
+	"adcl	$0, 4(%%esi)		\n\t"
+	:
+	: "S" (a)
+	: "memory", "cc"
+	);
+}
+
+/* ASR */
+
+static __inline__ void ASR_LL(LONGLONGCH *a, int shift)
+{
+/*
+	_asm
+	{
+		mov esi,a
+		mov eax,shift
+		and	eax,eax
+		jle	asrdn
+		asrlp:
+		sar	dword ptr[esi+4],1
+		rcr	dword ptr[esi],1
+		dec	eax
+		jne	asrlp
+		asrdn:
+	}
+*/
+__asm__("andl	%%eax, %%eax		\n\t"
+	"jle	asrdn			\n"
+"asrlp:					\n\t"
+	"sarl	$1, 4(%%esi)		\n\t"
+	"rcrl	$1, 0(%%esi)		\n\t"
+	"decl	%%eax			\n\t"
+	"jne	asrlp			\n"
+"asrdn:					\n\t"
+	:
+	: "S" (a), "a" (shift)
+	: "memory", "cc"
+	);
+	
+}
+
+/* Convert int to LONGLONGCH */
+
+static __inline__ void IntToLL(LONGLONGCH *a, int *b)
+{
+/*
+	_asm
+	{
+		mov esi,b
+		mov edi,a
+		mov	eax,[esi]
+		cdq
+		mov	[edi],eax
+		mov	[edi+4],edx
+	}
+*/
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"cdq				\n\t"
+	"movl	%%eax, 0(%%edi)		\n\t"
+	"movl	%%edx, 4(%%edi)		\n\t"
+	:
+	: "S" (b), "D" (a)
+	: "%eax", "%edx", "memory", "cc"
+	);
+
+}
+
+/*
+
+ Fixed Point Multiply.
+
+
+ 16.16 * 16.16 -> 16.16
+ or
+ 16.16 * 0.32 -> 0.32
+
+ A proper version of this function ought to read
+ 16.16 * 16.16 -> 32.16
+ but this would require a long long result
+
+ Algorithm:
+
+ Take the mid 32 bits of the 64 bit result
+
+*/
+
+/*
+	These functions have been checked for suitability for 
+	a Pentium and look as if they would work adequately.
+	Might be worth a more detailed look at optimising
+	them though.
+*/
+
+static __inline__ int MUL_FIXED(int a, int b)
+{
+	int retval;
+/*
+	_asm
+	{
+		mov eax,a
+		imul b
+		shrd eax,edx,16
+		mov retval,eax
+	}
+*/
+/* TODO */
+__asm__("imull	%0			\n\t"
+	"shrdl	$16, %%edx, %%eax	\n\t"
+	: "=a" (retval)
+	: "a" (a), "q" (b)
+	: "%edx", "cc"
+	);
+	return retval;
+}
+
+/*
+
+ Fixed Point Divide - returns a / b
+
+*/
+
+static __inline__ int DIV_FIXED(int a, int b)
+{
+	int retval;
+/*
+	_asm
+	{
+		mov eax,a
+		cdq
+		rol eax,16
+		mov dx,ax
+		xor ax,ax
+		idiv b
+		mov retval,eax
+	}
+*/
+/* TODO */
+__asm__("cdq				\n\t"
+	"roll	$16, %%eax		\n\t"
+	"mov	%%ax, %%dx		\n\t"
+	"xor	%%ax, %%ax		\n\t"
+	"idivl	%0			\n\t"
+	: "=a" (retval)
+	: "a" (a), "q" (b)
+	: "%edx", "cc"
+	);
+	return retval;
+}
+
+/*
+
+ Multiply and Divide Functions.
+
+*/
+
+
+/*
+
+ 32/32 division
+
+ This macro is a function on some other platforms
+
+*/
+
+#define DIV_INT(a, b) ((a) / (b))
+
+/*
+
+ A Narrowing 64/32 Division
+
+*/
+
+static __inline__ int NarrowDivide(LONGLONGCH *a, int b)
+{
+	int retval;
+/*
+	_asm
+	{
+		mov esi,a
+		mov	eax,[esi]
+		mov	edx,[esi+4]
+		idiv	b
+		mov retval,eax
+	}
+*/
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"movl	4(%%esi), %%edx		\n\t"
+	"idivl	%0			\n\t"
+	: "=a" (retval)
+	: "S" (a), "q" (b)
+	: "%edx", "cc"
+	);
+	return retval;
+}
+
+/*
+
+ This function performs a Widening Multiply followed by a Narrowing Divide.
+
+ a = (a * b) / c
+
+*/
+
+static __inline__ int WideMulNarrowDiv(int a, int b, int c)
+{
+	int retval;
+/*
+	_asm
+	{
+		mov eax,a
+		imul b
+		idiv c
+		mov retval,eax
+	}
+*/
+/* TODO */
+__asm__("imull	%0			\n\t"
+	"idivl	%1			\n\t"
+	: "=a" (retval)
+	: "a" (a), "q" (b), "q" (c)
+	: "cc"
+	);	
+	return retval;
+}
+
+/*
+
+ Function to rotate a VECTORCH using a MATRIXCH
+
+ This is the C function
+
+	x =  MUL_FIXED(m->mat11, v->vx);
+	x += MUL_FIXED(m->mat21, v->vy);
+	x += MUL_FIXED(m->mat31, v->vz);
+
+	y  = MUL_FIXED(m->mat12, v->vx);
+	y += MUL_FIXED(m->mat22, v->vy);
+	y += MUL_FIXED(m->mat32, v->vz);
+
+	z  = MUL_FIXED(m->mat13, v->vx);
+	z += MUL_FIXED(m->mat23, v->vy);
+	z += MUL_FIXED(m->mat33, v->vz);
+
+	v->vx = x;
+	v->vy = y;
+	v->vz = z;
+
+ This is the MUL_FIXED inline assembler function
+
+	imul edx
+	shrd eax,edx,16
+
+
+typedef struct matrixch {
+
+	int mat11;	0
+	int mat12;	4
+	int mat13;	8
+
+	int mat21;	12
+	int mat22;	16
+	int mat23;	20
+
+	int mat31;	24
+	int mat32;	28
+	int mat33;	32
+
+} MATRIXCH;
+
+*/
+
+#if 0 /* TODO if these are needed */
+static void RotateVector_ASM(VECTORCH *v, MATRIXCH *m)
+{
+	_asm
+	{
+		mov esi,v
+		mov edi,m
+
+		mov	eax,[edi + 0]
+		imul	DWORD PTR [esi + 0]
+		shrd	eax,edx,16
+		mov	ecx,eax
+		mov	eax,[edi + 12]
+		imul	DWORD PTR [esi + 4]
+		shrd	eax,edx,16
+		add	ecx,eax
+		mov	eax,[edi + 24]
+		imul	DWORD PTR [esi + 8]
+		shrd	eax,edx,16
+		add	ecx,eax
+
+		mov	eax,[edi + 4]
+		imul	DWORD PTR [esi + 0]
+		shrd	eax,edx,16
+		mov	ebx,eax
+		mov	eax,[edi + 16]
+		imul	DWORD PTR [esi + 4]
+		shrd	eax,edx,16
+		add	ebx,eax
+		mov	eax,[edi + 28]
+		imul	DWORD PTR [esi + 8]
+		shrd	eax,edx,16
+		add	ebx,eax
+
+		mov	eax,[edi + 8]
+		imul	DWORD PTR [esi + 0]
+		shrd	eax,edx,16
+		mov	ebp,eax
+		mov	eax,[edi + 20]
+		imul	DWORD PTR [esi + 4]
+		shrd	eax,edx,16
+		add	ebp,eax
+		mov	eax,[edi + 32]
+		imul	DWORD PTR [esi + 8]
+		shrd	eax,edx,16
+		add	ebp,eax
+
+		mov	[esi + 0],ecx
+		mov	[esi + 4],ebx
+		mov	[esi + 8],ebp
+	}
+}
+
+/*
+
+ Here is the same function, this time copying the result to a second vector
+
+*/
+
+static void RotateAndCopyVector_ASM(VECTORCH *v1, VECTORCH *v2, MATRIXCH *m)
+{
+	_asm
+	{
+		mov esi,v1
+		mov edi,m
+
+		mov	eax,[edi + 0]
+		imul	DWORD PTR [esi + 0]
+		shrd	eax,edx,16
+		mov	ecx,eax
+		mov	eax,[edi + 12]
+		imul	DWORD PTR [esi + 4]
+		shrd	eax,edx,16
+		add	ecx,eax
+		mov	eax,[edi + 24]
+		imul	DWORD PTR [esi + 8]
+		shrd	eax,edx,16
+		add	ecx,eax
+
+		mov	eax,[edi + 4]
+		imul	DWORD PTR [esi + 0]
+		shrd	eax,edx,16
+		mov	ebx,eax
+		mov	eax,[edi + 16]
+		imul	DWORD PTR [esi + 4]
+		shrd	eax,edx,16
+		add	ebx,eax
+		mov	eax,[edi + 28]
+		imul	DWORD PTR [esi + 8]
+		shrd	eax,edx,16
+		add	ebx,eax
+
+		mov	eax,[edi + 8]
+		imul	DWORD PTR [esi + 0]
+		shrd	eax,edx,16
+		mov	ebp,eax
+		mov	eax,[edi + 20]
+		imul	DWORD PTR [esi + 4]
+		shrd	eax,edx,16
+		add	ebp,eax
+		mov	eax,[edi + 32]
+		imul	DWORD PTR [esi + 8]
+		shrd	eax,edx,16
+		add	ebp,eax
+
+		mov edx,v2
+		mov	[edx + 0],ecx
+		mov	[edx + 4],ebx
+		mov	[edx + 8],ebp
+	}
+}
+#endif
+
+#if (SupportFPMathsFunctions || SupportFPSquareRoot)
+
+/*
+
+ Square Root
+
+ Returns the Square Root of a 32-bit number
+
+*/
+
+extern int sqrt_temp1;
+extern int sqrt_temp2;
+
+static __inline__ int SqRoot32(int A)
+{
+	sqrt_temp1 = A;
+/*
+	_asm
+	{
+		finit
+		fild A
+		fsqrt
+		fistp temp2
+		fwait
+	}
+*/
+
+__asm__("finit				\n\t"
+	"fild	sqrt_temp1		\n\t"
+	"fsqrt				\n\t"
+	"fistp	sqrt_temp2		\n\t"
+	"fwait				\n\t"
+	:
+	:
+	: "memory", "cc"
+	);
+	
+	return sqrt_temp2;
+}
+
+#endif
+
+
+/*
+
+ This may look ugly (it is) but it is a MUCH faster way to convert "float" into "int" than
+ the function call "CHP" used by the WATCOM compiler.
+
+*/
+
+extern float fti_fptmp;
+extern int fti_itmp;
+
+static __inline__ int FloatToInt(float fptmp)
+{
+	fti_fptmp = fptmp;
+/*
+	_asm
+	{
+		fld fptmp
+		fistp itmp
+	}
+*/
+__asm__("fld	fti_fptmp		\n\t"
+	"fistp	fti_itmp		\n\t"
+	:
+	:
+	: "memory", "cc"
+	);
+
+	return fti_itmp;
+}
+
+/*
+
+ This macro makes usage of the above function easier and more elegant
+
+*/
+
+#define f2i(a, b) { \
+a = FloatToInt(b); \
+}
 
 #endif
 
diff --git a/src/win95/plspecfn.c b/src/win95/plspecfn.c
index 26c9527..0efb61e 100644
--- a/src/win95/plspecfn.c
+++ b/src/win95/plspecfn.c
@@ -18,6 +18,11 @@
 #include "kshape.h"
 #endif
 
+/* globals from inline.h */
+int sqrt_temp1;
+int sqrt_temp2;
+float fti_fptmp;
+int fti_itmp;
 
 /*
 
@@ -513,88 +518,6 @@ int WideMul2NarrowDiv(int a, int b, int c, int d, int e)
 }
 
 
-
-
-
-/*
-
- Square Root
-
- Returns the Square Root of a 32-bit number
-
-*/
-
-#if (SupportFPMathsFunctions || SupportFPSquareRoot)
-#else
-
-
-int SqRoot32(int A)
-
-{
-
-	unsigned int edx = A;
-	unsigned int ecx;
-
-	unsigned int ax = 0;
-	unsigned int bx = 0;
-	unsigned int di = 0;
-
-
-	for(ecx = 15; ecx!=0; ecx--) {
-
-		bx <<= 1;
-		if(edx & 0x80000000) bx |= 1;
-		edx <<= 1;
-
-		bx <<= 1;
-		if(edx & 0x80000000) bx |= 1;
-		edx <<= 1;
-
-		ax += ax;
-		di =  ax;
-		di += di;
-
-		if(bx > di) {
-
-			di++;
-			ax++;
-
-			bx -= di;
-
-		}
-
-	}
-
-	bx <<= 1;
-	if(edx & 0x80000000) bx |= 1;
-	edx <<= 1;
-
-	bx <<= 1;
-	if(edx & 0x80000000) bx |= 1;
-	edx <<= 1;
-
-	ax += ax;
-	di =  ax;
-	di += di;
-
-	if(bx > di) {
-
-		ax++;
-
-	}
-
-	return ((int)ax);
-
-}
-
-
-#endif	/* SupportFPMathsFunctions */
-
-
-
-
-
-
 /*
 
  Calculate Plane Normal from three POP's
@@ -1115,99 +1038,6 @@ int Magnitude(VECTORCH *v)
 
 }
 
-
-
-
-
-
-
-
-
-
-/*
-
- 64-bit Square Root returns 32-bit result
-
- All 64-bit operations are now done using the type LONGLONGCH whose format
- varies from platform to platform, although it is always 64-bits in size.
-
- NOTE:
-
- Function currently not available to Watcom C users
- A Floating point version is STRONGLY advised for the PC anyway
-
-*/
-
-#if 0
-int SqRoot64(LONGLONGCH *A)
-
-{
-
-#if 0
-
-	unsigned long long edx = *A;
-
-	unsigned int eax = 0;
-	unsigned int ebx = 0;
-	unsigned int edi = 0;
-
-	unsigned int ecx;
-
-
-	unsigned long long TopBit = 0x8000000000000000LL;
-
-	for(ecx = 31; ecx != 0; ecx--) {
-
-		ebx <<= 1;
-		if(edx & TopBit) ebx |= 1;
-		edx <<= 1;
-
-		ebx <<= 1;
-		if(edx & TopBit) ebx |= 1;
-		edx <<= 1;
-
-		eax += eax;
-		edi  = eax;
-		edi += edi;
-
-		if(ebx > edi) {
-
-			edi++;
-			eax++;
-			ebx -= edi;
-
-		}
-
-	}
-
-	ebx <<= 1;
-	if(edx & TopBit) ebx |= 1;
-	edx <<= 1;
-
-	ebx <<= 1;
-	if(edx & TopBit) ebx |= 1;
-	edx <<= 1;
-
-	eax += eax;
-	edi  = eax;
-	edi += edi;
-
-	if(ebx > edi) {
-
-		eax++;
-
-	}
-
-	return eax;
-
-#endif
-
-	return (0);
-
-}
-
-#endif /* for #if 0 */
-
 /*
 
  Shift the 64-bit value until is LTE the limit