Implemented (most of) the inline assembly in inline.h

author: Steven Fuller <relnev@icculus.org> 2001-07-29 18:25:45 +0000
committer: Patryk Obara <dreamer.tan@gmail.com> 2019-08-20 02:22:36 +0200
commit: 5c497c61a656f338eb9599fb5f364ad853d0eccf (patch)
tree: 125832fffe333da4ef51de914fd012de54569b4e /src/win95/inline.h
parent: 44d4752e83d807cde6aff8260b5aa2acdf77778d (diff)
1 files changed, 718 insertions, 2 deletions
diff --git a/src/win95/inline.h b/src/win95/inline.h
index f09e79a..6054f29 100644
--- a/src/win95/inline.h
+++ b/src/win95/inline.h
@@ -1215,9 +1215,9 @@ fptmp = (b); \
 FloatToInt(); \
 a = itmp;}
 
-#else /* other compiler ? */
+#else
 
-/* #error "Unknown compiler" */
+#if 0
 void ADD_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c);
 void ADD_LL_PP(LONGLONGCH *c, LONGLONGCH *a);
 void SUB_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c);
@@ -1240,6 +1240,722 @@ void RotateAndCopyVector_ASM(VECTORCH *v1, VECTORCH *v2, MATRIXCH *m);
 
 int FloatToInt(float);
 #define f2i(a, b) { a = FloatToInt(b); }
+#endif
+
+/* ADD */
+
+static __inline__ void ADD_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c)
+{
+/*
+	_asm
+	{
+		mov esi,a
+		mov edi,b
+		mov ebx,c
+		mov	eax,[esi]
+		mov	edx,[esi+4]
+		add	eax,[edi]
+		adc	edx,[edi+4]
+		mov	[ebx],eax
+		mov	[ebx+4],edx
+	}
+*/
+
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"movl	4(%%esi), %%edx		\n\t"
+	"addl	0(%%edi), %%eax		\n\t"
+	"adcl	4(%%edi), %%edx		\n\t"
+	"movl	%%eax, 0(%%ebx)		\n\t"
+	"movl	%%edx, 4(%%ebx)		\n\t"
+	: 
+	: "S" (a), "D" (b), "b" (c)
+	: "%eax", "%edx", "memory", "cc"
+	);
+
+/*
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"movl	4(%%esi), %%edx		\n\t"
+	"addl	0(%%edi), %%eax		\n\t"
+	"adcl	4(%%edi), %%edx		\n\t"
+	: "=a" (c->lo32), "=d" (c->hi32)
+	: "S" (a), "D" (b)
+	);
+*/
+}
+
+/* ADD ++ */
+
+static __inline__ void ADD_LL_PP(LONGLONGCH *c, LONGLONGCH *a)
+{
+/*
+	_asm
+	{
+		mov edi,c
+		mov esi,a
+		mov	eax,[esi]
+		mov	edx,[esi+4]
+		add	[edi],eax
+		adc	[edi+4],edx
+	}
+*/
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"movl	4(%%esi), %%edx		\n\t"
+	"addl	%%eax, 0(%%edi)		\n\t"
+	"adcl	%%edx, 4(%%edi)		\n\t"
+	:
+	: "D" (c), "S" (a)
+	: "%eax", "%edx", "memory", "cc"
+	);
+}
+
+/* SUB */
+
+static __inline__ void SUB_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c)
+{
+/*
+	_asm
+	{
+		mov esi,a
+		mov edi,b
+		mov ebx,c
+		mov	eax,[esi]
+		mov	edx,[esi+4]
+		sub	eax,[edi]
+		sbb	edx,[edi+4]
+		mov	[ebx],eax
+		mov	[ebx+4],edx
+	}
+*/
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"movl	4(%%esi), %%edx		\n\t"
+	"subl	0(%%edi), %%eax		\n\t"
+	"sbbl	4(%%edi), %%edx		\n\t"
+	"movl	%%eax, 0(%%ebx)		\n\t"
+	"movl	%%edx, 4(%%ebx)		\n\t"
+	:
+	: "S" (a), "D" (b), "b" (c)
+	: "%eax", "%edx", "memory", "cc"
+	);
+}
+
+/* SUB -- */
+
+static __inline__ void SUB_LL_MM(LONGLONGCH *c, LONGLONGCH *a)
+{
+/*
+	_asm
+	{
+		mov edi,c
+		mov esi,a
+		mov	eax,[esi]
+		mov	edx,[esi+4]
+		sub	[edi],eax
+		sbb	[edi+4],edx
+	}
+*/
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"movl	4(%%esi), %%edx		\n\t"
+	"subl	%%eax, 0(%%edi)		\n\t"
+	"sbbl	%%edx, 4(%%edi)		\n\t"
+	:
+	: "D" (c), "S" (a)
+	: "%eax", "%edx", "memory", "cc"
+	);
+}
+
+/*
+
+ MUL
+
+ This is the multiply we use, the 32 x 32 = 64 widening version
+
+*/
+
+static __inline__ void MUL_I_WIDE(int a, int b, LONGLONGCH *c)
+{
+/*
+	_asm
+	{
+		mov eax,a
+		mov ebx,c
+		imul b
+		mov	[ebx],eax
+		mov	[ebx+4],edx
+	}
+*/
+__asm__("imull	%0			\n\t"
+	"movl	%%eax, 0(%%ebx)		\n\t"
+	"movl	%%edx, 4(%%ebx)		\n\t"
+	:
+	: "a" (a), "b" (c), "q" (b)
+	: "%edx", "memory", "cc"
+	);
+}
+
+/*
+
+ CMP
+
+ This substitutes for ==, >, <, >=, <=
+
+*/
+
+static __inline__ int CMP_LL(LONGLONGCH *a, LONGLONGCH *b)
+{
+	int retval;
+/*
+	_asm
+	{
+		mov ebx,a
+		mov ecx,b
+		mov	eax,[ebx]
+		mov	edx,[ebx+4]
+		sub	eax,[ecx]
+		sbb	edx,[ecx+4]
+		and	edx,edx
+		jne	llnz
+		and	eax,eax
+		je	llgs
+		llnz:
+		mov	retval,1
+		and	edx,edx
+		jge	llgs
+		neg	retval
+		llgs:
+	}
+*/
+/* TODO */
+__asm__("xorl	%0, %0			\n\t"
+	"movl	0(%%ebx), %%eax		\n\t"
+	"movl	4(%%ebx), %%edx		\n\t"
+	"subl	0(%%ecx), %%eax		\n\t"
+	"sbbl	4(%%ecx), %%edx		\n\t"
+	"andl	%%edx, %%edx		\n\t"
+	"jne	llnz			\n\t"
+	"andl	%%eax, %%eax		\n\t"
+	"je	llgs			\n"
+"llnz:					\n\t"
+	"movl	$1, %0			\n\t"
+	"andl	%%edx, %%edx		\n\t"
+	"jge	llgs			\n\t"
+	"negl	%0			\n"
+"llgs:					\n\t"
+	: "=r" (retval)
+	: "b" (a), "c" (b)
+	: "%eax", "%edx", "memory", "cc"
+	);
+	
+	return retval;
+}
+
+/* EQUALS */
+
+static __inline__ void EQUALS_LL(LONGLONGCH *a, LONGLONGCH *b)
+{
+/*
+	_asm
+	{
+		mov edi,a
+		mov esi,b
+		mov	eax,[esi]
+		mov	edx,[esi+4]
+		mov	[edi],eax
+		mov	[edi+4],edx
+	}
+*/
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"movl	4(%%esi), %%edx		\n\t"
+	"movl	%%eax, 0(%%edi)		\n\t"
+	"movl	%%edx, 4(%%edi)		\n\t"
+	:
+	: "D" (a), "S" (b)
+	: "%eax", "%edx", "memory"
+	);
+}
+
+/* NEGATE */
+
+static __inline__ void NEG_LL(LONGLONGCH *a)
+{
+/*
+	_asm
+	{
+		mov esi,a
+		not	dword ptr[esi]
+		not	dword ptr[esi+4]
+		add	dword ptr[esi],1
+		adc	dword ptr[esi+4],0
+	}
+*/
+__asm__("notl	0(%%esi)		\n\t"
+	"notl	4(%%esi)		\n\t"
+	"addl	$1, 0(%%esi)		\n\t"
+	"adcl	$0, 4(%%esi)		\n\t"
+	:
+	: "S" (a)
+	: "memory", "cc"
+	);
+}
+
+/* ASR */
+
+static __inline__ void ASR_LL(LONGLONGCH *a, int shift)
+{
+/*
+	_asm
+	{
+		mov esi,a
+		mov eax,shift
+		and	eax,eax
+		jle	asrdn
+		asrlp:
+		sar	dword ptr[esi+4],1
+		rcr	dword ptr[esi],1
+		dec	eax
+		jne	asrlp
+		asrdn:
+	}
+*/
+__asm__("andl	%%eax, %%eax		\n\t"
+	"jle	asrdn			\n"
+"asrlp:					\n\t"
+	"sarl	$1, 4(%%esi)		\n\t"
+	"rcrl	$1, 0(%%esi)		\n\t"
+	"decl	%%eax			\n\t"
+	"jne	asrlp			\n"
+"asrdn:					\n\t"
+	:
+	: "S" (a), "a" (shift)
+	: "memory", "cc"
+	);
+	
+}
+
+/* Convert int to LONGLONGCH */
+
+static __inline__ void IntToLL(LONGLONGCH *a, int *b)
+{
+/*
+	_asm
+	{
+		mov esi,b
+		mov edi,a
+		mov	eax,[esi]
+		cdq
+		mov	[edi],eax
+		mov	[edi+4],edx
+	}
+*/
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"cdq				\n\t"
+	"movl	%%eax, 0(%%edi)		\n\t"
+	"movl	%%edx, 4(%%edi)		\n\t"
+	:
+	: "S" (b), "D" (a)
+	: "%eax", "%edx", "memory", "cc"
+	);
+
+}
+
+/*
+
+ Fixed Point Multiply.
+
+
+ 16.16 * 16.16 -> 16.16
+ or
+ 16.16 * 0.32 -> 0.32
+
+ A proper version of this function ought to read
+ 16.16 * 16.16 -> 32.16
+ but this would require a long long result
+
+ Algorithm:
+
+ Take the mid 32 bits of the 64 bit result
+
+*/
+
+/*
+	These functions have been checked for suitability for 
+	a Pentium and look as if they would work adequately.
+	Might be worth a more detailed look at optimising
+	them though.
+*/
+
+static __inline__ int MUL_FIXED(int a, int b)
+{
+	int retval;
+/*
+	_asm
+	{
+		mov eax,a
+		imul b
+		shrd eax,edx,16
+		mov retval,eax
+	}
+*/
+/* TODO */
+__asm__("imull	%0			\n\t"
+	"shrdl	$16, %%edx, %%eax	\n\t"
+	: "=a" (retval)
+	: "a" (a), "q" (b)
+	: "%edx", "cc"
+	);
+	return retval;
+}
+
+/*
+
+ Fixed Point Divide - returns a / b
+
+*/
+
+static __inline__ int DIV_FIXED(int a, int b)
+{
+	int retval;
+/*
+	_asm
+	{
+		mov eax,a
+		cdq
+		rol eax,16
+		mov dx,ax
+		xor ax,ax
+		idiv b
+		mov retval,eax
+	}
+*/
+/* TODO */
+__asm__("cdq				\n\t"
+	"roll	$16, %%eax		\n\t"
+	"mov	%%ax, %%dx		\n\t"
+	"xor	%%ax, %%ax		\n\t"
+	"idivl	%0			\n\t"
+	: "=a" (retval)
+	: "a" (a), "q" (b)
+	: "%edx", "cc"
+	);
+	return retval;
+}
+
+/*
+
+ Multiply and Divide Functions.
+
+*/
+
+
+/*
+
+ 32/32 division
+
+ This macro is a function on some other platforms
+
+*/
+
+#define DIV_INT(a, b) ((a) / (b))
+
+/*
+
+ A Narrowing 64/32 Division
+
+*/
+
+static __inline__ int NarrowDivide(LONGLONGCH *a, int b)
+{
+	int retval;
+/*
+	_asm
+	{
+		mov esi,a
+		mov	eax,[esi]
+		mov	edx,[esi+4]
+		idiv	b
+		mov retval,eax
+	}
+*/
+__asm__("movl	0(%%esi), %%eax		\n\t"
+	"movl	4(%%esi), %%edx		\n\t"
+	"idivl	%0			\n\t"
+	: "=a" (retval)
+	: "S" (a), "q" (b)
+	: "%edx", "cc"
+	);
+	return retval;
+}
+
+/*
+
+ This function performs a Widening Multiply followed by a Narrowing Divide.
+
+ a = (a * b) / c
+
+*/
+
+static __inline__ int WideMulNarrowDiv(int a, int b, int c)
+{
+	int retval;
+/*
+	_asm
+	{
+		mov eax,a
+		imul b
+		idiv c
+		mov retval,eax
+	}
+*/
+/* TODO */
+__asm__("imull	%0			\n\t"
+	"idivl	%1			\n\t"
+	: "=a" (retval)
+	: "a" (a), "q" (b), "q" (c)
+	: "cc"
+	);	
+	return retval;
+}
+
+/*
+
+ Function to rotate a VECTORCH using a MATRIXCH
+
+ This is the C function
+
+	x =  MUL_FIXED(m->mat11, v->vx);
+	x += MUL_FIXED(m->mat21, v->vy);
+	x += MUL_FIXED(m->mat31, v->vz);
+
+	y  = MUL_FIXED(m->mat12, v->vx);
+	y += MUL_FIXED(m->mat22, v->vy);
+	y += MUL_FIXED(m->mat32, v->vz);
+
+	z  = MUL_FIXED(m->mat13, v->vx);
+	z += MUL_FIXED(m->mat23, v->vy);
+	z += MUL_FIXED(m->mat33, v->vz);
+
+	v->vx = x;
+	v->vy = y;
+	v->vz = z;
+
+ This is the MUL_FIXED inline assembler function
+
+	imul edx
+	shrd eax,edx,16
+
+
+typedef struct matrixch {
+
+	int mat11;	0
+	int mat12;	4
+	int mat13;	8
+
+	int mat21;	12
+	int mat22;	16
+	int mat23;	20
+
+	int mat31;	24
+	int mat32;	28
+	int mat33;	32
+
+} MATRIXCH;
+
+*/
+
+#if 0 /* TODO if these are needed */
+static void RotateVector_ASM(VECTORCH *v, MATRIXCH *m)
+{
+	_asm
+	{
+		mov esi,v
+		mov edi,m
+
+		mov	eax,[edi + 0]
+		imul	DWORD PTR [esi + 0]
+		shrd	eax,edx,16
+		mov	ecx,eax
+		mov	eax,[edi + 12]
+		imul	DWORD PTR [esi + 4]
+		shrd	eax,edx,16
+		add	ecx,eax
+		mov	eax,[edi + 24]
+		imul	DWORD PTR [esi + 8]
+		shrd	eax,edx,16
+		add	ecx,eax
+
+		mov	eax,[edi + 4]
+		imul	DWORD PTR [esi + 0]
+		shrd	eax,edx,16
+		mov	ebx,eax
+		mov	eax,[edi + 16]
+		imul	DWORD PTR [esi + 4]
+		shrd	eax,edx,16
+		add	ebx,eax
+		mov	eax,[edi + 28]
+		imul	DWORD PTR [esi + 8]
+		shrd	eax,edx,16
+		add	ebx,eax
+
+		mov	eax,[edi + 8]
+		imul	DWORD PTR [esi + 0]
+		shrd	eax,edx,16
+		mov	ebp,eax
+		mov	eax,[edi + 20]
+		imul	DWORD PTR [esi + 4]
+		shrd	eax,edx,16
+		add	ebp,eax
+		mov	eax,[edi + 32]
+		imul	DWORD PTR [esi + 8]
+		shrd	eax,edx,16
+		add	ebp,eax
+
+		mov	[esi + 0],ecx
+		mov	[esi + 4],ebx
+		mov	[esi + 8],ebp
+	}
+}
+
+/*
+
+ Here is the same function, this time copying the result to a second vector
+
+*/
+
+static void RotateAndCopyVector_ASM(VECTORCH *v1, VECTORCH *v2, MATRIXCH *m)
+{
+	_asm
+	{
+		mov esi,v1
+		mov edi,m
+
+		mov	eax,[edi + 0]
+		imul	DWORD PTR [esi + 0]
+		shrd	eax,edx,16
+		mov	ecx,eax
+		mov	eax,[edi + 12]
+		imul	DWORD PTR [esi + 4]
+		shrd	eax,edx,16
+		add	ecx,eax
+		mov	eax,[edi + 24]
+		imul	DWORD PTR [esi + 8]
+		shrd	eax,edx,16
+		add	ecx,eax
+
+		mov	eax,[edi + 4]
+		imul	DWORD PTR [esi + 0]
+		shrd	eax,edx,16
+		mov	ebx,eax
+		mov	eax,[edi + 16]
+		imul	DWORD PTR [esi + 4]
+		shrd	eax,edx,16
+		add	ebx,eax
+		mov	eax,[edi + 28]
+		imul	DWORD PTR [esi + 8]
+		shrd	eax,edx,16
+		add	ebx,eax
+
+		mov	eax,[edi + 8]
+		imul	DWORD PTR [esi + 0]
+		shrd	eax,edx,16
+		mov	ebp,eax
+		mov	eax,[edi + 20]
+		imul	DWORD PTR [esi + 4]
+		shrd	eax,edx,16
+		add	ebp,eax
+		mov	eax,[edi + 32]
+		imul	DWORD PTR [esi + 8]
+		shrd	eax,edx,16
+		add	ebp,eax
+
+		mov edx,v2
+		mov	[edx + 0],ecx
+		mov	[edx + 4],ebx
+		mov	[edx + 8],ebp
+	}
+}
+#endif
+
+#if (SupportFPMathsFunctions || SupportFPSquareRoot)
+
+/*
+
+ Square Root
+
+ Returns the Square Root of a 32-bit number
+
+*/
+
+extern int sqrt_temp1;
+extern int sqrt_temp2;
+
+static __inline__ int SqRoot32(int A)
+{
+	sqrt_temp1 = A;
+/*
+	_asm
+	{
+		finit
+		fild A
+		fsqrt
+		fistp temp2
+		fwait
+	}
+*/
+
+__asm__("finit				\n\t"
+	"fild	sqrt_temp1		\n\t"
+	"fsqrt				\n\t"
+	"fistp	sqrt_temp2		\n\t"
+	"fwait				\n\t"
+	:
+	:
+	: "memory", "cc"
+	);
+	
+	return sqrt_temp2;
+}
+
+#endif
+
+
+/*
+
+ This may look ugly (it is) but it is a MUCH faster way to convert "float" into "int" than
+ the function call "CHP" used by the WATCOM compiler.
+
+*/
+
+extern float fti_fptmp;
+extern int fti_itmp;
+
+static __inline__ int FloatToInt(float fptmp)
+{
+	fti_fptmp = fptmp;
+/*
+	_asm
+	{
+		fld fptmp
+		fistp itmp
+	}
+*/
+__asm__("fld	fti_fptmp		\n\t"
+	"fistp	fti_itmp		\n\t"
+	:
+	:
+	: "memory", "cc"
+	);
+
+	return fti_itmp;
+}
+
+/*
+
+ This macro makes usage of the above function easier and more elegant
+
+*/
+
+#define f2i(a, b) { \
+a = FloatToInt(b); \
+}
 
 #endif
author	Steven Fuller <relnev@icculus.org>	2001-07-29 18:25:45 +0000
committer	Patryk Obara <dreamer.tan@gmail.com>	2019-08-20 02:22:36 +0200
commit	5c497c61a656f338eb9599fb5f364ad853d0eccf (patch)
tree	125832fffe333da4ef51de914fd012de54569b4e /src/win95/inline.h
parent	44d4752e83d807cde6aff8260b5aa2acdf77778d (diff)