summaryrefslogtreecommitdiff
path: root/src/win95/inline.h
diff options
context:
space:
mode:
authorSteven Fuller <relnev@icculus.org>2001-07-29 18:25:45 +0000
committerPatryk Obara <dreamer.tan@gmail.com>2019-08-20 02:22:36 +0200
commit5c497c61a656f338eb9599fb5f364ad853d0eccf (patch)
tree125832fffe333da4ef51de914fd012de54569b4e /src/win95/inline.h
parent44d4752e83d807cde6aff8260b5aa2acdf77778d (diff)
Implemented (most of) the inline assembly in inline.h
Diffstat (limited to 'src/win95/inline.h')
-rw-r--r--src/win95/inline.h720
1 files changed, 718 insertions, 2 deletions
diff --git a/src/win95/inline.h b/src/win95/inline.h
index f09e79a..6054f29 100644
--- a/src/win95/inline.h
+++ b/src/win95/inline.h
@@ -1215,9 +1215,9 @@ fptmp = (b); \
FloatToInt(); \
a = itmp;}
-#else /* other compiler ? */
+#else
-/* #error "Unknown compiler" */
+#if 0
void ADD_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c);
void ADD_LL_PP(LONGLONGCH *c, LONGLONGCH *a);
void SUB_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c);
@@ -1240,6 +1240,722 @@ void RotateAndCopyVector_ASM(VECTORCH *v1, VECTORCH *v2, MATRIXCH *m);
int FloatToInt(float);
#define f2i(a, b) { a = FloatToInt(b); }
+#endif
+
+/* ADD */
+
+static __inline__ void ADD_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c)
+{
+/*
+ _asm
+ {
+ mov esi,a
+ mov edi,b
+ mov ebx,c
+ mov eax,[esi]
+ mov edx,[esi+4]
+ add eax,[edi]
+ adc edx,[edi+4]
+ mov [ebx],eax
+ mov [ebx+4],edx
+ }
+*/
+
+__asm__("movl 0(%%esi), %%eax \n\t"
+ "movl 4(%%esi), %%edx \n\t"
+ "addl 0(%%edi), %%eax \n\t"
+ "adcl 4(%%edi), %%edx \n\t"
+ "movl %%eax, 0(%%ebx) \n\t"
+ "movl %%edx, 4(%%ebx) \n\t"
+ :
+ : "S" (a), "D" (b), "b" (c)
+ : "%eax", "%edx", "memory", "cc"
+ );
+
+/*
+__asm__("movl 0(%%esi), %%eax \n\t"
+ "movl 4(%%esi), %%edx \n\t"
+ "addl 0(%%edi), %%eax \n\t"
+ "adcl 4(%%edi), %%edx \n\t"
+ : "=a" (c->lo32), "=d" (c->hi32)
+ : "S" (a), "D" (b)
+ );
+*/
+}
+
+/* ADD ++ */
+
+static __inline__ void ADD_LL_PP(LONGLONGCH *c, LONGLONGCH *a)
+{
+/*
+ _asm
+ {
+ mov edi,c
+ mov esi,a
+ mov eax,[esi]
+ mov edx,[esi+4]
+ add [edi],eax
+ adc [edi+4],edx
+ }
+*/
+__asm__("movl 0(%%esi), %%eax \n\t"
+ "movl 4(%%esi), %%edx \n\t"
+ "addl %%eax, 0(%%edi) \n\t"
+ "adcl %%edx, 4(%%edi) \n\t"
+ :
+ : "D" (c), "S" (a)
+ : "%eax", "%edx", "memory", "cc"
+ );
+}
+
+/* SUB */
+
+static __inline__ void SUB_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c)
+{
+/*
+ _asm
+ {
+ mov esi,a
+ mov edi,b
+ mov ebx,c
+ mov eax,[esi]
+ mov edx,[esi+4]
+ sub eax,[edi]
+ sbb edx,[edi+4]
+ mov [ebx],eax
+ mov [ebx+4],edx
+ }
+*/
+__asm__("movl 0(%%esi), %%eax \n\t"
+ "movl 4(%%esi), %%edx \n\t"
+ "subl 0(%%edi), %%eax \n\t"
+ "sbbl 4(%%edi), %%edx \n\t"
+ "movl %%eax, 0(%%ebx) \n\t"
+ "movl %%edx, 4(%%ebx) \n\t"
+ :
+ : "S" (a), "D" (b), "b" (c)
+ : "%eax", "%edx", "memory", "cc"
+ );
+}
+
+/* SUB -- */
+
+static __inline__ void SUB_LL_MM(LONGLONGCH *c, LONGLONGCH *a)
+{
+/*
+ _asm
+ {
+ mov edi,c
+ mov esi,a
+ mov eax,[esi]
+ mov edx,[esi+4]
+ sub [edi],eax
+ sbb [edi+4],edx
+ }
+*/
+__asm__("movl 0(%%esi), %%eax \n\t"
+ "movl 4(%%esi), %%edx \n\t"
+ "subl %%eax, 0(%%edi) \n\t"
+ "sbbl %%edx, 4(%%edi) \n\t"
+ :
+ : "D" (c), "S" (a)
+ : "%eax", "%edx", "memory", "cc"
+ );
+}
+
+/*
+
+ MUL
+
+ This is the multiply we use, the 32 x 32 = 64 widening version
+
+*/
+
+static __inline__ void MUL_I_WIDE(int a, int b, LONGLONGCH *c)
+{
+/*
+ _asm
+ {
+ mov eax,a
+ mov ebx,c
+ imul b
+ mov [ebx],eax
+ mov [ebx+4],edx
+ }
+*/
+__asm__("imull %0 \n\t"
+ "movl %%eax, 0(%%ebx) \n\t"
+ "movl %%edx, 4(%%ebx) \n\t"
+ :
+ : "a" (a), "b" (c), "q" (b)
+ : "%edx", "memory", "cc"
+ );
+}
+
+/*
+
+ CMP
+
+ This substitutes for ==, >, <, >=, <=
+
+*/
+
+static __inline__ int CMP_LL(LONGLONGCH *a, LONGLONGCH *b)
+{
+ int retval;
+/*
+ _asm
+ {
+ mov ebx,a
+ mov ecx,b
+ mov eax,[ebx]
+ mov edx,[ebx+4]
+ sub eax,[ecx]
+ sbb edx,[ecx+4]
+ and edx,edx
+ jne llnz
+ and eax,eax
+ je llgs
+ llnz:
+ mov retval,1
+ and edx,edx
+ jge llgs
+ neg retval
+ llgs:
+ }
+*/
+/* TODO */
+__asm__("xorl %0, %0 \n\t"
+ "movl 0(%%ebx), %%eax \n\t"
+ "movl 4(%%ebx), %%edx \n\t"
+ "subl 0(%%ecx), %%eax \n\t"
+ "sbbl 4(%%ecx), %%edx \n\t"
+ "andl %%edx, %%edx \n\t"
+ "jne llnz \n\t"
+ "andl %%eax, %%eax \n\t"
+ "je llgs \n"
+"llnz: \n\t"
+ "movl $1, %0 \n\t"
+ "andl %%edx, %%edx \n\t"
+ "jge llgs \n\t"
+ "negl %0 \n"
+"llgs: \n\t"
+ : "=r" (retval)
+ : "b" (a), "c" (b)
+ : "%eax", "%edx", "memory", "cc"
+ );
+
+ return retval;
+}
+
+/* EQUALS */
+
+static __inline__ void EQUALS_LL(LONGLONGCH *a, LONGLONGCH *b)
+{
+/*
+ _asm
+ {
+ mov edi,a
+ mov esi,b
+ mov eax,[esi]
+ mov edx,[esi+4]
+ mov [edi],eax
+ mov [edi+4],edx
+ }
+*/
+__asm__("movl 0(%%esi), %%eax \n\t"
+ "movl 4(%%esi), %%edx \n\t"
+ "movl %%eax, 0(%%edi) \n\t"
+ "movl %%edx, 4(%%edi) \n\t"
+ :
+ : "D" (a), "S" (b)
+ : "%eax", "%edx", "memory"
+ );
+}
+
+/* NEGATE */
+
+static __inline__ void NEG_LL(LONGLONGCH *a)
+{
+/*
+ _asm
+ {
+ mov esi,a
+ not dword ptr[esi]
+ not dword ptr[esi+4]
+ add dword ptr[esi],1
+ adc dword ptr[esi+4],0
+ }
+*/
+__asm__("notl 0(%%esi) \n\t"
+ "notl 4(%%esi) \n\t"
+ "addl $1, 0(%%esi) \n\t"
+ "adcl $0, 4(%%esi) \n\t"
+ :
+ : "S" (a)
+ : "memory", "cc"
+ );
+}
+
+/* ASR */
+
+static __inline__ void ASR_LL(LONGLONGCH *a, int shift)
+{
+/*
+ _asm
+ {
+ mov esi,a
+ mov eax,shift
+ and eax,eax
+ jle asrdn
+ asrlp:
+ sar dword ptr[esi+4],1
+ rcr dword ptr[esi],1
+ dec eax
+ jne asrlp
+ asrdn:
+ }
+*/
+__asm__("andl %%eax, %%eax \n\t"
+ "jle asrdn \n"
+"asrlp: \n\t"
+ "sarl $1, 4(%%esi) \n\t"
+ "rcrl $1, 0(%%esi) \n\t"
+ "decl %%eax \n\t"
+ "jne asrlp \n"
+"asrdn: \n\t"
+ :
+ : "S" (a), "a" (shift)
+ : "memory", "cc"
+ );
+
+}
+
+/* Convert int to LONGLONGCH */
+
+static __inline__ void IntToLL(LONGLONGCH *a, int *b)
+{
+/*
+ _asm
+ {
+ mov esi,b
+ mov edi,a
+ mov eax,[esi]
+ cdq
+ mov [edi],eax
+ mov [edi+4],edx
+ }
+*/
+__asm__("movl 0(%%esi), %%eax \n\t"
+ "cdq \n\t"
+ "movl %%eax, 0(%%edi) \n\t"
+ "movl %%edx, 4(%%edi) \n\t"
+ :
+ : "S" (b), "D" (a)
+ : "%eax", "%edx", "memory", "cc"
+ );
+
+}
+
+/*
+
+ Fixed Point Multiply.
+
+
+ 16.16 * 16.16 -> 16.16
+ or
+ 16.16 * 0.32 -> 0.32
+
+ A proper version of this function ought to read
+ 16.16 * 16.16 -> 32.16
+ but this would require a long long result
+
+ Algorithm:
+
+ Take the mid 32 bits of the 64 bit result
+
+*/
+
+/*
+ These functions have been checked for suitability for
+ a Pentium and look as if they would work adequately.
+ Might be worth a more detailed look at optimising
+ them though.
+*/
+
+static __inline__ int MUL_FIXED(int a, int b)
+{
+ int retval;
+/*
+ _asm
+ {
+ mov eax,a
+ imul b
+ shrd eax,edx,16
+ mov retval,eax
+ }
+*/
+/* TODO */
+__asm__("imull %0 \n\t"
+ "shrdl $16, %%edx, %%eax \n\t"
+ : "=a" (retval)
+ : "a" (a), "q" (b)
+ : "%edx", "cc"
+ );
+ return retval;
+}
+
+/*
+
+ Fixed Point Divide - returns a / b
+
+*/
+
+static __inline__ int DIV_FIXED(int a, int b)
+{
+ int retval;
+/*
+ _asm
+ {
+ mov eax,a
+ cdq
+ rol eax,16
+ mov dx,ax
+ xor ax,ax
+ idiv b
+ mov retval,eax
+ }
+*/
+/* TODO */
+__asm__("cdq \n\t"
+ "roll $16, %%eax \n\t"
+ "mov %%ax, %%dx \n\t"
+ "xor %%ax, %%ax \n\t"
+ "idivl %0 \n\t"
+ : "=a" (retval)
+ : "a" (a), "q" (b)
+ : "%edx", "cc"
+ );
+ return retval;
+}
+
+/*
+
+ Multiply and Divide Functions.
+
+*/
+
+
+/*
+
+ 32/32 division
+
+ This macro is a function on some other platforms
+
+*/
+
+#define DIV_INT(a, b) ((a) / (b))
+
+/*
+
+ A Narrowing 64/32 Division
+
+*/
+
+static __inline__ int NarrowDivide(LONGLONGCH *a, int b)
+{
+ int retval;
+/*
+ _asm
+ {
+ mov esi,a
+ mov eax,[esi]
+ mov edx,[esi+4]
+ idiv b
+ mov retval,eax
+ }
+*/
+__asm__("movl 0(%%esi), %%eax \n\t"
+ "movl 4(%%esi), %%edx \n\t"
+ "idivl %0 \n\t"
+ : "=a" (retval)
+ : "S" (a), "q" (b)
+ : "%edx", "cc"
+ );
+ return retval;
+}
+
+/*
+
+ This function performs a Widening Multiply followed by a Narrowing Divide.
+
+ a = (a * b) / c
+
+*/
+
+static __inline__ int WideMulNarrowDiv(int a, int b, int c)
+{
+ int retval;
+/*
+ _asm
+ {
+ mov eax,a
+ imul b
+ idiv c
+ mov retval,eax
+ }
+*/
+/* TODO */
+__asm__("imull %0 \n\t"
+ "idivl %1 \n\t"
+ : "=a" (retval)
+ : "a" (a), "q" (b), "q" (c)
+ : "cc"
+ );
+ return retval;
+}
+
+/*
+
+ Function to rotate a VECTORCH using a MATRIXCH
+
+ This is the C function
+
+ x = MUL_FIXED(m->mat11, v->vx);
+ x += MUL_FIXED(m->mat21, v->vy);
+ x += MUL_FIXED(m->mat31, v->vz);
+
+ y = MUL_FIXED(m->mat12, v->vx);
+ y += MUL_FIXED(m->mat22, v->vy);
+ y += MUL_FIXED(m->mat32, v->vz);
+
+ z = MUL_FIXED(m->mat13, v->vx);
+ z += MUL_FIXED(m->mat23, v->vy);
+ z += MUL_FIXED(m->mat33, v->vz);
+
+ v->vx = x;
+ v->vy = y;
+ v->vz = z;
+
+ This is the MUL_FIXED inline assembler function
+
+ imul edx
+ shrd eax,edx,16
+
+
+typedef struct matrixch {
+
+ int mat11; 0
+ int mat12; 4
+ int mat13; 8
+
+ int mat21; 12
+ int mat22; 16
+ int mat23; 20
+
+ int mat31; 24
+ int mat32; 28
+ int mat33; 32
+
+} MATRIXCH;
+
+*/
+
+#if 0 /* TODO if these are needed */
+static void RotateVector_ASM(VECTORCH *v, MATRIXCH *m)
+{
+ _asm
+ {
+ mov esi,v
+ mov edi,m
+
+ mov eax,[edi + 0]
+ imul DWORD PTR [esi + 0]
+ shrd eax,edx,16
+ mov ecx,eax
+ mov eax,[edi + 12]
+ imul DWORD PTR [esi + 4]
+ shrd eax,edx,16
+ add ecx,eax
+ mov eax,[edi + 24]
+ imul DWORD PTR [esi + 8]
+ shrd eax,edx,16
+ add ecx,eax
+
+ mov eax,[edi + 4]
+ imul DWORD PTR [esi + 0]
+ shrd eax,edx,16
+ mov ebx,eax
+ mov eax,[edi + 16]
+ imul DWORD PTR [esi + 4]
+ shrd eax,edx,16
+ add ebx,eax
+ mov eax,[edi + 28]
+ imul DWORD PTR [esi + 8]
+ shrd eax,edx,16
+ add ebx,eax
+
+ mov eax,[edi + 8]
+ imul DWORD PTR [esi + 0]
+ shrd eax,edx,16
+ mov ebp,eax
+ mov eax,[edi + 20]
+ imul DWORD PTR [esi + 4]
+ shrd eax,edx,16
+ add ebp,eax
+ mov eax,[edi + 32]
+ imul DWORD PTR [esi + 8]
+ shrd eax,edx,16
+ add ebp,eax
+
+ mov [esi + 0],ecx
+ mov [esi + 4],ebx
+ mov [esi + 8],ebp
+ }
+}
+
+/*
+
+ Here is the same function, this time copying the result to a second vector
+
+*/
+
+static void RotateAndCopyVector_ASM(VECTORCH *v1, VECTORCH *v2, MATRIXCH *m)
+{
+ _asm
+ {
+ mov esi,v1
+ mov edi,m
+
+ mov eax,[edi + 0]
+ imul DWORD PTR [esi + 0]
+ shrd eax,edx,16
+ mov ecx,eax
+ mov eax,[edi + 12]
+ imul DWORD PTR [esi + 4]
+ shrd eax,edx,16
+ add ecx,eax
+ mov eax,[edi + 24]
+ imul DWORD PTR [esi + 8]
+ shrd eax,edx,16
+ add ecx,eax
+
+ mov eax,[edi + 4]
+ imul DWORD PTR [esi + 0]
+ shrd eax,edx,16
+ mov ebx,eax
+ mov eax,[edi + 16]
+ imul DWORD PTR [esi + 4]
+ shrd eax,edx,16
+ add ebx,eax
+ mov eax,[edi + 28]
+ imul DWORD PTR [esi + 8]
+ shrd eax,edx,16
+ add ebx,eax
+
+ mov eax,[edi + 8]
+ imul DWORD PTR [esi + 0]
+ shrd eax,edx,16
+ mov ebp,eax
+ mov eax,[edi + 20]
+ imul DWORD PTR [esi + 4]
+ shrd eax,edx,16
+ add ebp,eax
+ mov eax,[edi + 32]
+ imul DWORD PTR [esi + 8]
+ shrd eax,edx,16
+ add ebp,eax
+
+ mov edx,v2
+ mov [edx + 0],ecx
+ mov [edx + 4],ebx
+ mov [edx + 8],ebp
+ }
+}
+#endif
+
+#if (SupportFPMathsFunctions || SupportFPSquareRoot)
+
+/*
+
+ Square Root
+
+ Returns the Square Root of a 32-bit number
+
+*/
+
+extern int sqrt_temp1;
+extern int sqrt_temp2;
+
+static __inline__ int SqRoot32(int A)
+{
+ sqrt_temp1 = A;
+/*
+ _asm
+ {
+ finit
+ fild A
+ fsqrt
+ fistp temp2
+ fwait
+ }
+*/
+
+__asm__("finit \n\t"
+ "fild sqrt_temp1 \n\t"
+ "fsqrt \n\t"
+ "fistp sqrt_temp2 \n\t"
+ "fwait \n\t"
+ :
+ :
+ : "memory", "cc"
+ );
+
+ return sqrt_temp2;
+}
+
+#endif
+
+
+/*
+
+ This may look ugly (it is) but it is a MUCH faster way to convert "float" into "int" than
+ the function call "CHP" used by the WATCOM compiler.
+
+*/
+
+extern float fti_fptmp;
+extern int fti_itmp;
+
+static __inline__ int FloatToInt(float fptmp)
+{
+ fti_fptmp = fptmp;
+/*
+ _asm
+ {
+ fld fptmp
+ fistp itmp
+ }
+*/
+__asm__("fld fti_fptmp \n\t"
+ "fistp fti_itmp \n\t"
+ :
+ :
+ : "memory", "cc"
+ );
+
+ return fti_itmp;
+}
+
+/*
+
+ This macro makes usage of the above function easier and more elegant
+
+*/
+
+#define f2i(a, b) { \
+a = FloatToInt(b); \
+}
#endif