diff options
| author | Steven Fuller <relnev@icculus.org> | 2001-07-01 00:55:22 +0000 |
|---|---|---|
| committer | Patryk Obara <dreamer.tan@gmail.com> | 2019-08-20 02:09:04 +0200 |
| commit | 2186d5f3f95cd74a070a490d899291648d58667a (patch) | |
| tree | 55241a1afa3e1a22e0b6593a8dead0b703800f44 /src/win95/mmx_math.h | |
| parent | 218ca90543758a20ac326e444ca0643174ca7384 (diff) | |
Initial revision
Diffstat (limited to 'src/win95/mmx_math.h')
| -rw-r--r-- | src/win95/mmx_math.h | 469 |
1 files changed, 469 insertions, 0 deletions
diff --git a/src/win95/mmx_math.h b/src/win95/mmx_math.h new file mode 100644 index 0000000..594ac0e --- /dev/null +++ b/src/win95/mmx_math.h @@ -0,0 +1,469 @@ +#ifndef _included_mmx_math_h_ +#define _included_mmx_math_h_ + +#if SUPPORT_MMX + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/* +Calling-convention independent +definitions of inline MMX assembler +functions and declarations for non- +inline MMX assembler functions +*/ + +/* SPECIFICATION */ +/* +Dot Product and Vector Transform functions take +arguments referencing matrices or vectors whose +elements are 32 bit signed integers and arranged as +follows. All integers (including the results) are +in 16.16 fixed point form - ie. The 64-bit results +are shifted down 16 bits (divided by 65536) before +being written back as 32-bit values. Results are +rounded down (towards negative infinity). + +the matrix structure looks like this (not ideal!) +[ +00 +0c +18 ] +[ +04 +10 +1c ] +[ +08 +14 +20 ] + +and the vector structure looks like this +[ +00 ] +[ +04 ] +[ +08 ] +*/ + +/* TYPICAL CHARACTERISTICS */ +/* +Accuracy + +Internal rounding errors may be propogated, and +the results may not be exact. For the Dot Product +result and the Vector Transform results (x,y and z +independently), the error distributions are all +the same, as follows: + +Exact: 25% +-1: 50% +-2: 25% + +Better accuracy can be obtained by adding 1 to each integer result, +but this will produce poor results in the case of nice simple round +numbers, eg Dot({1.0,0.0,0.0},{0.0,1.0,0.0}) gives 1 not 0! + +Speed + +The DotProduct Takes 33 cycles (not including call instruction) +The inline DotProduct takes 30+1 cycles (the last instruction is pairable) +All Vector transforms take 63 cycles. These figures assume no +stalls due to cache misses or misaligned data. A matrix multiply +or cross product could be supplied if it is thought they would +be necessary + + +For optimal performance, it is recommended that vector and +matrix structures should be aligned to EIGHT byte boundaries. +To ensure this in arrays of vectors/matrices, the structure +should contain a dummy padding 32-bit value (recommended). +*/ + +/* storage class specifier for assembler calls */ + +#ifdef __WATCOMC__ +#define _asmcall +#define _asminline +#elif defined(_MSC_VER) +#define _asmcall static __inline +#define _asminline static __inline +#else +#error "Unknown compiler" +#endif + +/* forward reference declared in global scope */ +struct vectorch; +struct matrixch; + +/***********************/ +/* F-U-N-C-T-I-O-N */ +/* P-R-O-T-O-T-Y-P-E-S */ +/* F-O-R A-L-L */ +/* P-U-B-L-I-C */ +/* F-U-N-C-T-I-O-N-S */ +/***********************/ + +/* overwrites the input vector with the new vector */ +_asmcall void MMX_VectorTransform(struct vectorch * vector, struct matrixch const * matrix); +/* fills a new vector with the result of the input vector transformed by the matrix */ +_asmcall void MMX_VectorTransformed(struct vectorch * v_result, struct vectorch const * v_parm, struct matrixch const * matrix); +/* overwrites the input vector with the new vector, then adds another vector */ +_asmcall void MMX_VectorTransformAndAdd(struct vectorch * vector, struct matrixch const * matrix, struct vectorch const * v_add); +/* fills a new vector with the result of the input vector transformed by the matrix then added to another vector */ +_asmcall void MMX_VectorTransformedAndAdd(struct vectorch * v_result, struct vectorch const * v_parm, struct matrixch const * matrix, struct vectorch const * v_add); +/* compute dot product */ +_asmcall signed MMX_VectorDot(struct vectorch const * v1, struct vectorch const * v2); +/* this one assumes all the input vector elements are in the range [-32768,32767] */ +_asmcall signed MMX_VectorDot16(struct vectorch const * v1, struct vectorch const * v2); + +/* inline versions */ +_asminline signed MMXInline_VectorDot(struct vectorch const * v1, struct vectorch const * v2); +_asminline signed MMXInline_VectorDot16(struct vectorch const * v1, struct vectorch const * v2); + +/*****************/ +/* PRIVATE PARTS */ +/*****************/ + +/* Assembler labels */ +extern void MMXAsm_VectorTransform(void); +extern void MMXAsm_VectorTransformed(void); +extern void MMXAsm_VectorTransformAndAdd(void); +extern void MMXAsm_VectorTransformedAndAdd(void); +extern void MMXAsm_VectorDot(void); +extern void MMXAsm_VectorDot16(void); + +/* inline calls to MMX functions with correct parameters set */ +#ifdef __WATCOMC__ + +#pragma aux MMX_VectorTransform = "call MMXAsm_VectorTransform" parm [eax] [edx]; +#pragma aux MMX_VectorTransformed = "call MMXAsm_VectorTransformed" parm [eax] [edx] [ecx]; +#pragma aux MMX_VectorTransformAndAdd = "call MMXAsm_VectorTransformAndAdd" parm [eax] [edx] [ecx]; +#pragma aux MMX_VectorTransformedAndAdd = "call MMXAsm_VectorTransformedAndAdd" parm [eax] [edx] [ecx] [ebx]; +#pragma aux MMX_VectorDot = "call MMXAsm_VectorDot" parm [eax] [edx] value [eax]; +#pragma aux MMX_VectorDot16 = "call MMXAsm_VectorDot16" parm [eax] [edx] value [eax]; + +#elif defined(_MSC_VER) + +_asmcall void MMX_VectorTransform(struct vectorch * vector, struct matrixch const * matrix) +{ + _asm + { + mov eax,vector + mov edx,matrix + call MMXAsm_VectorTransform + } +} +_asmcall void MMX_VectorTransformed(struct vectorch * v_result, struct vectorch const * v_parm, struct matrixch const * matrix) +{ + _asm + { + mov eax,v_result + mov edx,v_parm + mov ecx,matrix + call MMXAsm_VectorTransformed + } +} +_asmcall void MMX_VectorTransformAndAdd(struct vectorch * vector, struct matrixch const * matrix, struct vectorch const * v_add) +{ + _asm + { + mov eax,vector + mov edx,matrix + mov ecx,v_add + call MMXAsm_VectorTransformAndAdd + } +} +_asmcall void MMX_VectorTransformedAndAdd(struct vectorch * v_result, struct vectorch const * v_parm, struct matrixch const * matrix, struct vectorch const * v_add) +{ + _asm + { + mov eax,v_result + mov edx,v_parm + mov ecx,matrix + mov ebx,v_add + call MMXAsm_VectorTransformedAndAdd + } +} +_asmcall signed MMX_VectorDot(struct vectorch const * v1, struct vectorch const * v2) +{ + signed retval; + _asm + { + mov eax,v1 + mov edx,v2 + call MMXAsm_VectorDot + mov retval,eax + } + return retval; +} +_asmcall signed MMX_VectorDot16(struct vectorch const * v1, struct vectorch const * v2) +{ + signed retval; + _asm + { + mov eax,v1 + mov edx,v2 + call MMXAsm_VectorDot16 + mov retval,eax + } + return retval; +} + +#else + +#error "Unknown compiler" + +#endif + + +/* Cross product? Mod? MatrixMultiply? */ + +/* globals */ + +extern int use_mmx_math; + +/* inline functions - no call */ + +extern __int64 const mmx_sign_mask; +extern __int64 const mmx_one_fixed_h; + +#ifdef __WATCOMC__ + +#pragma aux MMXInline_VectorDot = \ +\ +" movq mm0,[edx]" \ +\ +" movd mm2,[edx+08h]" \ +" movq mm4,mm0" \ +\ +" pand mm4,mmx_sign_mask" \ +" movq mm6,mm2" \ +\ +" movq mm1,[eax]" \ +" paddd mm4,mm4" \ +\ +" movd mm3,[eax+08h]" \ +" movq mm5,mm1" \ +\ +" pand mm6,mmx_sign_mask" \ +" movq mm7,mm3" \ +\ +" pand mm5,mmx_sign_mask" \ +" paddd mm6,mm6" \ +\ +" pand mm7,mmx_sign_mask" \ +" paddd mm5,mm5" \ +\ +" paddd mm0,mm4" \ +" paddd mm2,mm6" \ +\ +" paddd mm7,mm7" \ +" movq mm4,mm2" \ +\ +" punpcklwd mm4,mm0" \ +" paddd mm1,mm5" \ +\ +" punpckhwd mm2,mm0" \ +" paddd mm3,mm7" \ +\ +" movq mm5,mm3" \ +" punpckhwd mm3,mm1" \ +\ +" punpcklwd mm5,mm1" \ +" movq mm0,mm2" \ +\ +" movq mm1,mm4" \ +" pmaddwd mm0,mm3" \ +\ +" movq mm6,mm3" \ +" psrlq mm3,32" \ +\ +" movq mm7,mm5" \ +" punpckldq mm3,mm6" \ +\ +" pmaddwd mm1,mm5" \ +" psrlq mm5,32" \ +\ +" punpckldq mm5,mm7" \ +" pmaddwd mm2,mm3" \ +\ +" pmaddwd mm4,mm5" \ +" movq mm3,mm0" \ +\ +" punpckldq mm0,mm1" \ +\ +" psubd mm0,mmx_one_fixed_h" \ +" punpckhdq mm1,mm3" \ +\ +" psrad mm0,16" \ +" paddd mm2,mm4" \ +\ +" pslld mm1,16" \ +" paddd mm2,mm0" \ +\ +" paddd mm2,mm1" \ +\ +" movq mm1,mm2" \ +" psrlq mm2,32" \ +\ +" paddd mm1,mm2" \ +\ +" movd eax,mm1" \ +\ +" emms" \ +\ +" inc eax" \ +\ +parm [eax] [edx] value [eax]; + +#pragma aux MMXInline_VectorDot16 = \ +\ +" movd mm0,[edx+08h]" \ +\ +" packssdw mm0,[edx]" \ +\ +" movd mm1,[eax+08h]" \ +\ +" packssdw mm1,[eax]" \ +\ +" pmaddwd mm0,mm1" \ +\ +" movq mm1,mm0" \ +" psrlq mm0,32" \ +\ +" paddd mm0,mm1" \ +\ +" movd eax,mm0" \ +\ +" emms" \ +\ +parm [eax] [edx] value [eax]; + +#elif defined(_MSC_VER) + +_asminline signed MMXInline_VectorDot(struct vectorch const * v1, struct vectorch const * v2) +{ + signed retval; + _asm + { + mov edx,v1 + mov eax,v2 + + movq mm0,[edx] + + movd mm2,[edx+08h] + movq mm4,mm0 + + pand mm4,mmx_sign_mask + movq mm6,mm2 + + movq mm1,[eax] + paddd mm4,mm4 + + movd mm3,[eax+08h] + movq mm5,mm1 + + pand mm6,mmx_sign_mask + movq mm7,mm3 + + pand mm5,mmx_sign_mask + paddd mm6,mm6 + + pand mm7,mmx_sign_mask + paddd mm5,mm5 + + paddd mm0,mm4 + paddd mm2,mm6 + + paddd mm7,mm7 + movq mm4,mm2 + + punpcklwd mm4,mm0 + paddd mm1,mm5 + + punpckhwd mm2,mm0 + paddd mm3,mm7 + + movq mm5,mm3 + punpckhwd mm3,mm1 + + punpcklwd mm5,mm1 + movq mm0,mm2 + + movq mm1,mm4 + pmaddwd mm0,mm3 + + movq mm6,mm3 + psrlq mm3,32 + + movq mm7,mm5 + punpckldq mm3,mm6 + + pmaddwd mm1,mm5 + psrlq mm5,32 + + punpckldq mm5,mm7 + pmaddwd mm2,mm3 + + pmaddwd mm4,mm5 + movq mm3,mm0 + + punpckldq mm0,mm1 + + psubd mm0,mmx_one_fixed_h + punpckhdq mm1,mm3 + + psrad mm0,16 + paddd mm2,mm4 + + pslld mm1,16 + paddd mm2,mm0 + + paddd mm2,mm1 + + movq mm1,mm2 + psrlq mm2,32 + + paddd mm1,mm2 + + movd retval,mm1 + + emms + } + return retval+1; +} + +_asminline signed MMXInline_VectorDot16(struct vectorch const * v1, struct vectorch const * v2) +{ + signed retval; + _asm + { + mov eax,v1 + mov edx,v2 + + movd mm0,[edx+08h] + + packssdw mm0,[edx] + + movd mm1,[eax+08h] + + packssdw mm1,[eax] + + pmaddwd mm0,mm1 + + movq mm1,mm0 + psrlq mm0,32 + + paddd mm0,mm1 + + movd retval,mm0 + + emms + } + return retval; +} + +#else + +#error "Unknown compiler" + +#endif + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* SUPPORT_MMX */ + +#endif /* ! _included_mmx_math_h_ */ |
