diff options
| author | Rebellion Developments <rebellion@nomail> | 2000-03-16 11:25:00 +0100 |
|---|---|---|
| committer | Patryk Obara <dreamer.tan@gmail.com> | 2019-08-19 05:45:17 +0200 |
| commit | 218ca90543758a20ac326e444ca0643174ca7384 (patch) | |
| tree | 16bfe3e5307f9f515489000f28728224291a0e3b /3dc/win95/Mmx_math.asm | |
Import Aliens vs Predator - Gold (Build 116)
Source code release, imported from:
https://www.gamefront.com/games/aliens-vs-predator-3/file/avp-gold-complete-source-code
All text files were converted to Unix format.
Diffstat (limited to '3dc/win95/Mmx_math.asm')
| -rw-r--r-- | 3dc/win95/Mmx_math.asm | 1272 |
1 files changed, 1272 insertions, 0 deletions
diff --git a/3dc/win95/Mmx_math.asm b/3dc/win95/Mmx_math.asm new file mode 100644 index 0000000..ce5cb46 --- /dev/null +++ b/3dc/win95/Mmx_math.asm @@ -0,0 +1,1272 @@ +; want 8-byte alignment really!! +_DATA SEGMENT DWORD PUBLIC 'DATA' + + + PUBLIC _use_mmx_math + PUBLIC _mmx_sign_mask + PUBLIC _mmx_one_fixed_h + + align + _mmx_sign_mask:QWORD 0000800000008000h + _mmx_one_fixed_h:QWORD 0001000000000000h + _mmx_one_fixed_hl:QWORD 0001000000010000h + _mmx_one_hl:QWORD 0000000100000001h + store1:QWORD ? + _use_mmx_math:DWORD 1 + + + +_DATA ENDS + + + +; want 16-byte alignment really!! +_TEXT SEGMENT DWORD PUBLIC 'CODE' + ASSUME cs:_TEXT, ds:_DATA + +.586 + + PUBLIC MMXAsm_VectorDot_ + PUBLIC MMXAsm_VectorDot16_ + PUBLIC MMXAsm_VectorTransformed_ + PUBLIC MMXAsm_VectorTransform_ + PUBLIC MMXAsm_VectorTransformedAndAdd_ + PUBLIC MMXAsm_VectorTransformAndAdd_ + + PUBLIC _MMXAsm_VectorDot + PUBLIC _MMXAsm_VectorDot16 + PUBLIC _MMXAsm_VectorTransformed + PUBLIC _MMXAsm_VectorTransform + PUBLIC _MMXAsm_VectorTransformedAndAdd + PUBLIC _MMXAsm_VectorTransformAndAdd + + align +_MMXAsm_VectorDot: +MMXAsm_VectorDot_: + +if 0 + ; This is the unoptimized version + + ; get the data + movq mm0,[edx] + movq mm1,[eax] + movd mm2,[edx+08h] + movd mm3,[eax+08h] + + + ; get it into signed fixed format + movq mm4,mm0 + movq mm5,mm1 + movq mm6,mm2 + movq mm7,mm3 + + pand mm4,_mmx_sign_mask + pand mm5,_mmx_sign_mask + pand mm6,_mmx_sign_mask + pand mm7,_mmx_sign_mask + + paddd mm4,mm4 + paddd mm5,mm5 + paddd mm6,mm6 + paddd mm7,mm7 + + paddd mm0,mm4 + paddd mm1,mm5 + paddd mm2,mm6 + paddd mm3,mm7 + + ; at this point we have split all 32 bit values + ; into 16-bit pairs, high and low, both signed + + ; mm0: y1h y1l x1h x1l + ; mm1: y2h y2l x2h x2l + ; mm2: 0 0 z1h z1l + ; mm3: 0 0 z2h z2l + + ; swap 1st and 2nd words in mm0,mm1,mm2,mm3 ?? + movq mm4,mm2 + movq mm5,mm3 + punpcklwd mm4,mm0 + ; mm4: x1h z1h x1l z1l + punpcklwd mm5,mm1 + ; mm5: x2h z2h x2l z2l + punpckhwd mm2,mm0 + ; mm2: y1h 0 y1l 0 + punpckhwd mm3,mm1 + ; mm3: y2h 0 y2l 0 + + ; get the high and low products: x1h*x2h, x1l*x2l, etc + movq mm0,mm2 + pmaddwd mm0,mm3 + ; mm0: y1h*y2h y1l*y2l + movq mm1,mm4 + pmaddwd mm1,mm5 + ; mm1: x1h*x2h+z1h*z2h x1l*x2l+z1l*z2l + + ; exchange dwords in mm3 and mm5 + movq mm6,mm3 + movq mm7,mm5 + psrlq mm3,32 + psrlq mm5,32 + punpckldq mm3,mm6 + punpckldq mm5,mm7 + ; mm5: x2l z2l x2h z2h + ; mm3: y2l 0 y2h 0 + + ; compute the products x1h*x2l, x1l*x2h, etc + pmaddwd mm2,mm3 + ; mm2: y1h*y2l y1l*y2h + pmaddwd mm4,mm5 + ; mm4: x1h*x2l+z1h*z2l x1l*x2h+z1l*z2h + + paddd mm2,mm4 + ; mm2: x1h*x2l+y1h*y2l+z1h*z2l x1l*x2h+y1l*y2h+z1l*z2h + + ; get the low order dwords of mm0,mm1 + movq mm3,mm0 + punpckldq mm0,mm1 + ; mm0: x1l*x2l+z1l*z2l y1l*y2l + + ; unfortunately, at this point it is possible to have the + ; wrong value in mm0: if x1l,x2l,x1l,x2l + ; are all -0x8000, the result should + ; be +0x80000000, but of course this becomes + ; -0x80000000 + ; in fact the largest +ve value we could have is + ; +0x80000000 + ; and the lowest -ve value we could have is + ; -0x7fff0000 + ; = 0x80010000 + ; so subtracting ONE at this stage gives us a value + ; which is out by ONE, but twos-complement correct + psubd mm0,_mmx_one_fixed_h + + ; and the high order dwords + punpckhdq mm1,mm3 + ; mm1: x1h*x2h+z1h*z2h y1h*y2h + ; in fact it is swapped, but it doesn't matter + + ; shift the low order dwords down + psrad mm0,16 + ; and the high order dwords up + pslld mm1,16 + ; mm0: x1l*x2l+z1l*z2l>>16 -1 y1l*y2l>>16 + ; mm1: x1h*x2h+z1h*z2h<<16 y1h*y2h<<16 + ;(mm2) x1h*x2l+y1h*y2l+z1h*z2l x1l*x2h+y1l*y2h+z1l*z2h + + ; sum up + paddd mm2,mm0 + paddd mm2,mm1 + movq mm1,mm2 + psrlq mm2,32 + paddd mm1,mm2 + movd eax,mm1 + + emms + inc eax + ret + +else + ; + ; Now the optimized version + + movq mm0,[edx] + + movd mm2,[edx+08h] + movq mm4,mm0 + + + pand mm4,_mmx_sign_mask + movq mm6,mm2 + + movq mm1,[eax] + paddd mm4,mm4 + + movd mm3,[eax+08h] + movq mm5,mm1 + + pand mm6,_mmx_sign_mask + movq mm7,mm3 + + pand mm5,_mmx_sign_mask + paddd mm6,mm6 + + pand mm7,_mmx_sign_mask + paddd mm5,mm5 + + paddd mm0,mm4 + paddd mm2,mm6 + + paddd mm7,mm7 + movq mm4,mm2 + + punpcklwd mm4,mm0 + paddd mm1,mm5 + + punpckhwd mm2,mm0 + paddd mm3,mm7 + + movq mm5,mm3 + punpckhwd mm3,mm1 + + punpcklwd mm5,mm1 + movq mm0,mm2 + + movq mm1,mm4 + pmaddwd mm0,mm3 + + movq mm6,mm3 + psrlq mm3,32 + + movq mm7,mm5 + punpckldq mm3,mm6 + + pmaddwd mm1,mm5 + psrlq mm5,32 + + punpckldq mm5,mm7 + pmaddwd mm2,mm3 + + pmaddwd mm4,mm5 + movq mm3,mm0 + + ; these instructions won't pair and I have no instructions I can pair them with + punpckldq mm0,mm1 + + psubd mm0,_mmx_one_fixed_h + punpckhdq mm1,mm3 + + psrad mm0,16 + paddd mm2,mm4 + + pslld mm1,16 + paddd mm2,mm0 + + ; complete pairing is not possible at this stage - there are too many dependencies + paddd mm2,mm1 + + movq mm1,mm2 + psrlq mm2,32 + + paddd mm1,mm2 + + movd eax,mm1 + + emms + + inc eax + ret + +endif + + ; This takes 33 cycles, the orignal C -> nonMMX version takes 80 cycles + + align +_MMXAsm_VectorDot16: +MMXAsm_VectorDot16_: + + movd mm0,[edx+08h] + + packssdw mm0,[edx] + + movd mm1,[eax+08h] + + packssdw mm1,[eax] + + pmaddwd mm0,mm1 + + movq mm1,mm0 + psrlq mm0,32 + + paddd mm0,mm1 + + movd eax,mm0 + + emms + + ret + ; taking 14 cycles but assuming 16bit input vector fields + + + align +_MMXAsm_VectorTransformed: +MMXAsm_VectorTransformed_: + +if 0 + ; eax ptr to result + ; edx ptr to vector xh, xl, yh, yl, zh, zl + ; ecx ptr to matrix a11h, a11l, a12h, etc + + ; unoptimized version + + ; NOTE: in the Dot Product there was a problem + ; of an internal overflow where -32768*-32768 + -32768*-32768 gave 0x80000000 + ; which is -ve in two's complement + ; the additions and subtractions of ONE to resolve this problem + ; are marked '******' + + movq mm0,[edx] + movq mm1,mm0 + pand mm1,_mmx_sign_mask + paddd mm1,mm1 + paddd mm0,mm1 + ; mm0: yh yl xh xl + + movq mm2,[ecx] + movq mm3,mm2 + pand mm3,_mmx_sign_mask + paddd mm3,mm3 + paddd mm2,mm3 + ; mm2: a21h a21l a11h a11l + + movd mm4,[edx+08h] + movq mm5,mm4 + pand mm5,_mmx_sign_mask + paddd mm5,mm5 + paddd mm4,mm5 + ; mm4: 0 0 zh zl + + movq mm6,[ecx+18h] + movq mm7,mm6 + pand mm7,_mmx_sign_mask + paddd mm7,mm7 + paddd mm6,mm7 + ; mm6: a23h a23l a13h a13l + + ; interleave + + movq mm1,mm0 + punpckhwd mm0,mm4 + ; mm0: 0 yh 0 yl + punpcklwd mm1,mm4 + ; mm1: zh xh zl xl + + movq mm3,mm2 + punpckhwd mm2,mm6 + ; mm2: a23h a21h a23l a21l + punpcklwd mm3,mm6 + ; mm3: a13h a11h a13l a11l + + ; get a13*z, a11*x; a23*z a21*x, high and low products + movq mm4,mm1 + pmaddwd mm1,mm2 + movq mm6,mm4 + pmaddwd mm4,mm3 + ; mm0: 0 yh 0 yl + ; mm6: zh xh zl xl + ; mm2: a23h a21h a23l a21l + ; mm3: a13h a11h a13l a11l + ; mm1: zh*a23h+xh*a21h zl*a23l+xl*a21l + ; mm4: zh*a13h+xh*a11h zl*a13l+xl*a11l + + ; exchange dwords in mm6 + movq mm7,mm6 + psrlq mm6,32 + punpckldq mm6,mm7 + ; mm6: zl xl zh xh + ; mm7: zh xh zl xl + + ; get the high-low 'cross' products + pmaddwd mm2,mm6 + pmaddwd mm3,mm6 + ; mm2: a23h*zl+a21h*xl a23l*zh+a21l*xh + ; mm3: a13h*zl+a11h*xl a13l*zh+a11l*xh + + ; interleave mm1,mm4 and mm2,mm3 + movq mm5,mm4 + punpckldq mm4,mm1 + punpckhdq mm5,mm1 + ; mm4: zl*a23l+xl*a21l zl*a13l+xl*a11l ****** + ; mm5: zh*a23h+xh*a21h zh*a13h+xh*a11h + + ; ****** + psubd mm4,_mmx_one_fixed_hl + + + movq mm1,mm3 + punpckldq mm3,mm2 + punpckhdq mm1,mm2 + ; mm1: zl*a23h+xl*a21h zl*a13h+xl*a11h + ; mm3: zh*a23l+xh*a21l zh*a13l+xh*a11l + ; sum + paddd mm1,mm3 + ; shift the low order dwords down + psrad mm4,16 + ; and the high order dwords up + pslld mm5,16 + ; sum + paddd mm1,mm4 + paddd mm1,mm5 + ; mm1 holding x and y of the result + ; mm0: 0 yh 0 yl + ; mm1: z*a23+x*a21 z*a13+x*a11 + ; mm2: + ; mm3: + ; mm4: + ; mm5: + ; mm6: zl xl zh xh + ; mm7: zh xh zl xl + + ; grab some more of the matrix + movq mm2,[ecx+08h] + movq mm3,mm2 + pand mm3,_mmx_sign_mask + paddd mm3,mm3 + paddd mm2,mm3 ; mm7 not mm2 in optimized version + ; mm2: a12h a12l a31h a31l + + movd mm4,[ecx+20h] + movq mm5,mm4 + pand mm5,_mmx_sign_mask + paddd mm5,mm5 + paddd mm4,mm5 + ; mm4: 0 0 a33h a33l + + ; interleave + movq mm3,mm2 + punpcklwd mm2,mm4 + ; mm2: a33h a31h a33l a31l + psrlq mm3,32 + ; mm3: 0 0 a12h a12l + + ; compute mm2 * mm6/7 + movq mm4,mm2 + pmaddwd mm2,mm7 + pmaddwd mm4,mm6 + ; mm2: a33h*zh+a31h*xh a33l*zl+a31l*xl ****** + ; mm4: a33h*zl+a31h*xl a33l*zh+a31l*xh + movq mm7,mm2 + + ; ****** + psubd mm7,_mmx_one_fixed_hl + + pslld mm2,16 + psrad mm7,16 + paddd mm2,mm4 + paddd mm7,mm4 + psrlq mm2,32 + paddd mm2,mm7 + ; mm2: ? a33*z+a31*x + + + + ; get the rest of the matrix + movq mm5,[ecx+010h] + movq mm6,mm5 + pand mm6,_mmx_sign_mask + paddd mm6,mm6 + paddd mm5,mm6 + ; mm5: a32h a32l a22h a22l + ; mm3: 0 0 a12h a12l + + ; mm0: 0 yh 0 yl + movq mm7,mm0 + psrlq mm0,32 + punpcklwd mm0,mm7 + ; mm0: 0 0 yl yh + punpckldq mm0,mm0 + + ; mm0: yl yh yl yh + movq mm7,mm0 + pmaddwd mm0,mm3 + movq mm6,mm7 + pmaddwd mm7,mm5 + ; mm0: 0 yl*a12h+yh*a12l + ; mm7: yl*a32h+yh*a32l yl*a22h+yh*a22l + ; mm6: yl yh yl yh + punpckldq mm0,mm7 + ; mm0: yl*a22h+yh*a22l yl*a12h+yh*a12l + paddd mm1,mm0 + ; mm1: z*a23+x*a21+yl*a22h+yh*a22l z*a13+x*a11+yl*a12h+yh*a12l + psrlq mm7,32 + paddd mm2,mm7 + ; mm2: ? a33*z+a31*x+yl*a32h+yh*a32l + + + + ; mm5: a32h a32l a22h a22l + ; mm3: 0 0 a12h a12l + ; mm6: yl yh yl yh + + + + ; get all h and l separate + movq mm4,mm3 + punpcklwd mm3,mm5 + ; mm3: a22h a12h a22l a12l + punpckhwd mm5,mm4 + ; mm5: 0 a32h 0 a32l + movq mm4,mm3 + punpckhdq mm3,mm5 + ; mm3: 0 a32h a22h a12h + punpckldq mm4,mm5 + ; mm4: 0 a32l a22l a12l + punpckhwd mm6,mm6 + ; mm6: yl yl yh yh + movq mm0,mm6 + punpckhdq mm6,mm6 + ; mm6: yl yl yl yl + punpckldq mm0,mm0 + ; mm0: yh yh yh yh + pmullw mm3,mm0 + pmulhw mm4,mm6 + ; mm3: 0 a32h*yh a22h*yh a12h*yh + ; mm4: 0 a32l*yl>>16 a22l*yl>>16 a12l*yl>>16 + pxor mm7,mm7 + pcmpgtw mm7,mm4 + paddw mm3,mm7 + + movq mm5,mm4 + punpcklwd mm4,mm3 + punpckhwd mm5,mm3 + paddd mm1,mm4 + paddd mm2,mm5 + + ; ****** + paddd mm1,_mmx_one_hl + paddd mm2,_mmx_one_hl + + movq [eax],mm1 + movd [eax+08h],mm2 + + emms + ret + +else + ; + ; optimized version + + movq mm0,[edx] + + movd mm4,[edx+08h] + movq mm1,mm0 + + movq mm2,[ecx] + movq mm5,mm4 + + pand mm1,_mmx_sign_mask + movq mm3,mm2 + + pand mm5,_mmx_sign_mask + paddd mm1,mm1 + + movq mm6,[ecx+18h] + paddd mm5,mm5 + + pand mm3,_mmx_sign_mask + movq mm7,mm6 + + paddd mm0,mm1 + paddd mm3,mm3 + + pand mm7,_mmx_sign_mask + paddd mm2,mm3 + + movq mm1,mm0 + punpckhwd mm0,mm4 + + paddd mm4,mm5 + paddd mm7,mm7 + + paddd mm6,mm7 + punpcklwd mm1,mm4 + + movq mm3,mm2 + punpckhwd mm2,mm6 + + punpcklwd mm3,mm6 + movq mm4,mm1 + + movq mm6,mm1 + pmaddwd mm4,mm3 + + movq mm7,mm6 + psrlq mm6,32 + + pmaddwd mm1,mm2 + punpckldq mm6,mm7 + + movq store1,mm7 + pmaddwd mm3,mm6 + + movq mm7,[ecx+08h] + pmaddwd mm2,mm6 + + movq mm5,mm4 + punpckldq mm4,mm1 + + psubd mm4,_mmx_one_fixed_hl + punpckhdq mm5,mm1 + + movq mm1,mm7 + psrad mm4,16 + + pand mm1,_mmx_sign_mask + pslld mm5,16 + + paddd mm1,mm1 + paddd mm5,mm4 + + paddd mm7,mm1 + movq mm1,mm3 + + movd mm4,[ecx+20h] + punpckldq mm3,mm2 + + paddd mm3,mm5 + movq mm5,mm4 + + pand mm5,_mmx_sign_mask + punpckhdq mm1,mm2 + + paddd mm1,mm3 + paddd mm5,mm5 + + movq mm2,[ecx+010h] + movq mm3,mm7 + + paddd mm4,mm5 + movq mm5,mm2 + + pand mm2,_mmx_sign_mask + punpcklwd mm7,mm4 + + movq mm4,mm7 + psrlq mm3,32 + + pmaddwd mm7,store1 + paddd mm2,mm2 + + pmaddwd mm4,mm6 + movq mm6,mm0 + + psrlq mm0,32 + paddd mm5,mm2 + + punpcklwd mm0,mm6 + movq mm2,mm7 + + psubd mm7,_mmx_one_fixed_hl + pslld mm2,16 + + psrad mm7,16 + paddd mm2,mm4 + + paddd mm7,mm4 + punpckldq mm0,mm0 + + movq mm6,mm0 + psrlq mm2,32 + + paddd mm2,mm7 + movq mm7,mm6 + + pmaddwd mm0,mm3 + punpckhwd mm7,mm7 + + pmaddwd mm6,mm5 + movq mm4,mm3 + + punpcklwd mm3,mm5 + + punpckhwd mm5,mm4 + movq mm4,mm7 + + punpckldq mm0,mm6 + + paddd mm1,mm0 + punpckhdq mm7,mm7 + + movq mm0,mm3 + punpckldq mm3,mm5 + + pmulhw mm3,mm7 + punpckhdq mm0,mm5 + + punpckldq mm4,mm4 + + pmullw mm0,mm4 + psrlq mm6,32 + + paddd mm2,mm6 + pxor mm6,mm6 + + pcmpgtw mm6,mm3 + movq mm5,mm3 + + paddd mm1,_mmx_one_hl + paddw mm0,mm6 + + paddd mm2,_mmx_one_hl + punpcklwd mm3,mm0 + + paddd mm1,mm3 + punpckhwd mm5,mm0 + + paddd mm2,mm5 + + movq [eax],mm1 + + movd [eax+08h],mm2 + + emms + ret + ; 63 cycles compared with 204 for the C-nonMMX version +endif + + align +_MMXAsm_VectorTransform: +MMXAsm_VectorTransform_: + + movq mm0,[eax] + + movd mm4,[eax+08h] + movq mm1,mm0 + + movq mm2,[edx] + movq mm5,mm4 + + pand mm1,_mmx_sign_mask + movq mm3,mm2 + + pand mm5,_mmx_sign_mask + paddd mm1,mm1 + + movq mm6,[edx+18h] + paddd mm5,mm5 + + pand mm3,_mmx_sign_mask + movq mm7,mm6 + + paddd mm0,mm1 + paddd mm3,mm3 + + pand mm7,_mmx_sign_mask + paddd mm2,mm3 + + movq mm1,mm0 + punpckhwd mm0,mm4 + + paddd mm4,mm5 + paddd mm7,mm7 + + paddd mm6,mm7 + punpcklwd mm1,mm4 + + movq mm3,mm2 + punpckhwd mm2,mm6 + + punpcklwd mm3,mm6 + movq mm4,mm1 + + movq mm6,mm1 + pmaddwd mm4,mm3 + + movq mm7,mm6 + psrlq mm6,32 + + pmaddwd mm1,mm2 + punpckldq mm6,mm7 + + movq store1,mm7 + pmaddwd mm3,mm6 + + movq mm7,[edx+08h] + pmaddwd mm2,mm6 + + movq mm5,mm4 + punpckldq mm4,mm1 + + psubd mm4,_mmx_one_fixed_hl + punpckhdq mm5,mm1 + + movq mm1,mm7 + psrad mm4,16 + + pand mm1,_mmx_sign_mask + pslld mm5,16 + + paddd mm1,mm1 + paddd mm5,mm4 + + paddd mm7,mm1 + movq mm1,mm3 + + movd mm4,[edx+20h] + punpckldq mm3,mm2 + + paddd mm3,mm5 + movq mm5,mm4 + + pand mm5,_mmx_sign_mask + punpckhdq mm1,mm2 + + paddd mm1,mm3 + paddd mm5,mm5 + + movq mm2,[edx+010h] + movq mm3,mm7 + + paddd mm4,mm5 + movq mm5,mm2 + + pand mm2,_mmx_sign_mask + punpcklwd mm7,mm4 + + movq mm4,mm7 + psrlq mm3,32 + + pmaddwd mm7,store1 + paddd mm2,mm2 + + pmaddwd mm4,mm6 + movq mm6,mm0 + + psrlq mm0,32 + paddd mm5,mm2 + + punpcklwd mm0,mm6 + movq mm2,mm7 + + psubd mm7,_mmx_one_fixed_hl + pslld mm2,16 + + psrad mm7,16 + paddd mm2,mm4 + + paddd mm7,mm4 + punpckldq mm0,mm0 + + movq mm6,mm0 + psrlq mm2,32 + + paddd mm2,mm7 + movq mm7,mm6 + + pmaddwd mm0,mm3 + punpckhwd mm7,mm7 + + pmaddwd mm6,mm5 + movq mm4,mm3 + + punpcklwd mm3,mm5 + + punpckhwd mm5,mm4 + movq mm4,mm7 + + punpckldq mm0,mm6 + + paddd mm1,mm0 + punpckhdq mm7,mm7 + + movq mm0,mm3 + punpckldq mm3,mm5 + + pmulhw mm3,mm7 + punpckhdq mm0,mm5 + + punpckldq mm4,mm4 + + pmullw mm0,mm4 + psrlq mm6,32 + + paddd mm2,mm6 + pxor mm6,mm6 + + pcmpgtw mm6,mm3 + movq mm5,mm3 + + paddd mm1,_mmx_one_hl + paddw mm0,mm6 + + paddd mm2,_mmx_one_hl + punpcklwd mm3,mm0 + + paddd mm1,mm3 + punpckhwd mm5,mm0 + + paddd mm2,mm5 + + movq [eax],mm1 + + movd [eax+08h],mm2 + + emms + ret + ; 63 cycles compared with 204 for the C-nonMMX version + + + align +_MMXAsm_VectorTransformedAndAdd: +MMXAsm_VectorTransformedAndAdd_: + + movq mm0,[edx] + + movd mm4,[edx+08h] + movq mm1,mm0 + + movq mm2,[ecx] + movq mm5,mm4 + + pand mm1,_mmx_sign_mask + movq mm3,mm2 + + pand mm5,_mmx_sign_mask + paddd mm1,mm1 + + movq mm6,[ecx+18h] + paddd mm5,mm5 + + pand mm3,_mmx_sign_mask + movq mm7,mm6 + + paddd mm0,mm1 + paddd mm3,mm3 + + pand mm7,_mmx_sign_mask + paddd mm2,mm3 + + movq mm1,mm0 + punpckhwd mm0,mm4 + + paddd mm4,mm5 + paddd mm7,mm7 + + paddd mm6,mm7 + punpcklwd mm1,mm4 + + movq mm3,mm2 + punpckhwd mm2,mm6 + + punpcklwd mm3,mm6 + movq mm4,mm1 + + movq mm6,mm1 + pmaddwd mm4,mm3 + + movq mm7,mm6 + psrlq mm6,32 + + pmaddwd mm1,mm2 + punpckldq mm6,mm7 + + movq store1,mm7 + pmaddwd mm3,mm6 + + movq mm7,[ecx+08h] + pmaddwd mm2,mm6 + + movq mm5,mm4 + punpckldq mm4,mm1 + + psubd mm4,_mmx_one_fixed_hl + punpckhdq mm5,mm1 + + movq mm1,mm7 + psrad mm4,16 + + pand mm1,_mmx_sign_mask + pslld mm5,16 + + paddd mm1,mm1 + paddd mm5,mm4 + + paddd mm7,mm1 + movq mm1,mm3 + + movd mm4,[ecx+20h] + punpckldq mm3,mm2 + + paddd mm3,mm5 + movq mm5,mm4 + + pand mm5,_mmx_sign_mask + punpckhdq mm1,mm2 + + paddd mm1,mm3 + paddd mm5,mm5 + + movq mm2,[ecx+010h] + movq mm3,mm7 + + paddd mm4,mm5 + movq mm5,mm2 + + pand mm2,_mmx_sign_mask + punpcklwd mm7,mm4 + + movq mm4,mm7 + psrlq mm3,32 + + pmaddwd mm7,store1 + paddd mm2,mm2 + + pmaddwd mm4,mm6 + movq mm6,mm0 + + psrlq mm0,32 + paddd mm5,mm2 + + punpcklwd mm0,mm6 + movq mm2,mm7 + + psubd mm7,_mmx_one_fixed_hl + pslld mm2,16 + + psrad mm7,16 + paddd mm2,mm4 + + paddd mm7,mm4 + punpckldq mm0,mm0 + + movq mm6,mm0 + psrlq mm2,32 + + paddd mm2,mm7 + movq mm7,mm6 + + pmaddwd mm0,mm3 + punpckhwd mm7,mm7 + + pmaddwd mm6,mm5 + movq mm4,mm3 + + paddd mm1,_mmx_one_hl + punpcklwd mm3,mm5 + + punpckhwd mm5,mm4 + movq mm4,mm7 + + paddd mm2,_mmx_one_hl + punpckldq mm0,mm6 + + paddd mm1,mm0 + punpckhdq mm7,mm7 + + movq mm0,mm3 + punpckldq mm3,mm5 + + pmulhw mm3,mm7 + punpckhdq mm0,mm5 + + paddd mm1,[ebx] + punpckldq mm4,mm4 + + pmullw mm0,mm4 + psrlq mm6,32 + + paddd mm2,mm6 + pxor mm6,mm6 + + pcmpgtw mm6,mm3 + movq mm5,mm3 + + movd mm4,[ebx+08h] + paddw mm0,mm6 + + paddd mm2,mm4 + punpcklwd mm3,mm0 + + paddd mm1,mm3 + punpckhwd mm5,mm0 + + paddd mm2,mm5 + + movq [eax],mm1 + + movd [eax+08h],mm2 + + emms + ret + ; 63 cycles compared with 204 for the C-nonMMX version + + + align +_MMXAsm_VectorTransformAndAdd: +MMXAsm_VectorTransformAndAdd_: + + movq mm0,[eax] + + movd mm4,[eax+08h] + movq mm1,mm0 + + movq mm2,[edx] + movq mm5,mm4 + + pand mm1,_mmx_sign_mask + movq mm3,mm2 + + pand mm5,_mmx_sign_mask + paddd mm1,mm1 + + movq mm6,[edx+18h] + paddd mm5,mm5 + + pand mm3,_mmx_sign_mask + movq mm7,mm6 + + paddd mm0,mm1 + paddd mm3,mm3 + + pand mm7,_mmx_sign_mask + paddd mm2,mm3 + + movq mm1,mm0 + punpckhwd mm0,mm4 + + paddd mm4,mm5 + paddd mm7,mm7 + + paddd mm6,mm7 + punpcklwd mm1,mm4 + + movq mm3,mm2 + punpckhwd mm2,mm6 + + punpcklwd mm3,mm6 + movq mm4,mm1 + + movq mm6,mm1 + pmaddwd mm4,mm3 + + movq mm7,mm6 + psrlq mm6,32 + + pmaddwd mm1,mm2 + punpckldq mm6,mm7 + + movq store1,mm7 + pmaddwd mm3,mm6 + + movq mm7,[edx+08h] + pmaddwd mm2,mm6 + + movq mm5,mm4 + punpckldq mm4,mm1 + + psubd mm4,_mmx_one_fixed_hl + punpckhdq mm5,mm1 + + movq mm1,mm7 + psrad mm4,16 + + pand mm1,_mmx_sign_mask + pslld mm5,16 + + paddd mm1,mm1 + paddd mm5,mm4 + + paddd mm7,mm1 + movq mm1,mm3 + + movd mm4,[edx+20h] + punpckldq mm3,mm2 + + paddd mm3,mm5 + movq mm5,mm4 + + pand mm5,_mmx_sign_mask + punpckhdq mm1,mm2 + + paddd mm1,mm3 + paddd mm5,mm5 + + movq mm2,[edx+010h] + movq mm3,mm7 + + paddd mm4,mm5 + movq mm5,mm2 + + pand mm2,_mmx_sign_mask + punpcklwd mm7,mm4 + + movq mm4,mm7 + psrlq mm3,32 + + pmaddwd mm7,store1 + paddd mm2,mm2 + + pmaddwd mm4,mm6 + movq mm6,mm0 + + psrlq mm0,32 + paddd mm5,mm2 + + punpcklwd mm0,mm6 + movq mm2,mm7 + + psubd mm7,_mmx_one_fixed_hl + pslld mm2,16 + + psrad mm7,16 + paddd mm2,mm4 + + paddd mm7,mm4 + punpckldq mm0,mm0 + + movq mm6,mm0 + psrlq mm2,32 + + paddd mm2,mm7 + movq mm7,mm6 + + pmaddwd mm0,mm3 + punpckhwd mm7,mm7 + + pmaddwd mm6,mm5 + movq mm4,mm3 + + paddd mm1,_mmx_one_hl + punpcklwd mm3,mm5 + + punpckhwd mm5,mm4 + movq mm4,mm7 + + paddd mm2,_mmx_one_hl + punpckldq mm0,mm6 + + paddd mm1,mm0 + punpckhdq mm7,mm7 + + movq mm0,mm3 + punpckldq mm3,mm5 + + pmulhw mm3,mm7 + punpckhdq mm0,mm5 + + paddd mm1,[ecx] + punpckldq mm4,mm4 + + pmullw mm0,mm4 + psrlq mm6,32 + + paddd mm2,mm6 + pxor mm6,mm6 + + pcmpgtw mm6,mm3 + movq mm5,mm3 + + movd mm4,[ecx+08h] + paddw mm0,mm6 + + paddd mm2,mm4 + punpcklwd mm3,mm0 + + paddd mm1,mm3 + punpckhwd mm5,mm0 + + paddd mm2,mm5 + + movq [eax],mm1 + + movd [eax+08h],mm2 + + emms + ret + ; 63 cycles compared with 204 for the C-nonMMX version + + +_TEXT ENDS + +END + |
