summaryrefslogtreecommitdiff
path: root/3dc/win95/Mmx_math.asm
diff options
context:
space:
mode:
Diffstat (limited to '3dc/win95/Mmx_math.asm')
-rw-r--r--3dc/win95/Mmx_math.asm1272
1 files changed, 0 insertions, 1272 deletions
diff --git a/3dc/win95/Mmx_math.asm b/3dc/win95/Mmx_math.asm
deleted file mode 100644
index ce5cb46..0000000
--- a/3dc/win95/Mmx_math.asm
+++ /dev/null
@@ -1,1272 +0,0 @@
-; want 8-byte alignment really!!
-_DATA SEGMENT DWORD PUBLIC 'DATA'
-
-
- PUBLIC _use_mmx_math
- PUBLIC _mmx_sign_mask
- PUBLIC _mmx_one_fixed_h
-
- align
- _mmx_sign_mask:QWORD 0000800000008000h
- _mmx_one_fixed_h:QWORD 0001000000000000h
- _mmx_one_fixed_hl:QWORD 0001000000010000h
- _mmx_one_hl:QWORD 0000000100000001h
- store1:QWORD ?
- _use_mmx_math:DWORD 1
-
-
-
-_DATA ENDS
-
-
-
-; want 16-byte alignment really!!
-_TEXT SEGMENT DWORD PUBLIC 'CODE'
- ASSUME cs:_TEXT, ds:_DATA
-
-.586
-
- PUBLIC MMXAsm_VectorDot_
- PUBLIC MMXAsm_VectorDot16_
- PUBLIC MMXAsm_VectorTransformed_
- PUBLIC MMXAsm_VectorTransform_
- PUBLIC MMXAsm_VectorTransformedAndAdd_
- PUBLIC MMXAsm_VectorTransformAndAdd_
-
- PUBLIC _MMXAsm_VectorDot
- PUBLIC _MMXAsm_VectorDot16
- PUBLIC _MMXAsm_VectorTransformed
- PUBLIC _MMXAsm_VectorTransform
- PUBLIC _MMXAsm_VectorTransformedAndAdd
- PUBLIC _MMXAsm_VectorTransformAndAdd
-
- align
-_MMXAsm_VectorDot:
-MMXAsm_VectorDot_:
-
-if 0
- ; This is the unoptimized version
-
- ; get the data
- movq mm0,[edx]
- movq mm1,[eax]
- movd mm2,[edx+08h]
- movd mm3,[eax+08h]
-
-
- ; get it into signed fixed format
- movq mm4,mm0
- movq mm5,mm1
- movq mm6,mm2
- movq mm7,mm3
-
- pand mm4,_mmx_sign_mask
- pand mm5,_mmx_sign_mask
- pand mm6,_mmx_sign_mask
- pand mm7,_mmx_sign_mask
-
- paddd mm4,mm4
- paddd mm5,mm5
- paddd mm6,mm6
- paddd mm7,mm7
-
- paddd mm0,mm4
- paddd mm1,mm5
- paddd mm2,mm6
- paddd mm3,mm7
-
- ; at this point we have split all 32 bit values
- ; into 16-bit pairs, high and low, both signed
-
- ; mm0: y1h y1l x1h x1l
- ; mm1: y2h y2l x2h x2l
- ; mm2: 0 0 z1h z1l
- ; mm3: 0 0 z2h z2l
-
- ; swap 1st and 2nd words in mm0,mm1,mm2,mm3 ??
- movq mm4,mm2
- movq mm5,mm3
- punpcklwd mm4,mm0
- ; mm4: x1h z1h x1l z1l
- punpcklwd mm5,mm1
- ; mm5: x2h z2h x2l z2l
- punpckhwd mm2,mm0
- ; mm2: y1h 0 y1l 0
- punpckhwd mm3,mm1
- ; mm3: y2h 0 y2l 0
-
- ; get the high and low products: x1h*x2h, x1l*x2l, etc
- movq mm0,mm2
- pmaddwd mm0,mm3
- ; mm0: y1h*y2h y1l*y2l
- movq mm1,mm4
- pmaddwd mm1,mm5
- ; mm1: x1h*x2h+z1h*z2h x1l*x2l+z1l*z2l
-
- ; exchange dwords in mm3 and mm5
- movq mm6,mm3
- movq mm7,mm5
- psrlq mm3,32
- psrlq mm5,32
- punpckldq mm3,mm6
- punpckldq mm5,mm7
- ; mm5: x2l z2l x2h z2h
- ; mm3: y2l 0 y2h 0
-
- ; compute the products x1h*x2l, x1l*x2h, etc
- pmaddwd mm2,mm3
- ; mm2: y1h*y2l y1l*y2h
- pmaddwd mm4,mm5
- ; mm4: x1h*x2l+z1h*z2l x1l*x2h+z1l*z2h
-
- paddd mm2,mm4
- ; mm2: x1h*x2l+y1h*y2l+z1h*z2l x1l*x2h+y1l*y2h+z1l*z2h
-
- ; get the low order dwords of mm0,mm1
- movq mm3,mm0
- punpckldq mm0,mm1
- ; mm0: x1l*x2l+z1l*z2l y1l*y2l
-
- ; unfortunately, at this point it is possible to have the
- ; wrong value in mm0: if x1l,x2l,x1l,x2l
- ; are all -0x8000, the result should
- ; be +0x80000000, but of course this becomes
- ; -0x80000000
- ; in fact the largest +ve value we could have is
- ; +0x80000000
- ; and the lowest -ve value we could have is
- ; -0x7fff0000
- ; = 0x80010000
- ; so subtracting ONE at this stage gives us a value
- ; which is out by ONE, but twos-complement correct
- psubd mm0,_mmx_one_fixed_h
-
- ; and the high order dwords
- punpckhdq mm1,mm3
- ; mm1: x1h*x2h+z1h*z2h y1h*y2h
- ; in fact it is swapped, but it doesn't matter
-
- ; shift the low order dwords down
- psrad mm0,16
- ; and the high order dwords up
- pslld mm1,16
- ; mm0: x1l*x2l+z1l*z2l>>16 -1 y1l*y2l>>16
- ; mm1: x1h*x2h+z1h*z2h<<16 y1h*y2h<<16
- ;(mm2) x1h*x2l+y1h*y2l+z1h*z2l x1l*x2h+y1l*y2h+z1l*z2h
-
- ; sum up
- paddd mm2,mm0
- paddd mm2,mm1
- movq mm1,mm2
- psrlq mm2,32
- paddd mm1,mm2
- movd eax,mm1
-
- emms
- inc eax
- ret
-
-else
- ;
- ; Now the optimized version
-
- movq mm0,[edx]
-
- movd mm2,[edx+08h]
- movq mm4,mm0
-
-
- pand mm4,_mmx_sign_mask
- movq mm6,mm2
-
- movq mm1,[eax]
- paddd mm4,mm4
-
- movd mm3,[eax+08h]
- movq mm5,mm1
-
- pand mm6,_mmx_sign_mask
- movq mm7,mm3
-
- pand mm5,_mmx_sign_mask
- paddd mm6,mm6
-
- pand mm7,_mmx_sign_mask
- paddd mm5,mm5
-
- paddd mm0,mm4
- paddd mm2,mm6
-
- paddd mm7,mm7
- movq mm4,mm2
-
- punpcklwd mm4,mm0
- paddd mm1,mm5
-
- punpckhwd mm2,mm0
- paddd mm3,mm7
-
- movq mm5,mm3
- punpckhwd mm3,mm1
-
- punpcklwd mm5,mm1
- movq mm0,mm2
-
- movq mm1,mm4
- pmaddwd mm0,mm3
-
- movq mm6,mm3
- psrlq mm3,32
-
- movq mm7,mm5
- punpckldq mm3,mm6
-
- pmaddwd mm1,mm5
- psrlq mm5,32
-
- punpckldq mm5,mm7
- pmaddwd mm2,mm3
-
- pmaddwd mm4,mm5
- movq mm3,mm0
-
- ; these instructions won't pair and I have no instructions I can pair them with
- punpckldq mm0,mm1
-
- psubd mm0,_mmx_one_fixed_h
- punpckhdq mm1,mm3
-
- psrad mm0,16
- paddd mm2,mm4
-
- pslld mm1,16
- paddd mm2,mm0
-
- ; complete pairing is not possible at this stage - there are too many dependencies
- paddd mm2,mm1
-
- movq mm1,mm2
- psrlq mm2,32
-
- paddd mm1,mm2
-
- movd eax,mm1
-
- emms
-
- inc eax
- ret
-
-endif
-
- ; This takes 33 cycles, the orignal C -> nonMMX version takes 80 cycles
-
- align
-_MMXAsm_VectorDot16:
-MMXAsm_VectorDot16_:
-
- movd mm0,[edx+08h]
-
- packssdw mm0,[edx]
-
- movd mm1,[eax+08h]
-
- packssdw mm1,[eax]
-
- pmaddwd mm0,mm1
-
- movq mm1,mm0
- psrlq mm0,32
-
- paddd mm0,mm1
-
- movd eax,mm0
-
- emms
-
- ret
- ; taking 14 cycles but assuming 16bit input vector fields
-
-
- align
-_MMXAsm_VectorTransformed:
-MMXAsm_VectorTransformed_:
-
-if 0
- ; eax ptr to result
- ; edx ptr to vector xh, xl, yh, yl, zh, zl
- ; ecx ptr to matrix a11h, a11l, a12h, etc
-
- ; unoptimized version
-
- ; NOTE: in the Dot Product there was a problem
- ; of an internal overflow where -32768*-32768 + -32768*-32768 gave 0x80000000
- ; which is -ve in two's complement
- ; the additions and subtractions of ONE to resolve this problem
- ; are marked '******'
-
- movq mm0,[edx]
- movq mm1,mm0
- pand mm1,_mmx_sign_mask
- paddd mm1,mm1
- paddd mm0,mm1
- ; mm0: yh yl xh xl
-
- movq mm2,[ecx]
- movq mm3,mm2
- pand mm3,_mmx_sign_mask
- paddd mm3,mm3
- paddd mm2,mm3
- ; mm2: a21h a21l a11h a11l
-
- movd mm4,[edx+08h]
- movq mm5,mm4
- pand mm5,_mmx_sign_mask
- paddd mm5,mm5
- paddd mm4,mm5
- ; mm4: 0 0 zh zl
-
- movq mm6,[ecx+18h]
- movq mm7,mm6
- pand mm7,_mmx_sign_mask
- paddd mm7,mm7
- paddd mm6,mm7
- ; mm6: a23h a23l a13h a13l
-
- ; interleave
-
- movq mm1,mm0
- punpckhwd mm0,mm4
- ; mm0: 0 yh 0 yl
- punpcklwd mm1,mm4
- ; mm1: zh xh zl xl
-
- movq mm3,mm2
- punpckhwd mm2,mm6
- ; mm2: a23h a21h a23l a21l
- punpcklwd mm3,mm6
- ; mm3: a13h a11h a13l a11l
-
- ; get a13*z, a11*x; a23*z a21*x, high and low products
- movq mm4,mm1
- pmaddwd mm1,mm2
- movq mm6,mm4
- pmaddwd mm4,mm3
- ; mm0: 0 yh 0 yl
- ; mm6: zh xh zl xl
- ; mm2: a23h a21h a23l a21l
- ; mm3: a13h a11h a13l a11l
- ; mm1: zh*a23h+xh*a21h zl*a23l+xl*a21l
- ; mm4: zh*a13h+xh*a11h zl*a13l+xl*a11l
-
- ; exchange dwords in mm6
- movq mm7,mm6
- psrlq mm6,32
- punpckldq mm6,mm7
- ; mm6: zl xl zh xh
- ; mm7: zh xh zl xl
-
- ; get the high-low 'cross' products
- pmaddwd mm2,mm6
- pmaddwd mm3,mm6
- ; mm2: a23h*zl+a21h*xl a23l*zh+a21l*xh
- ; mm3: a13h*zl+a11h*xl a13l*zh+a11l*xh
-
- ; interleave mm1,mm4 and mm2,mm3
- movq mm5,mm4
- punpckldq mm4,mm1
- punpckhdq mm5,mm1
- ; mm4: zl*a23l+xl*a21l zl*a13l+xl*a11l ******
- ; mm5: zh*a23h+xh*a21h zh*a13h+xh*a11h
-
- ; ******
- psubd mm4,_mmx_one_fixed_hl
-
-
- movq mm1,mm3
- punpckldq mm3,mm2
- punpckhdq mm1,mm2
- ; mm1: zl*a23h+xl*a21h zl*a13h+xl*a11h
- ; mm3: zh*a23l+xh*a21l zh*a13l+xh*a11l
- ; sum
- paddd mm1,mm3
- ; shift the low order dwords down
- psrad mm4,16
- ; and the high order dwords up
- pslld mm5,16
- ; sum
- paddd mm1,mm4
- paddd mm1,mm5
- ; mm1 holding x and y of the result
- ; mm0: 0 yh 0 yl
- ; mm1: z*a23+x*a21 z*a13+x*a11
- ; mm2:
- ; mm3:
- ; mm4:
- ; mm5:
- ; mm6: zl xl zh xh
- ; mm7: zh xh zl xl
-
- ; grab some more of the matrix
- movq mm2,[ecx+08h]
- movq mm3,mm2
- pand mm3,_mmx_sign_mask
- paddd mm3,mm3
- paddd mm2,mm3 ; mm7 not mm2 in optimized version
- ; mm2: a12h a12l a31h a31l
-
- movd mm4,[ecx+20h]
- movq mm5,mm4
- pand mm5,_mmx_sign_mask
- paddd mm5,mm5
- paddd mm4,mm5
- ; mm4: 0 0 a33h a33l
-
- ; interleave
- movq mm3,mm2
- punpcklwd mm2,mm4
- ; mm2: a33h a31h a33l a31l
- psrlq mm3,32
- ; mm3: 0 0 a12h a12l
-
- ; compute mm2 * mm6/7
- movq mm4,mm2
- pmaddwd mm2,mm7
- pmaddwd mm4,mm6
- ; mm2: a33h*zh+a31h*xh a33l*zl+a31l*xl ******
- ; mm4: a33h*zl+a31h*xl a33l*zh+a31l*xh
- movq mm7,mm2
-
- ; ******
- psubd mm7,_mmx_one_fixed_hl
-
- pslld mm2,16
- psrad mm7,16
- paddd mm2,mm4
- paddd mm7,mm4
- psrlq mm2,32
- paddd mm2,mm7
- ; mm2: ? a33*z+a31*x
-
-
-
- ; get the rest of the matrix
- movq mm5,[ecx+010h]
- movq mm6,mm5
- pand mm6,_mmx_sign_mask
- paddd mm6,mm6
- paddd mm5,mm6
- ; mm5: a32h a32l a22h a22l
- ; mm3: 0 0 a12h a12l
-
- ; mm0: 0 yh 0 yl
- movq mm7,mm0
- psrlq mm0,32
- punpcklwd mm0,mm7
- ; mm0: 0 0 yl yh
- punpckldq mm0,mm0
-
- ; mm0: yl yh yl yh
- movq mm7,mm0
- pmaddwd mm0,mm3
- movq mm6,mm7
- pmaddwd mm7,mm5
- ; mm0: 0 yl*a12h+yh*a12l
- ; mm7: yl*a32h+yh*a32l yl*a22h+yh*a22l
- ; mm6: yl yh yl yh
- punpckldq mm0,mm7
- ; mm0: yl*a22h+yh*a22l yl*a12h+yh*a12l
- paddd mm1,mm0
- ; mm1: z*a23+x*a21+yl*a22h+yh*a22l z*a13+x*a11+yl*a12h+yh*a12l
- psrlq mm7,32
- paddd mm2,mm7
- ; mm2: ? a33*z+a31*x+yl*a32h+yh*a32l
-
-
-
- ; mm5: a32h a32l a22h a22l
- ; mm3: 0 0 a12h a12l
- ; mm6: yl yh yl yh
-
-
-
- ; get all h and l separate
- movq mm4,mm3
- punpcklwd mm3,mm5
- ; mm3: a22h a12h a22l a12l
- punpckhwd mm5,mm4
- ; mm5: 0 a32h 0 a32l
- movq mm4,mm3
- punpckhdq mm3,mm5
- ; mm3: 0 a32h a22h a12h
- punpckldq mm4,mm5
- ; mm4: 0 a32l a22l a12l
- punpckhwd mm6,mm6
- ; mm6: yl yl yh yh
- movq mm0,mm6
- punpckhdq mm6,mm6
- ; mm6: yl yl yl yl
- punpckldq mm0,mm0
- ; mm0: yh yh yh yh
- pmullw mm3,mm0
- pmulhw mm4,mm6
- ; mm3: 0 a32h*yh a22h*yh a12h*yh
- ; mm4: 0 a32l*yl>>16 a22l*yl>>16 a12l*yl>>16
- pxor mm7,mm7
- pcmpgtw mm7,mm4
- paddw mm3,mm7
-
- movq mm5,mm4
- punpcklwd mm4,mm3
- punpckhwd mm5,mm3
- paddd mm1,mm4
- paddd mm2,mm5
-
- ; ******
- paddd mm1,_mmx_one_hl
- paddd mm2,_mmx_one_hl
-
- movq [eax],mm1
- movd [eax+08h],mm2
-
- emms
- ret
-
-else
- ;
- ; optimized version
-
- movq mm0,[edx]
-
- movd mm4,[edx+08h]
- movq mm1,mm0
-
- movq mm2,[ecx]
- movq mm5,mm4
-
- pand mm1,_mmx_sign_mask
- movq mm3,mm2
-
- pand mm5,_mmx_sign_mask
- paddd mm1,mm1
-
- movq mm6,[ecx+18h]
- paddd mm5,mm5
-
- pand mm3,_mmx_sign_mask
- movq mm7,mm6
-
- paddd mm0,mm1
- paddd mm3,mm3
-
- pand mm7,_mmx_sign_mask
- paddd mm2,mm3
-
- movq mm1,mm0
- punpckhwd mm0,mm4
-
- paddd mm4,mm5
- paddd mm7,mm7
-
- paddd mm6,mm7
- punpcklwd mm1,mm4
-
- movq mm3,mm2
- punpckhwd mm2,mm6
-
- punpcklwd mm3,mm6
- movq mm4,mm1
-
- movq mm6,mm1
- pmaddwd mm4,mm3
-
- movq mm7,mm6
- psrlq mm6,32
-
- pmaddwd mm1,mm2
- punpckldq mm6,mm7
-
- movq store1,mm7
- pmaddwd mm3,mm6
-
- movq mm7,[ecx+08h]
- pmaddwd mm2,mm6
-
- movq mm5,mm4
- punpckldq mm4,mm1
-
- psubd mm4,_mmx_one_fixed_hl
- punpckhdq mm5,mm1
-
- movq mm1,mm7
- psrad mm4,16
-
- pand mm1,_mmx_sign_mask
- pslld mm5,16
-
- paddd mm1,mm1
- paddd mm5,mm4
-
- paddd mm7,mm1
- movq mm1,mm3
-
- movd mm4,[ecx+20h]
- punpckldq mm3,mm2
-
- paddd mm3,mm5
- movq mm5,mm4
-
- pand mm5,_mmx_sign_mask
- punpckhdq mm1,mm2
-
- paddd mm1,mm3
- paddd mm5,mm5
-
- movq mm2,[ecx+010h]
- movq mm3,mm7
-
- paddd mm4,mm5
- movq mm5,mm2
-
- pand mm2,_mmx_sign_mask
- punpcklwd mm7,mm4
-
- movq mm4,mm7
- psrlq mm3,32
-
- pmaddwd mm7,store1
- paddd mm2,mm2
-
- pmaddwd mm4,mm6
- movq mm6,mm0
-
- psrlq mm0,32
- paddd mm5,mm2
-
- punpcklwd mm0,mm6
- movq mm2,mm7
-
- psubd mm7,_mmx_one_fixed_hl
- pslld mm2,16
-
- psrad mm7,16
- paddd mm2,mm4
-
- paddd mm7,mm4
- punpckldq mm0,mm0
-
- movq mm6,mm0
- psrlq mm2,32
-
- paddd mm2,mm7
- movq mm7,mm6
-
- pmaddwd mm0,mm3
- punpckhwd mm7,mm7
-
- pmaddwd mm6,mm5
- movq mm4,mm3
-
- punpcklwd mm3,mm5
-
- punpckhwd mm5,mm4
- movq mm4,mm7
-
- punpckldq mm0,mm6
-
- paddd mm1,mm0
- punpckhdq mm7,mm7
-
- movq mm0,mm3
- punpckldq mm3,mm5
-
- pmulhw mm3,mm7
- punpckhdq mm0,mm5
-
- punpckldq mm4,mm4
-
- pmullw mm0,mm4
- psrlq mm6,32
-
- paddd mm2,mm6
- pxor mm6,mm6
-
- pcmpgtw mm6,mm3
- movq mm5,mm3
-
- paddd mm1,_mmx_one_hl
- paddw mm0,mm6
-
- paddd mm2,_mmx_one_hl
- punpcklwd mm3,mm0
-
- paddd mm1,mm3
- punpckhwd mm5,mm0
-
- paddd mm2,mm5
-
- movq [eax],mm1
-
- movd [eax+08h],mm2
-
- emms
- ret
- ; 63 cycles compared with 204 for the C-nonMMX version
-endif
-
- align
-_MMXAsm_VectorTransform:
-MMXAsm_VectorTransform_:
-
- movq mm0,[eax]
-
- movd mm4,[eax+08h]
- movq mm1,mm0
-
- movq mm2,[edx]
- movq mm5,mm4
-
- pand mm1,_mmx_sign_mask
- movq mm3,mm2
-
- pand mm5,_mmx_sign_mask
- paddd mm1,mm1
-
- movq mm6,[edx+18h]
- paddd mm5,mm5
-
- pand mm3,_mmx_sign_mask
- movq mm7,mm6
-
- paddd mm0,mm1
- paddd mm3,mm3
-
- pand mm7,_mmx_sign_mask
- paddd mm2,mm3
-
- movq mm1,mm0
- punpckhwd mm0,mm4
-
- paddd mm4,mm5
- paddd mm7,mm7
-
- paddd mm6,mm7
- punpcklwd mm1,mm4
-
- movq mm3,mm2
- punpckhwd mm2,mm6
-
- punpcklwd mm3,mm6
- movq mm4,mm1
-
- movq mm6,mm1
- pmaddwd mm4,mm3
-
- movq mm7,mm6
- psrlq mm6,32
-
- pmaddwd mm1,mm2
- punpckldq mm6,mm7
-
- movq store1,mm7
- pmaddwd mm3,mm6
-
- movq mm7,[edx+08h]
- pmaddwd mm2,mm6
-
- movq mm5,mm4
- punpckldq mm4,mm1
-
- psubd mm4,_mmx_one_fixed_hl
- punpckhdq mm5,mm1
-
- movq mm1,mm7
- psrad mm4,16
-
- pand mm1,_mmx_sign_mask
- pslld mm5,16
-
- paddd mm1,mm1
- paddd mm5,mm4
-
- paddd mm7,mm1
- movq mm1,mm3
-
- movd mm4,[edx+20h]
- punpckldq mm3,mm2
-
- paddd mm3,mm5
- movq mm5,mm4
-
- pand mm5,_mmx_sign_mask
- punpckhdq mm1,mm2
-
- paddd mm1,mm3
- paddd mm5,mm5
-
- movq mm2,[edx+010h]
- movq mm3,mm7
-
- paddd mm4,mm5
- movq mm5,mm2
-
- pand mm2,_mmx_sign_mask
- punpcklwd mm7,mm4
-
- movq mm4,mm7
- psrlq mm3,32
-
- pmaddwd mm7,store1
- paddd mm2,mm2
-
- pmaddwd mm4,mm6
- movq mm6,mm0
-
- psrlq mm0,32
- paddd mm5,mm2
-
- punpcklwd mm0,mm6
- movq mm2,mm7
-
- psubd mm7,_mmx_one_fixed_hl
- pslld mm2,16
-
- psrad mm7,16
- paddd mm2,mm4
-
- paddd mm7,mm4
- punpckldq mm0,mm0
-
- movq mm6,mm0
- psrlq mm2,32
-
- paddd mm2,mm7
- movq mm7,mm6
-
- pmaddwd mm0,mm3
- punpckhwd mm7,mm7
-
- pmaddwd mm6,mm5
- movq mm4,mm3
-
- punpcklwd mm3,mm5
-
- punpckhwd mm5,mm4
- movq mm4,mm7
-
- punpckldq mm0,mm6
-
- paddd mm1,mm0
- punpckhdq mm7,mm7
-
- movq mm0,mm3
- punpckldq mm3,mm5
-
- pmulhw mm3,mm7
- punpckhdq mm0,mm5
-
- punpckldq mm4,mm4
-
- pmullw mm0,mm4
- psrlq mm6,32
-
- paddd mm2,mm6
- pxor mm6,mm6
-
- pcmpgtw mm6,mm3
- movq mm5,mm3
-
- paddd mm1,_mmx_one_hl
- paddw mm0,mm6
-
- paddd mm2,_mmx_one_hl
- punpcklwd mm3,mm0
-
- paddd mm1,mm3
- punpckhwd mm5,mm0
-
- paddd mm2,mm5
-
- movq [eax],mm1
-
- movd [eax+08h],mm2
-
- emms
- ret
- ; 63 cycles compared with 204 for the C-nonMMX version
-
-
- align
-_MMXAsm_VectorTransformedAndAdd:
-MMXAsm_VectorTransformedAndAdd_:
-
- movq mm0,[edx]
-
- movd mm4,[edx+08h]
- movq mm1,mm0
-
- movq mm2,[ecx]
- movq mm5,mm4
-
- pand mm1,_mmx_sign_mask
- movq mm3,mm2
-
- pand mm5,_mmx_sign_mask
- paddd mm1,mm1
-
- movq mm6,[ecx+18h]
- paddd mm5,mm5
-
- pand mm3,_mmx_sign_mask
- movq mm7,mm6
-
- paddd mm0,mm1
- paddd mm3,mm3
-
- pand mm7,_mmx_sign_mask
- paddd mm2,mm3
-
- movq mm1,mm0
- punpckhwd mm0,mm4
-
- paddd mm4,mm5
- paddd mm7,mm7
-
- paddd mm6,mm7
- punpcklwd mm1,mm4
-
- movq mm3,mm2
- punpckhwd mm2,mm6
-
- punpcklwd mm3,mm6
- movq mm4,mm1
-
- movq mm6,mm1
- pmaddwd mm4,mm3
-
- movq mm7,mm6
- psrlq mm6,32
-
- pmaddwd mm1,mm2
- punpckldq mm6,mm7
-
- movq store1,mm7
- pmaddwd mm3,mm6
-
- movq mm7,[ecx+08h]
- pmaddwd mm2,mm6
-
- movq mm5,mm4
- punpckldq mm4,mm1
-
- psubd mm4,_mmx_one_fixed_hl
- punpckhdq mm5,mm1
-
- movq mm1,mm7
- psrad mm4,16
-
- pand mm1,_mmx_sign_mask
- pslld mm5,16
-
- paddd mm1,mm1
- paddd mm5,mm4
-
- paddd mm7,mm1
- movq mm1,mm3
-
- movd mm4,[ecx+20h]
- punpckldq mm3,mm2
-
- paddd mm3,mm5
- movq mm5,mm4
-
- pand mm5,_mmx_sign_mask
- punpckhdq mm1,mm2
-
- paddd mm1,mm3
- paddd mm5,mm5
-
- movq mm2,[ecx+010h]
- movq mm3,mm7
-
- paddd mm4,mm5
- movq mm5,mm2
-
- pand mm2,_mmx_sign_mask
- punpcklwd mm7,mm4
-
- movq mm4,mm7
- psrlq mm3,32
-
- pmaddwd mm7,store1
- paddd mm2,mm2
-
- pmaddwd mm4,mm6
- movq mm6,mm0
-
- psrlq mm0,32
- paddd mm5,mm2
-
- punpcklwd mm0,mm6
- movq mm2,mm7
-
- psubd mm7,_mmx_one_fixed_hl
- pslld mm2,16
-
- psrad mm7,16
- paddd mm2,mm4
-
- paddd mm7,mm4
- punpckldq mm0,mm0
-
- movq mm6,mm0
- psrlq mm2,32
-
- paddd mm2,mm7
- movq mm7,mm6
-
- pmaddwd mm0,mm3
- punpckhwd mm7,mm7
-
- pmaddwd mm6,mm5
- movq mm4,mm3
-
- paddd mm1,_mmx_one_hl
- punpcklwd mm3,mm5
-
- punpckhwd mm5,mm4
- movq mm4,mm7
-
- paddd mm2,_mmx_one_hl
- punpckldq mm0,mm6
-
- paddd mm1,mm0
- punpckhdq mm7,mm7
-
- movq mm0,mm3
- punpckldq mm3,mm5
-
- pmulhw mm3,mm7
- punpckhdq mm0,mm5
-
- paddd mm1,[ebx]
- punpckldq mm4,mm4
-
- pmullw mm0,mm4
- psrlq mm6,32
-
- paddd mm2,mm6
- pxor mm6,mm6
-
- pcmpgtw mm6,mm3
- movq mm5,mm3
-
- movd mm4,[ebx+08h]
- paddw mm0,mm6
-
- paddd mm2,mm4
- punpcklwd mm3,mm0
-
- paddd mm1,mm3
- punpckhwd mm5,mm0
-
- paddd mm2,mm5
-
- movq [eax],mm1
-
- movd [eax+08h],mm2
-
- emms
- ret
- ; 63 cycles compared with 204 for the C-nonMMX version
-
-
- align
-_MMXAsm_VectorTransformAndAdd:
-MMXAsm_VectorTransformAndAdd_:
-
- movq mm0,[eax]
-
- movd mm4,[eax+08h]
- movq mm1,mm0
-
- movq mm2,[edx]
- movq mm5,mm4
-
- pand mm1,_mmx_sign_mask
- movq mm3,mm2
-
- pand mm5,_mmx_sign_mask
- paddd mm1,mm1
-
- movq mm6,[edx+18h]
- paddd mm5,mm5
-
- pand mm3,_mmx_sign_mask
- movq mm7,mm6
-
- paddd mm0,mm1
- paddd mm3,mm3
-
- pand mm7,_mmx_sign_mask
- paddd mm2,mm3
-
- movq mm1,mm0
- punpckhwd mm0,mm4
-
- paddd mm4,mm5
- paddd mm7,mm7
-
- paddd mm6,mm7
- punpcklwd mm1,mm4
-
- movq mm3,mm2
- punpckhwd mm2,mm6
-
- punpcklwd mm3,mm6
- movq mm4,mm1
-
- movq mm6,mm1
- pmaddwd mm4,mm3
-
- movq mm7,mm6
- psrlq mm6,32
-
- pmaddwd mm1,mm2
- punpckldq mm6,mm7
-
- movq store1,mm7
- pmaddwd mm3,mm6
-
- movq mm7,[edx+08h]
- pmaddwd mm2,mm6
-
- movq mm5,mm4
- punpckldq mm4,mm1
-
- psubd mm4,_mmx_one_fixed_hl
- punpckhdq mm5,mm1
-
- movq mm1,mm7
- psrad mm4,16
-
- pand mm1,_mmx_sign_mask
- pslld mm5,16
-
- paddd mm1,mm1
- paddd mm5,mm4
-
- paddd mm7,mm1
- movq mm1,mm3
-
- movd mm4,[edx+20h]
- punpckldq mm3,mm2
-
- paddd mm3,mm5
- movq mm5,mm4
-
- pand mm5,_mmx_sign_mask
- punpckhdq mm1,mm2
-
- paddd mm1,mm3
- paddd mm5,mm5
-
- movq mm2,[edx+010h]
- movq mm3,mm7
-
- paddd mm4,mm5
- movq mm5,mm2
-
- pand mm2,_mmx_sign_mask
- punpcklwd mm7,mm4
-
- movq mm4,mm7
- psrlq mm3,32
-
- pmaddwd mm7,store1
- paddd mm2,mm2
-
- pmaddwd mm4,mm6
- movq mm6,mm0
-
- psrlq mm0,32
- paddd mm5,mm2
-
- punpcklwd mm0,mm6
- movq mm2,mm7
-
- psubd mm7,_mmx_one_fixed_hl
- pslld mm2,16
-
- psrad mm7,16
- paddd mm2,mm4
-
- paddd mm7,mm4
- punpckldq mm0,mm0
-
- movq mm6,mm0
- psrlq mm2,32
-
- paddd mm2,mm7
- movq mm7,mm6
-
- pmaddwd mm0,mm3
- punpckhwd mm7,mm7
-
- pmaddwd mm6,mm5
- movq mm4,mm3
-
- paddd mm1,_mmx_one_hl
- punpcklwd mm3,mm5
-
- punpckhwd mm5,mm4
- movq mm4,mm7
-
- paddd mm2,_mmx_one_hl
- punpckldq mm0,mm6
-
- paddd mm1,mm0
- punpckhdq mm7,mm7
-
- movq mm0,mm3
- punpckldq mm3,mm5
-
- pmulhw mm3,mm7
- punpckhdq mm0,mm5
-
- paddd mm1,[ecx]
- punpckldq mm4,mm4
-
- pmullw mm0,mm4
- psrlq mm6,32
-
- paddd mm2,mm6
- pxor mm6,mm6
-
- pcmpgtw mm6,mm3
- movq mm5,mm3
-
- movd mm4,[ecx+08h]
- paddw mm0,mm6
-
- paddd mm2,mm4
- punpcklwd mm3,mm0
-
- paddd mm1,mm3
- punpckhwd mm5,mm0
-
- paddd mm2,mm5
-
- movq [eax],mm1
-
- movd [eax+08h],mm2
-
- emms
- ret
- ; 63 cycles compared with 204 for the C-nonMMX version
-
-
-_TEXT ENDS
-
-END
-