@@ -214,30 +214,30 @@ $L$inner_enter:
214214
215215 xor r14 , r14
216216 mov rax , QWORD [ rsp ]
217- lea rsi ,[ rsp ]
218217 mov r15 , r9
219- jmp NEAR $ L $ sub
218+
220219ALIGN 16
221220$ L $ sub: sbb rax , QWORD [ r14 * 8 + rcx ]
222221 mov QWORD [ r14 * 8 + rdi ], rax
223- mov rax , QWORD [ 8 + r14 * 8 + rsi ]
222+ mov rax , QWORD [ 8 + r14 * 8 + rsp ]
224223 lea r14 ,[ 1 + r14 ]
225224 dec r15
226225 jnz NEAR $ L $ sub
227226
228227 sbb rax , 0
228+ mov rbx ,- 1
229+ xor rbx , rax
229230 xor r14 , r14
230- and rsi , rax
231- not rax
232- mov rcx , rdi
233- and rcx , rax
234231 mov r15 , r9
235- or rsi , rcx
236- ALIGN 16
232+
237233$ L $ copy:
238- mov rax , QWORD [ r14 * 8 + rsi ]
239- mov QWORD [ r14 * 8 + rsp ], r14
240- mov QWORD [ r14 * 8 + rdi ], rax
234+ mov rcx , QWORD [ r14 * 8 + rdi ]
235+ mov rdx , QWORD [ r14 * 8 + rsp ]
236+ and rcx , rbx
237+ and rdx , rax
238+ mov QWORD [ r14 * 8 + rsp ], r9
239+ or rdx , rcx
240+ mov QWORD [ r14 * 8 + rdi ], rdx
241241 lea r14 ,[ 1 + r14 ]
242242 sub r15 , 1
243243 jnz NEAR $ L $ copy
@@ -605,20 +605,18 @@ $L$inner4x:
605605 cmp r14 , r9
606606 jb NEAR $ L $ outer4x
607607 mov rdi , QWORD [ 16 + r9 * 8 + rsp ]
608+ lea r15 ,[ (( - 4 )) + r9 ]
608609 mov rax , QWORD [ rsp ]
609- pxor xmm0 , xmm0
610610 mov rdx , QWORD [ 8 + rsp ]
611- shr r9 , 2
611+ shr r15 , 2
612612 lea rsi ,[ rsp ]
613613 xor r14 , r14
614614
615615 sub rax , QWORD [ rcx ]
616616 mov rbx , QWORD [ 16 + rsi ]
617617 mov rbp , QWORD [ 24 + rsi ]
618618 sbb rdx , QWORD [ 8 + rcx ]
619- lea r15 ,[ (( - 1 )) + r9 ]
620- jmp NEAR $ L $ sub4x
621- ALIGN 16
619+
622620$ L $ sub4x:
623621 mov QWORD [ r14 * 8 + rdi ], rax
624622 mov QWORD [ 8 + r14 * 8 + rdi ], rdx
@@ -645,34 +643,35 @@ $L$sub4x:
645643
646644 sbb rax , 0
647645 mov QWORD [ 24 + r14 * 8 + rdi ], rbp
648- xor r14 , r14
649- and rsi , rax
650- not rax
651- mov rcx , rdi
652- and rcx , rax
653- lea r15 ,[ (( - 1 )) + r9 ]
654- or rsi , rcx
655-
656- movdqu xmm1 , XMMWORD [ rsi ]
657- movdqa XMMWORD [ rsp ], xmm0
658- movdqu XMMWORD [ rdi ], xmm1
646+ pxor xmm0 , xmm0
647+ DB 102 , 72 , 15 , 110 , 224
648+ pcmpeqd xmm5 , xmm5
649+ pshufd xmm4 , xmm4 , 0
650+ mov r15 , r9
651+ pxor xmm5 , xmm4
652+ shr r15 , 2
653+ xor eax , eax
654+
659655 jmp NEAR $ L $ copy4x
660656ALIGN 16
661657$ L $ copy4x:
662- movdqu xmm2 , XMMWORD [ 16 + r14 * 1 + rsi ]
663- movdqu xmm1 , XMMWORD [ 32 + r14 * 1 + rsi ]
664- movdqa XMMWORD [ 16 + r14 * 1 + rsp ], xmm0
665- movdqu XMMWORD [ 16 + r14 * 1 + rdi ], xmm2
666- movdqa XMMWORD [ 32 + r14 * 1 + rsp ], xmm0
667- movdqu XMMWORD [ 32 + r14 * 1 + rdi ], xmm1
668- lea r14 ,[ 32 + r14 ]
658+ movdqa xmm1 , XMMWORD [ rax * 1 + rsp ]
659+ movdqu xmm2 , XMMWORD [ rax * 1 + rdi ]
660+ pand xmm1 , xmm4
661+ pand xmm2 , xmm5
662+ movdqa xmm3 , XMMWORD [ 16 + rax * 1 + rsp ]
663+ movdqa XMMWORD [ rax * 1 + rsp ], xmm0
664+ por xmm1 , xmm2
665+ movdqu xmm2 , XMMWORD [ 16 + rax * 1 + rdi ]
666+ movdqu XMMWORD [ rax * 1 + rdi ], xmm1
667+ pand xmm3 , xmm4
668+ pand xmm2 , xmm5
669+ movdqa XMMWORD [ 16 + rax * 1 + rsp ], xmm0
670+ por xmm3 , xmm2
671+ movdqu XMMWORD [ 16 + rax * 1 + rdi ], xmm3
672+ lea rax ,[ 32 + rax ]
669673 dec r15
670674 jnz NEAR $ L $ copy4x
671-
672- shl r9 , 2
673- movdqu xmm2 , XMMWORD [ 16 + r14 * 1 + rsi ]
674- movdqa XMMWORD [ 16 + r14 * 1 + rsp ], xmm0
675- movdqu XMMWORD [ 16 + r14 * 1 + rdi ], xmm2
676675 mov rsi , QWORD [ 8 + r9 * 8 + rsp ]
677676 mov rax , 1
678677 mov r15 , QWORD [ (( - 48 )) + rsi ]
0 commit comments