LibC: Add SSE2 non-temporal memset and memcpy

Also cleanup other assembly by using local labels to emit them from the
assembled program.
This commit is contained in:
2026-04-11 00:45:52 +03:00
parent d11160d2f7
commit 2a9dad2dd8
5 changed files with 254 additions and 44 deletions

View File

@@ -1,48 +1,177 @@
.set nt_threshold, 32 * 1024
.align 16
.global memcpy
memcpy:
cmpq $nt_threshold, %rdx
jae .Lmemcpy_nt
movq %rdi, %rax
movq %rdx, %rcx
rep movsb
ret
.Lmemcpy_nt:
movq %rdi, %rax
testq $63, %rdi
jz .Lmemcpy_nt_aligned
# NOTE: this unconditionally copies 64 bytes to align to 64 byte boundary
# but as nt branch is only taken for huge buffers, it doesnt add much overhead
movdqu 0(%rsi), %xmm0
movdqu 16(%rsi), %xmm1
movdqu 32(%rsi), %xmm2
movdqu 48(%rsi), %xmm3
movdqu %xmm0, 0(%rdi)
movdqu %xmm1, 16(%rdi)
movdqu %xmm2, 32(%rdi)
movdqu %xmm3, 48(%rdi)
movq %rdi, %rcx
andq $63, %rcx
leaq -64(%rdx, %rcx), %rdx
negq %rcx
leaq 64(%rdi, %rcx), %rdi
leaq 64(%rsi, %rcx), %rsi
.Lmemcpy_nt_aligned:
movq %rdx, %rcx
shrq $6, %rdx
.align 16
.Lmemcpy_nt_loop:
prefetchnta 256(%rsi)
prefetchnta 32+256(%rsi)
movdqu 0(%rsi), %xmm0
movdqu 16(%rsi), %xmm1
movdqu 32(%rsi), %xmm2
movdqu 48(%rsi), %xmm3
movntdq %xmm0, 0(%rdi)
movntdq %xmm1, 16(%rdi)
movntdq %xmm2, 32(%rdi)
movntdq %xmm3, 48(%rdi)
addq $64, %rdi
addq $64, %rsi
subq $1, %rdx
jnz .Lmemcpy_nt_loop
andq $63, %rcx
rep movsb
sfence
ret
.align 16
.global memmove
memmove:
cmpq %rdi, %rsi
jae memcpy
cmpq $nt_threshold, %rdx
jb .Lmemmove_small
leaq (%rdi, %rdx), %rax
cmpq %rax, %rsi
jae .Lmemcpy_nt
leaq (%rsi, %rdx), %rax
cmpq %rax, %rdi
jae .Lmemcpy_nt
.Lmemmove_small:
movq %rdi, %rax
movq %rdx, %rcx
cmpq %rdi, %rsi
jb .Lmemmove_backwards
rep movsb
ret
.Lmemmove_backwards:
leaq -1(%rdi, %rdx), %rdi
leaq -1(%rsi, %rdx), %rsi
movq %rdx, %rcx
std
rep movsb
cld
ret
.align 16
.global memset
memset:
cmpq $nt_threshold, %rdx
jae .Lmemset_nt
movq %rdi, %r8
movb %sil, %al
movq %rdx, %rcx
movzbl %sil, %eax
rep stosb
movq %r8, %rax
ret
.Lmemset_nt:
movq %rdi, %rax
#if defined(__SSE2__)
movzbl %sil, %esi
imul $0x01010101, %esi
movd %esi, %xmm0
pshufd $0, %xmm0, %xmm0
testq $63, %rdi
jz .Lmemset_nt_aligned
# NOTE: this unconditionally writes 64 bytes to align to 64 byte boundary
# but as nt branch is only taken for huge buffers, it doesnt add much overhead
movdqu %xmm0, 0(%rdi)
movdqu %xmm0, 16(%rdi)
movdqu %xmm0, 32(%rdi)
movdqu %xmm0, 48(%rdi)
movq %rdi, %rcx
andq $63, %rcx
leaq -64(%rdx, %rcx), %rdx
negq %rcx
leaq 64(%rdi, %rcx), %rdi
.Lmemset_nt_aligned:
movq %rdx, %rcx
shrq $6, %rdx
.align 16
.Lmemset_nt_loop:
movntdq %xmm0, 0(%rdi)
movntdq %xmm0, 16(%rdi)
movntdq %xmm0, 32(%rdi)
movntdq %xmm0, 48(%rdi)
addq $64, %rdi
subq $1, %rdx
jnz .Lmemset_nt_loop
andq $63, %rcx
jnz .Lmemset_nt_bytes
sfence
ret
.Lmemset_nt_bytes:
movq %rax, %rdx
movzbl %sil, %eax
rep stosb
movq %rdx, %rax
sfence
ret
.align 16
.global memchr
memchr:
testq %rdx, %rdx
jz .memchr_no_match
jz .Lmemchr_no_match
movzbl %sil, %esi
imul $0x01010101, %esi
movd %esi, %xmm0
punpcklbw %xmm0, %xmm0
punpcklwd %xmm0, %xmm0
pshufd $0, %xmm0, %xmm0
movq %rdi, %rcx
andq $15, %rcx
jz .memchr_loop
jz .Lmemchr_loop
movq %rdi, %rsi
subq %rcx, %rsi
@@ -50,40 +179,42 @@ memchr:
pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %eax
shrl %cl, %eax
jnz .memchr_match
jnz .Lmemchr_match
leaq 16(%rsi), %rdi
addq %rcx, %rdx
subq $16, %rdx
jbe .memchr_no_match
jbe .Lmemchr_no_match
.memchr_loop:
.align 16
.Lmemchr_loop:
movdqa (%rdi), %xmm1
pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %eax
testl %eax, %eax
jnz .memchr_match
jnz .Lmemchr_match
addq $16, %rdi
subq $16, %rdx
ja .memchr_loop
ja .Lmemchr_loop
.memchr_no_match:
.Lmemchr_no_match:
xorq %rax, %rax
ret
.memchr_match:
.Lmemchr_match:
bsfl %eax, %eax
cmpq %rdx, %rax
jae .memchr_no_match
jae .Lmemchr_no_match
addq %rdi, %rax
ret
.align 16
.global memcmp
memcmp:
testq %rdx, %rdx
jz .memcmp_equal
jz .Lmemcmp_equal
movq %rdi, %rax
movq %rsi, %rcx
@@ -93,7 +224,7 @@ memcmp:
cmovaq %rcx, %rax
testq %rax, %rax
jz .memcmp_loop
jz .Lmemcmp_loop
movq $16, %rcx
subq %rax, %rcx
@@ -103,44 +234,46 @@ memcmp:
subq %rcx, %rdx
.memcmp_align_loop:
.Lmemcmp_align_loop:
movzbl (%rdi), %eax
movzbl (%rsi), %r8d
subl %r8d, %eax
jnz .memcmp_return
jnz .Lmemcmp_return
incq %rdi
incq %rsi
decq %rcx
jnz .memcmp_align_loop
addq $1, %rdi
addq $1, %rsi
subq $1, %rcx
jnz .Lmemcmp_align_loop
.memcmp_loop:
.align 16
.Lmemcmp_loop:
movdqu (%rdi), %xmm0
movdqu (%rsi), %xmm1
pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %eax
xorl $0xFFFF, %eax
jnz .memcmp_differ
jnz .Lmemcmp_differ
addq $16, %rdi
addq $16, %rsi
subq $16, %rdx
ja .memcmp_loop
ja .Lmemcmp_loop
.memcmp_equal:
.Lmemcmp_equal:
xorl %eax, %eax
.memcmp_return:
.Lmemcmp_return:
ret
.memcmp_differ:
.Lmemcmp_differ:
bsfl %eax, %ecx
cmpq %rdx, %rcx
jae .memcmp_equal
jae .Lmemcmp_equal
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %edx
subl %edx, %eax
ret
.align 16
.global strlen
strlen:
movq %rdi, %rsi
@@ -149,7 +282,7 @@ strlen:
movq %rsi, %rcx
andq $15, %rcx
jz .strlen_loop
jz .Lstrlen_loop
movq %rsi, %rdx
subq %rcx, %rdx
@@ -157,24 +290,23 @@ strlen:
pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %eax
shrl %cl, %eax
jnz .strlen_null_found
jnz .Lstrlen_null_found
leaq 16(%rdx), %rsi
.strlen_loop:
.align 16
.Lstrlen_loop:
movdqa (%rsi), %xmm1
pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %eax
testl %eax, %eax
jnz .strlen_null_found
jnz .Lstrlen_null_found
addq $16, %rsi
jmp .strlen_loop
jmp .Lstrlen_loop
.strlen_null_found:
.Lstrlen_null_found:
bsfl %eax, %eax
addq %rsi, %rax
subq %rdi, %rax
ret
#endif