LibC: Add SSE2 non-temporal memset and memcpy

Also cleanup other assembly by using local labels to emit them from the
assembled program.
This commit is contained in:
2026-04-11 00:45:52 +03:00
parent d11160d2f7
commit 2a9dad2dd8
5 changed files with 254 additions and 44 deletions

View File

@@ -168,10 +168,7 @@ set(BAN_SOURCES
set(KLIBC_SOURCES set(KLIBC_SOURCES
klibc/ctype.cpp klibc/ctype.cpp
klibc/string.cpp klibc/string.cpp
klibc/arch/${BANAN_ARCH}/string.S
# Ehhh don't do this but for now libc uses the same stuff kernel can use
# This won't work after libc starts using sse implemetations tho
../userspace/libraries/LibC/arch/${BANAN_ARCH}/string.S
) )
set(LIBDEFLATE_SOURCE set(LIBDEFLATE_SOURCE

View File

@@ -0,0 +1,47 @@
.align 16
.global memcpy
memcpy:
xchgl 4(%esp), %edi
xchgl 8(%esp), %esi
movl 12(%esp), %ecx
movl %edi, %edx
rep movsb
movl 4(%esp), %edi
movl 8(%esp), %esi
movl %edx, %eax
ret
.align 16
.global memmove
memmove:
xchgl 4(%esp), %edi
xchgl 8(%esp), %esi
movl 12(%esp), %ecx
movl %edi, %edx
cmpl %edi, %esi
jb .memmove_slow
rep movsb
.memmove_done:
movl 4(%esp), %edi
movl 8(%esp), %esi
movl %edx, %eax
ret
.memmove_slow:
leal -1(%edi, %ecx), %edi
leal -1(%esi, %ecx), %esi
std
rep movsb
cld
jmp .memmove_done
.align 16
.global memset
memset:
xchgl 4(%esp), %edi
movl 8(%esp), %eax
movl 12(%esp), %ecx
movl %edi, %edx
rep stosb
movl 4(%esp), %edi
movl %edx, %eax
ret

View File

@@ -0,0 +1,31 @@
.align 16
.global memcpy
memcpy:
movq %rdi, %rax
movq %rdx, %rcx
rep movsb
ret
.align 16
.global memmove
memmove:
cmpq %rdi, %rsi
jae memcpy
movq %rdi, %rax
leaq -1(%rdi, %rdx), %rdi
leaq -1(%rsi, %rdx), %rsi
movq %rdx, %rcx
std
rep movsb
cld
ret
.align 16
.global memset
memset:
movq %rdi, %r8
movb %sil, %al
movq %rdx, %rcx
rep stosb
movq %r8, %rax
ret

View File

@@ -1,3 +1,4 @@
.align 16
.global memcpy .global memcpy
memcpy: memcpy:
xchgl 4(%esp), %edi xchgl 4(%esp), %edi
@@ -10,6 +11,7 @@ memcpy:
movl %edx, %eax movl %edx, %eax
ret ret
.align 16
.global memmove .global memmove
memmove: memmove:
xchgl 4(%esp), %edi xchgl 4(%esp), %edi
@@ -32,6 +34,7 @@ memmove:
cld cld
jmp .memmove_done jmp .memmove_done
.align 16
.global memset .global memset
memset: memset:
xchgl 4(%esp), %edi xchgl 4(%esp), %edi

View File

@@ -1,48 +1,177 @@
.set nt_threshold, 32 * 1024
.align 16
.global memcpy .global memcpy
memcpy: memcpy:
cmpq $nt_threshold, %rdx
jae .Lmemcpy_nt
movq %rdi, %rax movq %rdi, %rax
movq %rdx, %rcx movq %rdx, %rcx
rep movsb rep movsb
ret ret
.Lmemcpy_nt:
movq %rdi, %rax
testq $63, %rdi
jz .Lmemcpy_nt_aligned
# NOTE: this unconditionally copies 64 bytes to align to 64 byte boundary
# but as nt branch is only taken for huge buffers, it doesnt add much overhead
movdqu 0(%rsi), %xmm0
movdqu 16(%rsi), %xmm1
movdqu 32(%rsi), %xmm2
movdqu 48(%rsi), %xmm3
movdqu %xmm0, 0(%rdi)
movdqu %xmm1, 16(%rdi)
movdqu %xmm2, 32(%rdi)
movdqu %xmm3, 48(%rdi)
movq %rdi, %rcx
andq $63, %rcx
leaq -64(%rdx, %rcx), %rdx
negq %rcx
leaq 64(%rdi, %rcx), %rdi
leaq 64(%rsi, %rcx), %rsi
.Lmemcpy_nt_aligned:
movq %rdx, %rcx
shrq $6, %rdx
.align 16
.Lmemcpy_nt_loop:
prefetchnta 256(%rsi)
prefetchnta 32+256(%rsi)
movdqu 0(%rsi), %xmm0
movdqu 16(%rsi), %xmm1
movdqu 32(%rsi), %xmm2
movdqu 48(%rsi), %xmm3
movntdq %xmm0, 0(%rdi)
movntdq %xmm1, 16(%rdi)
movntdq %xmm2, 32(%rdi)
movntdq %xmm3, 48(%rdi)
addq $64, %rdi
addq $64, %rsi
subq $1, %rdx
jnz .Lmemcpy_nt_loop
andq $63, %rcx
rep movsb
sfence
ret
.align 16
.global memmove .global memmove
memmove: memmove:
cmpq %rdi, %rsi cmpq $nt_threshold, %rdx
jae memcpy jb .Lmemmove_small
leaq (%rdi, %rdx), %rax
cmpq %rax, %rsi
jae .Lmemcpy_nt
leaq (%rsi, %rdx), %rax
cmpq %rax, %rdi
jae .Lmemcpy_nt
.Lmemmove_small:
movq %rdi, %rax movq %rdi, %rax
movq %rdx, %rcx
cmpq %rdi, %rsi
jb .Lmemmove_backwards
rep movsb
ret
.Lmemmove_backwards:
leaq -1(%rdi, %rdx), %rdi leaq -1(%rdi, %rdx), %rdi
leaq -1(%rsi, %rdx), %rsi leaq -1(%rsi, %rdx), %rsi
movq %rdx, %rcx
std std
rep movsb rep movsb
cld cld
ret ret
.align 16
.global memset .global memset
memset: memset:
cmpq $nt_threshold, %rdx
jae .Lmemset_nt
movq %rdi, %r8 movq %rdi, %r8
movb %sil, %al
movq %rdx, %rcx movq %rdx, %rcx
movzbl %sil, %eax
rep stosb rep stosb
movq %r8, %rax movq %r8, %rax
ret ret
.Lmemset_nt:
movq %rdi, %rax
#if defined(__SSE2__) movzbl %sil, %esi
imul $0x01010101, %esi
movd %esi, %xmm0
pshufd $0, %xmm0, %xmm0
testq $63, %rdi
jz .Lmemset_nt_aligned
# NOTE: this unconditionally writes 64 bytes to align to 64 byte boundary
# but as nt branch is only taken for huge buffers, it doesnt add much overhead
movdqu %xmm0, 0(%rdi)
movdqu %xmm0, 16(%rdi)
movdqu %xmm0, 32(%rdi)
movdqu %xmm0, 48(%rdi)
movq %rdi, %rcx
andq $63, %rcx
leaq -64(%rdx, %rcx), %rdx
negq %rcx
leaq 64(%rdi, %rcx), %rdi
.Lmemset_nt_aligned:
movq %rdx, %rcx
shrq $6, %rdx
.align 16
.Lmemset_nt_loop:
movntdq %xmm0, 0(%rdi)
movntdq %xmm0, 16(%rdi)
movntdq %xmm0, 32(%rdi)
movntdq %xmm0, 48(%rdi)
addq $64, %rdi
subq $1, %rdx
jnz .Lmemset_nt_loop
andq $63, %rcx
jnz .Lmemset_nt_bytes
sfence
ret
.Lmemset_nt_bytes:
movq %rax, %rdx
movzbl %sil, %eax
rep stosb
movq %rdx, %rax
sfence
ret
.align 16
.global memchr .global memchr
memchr: memchr:
testq %rdx, %rdx testq %rdx, %rdx
jz .memchr_no_match jz .Lmemchr_no_match
movzbl %sil, %esi
imul $0x01010101, %esi
movd %esi, %xmm0 movd %esi, %xmm0
punpcklbw %xmm0, %xmm0
punpcklwd %xmm0, %xmm0
pshufd $0, %xmm0, %xmm0 pshufd $0, %xmm0, %xmm0
movq %rdi, %rcx movq %rdi, %rcx
andq $15, %rcx andq $15, %rcx
jz .memchr_loop jz .Lmemchr_loop
movq %rdi, %rsi movq %rdi, %rsi
subq %rcx, %rsi subq %rcx, %rsi
@@ -50,40 +179,42 @@ memchr:
pcmpeqb %xmm0, %xmm1 pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %eax pmovmskb %xmm1, %eax
shrl %cl, %eax shrl %cl, %eax
jnz .memchr_match jnz .Lmemchr_match
leaq 16(%rsi), %rdi leaq 16(%rsi), %rdi
addq %rcx, %rdx addq %rcx, %rdx
subq $16, %rdx subq $16, %rdx
jbe .memchr_no_match jbe .Lmemchr_no_match
.memchr_loop: .align 16
.Lmemchr_loop:
movdqa (%rdi), %xmm1 movdqa (%rdi), %xmm1
pcmpeqb %xmm0, %xmm1 pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %eax pmovmskb %xmm1, %eax
testl %eax, %eax testl %eax, %eax
jnz .memchr_match jnz .Lmemchr_match
addq $16, %rdi addq $16, %rdi
subq $16, %rdx subq $16, %rdx
ja .memchr_loop ja .Lmemchr_loop
.memchr_no_match: .Lmemchr_no_match:
xorq %rax, %rax xorq %rax, %rax
ret ret
.memchr_match: .Lmemchr_match:
bsfl %eax, %eax bsfl %eax, %eax
cmpq %rdx, %rax cmpq %rdx, %rax
jae .memchr_no_match jae .Lmemchr_no_match
addq %rdi, %rax addq %rdi, %rax
ret ret
.align 16
.global memcmp .global memcmp
memcmp: memcmp:
testq %rdx, %rdx testq %rdx, %rdx
jz .memcmp_equal jz .Lmemcmp_equal
movq %rdi, %rax movq %rdi, %rax
movq %rsi, %rcx movq %rsi, %rcx
@@ -93,7 +224,7 @@ memcmp:
cmovaq %rcx, %rax cmovaq %rcx, %rax
testq %rax, %rax testq %rax, %rax
jz .memcmp_loop jz .Lmemcmp_loop
movq $16, %rcx movq $16, %rcx
subq %rax, %rcx subq %rax, %rcx
@@ -103,44 +234,46 @@ memcmp:
subq %rcx, %rdx subq %rcx, %rdx
.memcmp_align_loop: .Lmemcmp_align_loop:
movzbl (%rdi), %eax movzbl (%rdi), %eax
movzbl (%rsi), %r8d movzbl (%rsi), %r8d
subl %r8d, %eax subl %r8d, %eax
jnz .memcmp_return jnz .Lmemcmp_return
incq %rdi addq $1, %rdi
incq %rsi addq $1, %rsi
decq %rcx subq $1, %rcx
jnz .memcmp_align_loop jnz .Lmemcmp_align_loop
.memcmp_loop: .align 16
.Lmemcmp_loop:
movdqu (%rdi), %xmm0 movdqu (%rdi), %xmm0
movdqu (%rsi), %xmm1 movdqu (%rsi), %xmm1
pcmpeqb %xmm0, %xmm1 pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %eax pmovmskb %xmm1, %eax
xorl $0xFFFF, %eax xorl $0xFFFF, %eax
jnz .memcmp_differ jnz .Lmemcmp_differ
addq $16, %rdi addq $16, %rdi
addq $16, %rsi addq $16, %rsi
subq $16, %rdx subq $16, %rdx
ja .memcmp_loop ja .Lmemcmp_loop
.memcmp_equal: .Lmemcmp_equal:
xorl %eax, %eax xorl %eax, %eax
.memcmp_return: .Lmemcmp_return:
ret ret
.memcmp_differ: .Lmemcmp_differ:
bsfl %eax, %ecx bsfl %eax, %ecx
cmpq %rdx, %rcx cmpq %rdx, %rcx
jae .memcmp_equal jae .Lmemcmp_equal
movzbl (%rdi, %rcx), %eax movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %edx movzbl (%rsi, %rcx), %edx
subl %edx, %eax subl %edx, %eax
ret ret
.align 16
.global strlen .global strlen
strlen: strlen:
movq %rdi, %rsi movq %rdi, %rsi
@@ -149,7 +282,7 @@ strlen:
movq %rsi, %rcx movq %rsi, %rcx
andq $15, %rcx andq $15, %rcx
jz .strlen_loop jz .Lstrlen_loop
movq %rsi, %rdx movq %rsi, %rdx
subq %rcx, %rdx subq %rcx, %rdx
@@ -157,24 +290,23 @@ strlen:
pcmpeqb %xmm0, %xmm1 pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %eax pmovmskb %xmm1, %eax
shrl %cl, %eax shrl %cl, %eax
jnz .strlen_null_found jnz .Lstrlen_null_found
leaq 16(%rdx), %rsi leaq 16(%rdx), %rsi
.strlen_loop: .align 16
.Lstrlen_loop:
movdqa (%rsi), %xmm1 movdqa (%rsi), %xmm1
pcmpeqb %xmm0, %xmm1 pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %eax pmovmskb %xmm1, %eax
testl %eax, %eax testl %eax, %eax
jnz .strlen_null_found jnz .Lstrlen_null_found
addq $16, %rsi addq $16, %rsi
jmp .strlen_loop jmp .Lstrlen_loop
.strlen_null_found: .Lstrlen_null_found:
bsfl %eax, %eax bsfl %eax, %eax
addq %rsi, %rax addq %rsi, %rax
subq %rdi, %rax subq %rdi, %rax
ret ret
#endif