From 82d5d9ba58c5edd54b1c4916ab595dd431f085e6 Mon Sep 17 00:00:00 2001 From: Bananymous Date: Thu, 2 Apr 2026 15:30:31 +0300 Subject: [PATCH] LibC: Write memchr, memcmp and strlen with sse --- userspace/libraries/LibC/arch/i686/string.S | 43 ----- userspace/libraries/LibC/arch/x86_64/string.S | 178 +++++++++++++++--- 2 files changed, 148 insertions(+), 73 deletions(-) diff --git a/userspace/libraries/LibC/arch/i686/string.S b/userspace/libraries/LibC/arch/i686/string.S index d14adf86..d9d60ed6 100644 --- a/userspace/libraries/LibC/arch/i686/string.S +++ b/userspace/libraries/LibC/arch/i686/string.S @@ -1,35 +1,3 @@ -.global memchr -memchr: - xchgl 4(%esp), %edi - movl 8(%esp), %eax - movl 12(%esp), %ecx - movl $1, %edx - cmpl $1, %ecx # clear ZF if count is zero - repne scasb - cmovel %edi, %edx - leal -1(%edx), %eax - movl 4(%esp), %edi - ret - -.global memcmp -memcmp: - xchgl 4(%esp), %edi - xchgl 8(%esp), %esi - movl 12(%esp), %ecx - testl %ecx, %ecx # set ZF if count is zero - repe cmpsb - jne .memcmp_not_equal - xorl %eax, %eax - jmp .memcmp_done - .memcmp_not_equal: - movzbl -1(%edi), %eax - movzbl -1(%esi), %ecx - subl %ecx, %eax - .memcmp_done: - movl 4(%esp), %edi - movl 8(%esp), %esi - ret - .global memcpy memcpy: xchgl 4(%esp), %edi @@ -74,14 +42,3 @@ memset: movl 4(%esp), %edi movl %edx, %eax ret - -.global strlen -strlen: - xchgl 4(%esp), %edi - xorb %al, %al - movl $-1, %ecx - repne scasb - movl 4(%esp), %edi - movl $-2, %eax - subl %ecx, %eax - ret diff --git a/userspace/libraries/LibC/arch/x86_64/string.S b/userspace/libraries/LibC/arch/x86_64/string.S index e0ffa58e..4b985ab4 100644 --- a/userspace/libraries/LibC/arch/x86_64/string.S +++ b/userspace/libraries/LibC/arch/x86_64/string.S @@ -1,28 +1,3 @@ -.global memchr -memchr: - movb %sil, %al - movq %rdx, %rcx - movq $1, %rdx - cmpq $1, %rcx # clear ZF if count is zero - repne scasb - cmoveq %rdi, %rdx - leaq -1(%rdx), %rax - ret - -.global memcmp -memcmp: - movq %rdx, %rcx - testq %rcx, %rcx # set ZF if count is zero - repe cmpsb - jne .memcmp_not_equal - xorq %rax, %rax - ret - .memcmp_not_equal: - movzbl -1(%rdi), %eax - movzbl -1(%rsi), %ecx - subq %rcx, %rax - ret - .global memcpy memcpy: movq %rdi, %rax @@ -52,11 +27,154 @@ memset: movq %r8, %rax ret + +#if defined(__SSE2__) + +.global memchr +memchr: + testq %rdx, %rdx + jz .memchr_no_match + + movd %esi, %xmm0 + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0 + + movq %rdi, %rcx + andq $15, %rcx + jz .memchr_loop + + movq %rdi, %rsi + subq %rcx, %rsi + movdqa (%rsi), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + shrl %cl, %eax + jnz .memchr_match + + leaq 16(%rsi), %rdi + + addq %rcx, %rdx + subq $16, %rdx + jbe .memchr_no_match + + .memchr_loop: + movdqa (%rdi), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + testl %eax, %eax + jnz .memchr_match + + addq $16, %rdi + subq $16, %rdx + ja .memchr_loop + + .memchr_no_match: + xorq %rax, %rax + ret + + .memchr_match: + bsfl %eax, %eax + cmpq %rdx, %rax + jae .memchr_no_match + addq %rdi, %rax + ret + +.global memcmp +memcmp: + testq %rdx, %rdx + jz .memcmp_equal + + movq %rdi, %rax + movq %rsi, %rcx + andq $15, %rax + andq $15, %rcx + cmpq %rax, %rcx + cmovaq %rcx, %rax + + testq %rax, %rax + jz .memcmp_loop + + movq $16, %rcx + subq %rax, %rcx + + cmpq %rcx, %rdx + cmovbq %rdx, %rcx + + subq %rcx, %rdx + + .memcmp_align_loop: + movzbl (%rdi), %eax + movzbl (%rsi), %r8d + subl %r8d, %eax + jnz .memcmp_return + + incq %rdi + incq %rsi + decq %rcx + jnz .memcmp_align_loop + + .memcmp_loop: + movdqu (%rdi), %xmm0 + movdqu (%rsi), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + xorl $0xFFFF, %eax + jnz .memcmp_differ + + addq $16, %rdi + addq $16, %rsi + subq $16, %rdx + ja .memcmp_loop + + .memcmp_equal: + xorl %eax, %eax + .memcmp_return: + ret + + .memcmp_differ: + bsfl %eax, %ecx + cmpq %rdx, %rcx + jae .memcmp_equal + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %edx + subl %edx, %eax + ret + .global strlen strlen: - xorb %al, %al - movq $-1, %rcx - repne scasb - movq $-2, %rax - subq %rcx, %rax + movq %rdi, %rsi + + pxor %xmm0, %xmm0 + + movq %rsi, %rcx + andq $15, %rcx + jz .strlen_loop + + movq %rsi, %rdx + subq %rcx, %rdx + movdqa (%rdx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + shrl %cl, %eax + jnz .strlen_null_found + + leaq 16(%rdx), %rsi + + .strlen_loop: + movdqa (%rsi), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + testl %eax, %eax + jnz .strlen_null_found + + addq $16, %rsi + jmp .strlen_loop + + .strlen_null_found: + bsfl %eax, %eax + addq %rsi, %rax + subq %rdi, %rax ret + +#endif