diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 309e231b..4ad7502a 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -168,10 +168,7 @@ set(BAN_SOURCES set(KLIBC_SOURCES klibc/ctype.cpp klibc/string.cpp - - # Ehhh don't do this but for now libc uses the same stuff kernel can use - # This won't work after libc starts using sse implemetations tho - ../userspace/libraries/LibC/arch/${BANAN_ARCH}/string.S + klibc/arch/${BANAN_ARCH}/string.S ) set(LIBDEFLATE_SOURCE diff --git a/kernel/klibc/arch/i686/string.S b/kernel/klibc/arch/i686/string.S new file mode 100644 index 00000000..38f08a7a --- /dev/null +++ b/kernel/klibc/arch/i686/string.S @@ -0,0 +1,47 @@ +.align 16 +.global memcpy +memcpy: + xchgl 4(%esp), %edi + xchgl 8(%esp), %esi + movl 12(%esp), %ecx + movl %edi, %edx + rep movsb + movl 4(%esp), %edi + movl 8(%esp), %esi + movl %edx, %eax + ret + +.align 16 +.global memmove +memmove: + xchgl 4(%esp), %edi + xchgl 8(%esp), %esi + movl 12(%esp), %ecx + movl %edi, %edx + cmpl %edi, %esi + jb .memmove_slow + rep movsb + .memmove_done: + movl 4(%esp), %edi + movl 8(%esp), %esi + movl %edx, %eax + ret + .memmove_slow: + leal -1(%edi, %ecx), %edi + leal -1(%esi, %ecx), %esi + std + rep movsb + cld + jmp .memmove_done + +.align 16 +.global memset +memset: + xchgl 4(%esp), %edi + movl 8(%esp), %eax + movl 12(%esp), %ecx + movl %edi, %edx + rep stosb + movl 4(%esp), %edi + movl %edx, %eax + ret diff --git a/kernel/klibc/arch/x86_64/string.S b/kernel/klibc/arch/x86_64/string.S new file mode 100644 index 00000000..042d7b3e --- /dev/null +++ b/kernel/klibc/arch/x86_64/string.S @@ -0,0 +1,31 @@ +.align 16 +.global memcpy +memcpy: + movq %rdi, %rax + movq %rdx, %rcx + rep movsb + ret + +.align 16 +.global memmove +memmove: + cmpq %rdi, %rsi + jae memcpy + movq %rdi, %rax + leaq -1(%rdi, %rdx), %rdi + leaq -1(%rsi, %rdx), %rsi + movq %rdx, %rcx + std + rep movsb + cld + ret + +.align 16 +.global memset +memset: + movq %rdi, %r8 + movb %sil, %al + movq %rdx, %rcx + rep stosb + movq %r8, %rax + ret diff --git a/userspace/libraries/LibC/arch/i686/string.S b/userspace/libraries/LibC/arch/i686/string.S index d9d60ed6..38f08a7a 100644 --- a/userspace/libraries/LibC/arch/i686/string.S +++ b/userspace/libraries/LibC/arch/i686/string.S @@ -1,3 +1,4 @@ +.align 16 .global memcpy memcpy: xchgl 4(%esp), %edi @@ -10,6 +11,7 @@ memcpy: movl %edx, %eax ret +.align 16 .global memmove memmove: xchgl 4(%esp), %edi @@ -32,6 +34,7 @@ memmove: cld jmp .memmove_done +.align 16 .global memset memset: xchgl 4(%esp), %edi diff --git a/userspace/libraries/LibC/arch/x86_64/string.S b/userspace/libraries/LibC/arch/x86_64/string.S index 4b985ab4..3eae301c 100644 --- a/userspace/libraries/LibC/arch/x86_64/string.S +++ b/userspace/libraries/LibC/arch/x86_64/string.S @@ -1,48 +1,177 @@ +.set nt_threshold, 32 * 1024 + +.align 16 .global memcpy memcpy: + cmpq $nt_threshold, %rdx + jae .Lmemcpy_nt movq %rdi, %rax movq %rdx, %rcx rep movsb ret + .Lmemcpy_nt: + movq %rdi, %rax + + testq $63, %rdi + jz .Lmemcpy_nt_aligned + + # NOTE: this unconditionally copies 64 bytes to align to 64 byte boundary + # but as nt branch is only taken for huge buffers, it doesnt add much overhead + + movdqu 0(%rsi), %xmm0 + movdqu 16(%rsi), %xmm1 + movdqu 32(%rsi), %xmm2 + movdqu 48(%rsi), %xmm3 + + movdqu %xmm0, 0(%rdi) + movdqu %xmm1, 16(%rdi) + movdqu %xmm2, 32(%rdi) + movdqu %xmm3, 48(%rdi) + + movq %rdi, %rcx + andq $63, %rcx + leaq -64(%rdx, %rcx), %rdx + + negq %rcx + leaq 64(%rdi, %rcx), %rdi + leaq 64(%rsi, %rcx), %rsi + + .Lmemcpy_nt_aligned: + movq %rdx, %rcx + shrq $6, %rdx + + .align 16 + .Lmemcpy_nt_loop: + prefetchnta 256(%rsi) + prefetchnta 32+256(%rsi) + + movdqu 0(%rsi), %xmm0 + movdqu 16(%rsi), %xmm1 + movdqu 32(%rsi), %xmm2 + movdqu 48(%rsi), %xmm3 + + movntdq %xmm0, 0(%rdi) + movntdq %xmm1, 16(%rdi) + movntdq %xmm2, 32(%rdi) + movntdq %xmm3, 48(%rdi) + + addq $64, %rdi + addq $64, %rsi + subq $1, %rdx + jnz .Lmemcpy_nt_loop + + andq $63, %rcx + rep movsb + sfence + ret + +.align 16 .global memmove memmove: - cmpq %rdi, %rsi - jae memcpy + cmpq $nt_threshold, %rdx + jb .Lmemmove_small + leaq (%rdi, %rdx), %rax + cmpq %rax, %rsi + jae .Lmemcpy_nt + leaq (%rsi, %rdx), %rax + cmpq %rax, %rdi + jae .Lmemcpy_nt + .Lmemmove_small: movq %rdi, %rax + movq %rdx, %rcx + cmpq %rdi, %rsi + jb .Lmemmove_backwards + rep movsb + ret + .Lmemmove_backwards: leaq -1(%rdi, %rdx), %rdi leaq -1(%rsi, %rdx), %rsi - movq %rdx, %rcx std rep movsb cld ret +.align 16 .global memset memset: + cmpq $nt_threshold, %rdx + jae .Lmemset_nt movq %rdi, %r8 - movb %sil, %al movq %rdx, %rcx + movzbl %sil, %eax rep stosb movq %r8, %rax ret + .Lmemset_nt: + movq %rdi, %rax -#if defined(__SSE2__) + movzbl %sil, %esi + imul $0x01010101, %esi + movd %esi, %xmm0 + pshufd $0, %xmm0, %xmm0 + testq $63, %rdi + jz .Lmemset_nt_aligned + + # NOTE: this unconditionally writes 64 bytes to align to 64 byte boundary + # but as nt branch is only taken for huge buffers, it doesnt add much overhead + + movdqu %xmm0, 0(%rdi) + movdqu %xmm0, 16(%rdi) + movdqu %xmm0, 32(%rdi) + movdqu %xmm0, 48(%rdi) + + movq %rdi, %rcx + andq $63, %rcx + leaq -64(%rdx, %rcx), %rdx + + negq %rcx + leaq 64(%rdi, %rcx), %rdi + + .Lmemset_nt_aligned: + movq %rdx, %rcx + shrq $6, %rdx + + .align 16 + .Lmemset_nt_loop: + movntdq %xmm0, 0(%rdi) + movntdq %xmm0, 16(%rdi) + movntdq %xmm0, 32(%rdi) + movntdq %xmm0, 48(%rdi) + + addq $64, %rdi + subq $1, %rdx + jnz .Lmemset_nt_loop + + andq $63, %rcx + jnz .Lmemset_nt_bytes + sfence + ret + + .Lmemset_nt_bytes: + movq %rax, %rdx + movzbl %sil, %eax + rep stosb + movq %rdx, %rax + sfence + ret + +.align 16 .global memchr memchr: testq %rdx, %rdx - jz .memchr_no_match + jz .Lmemchr_no_match + movzbl %sil, %esi + imul $0x01010101, %esi movd %esi, %xmm0 - punpcklbw %xmm0, %xmm0 - punpcklwd %xmm0, %xmm0 pshufd $0, %xmm0, %xmm0 movq %rdi, %rcx andq $15, %rcx - jz .memchr_loop + jz .Lmemchr_loop movq %rdi, %rsi subq %rcx, %rsi @@ -50,40 +179,42 @@ memchr: pcmpeqb %xmm0, %xmm1 pmovmskb %xmm1, %eax shrl %cl, %eax - jnz .memchr_match + jnz .Lmemchr_match leaq 16(%rsi), %rdi addq %rcx, %rdx subq $16, %rdx - jbe .memchr_no_match + jbe .Lmemchr_no_match - .memchr_loop: + .align 16 + .Lmemchr_loop: movdqa (%rdi), %xmm1 pcmpeqb %xmm0, %xmm1 pmovmskb %xmm1, %eax testl %eax, %eax - jnz .memchr_match + jnz .Lmemchr_match addq $16, %rdi subq $16, %rdx - ja .memchr_loop + ja .Lmemchr_loop - .memchr_no_match: + .Lmemchr_no_match: xorq %rax, %rax ret - .memchr_match: + .Lmemchr_match: bsfl %eax, %eax cmpq %rdx, %rax - jae .memchr_no_match + jae .Lmemchr_no_match addq %rdi, %rax ret +.align 16 .global memcmp memcmp: testq %rdx, %rdx - jz .memcmp_equal + jz .Lmemcmp_equal movq %rdi, %rax movq %rsi, %rcx @@ -93,7 +224,7 @@ memcmp: cmovaq %rcx, %rax testq %rax, %rax - jz .memcmp_loop + jz .Lmemcmp_loop movq $16, %rcx subq %rax, %rcx @@ -103,44 +234,46 @@ memcmp: subq %rcx, %rdx - .memcmp_align_loop: + .Lmemcmp_align_loop: movzbl (%rdi), %eax movzbl (%rsi), %r8d subl %r8d, %eax - jnz .memcmp_return + jnz .Lmemcmp_return - incq %rdi - incq %rsi - decq %rcx - jnz .memcmp_align_loop + addq $1, %rdi + addq $1, %rsi + subq $1, %rcx + jnz .Lmemcmp_align_loop - .memcmp_loop: + .align 16 + .Lmemcmp_loop: movdqu (%rdi), %xmm0 movdqu (%rsi), %xmm1 pcmpeqb %xmm0, %xmm1 pmovmskb %xmm1, %eax xorl $0xFFFF, %eax - jnz .memcmp_differ + jnz .Lmemcmp_differ addq $16, %rdi addq $16, %rsi subq $16, %rdx - ja .memcmp_loop + ja .Lmemcmp_loop - .memcmp_equal: + .Lmemcmp_equal: xorl %eax, %eax - .memcmp_return: + .Lmemcmp_return: ret - .memcmp_differ: + .Lmemcmp_differ: bsfl %eax, %ecx cmpq %rdx, %rcx - jae .memcmp_equal + jae .Lmemcmp_equal movzbl (%rdi, %rcx), %eax movzbl (%rsi, %rcx), %edx subl %edx, %eax ret +.align 16 .global strlen strlen: movq %rdi, %rsi @@ -149,7 +282,7 @@ strlen: movq %rsi, %rcx andq $15, %rcx - jz .strlen_loop + jz .Lstrlen_loop movq %rsi, %rdx subq %rcx, %rdx @@ -157,24 +290,23 @@ strlen: pcmpeqb %xmm0, %xmm1 pmovmskb %xmm1, %eax shrl %cl, %eax - jnz .strlen_null_found + jnz .Lstrlen_null_found leaq 16(%rdx), %rsi - .strlen_loop: + .align 16 + .Lstrlen_loop: movdqa (%rsi), %xmm1 pcmpeqb %xmm0, %xmm1 pmovmskb %xmm1, %eax testl %eax, %eax - jnz .strlen_null_found + jnz .Lstrlen_null_found addq $16, %rsi - jmp .strlen_loop + jmp .Lstrlen_loop - .strlen_null_found: + .Lstrlen_null_found: bsfl %eax, %eax addq %rsi, %rax subq %rdi, %rax ret - -#endif