From 3b23458eccd9d96377c9434d212f07977edd8b30 Mon Sep 17 00:00:00 2001 From: Bananymous Date: Wed, 7 Aug 2024 19:00:05 +0300 Subject: [PATCH] LibC: Start work on locales This patch adds 2 locales, POSIX locale and UTF8 locale. functions `mbstowcs()` and `strcoll()` use locales to do convertions and comparison respectively. --- .../LibC/include/bits/types/locale_t.h | 2 +- userspace/libraries/LibC/include/locale.h | 2 + userspace/libraries/LibC/locale.cpp | 117 ++++++++++++++++-- userspace/libraries/LibC/stdlib.cpp | 37 ++++++ userspace/libraries/LibC/string.cpp | 38 +++++- 5 files changed, 183 insertions(+), 13 deletions(-) diff --git a/userspace/libraries/LibC/include/bits/types/locale_t.h b/userspace/libraries/LibC/include/bits/types/locale_t.h index a19779c0d6..5be9a9c693 100644 --- a/userspace/libraries/LibC/include/bits/types/locale_t.h +++ b/userspace/libraries/LibC/include/bits/types/locale_t.h @@ -9,7 +9,7 @@ __BEGIN_DECLS #ifndef __locale_t_defined #define __locale_t_defined 1 - typedef int locale_t; + typedef enum { LOCALE_INVALID, LOCALE_POSIX, LOCALE_UTF8 } locale_t; #endif __END_DECLS diff --git a/userspace/libraries/LibC/include/locale.h b/userspace/libraries/LibC/include/locale.h index 52c930bd6c..6b48188474 100644 --- a/userspace/libraries/LibC/include/locale.h +++ b/userspace/libraries/LibC/include/locale.h @@ -68,6 +68,8 @@ locale_t newlocale(int category_mask, const char* locale, locale_t base); char* setlocale(int category, const char* locale); locale_t uselocale(locale_t newloc); +locale_t __getlocale(int category); + __END_DECLS #endif diff --git a/userspace/libraries/LibC/locale.cpp b/userspace/libraries/LibC/locale.cpp index c4ba8bbbb3..2fd67a5455 100644 --- a/userspace/libraries/LibC/locale.cpp +++ b/userspace/libraries/LibC/locale.cpp @@ -1,15 +1,112 @@ +#include + #include +#include #include -// FIXME: Actually support locales -char* setlocale(int category, const char* locale) -{ - (void)category; +static locale_t s_current_locales[LC_ALL] { + LOCALE_POSIX, + LOCALE_POSIX, + LOCALE_POSIX, + LOCALE_POSIX, + LOCALE_POSIX, + LOCALE_POSIX, +}; +static_assert(LC_ALL == 6); - static char s_locale[] = "C"; - if (locale == nullptr) - return s_locale; - if (strcmp(locale, "") == 0 || strcmp(locale, "C") == 0 || strcmp(locale, "POSIX") == 0) - return s_locale; - return nullptr; +static locale_t str_to_locale(const char* locale) +{ + if (*locale == '\0') + return LOCALE_UTF8; + + if (strcmp(locale, "C") == 0 || strcmp(locale, "LOCALE_POSIX") == 0) + return LOCALE_POSIX; + if (strcmp(locale, "C.UTF8") == 0) + return LOCALE_UTF8; + return LOCALE_INVALID; +} + +static const char* locale_to_str(locale_t locale) +{ + if (locale == LOCALE_POSIX) + return "C"; + if (locale == LOCALE_UTF8) + return "C.UTF8"; + ASSERT_NOT_REACHED(); +} + +char* setlocale(int category, const char* locale_str) +{ + static char s_locale_buffer[128]; + + if (locale_str == nullptr) + { + switch (category) + { + case LC_COLLATE: + case LC_CTYPE: + case LC_MESSAGES: + case LC_MONETARY: + case LC_NUMERIC: + case LC_TIME: + strcpy(s_locale_buffer, locale_to_str(s_current_locales[category])); + break; + case LC_ALL: + sprintf(s_locale_buffer, "%s;%s;%s;%s;%s;%s", + locale_to_str(s_current_locales[0]), + locale_to_str(s_current_locales[1]), + locale_to_str(s_current_locales[2]), + locale_to_str(s_current_locales[3]), + locale_to_str(s_current_locales[4]), + locale_to_str(s_current_locales[5]) + ); + break; + default: + return nullptr; + } + + return s_locale_buffer; + } + + locale_t locale = str_to_locale(locale_str); + if (locale == LOCALE_INVALID) + return nullptr; + + switch (category) + { + case LC_COLLATE: + case LC_CTYPE: + case LC_MESSAGES: + case LC_MONETARY: + case LC_NUMERIC: + case LC_TIME: + s_current_locales[category] = locale; + break; + case LC_ALL: + for (auto& current : s_current_locales) + current = locale; + break; + default: + return nullptr; + } + + strcpy(s_locale_buffer, locale_to_str(locale)); + return s_locale_buffer; +} + + +locale_t __getlocale(int category) +{ + switch (category) + { + case LC_COLLATE: + case LC_CTYPE: + case LC_MESSAGES: + case LC_MONETARY: + case LC_NUMERIC: + case LC_TIME: + return s_current_locales[category]; + default: + return LOCALE_INVALID; + } } diff --git a/userspace/libraries/LibC/stdlib.cpp b/userspace/libraries/LibC/stdlib.cpp index f659825269..8a11b025af 100644 --- a/userspace/libraries/LibC/stdlib.cpp +++ b/userspace/libraries/LibC/stdlib.cpp @@ -1,8 +1,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -512,6 +514,41 @@ int putenv(char* string) return 0; } +size_t mbstowcs(wchar_t* __restrict pwcs, const char* __restrict s, size_t n) +{ + auto* us = reinterpret_cast(s); + + size_t len = 0; + + switch (__getlocale(LC_CTYPE)) + { + case LOCALE_INVALID: + ASSERT_NOT_REACHED(); + case LOCALE_POSIX: + while (*us && len < n) + pwcs[len++] = *us++; + break; + case LOCALE_UTF8: + while (*us && len < n) + { + auto wch = BAN::UTF8::to_codepoint(us); + if (wch == BAN::UTF8::invalid) + { + errno = EILSEQ; + return -1; + } + pwcs[len++] = wch; + us += BAN::UTF8::byte_length(*us); + } + break; + } + + if (len < n) + pwcs[len] = 0; + + return len; +} + void* bsearch(const void* key, const void* base, size_t nel, size_t width, int (*compar)(const void*, const void*)) { if (nel == 0) diff --git a/userspace/libraries/LibC/string.cpp b/userspace/libraries/LibC/string.cpp index 3aa100e294..853735db48 100644 --- a/userspace/libraries/LibC/string.cpp +++ b/userspace/libraries/LibC/string.cpp @@ -1,4 +1,8 @@ +#include +#include + #include +#include #include #include #include @@ -138,8 +142,38 @@ char* strncat(char* __restrict__ dest, const char* __restrict__ src, size_t n) int strcoll(const char* s1, const char* s2) { - // FIXME: support locales - return strcmp(s1, s2); + switch (__getlocale(LC_COLLATE)) + { + case LOCALE_INVALID: + ASSERT_NOT_REACHED(); + case LOCALE_POSIX: + return strcmp(s1, s2); + case LOCALE_UTF8: + { + const unsigned char* u1 = (unsigned char*)s1; + const unsigned char* u2 = (unsigned char*)s2; + if (!*u1 || !*u2) + return *u1 - *u2; + + wchar_t wc1, wc2; + while (*u1 && *u2) + { + wc1 = BAN::UTF8::to_codepoint(u1); + wc2 = BAN::UTF8::to_codepoint(u2); + if (wc1 == (wchar_t)BAN::UTF8::invalid || wc2 == (wchar_t)BAN::UTF8::invalid) + { + errno = EINVAL; + return -1; + } + if (wc1 != wc2) + break; + u1 += BAN::UTF8::byte_length(*u1); + u2 += BAN::UTF8::byte_length(*u2); + } + return wc1 - wc2; + } + } + ASSERT_NOT_REACHED(); } char* strdup(const char* str)