diff --git a/userspace/libraries/LibC/CMakeLists.txt b/userspace/libraries/LibC/CMakeLists.txt index e280df63..9f27cf45 100644 --- a/userspace/libraries/LibC/CMakeLists.txt +++ b/userspace/libraries/LibC/CMakeLists.txt @@ -29,6 +29,7 @@ set(LIBC_SOURCES printf_impl.cpp pthread.cpp pwd.cpp + regex.cpp scanf_impl.cpp sched.cpp semaphore.cpp diff --git a/userspace/libraries/LibC/include/regex.h b/userspace/libraries/LibC/include/regex.h index f84badf8..8d244d07 100644 --- a/userspace/libraries/LibC/include/regex.h +++ b/userspace/libraries/LibC/include/regex.h @@ -10,9 +10,47 @@ __BEGIN_DECLS #define __need_size_t #include +#include +#include + +typedef enum +{ + _re_literal, + _re_group, + _re_anchor_begin, + _re_anchor_end, + _re_group_tag1, + _re_group_tag2, +} _regex_elem_e; + +typedef struct +{ + int min; + int max; +} _regex_qualifier_t; + +typedef struct _regex_elem_t +{ + _regex_elem_e type; + _regex_qualifier_t qualifier; + union + { + uint32_t literal[0x100 / 32]; + size_t group_tag; + struct { + struct _regex_elem_t* elements; + size_t elements_len; + size_t index; + } group; + } as; +} _regex_elem_t; + typedef struct { size_t re_nsub; + _regex_elem_t* _compiled; + size_t _compiled_len; + int _cflags; } regex_t; typedef __PTRDIFF_TYPE__ regoff_t; @@ -23,30 +61,31 @@ typedef struct regoff_t rm_eo; /* Byte offset from start of string of the first character after the end of substring. */ } regmatch_t; -#define REG_EXTENDED 0x01 -#define REG_ICASE 0x02 -#define REG_NOSUB 0x04 -#define REG_NEWLINE 0x80 +#define REG_EXTENDED 0x1 +#define REG_ICASE 0x2 +#define REG_NOSUB 0x4 +#define REG_NEWLINE 0x8 -#define REG_NOTBOL 0x0001 -#define REG_NOTEOL 0x0002 -#define REG_NOMATCH 0x0004 -#define REG_BADPAT 0x0008 -#define REG_ECOLLATE 0x0010 -#define REG_ECTYPE 0x0020 -#define REG_EESCAPE 0x0040 -#define REG_ESUBREG 0x0080 -#define REG_EBRACK 0x0100 -#define REG_EPAREN 0x0200 -#define REG_EBRACE 0x0400 -#define REG_BADBR 0x0800 -#define REG_ERANGE 0x1000 -#define REG_ESPACE 0x2000 -#define REG_BADRPT 0x4000 +#define REG_NOTBOL 0x1 +#define REG_NOTEOL 0x2 + +#define REG_NOMATCH 1 +#define REG_BADPAT 2 +#define REG_ECOLLATE 3 +#define REG_ECTYPE 4 +#define REG_EESCAPE 5 +#define REG_ESUBREG 6 +#define REG_EBRACK 7 +#define REG_EPAREN 8 +#define REG_EBRACE 9 +#define REG_BADBR 10 +#define REG_ERANGE 11 +#define REG_ESPACE 12 +#define REG_BADRPT 13 int regcomp(regex_t* __restrict preg, const char* __restrict pattern, int cflags); size_t regerror(int errcode, const regex_t* __restrict preg, char* __restrict errbuf, size_t errbuf_size); -int regexec(const regex_t* __restrict preg, const char* __restrict string, size_t nmatch, regmatch_t pmatch[__restrict], int eflags); +int regexec(const regex_t* __restrict preg, const char* __restrict string, size_t nmatch, regmatch_t pmatch[], int eflags); void regfree(regex_t* preg); __END_DECLS diff --git a/userspace/libraries/LibC/regex.cpp b/userspace/libraries/LibC/regex.cpp new file mode 100644 index 00000000..1551be3c --- /dev/null +++ b/userspace/libraries/LibC/regex.cpp @@ -0,0 +1,607 @@ +#include +#include +#include +#include +#include +#include + +static void regfree_element(_regex_elem_t* element) +{ + switch (element->type) + { + case _re_anchor_begin: + case _re_anchor_end: + case _re_group_tag1: + case _re_group_tag2: + case _re_literal: + break; + case _re_group: + for (size_t i = 0; i < element->as.group.elements_len; i++) + regfree_element(&element->as.group.elements[i]); + free(element->as.group.elements); + break; + } +} + +static bool append_element(_regex_elem_t* group, _regex_elem_t new_element) +{ + assert(group->type == _re_group); + + _regex_elem_t* new_elements = static_cast<_regex_elem_t*>(realloc( + group->as.group.elements, + (group->as.group.elements_len + 1) * sizeof(_regex_elem_t) + )); + if (new_elements == nullptr) + return false; + + group->as.group.elements = new_elements; + group->as.group.elements[group->as.group.elements_len] = new_element; + group->as.group.elements_len++; + return true; +} + +static constexpr void literal_init(uint32_t literal[8]) +{ + for (size_t i = 0; i < 8; i++) + literal[i] = 0; +} + +static constexpr void literal_add(uint32_t literal[8], char ch) +{ + const uint8_t u8 = ch; + literal[u8 / 32] |= static_cast(1) << (u8 % 32); +} + +static constexpr void literal_invert(uint32_t literal[8]) +{ + for (size_t i = 0; i < 8; i++) + literal[i] ^= 0xFFFFFFFF; +} + +static constexpr bool literal_has(const uint32_t literal[8], char ch) +{ + const uint8_t u8 = ch; + return literal[u8 / 32] & (static_cast(1) << (u8 % 32)); +} + +static size_t parse_bracket_expression(const char* expression, uint32_t literal[8], bool is_case_insensitive) +{ + assert(*expression == '['); + + literal_init(literal); + + size_t index = 1; + + const bool is_inverse = (expression[index] == '^'); + if (is_inverse) + index++; + + if (expression[index] == ']') + { + literal_add(literal, ']'); + index++; + } + + const auto add_character = + [literal, is_case_insensitive](uint8_t ch) + { + if (!is_case_insensitive) + literal_add(literal, ch); + else + { + literal_add(literal, toupper(ch)); + literal_add(literal, tolower(ch)); + } + }; + + while (expression[index] && expression[index] != ']') + { + if (expression[index] == '[') + { + bool sequence = false; + switch (expression[index + 1]) + { + case '.': case ':': case '=': + sequence = true; + break; + } + + if (sequence) + { + const char target = expression[index + 1]; + index += 2; + + size_t len = 0; + while (expression[index + len] && expression[index + len] != target && expression[index + len + 1] != ']') + len++; + if (expression[index + len] == '\0') + return -REG_EBRACK; + + switch (target) + { + case '.': + fprintf(stddbg, "regcomp: collating elements are not supported\n"); + return -REG_ECOLLATE; + case '=': + fprintf(stddbg, "regcomp: equivalence classes are not supported\n"); + if (len == 1) + add_character(expression[index]); + break; + case ':': + int (*func)(int) = nullptr; + if (false); +#define CHECK_CHARACTER_CLASS(name) else if (strncmp(#name, expression + index, len) == 0) func = is##name + CHECK_CHARACTER_CLASS(alnum); + CHECK_CHARACTER_CLASS(alpha); + CHECK_CHARACTER_CLASS(blank); + CHECK_CHARACTER_CLASS(cntrl); + CHECK_CHARACTER_CLASS(digit); + CHECK_CHARACTER_CLASS(graph); + CHECK_CHARACTER_CLASS(lower); + CHECK_CHARACTER_CLASS(print); + CHECK_CHARACTER_CLASS(punct); + CHECK_CHARACTER_CLASS(space); + CHECK_CHARACTER_CLASS(upper); + CHECK_CHARACTER_CLASS(xdigit); +#undef CHECK_CHARACTER_CLASS + if (func == nullptr) + { + fprintf(stddbg, "regcomp: unrecognized character class '%.*s'\n", static_cast(len), expression); + return -REG_ECTYPE; + } + for (int ch = 0; ch < 0x100; ch++) + if (func(ch)) + add_character(ch); + break; + } + + index += len + 2; + continue; + } + } + + if (expression[index + 1] == '-' && expression[index + 2] != ']') + { + const int ch_s = expression[index]; + const int ch_e = expression[index + 2]; + for (int ch = ch_s; ch <= ch_e; ch++) + add_character(ch); + index += 3; + continue; + } + + add_character(expression[index]); + index++; + } + + if (expression[index] != ']') + return -REG_EBRACK; + + if (is_inverse) + literal_invert(literal); + return index + 1; +} + +int regcomp(regex_t* __restrict preg, const char* __restrict pattern, int cflags) +{ +#define REGCOMP_ERROR(err) do { error = err; goto regcomp_error; } while (false) + int error = 0; + + const bool is_extended = (cflags & REG_EXTENDED); + const bool is_case_insensitive = (cflags & REG_ICASE); + + const auto can_add_qualifier = + [](const _regex_elem_t& elem) -> bool + { + if (elem.type == _re_anchor_begin || elem.type == _re_anchor_end) + return false; + return elem.qualifier.min == 1 && elem.qualifier.max == 1; + }; + + size_t stack_len = 1; + _regex_elem_t** stack = static_cast<_regex_elem_t**>(malloc(sizeof(_regex_elem_t*))); + if (stack == nullptr) + REGCOMP_ERROR(REG_ESPACE); + + stack[0] = static_cast<_regex_elem_t*>(malloc(sizeof(_regex_elem_t))); + if (stack[0] == nullptr) + REGCOMP_ERROR(REG_ESPACE); + stack[0][0] = { _re_group, { 1, 1 }, { .group = { nullptr, 0, 0 } } }; + + *preg = { + .re_nsub = 0, + ._compiled = nullptr, + ._compiled_len = 0, + ._cflags = cflags, + }; + + for (size_t i = 0; pattern[i]; i++) + { + auto* current = stack[stack_len - 1]; + assert(current->type == _re_group); + + switch (pattern[i]) + { + case '.': + if (!append_element(current, { _re_literal, { 1, 1 }, {} })) + REGCOMP_ERROR(REG_ESPACE); + literal_init(current->as.group.elements[current->as.group.elements_len - 1].as.literal); + literal_add(current->as.group.elements[current->as.group.elements_len - 1].as.literal, '\n'); + literal_invert(current->as.group.elements[current->as.group.elements_len - 1].as.literal); + break; + case '*': + if (current->as.group.elements_len == 0 || current->as.group.elements[current->as.group.elements_len - 1].type == _re_anchor_begin) + goto case_default; // valid in BRE, UB in ERE + if (!can_add_qualifier(current->as.group.elements[current->as.group.elements_len - 1])) + REGCOMP_ERROR(REG_BADRPT); + current->as.group.elements[current->as.group.elements_len - 1].qualifier = { 0, RE_DUP_MAX }; + break; + case '?': + if (!can_add_qualifier(current->as.group.elements[current->as.group.elements_len - 1])) + REGCOMP_ERROR(REG_BADRPT); + current->as.group.elements[current->as.group.elements_len - 1].qualifier = { 0, 1 }; + break; + case '+': + if (!can_add_qualifier(current->as.group.elements[current->as.group.elements_len - 1])) + REGCOMP_ERROR(REG_BADRPT); + current->as.group.elements[current->as.group.elements_len - 1].qualifier = { 1, RE_DUP_MAX }; + break; + case '(': + if (!is_extended) + goto case_default; + case_openparen: + { + _regex_elem_t** new_stack = static_cast<_regex_elem_t**>(realloc(stack, (stack_len + 1) * sizeof(_regex_elem_t*))); + _regex_elem_t* new_element = static_cast<_regex_elem_t*>(malloc(sizeof(_regex_elem_t))); + if (new_stack == nullptr || new_element == nullptr) + { + free(new_stack); + free(new_element); + REGCOMP_ERROR(REG_ESPACE); + } + stack = new_stack; + stack[stack_len] = new_element; + stack[stack_len][0] = { _re_group, { 1, 1 }, { .group = { nullptr, 0, ++preg->re_nsub }} }; + stack_len++; + break; + } + case ')': + if (!is_extended) + goto case_default; + case_closeparen: + { + if (stack_len <= 1) + REGCOMP_ERROR(REG_EPAREN); + auto* prev = stack[stack_len - 2]; + _regex_elem_t* new_elements = static_cast<_regex_elem_t*>(realloc( + prev->as.group.elements, + (prev->as.group.elements_len + 1) * sizeof(_regex_elem_t) + )); + if (new_elements == nullptr) + REGCOMP_ERROR(REG_ESPACE); + prev->as.group.elements = new_elements; + prev->as.group.elements[prev->as.group.elements_len] = *current; + prev->as.group.elements_len++; + free(current); + stack_len--; + break; + } + case '[': + { + if (!append_element(current, { _re_literal, { 1, 1 }, {} })) + REGCOMP_ERROR(REG_ESPACE); + ssize_t consumed = parse_bracket_expression(pattern + i, current->as.group.elements[current->as.group.elements_len - 1].as.literal, is_case_insensitive); + if (consumed <= 0) + REGCOMP_ERROR(-consumed); + i += consumed - 1; + break; + } + case '{': + if (!is_extended) + goto case_default; + case_openbrace: + if (!can_add_qualifier(current->as.group.elements[current->as.group.elements_len - 1])) + REGCOMP_ERROR(REG_BADRPT); + { + i++; + int min = 0, max = RE_DUP_MAX; + bool found_comma = false; + while (pattern[i] && pattern[i] != (is_extended ? '}' : '\\')) + { + if (pattern[i] == ',') + { + if (found_comma) + REGCOMP_ERROR(REG_BADPAT); + found_comma = true; + if (isdigit(pattern[i + 1])) + max = 0; + } + else if (isdigit(pattern[i])) + { + auto& val = found_comma ? max : min; + val = (val * 10) + (pattern[i] - '0'); + if (val > RE_DUP_MAX) + REGCOMP_ERROR(REG_ERANGE); + } + else + { + REGCOMP_ERROR(REG_BADPAT); + } + i++; + } + if (pattern[i] == '\0') + REGCOMP_ERROR(REG_EBRACE); + if (!is_extended && pattern[++i] != '}') + REGCOMP_ERROR(REG_BADPAT); + if (!found_comma) + max = min; + if (max < min) + REGCOMP_ERROR(REG_ERANGE); + current->as.group.elements[current->as.group.elements_len - 1].qualifier = { min, max }; + break; + } + case '|': + if (!is_extended) + goto case_default; + fprintf(stddbg, "regcomp: alternations are not supported"); + REGCOMP_ERROR(REG_BADPAT); + case '^': + if (!is_extended && i != 0) + goto case_default; + if (!append_element(current, { _re_anchor_begin, { 1, 1 }, {} })) + REGCOMP_ERROR(REG_ESPACE); + break; + case '$': + if (!is_extended && pattern[i + 1] != '\0') + goto case_default; + if (!append_element(current, { _re_anchor_end, { 1, 1 }, {} })) + REGCOMP_ERROR(REG_ESPACE); + break; + case '\\': + i++; + if (pattern[i] == '\0') + REGCOMP_ERROR(REG_EESCAPE); + if (is_extended) + goto case_default; + switch (pattern[i]) + { + case '(': goto case_openparen; + case ')': goto case_closeparen; + case '{': goto case_openbrace; + case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + fprintf(stddbg, "regcomp: back-references are not supported"); + REGCOMP_ERROR(REG_BADPAT); + } + [[fallthrough]]; + default: + case_default: + if (!append_element(current, { _re_literal, { 1, 1 }, {} })) + REGCOMP_ERROR(REG_ESPACE); + literal_init(current->as.group.elements[current->as.group.elements_len - 1].as.literal); + if (!is_case_insensitive) + literal_add(current->as.group.elements[current->as.group.elements_len - 1].as.literal, pattern[i]); + else + { + literal_add(current->as.group.elements[current->as.group.elements_len - 1].as.literal, toupper(pattern[i])); + literal_add(current->as.group.elements[current->as.group.elements_len - 1].as.literal, tolower(pattern[i])); + } + break; + } + } + + if (stack_len != 1) + REGCOMP_ERROR(REG_EPAREN); + + preg->_compiled = stack[0]->as.group.elements; + preg->_compiled_len = stack[0]->as.group.elements_len; + + free(stack[0]); + free(stack); + + return 0; + +regcomp_error: + for (size_t i = 0; i < stack_len; i++) + regfree_element(stack[i]); + free(stack); + return error; + +#undef REGCOMP_ERROR +} + +struct regexec_result_t +{ + bool matched; + bool out_of_memory; + size_t length; +}; + +static regexec_result_t match_expr(const _regex_elem_t* elements, size_t elements_len, const char* string, size_t length, size_t nmatch, regmatch_t pmatch[], int eflags) +{ + if (elements_len == 0) + return { true, false, length }; + + // match fixed length elements + if (elements[0].qualifier.min == elements[0].qualifier.max) + { + int qualifier = elements[0].qualifier.min; + if (qualifier == 0) + return match_expr(elements + 1, elements_len - 1, string, length, nmatch, pmatch, eflags); + + switch (elements[0].type) + { + case _re_anchor_begin: + if ((eflags & REG_NOTBOL) || length > 0) + return { false, false, 0 }; + return match_expr(elements + 1, elements_len - 1, string, length, nmatch, pmatch, eflags); + + case _re_anchor_end: + if ((eflags & REG_NOTEOL) || string[length]) + return { false, false, 0 }; + return match_expr(elements + 1, elements_len - 1, string, length, nmatch, pmatch, eflags); + + case _re_group_tag1: + if (const size_t index = elements[0].as.group_tag; index < nmatch) + pmatch[index].rm_so = length; + return match_expr(elements + 1, elements_len - 1, string, length, nmatch, pmatch, eflags); + + case _re_group_tag2: + if (const size_t index = elements[0].as.group_tag; index < nmatch) + pmatch[index].rm_eo = length; + return match_expr(elements + 1, elements_len - 1, string, length, nmatch, pmatch, eflags); + + case _re_literal: + for (ssize_t i = 0; i < qualifier; i++) + { + if (string[length + i] == '\0') + return { false, false, 0 }; + if (!literal_has(elements[0].as.literal, string[length + i])) + return { false, false, 0 }; + } + return match_expr(elements + 1, elements_len - 1, string, length + qualifier, nmatch, pmatch, eflags); + + case _re_group: + const size_t attempt_len = qualifier * elements[0].as.group.elements_len + 2 + elements_len - 1; + _regex_elem_t* attempt = static_cast<_regex_elem_t*>(malloc(attempt_len * sizeof(_regex_elem_t))); + if (attempt == nullptr) + return { false, true, 0 }; + + size_t index = 0; + for (ssize_t i = 0; i < qualifier; i++) + { + if (i + 1 == qualifier) + attempt[index++] = { _re_group_tag1, { 1, 1 }, { .group_tag = elements[0].as.group.index } }; + for (size_t j = 0; j < elements[0].as.group.elements_len; j++) + attempt[index++] = elements[0].as.group.elements[j]; + if (i + 1 == qualifier) + attempt[index++] = { _re_group_tag2, { 1, 1 }, { .group_tag = elements[0].as.group.index } }; + } + for (size_t i = 1; i < elements_len; i++) + attempt[index++] = elements[i]; + + const auto result = match_expr(attempt, attempt_len, string, length, nmatch, pmatch, eflags); + free(attempt); + + if (const size_t index = elements[0].as.group.index; !result.matched && index < nmatch) + pmatch[index] = { -1, -1 }; + return result; + } + } + + // binary search upper bound for qualifier + int qualifier_min = elements[0].qualifier.min - 1; + int qualifier_max = elements[0].qualifier.max; + while (qualifier_min < qualifier_max) + { + const int qualifier = (qualifier_min + qualifier_max) / 2 + 1; + + const _regex_elem_t dummy { + .type = elements[0].type, + .qualifier = { qualifier, qualifier }, + .as = elements[0].as + }; + + const auto result = match_expr(&dummy, 1, string, length, 0, nullptr, eflags); + if (result.out_of_memory) + return regexec_result_t { false, true, 0 }; + + if (result.matched) + qualifier_min = qualifier; + else + qualifier_max = qualifier - 1; + } + + // greedy match longest working qualifier + for (int qualifier = qualifier_min; qualifier >= elements[0].qualifier.min; qualifier--) + { + const _regex_elem_t dummy { + .type = elements[0].type, + .qualifier = { qualifier, qualifier }, + .as = elements[0].as + }; + + _regex_elem_t* attempt = static_cast<_regex_elem_t*>(malloc(elements_len * sizeof(_regex_elem_t))); + if (attempt == nullptr) + return regexec_result_t { false, true, 0 }; + + attempt[0] = dummy; + for (size_t i = 1; i < elements_len; i++) + attempt[i] = elements[i]; + + const auto result = match_expr(attempt, elements_len, string, length, nmatch, pmatch, eflags); + free(attempt); + + if (result.out_of_memory || result.matched) + return result; + } + + return regexec_result_t { false, false, 0 }; +} + +int regexec(const regex_t* __restrict preg, const char* __restrict string, size_t nmatch, regmatch_t pmatch[], int eflags) +{ + if (preg->_cflags & REG_NOSUB) + nmatch = 0; + + if (preg->_cflags & REG_NEWLINE) + fprintf(stddbg, "regexec REG_NEWLINE is not supported"); + + for (regoff_t off = 0; string[off]; off++, eflags |= REG_NOTBOL) + { + for (size_t i = 0; i < nmatch; i++) + pmatch[i] = { -1, -1 }; + + const auto [matched, out_of_memory, length] = match_expr(preg->_compiled, preg->_compiled_len, string + off, 0, nmatch, pmatch, eflags); + if (out_of_memory) + return REG_ESPACE; + if (!matched) + continue; + + if (nmatch > 0) + pmatch[0] = { off, off + static_cast(length) }; + for (size_t i = 1; i < nmatch; i++) + { + if (pmatch[i].rm_so == -1) + continue; + pmatch[i].rm_so += off; + pmatch[i].rm_eo += off; + } + + return 0; + } + + return REG_NOMATCH; +} + +void regfree(regex_t* preg) +{ + for (size_t i = 0; i < preg->_compiled_len; i++) + regfree_element(&preg->_compiled[i]); + free(preg->_compiled); +} + +size_t regerror(int errcode, const regex_t* __restrict preg, char* __restrict errbuf, size_t errbuf_size) +{ + (void)preg; + + const char* message = "Unknown error."; + switch (errcode) + { + case REG_NOMATCH: message = "regexec() failed to match."; break; + case REG_BADPAT: message = "Invalid regular expression."; break; + case REG_ECOLLATE: message = "Invalid collating element referenced."; break; + case REG_ECTYPE: message = "Invalid character class type referenced."; break; + case REG_EESCAPE: message = "Trailing character in pattern."; break; + case REG_ESUBREG: message = "Number in \\digit invalid or in error."; break; + case REG_EBRACK: message = "\"[]\" imbalance."; break; + case REG_EPAREN: message = "\"\\(\\)\" or \"()\" imbalance."; break; + case REG_EBRACE: message = "\"\\{\\}\" imbalance."; break; + case REG_BADBR: message = "Content of \"\\{\\}\" invalid: not a number, number too large, more than two numbers, first larger than second."; break; + case REG_ERANGE: message = "Invalid endpoint in range expression."; break; + case REG_ESPACE: message = "Out of memory."; break; + case REG_BADRPT: message = "'?', '*', or '+' not preceded by valid regular expression."; break; + } + + strncpy(errbuf, message, errbuf_size); + return strlen(message); +}