2023-02-22 21:45:26 +02:00
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <stddef.h>
|
|
|
|
#include <stdint.h>
|
|
|
|
|
2023-03-23 11:13:14 +02:00
|
|
|
namespace BAN::UTF8
|
2023-02-22 21:45:26 +02:00
|
|
|
{
|
|
|
|
|
2023-03-23 11:13:14 +02:00
|
|
|
static constexpr uint32_t invalid = 0xFFFFFFFF;
|
2023-02-22 21:45:26 +02:00
|
|
|
|
2023-03-23 11:13:14 +02:00
|
|
|
constexpr uint32_t byte_length(uint8_t first_byte)
|
2023-03-20 14:52:42 +02:00
|
|
|
{
|
|
|
|
if ((first_byte & 0x80) == 0x00)
|
|
|
|
return 1;
|
|
|
|
if ((first_byte & 0xE0) == 0xC0)
|
|
|
|
return 2;
|
|
|
|
if ((first_byte & 0xF0) == 0xE0)
|
|
|
|
return 3;
|
|
|
|
if ((first_byte & 0xF8) == 0xF0)
|
|
|
|
return 4;
|
|
|
|
return 0;
|
|
|
|
}
|
2023-02-22 21:45:26 +02:00
|
|
|
|
2023-03-23 11:13:14 +02:00
|
|
|
constexpr uint32_t to_codepoint(uint8_t* bytes)
|
2023-03-20 14:52:42 +02:00
|
|
|
{
|
2023-03-23 11:13:14 +02:00
|
|
|
uint32_t length = byte_length(bytes[0]);
|
2023-03-20 14:52:42 +02:00
|
|
|
|
|
|
|
for (uint32_t i = 1; i < length; i++)
|
|
|
|
if ((bytes[i] & 0xC0) != 0x80)
|
|
|
|
return UTF8::invalid;
|
2024-01-24 14:43:46 +02:00
|
|
|
|
2023-03-20 14:52:42 +02:00
|
|
|
switch (length)
|
2023-02-22 21:45:26 +02:00
|
|
|
{
|
2023-03-20 14:52:42 +02:00
|
|
|
case 1: return ((bytes[0] & 0x80) != 0x00) ? UTF8::invalid : bytes[0];
|
|
|
|
case 2: return ((bytes[0] & 0xE0) != 0xC0) ? UTF8::invalid : ((bytes[0] & 0x1F) << 6) | (bytes[1] & 0x3F);
|
|
|
|
case 3: return ((bytes[0] & 0xF0) != 0xE0) ? UTF8::invalid : ((bytes[0] & 0x0F) << 12) | ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
|
|
|
|
case 4: return ((bytes[0] & 0xF8) != 0xF0) ? UTF8::invalid : ((bytes[0] & 0x07) << 18) | ((bytes[1] & 0x3F) << 12) | ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
|
2023-02-22 21:45:26 +02:00
|
|
|
}
|
|
|
|
|
2023-03-20 14:52:42 +02:00
|
|
|
return UTF8::invalid;
|
2023-02-22 21:45:26 +02:00
|
|
|
}
|
|
|
|
|
2023-03-23 11:13:14 +02:00
|
|
|
template<typename T>
|
|
|
|
constexpr bool from_codepoints(const T* codepoints, size_t count, char* out)
|
|
|
|
{
|
|
|
|
uint8_t* ptr = (uint8_t*)out;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < count; i++)
|
|
|
|
{
|
|
|
|
if (codepoints[i] < 0x80)
|
|
|
|
{
|
|
|
|
*ptr++ = codepoints[i];
|
|
|
|
}
|
|
|
|
else if (codepoints[i] < 0x800)
|
|
|
|
{
|
|
|
|
*ptr++ = 0xC0 | ((codepoints[i] >> 6) & 0x1F);
|
|
|
|
*ptr++ = 0x80 | ((codepoints[i] >> 0) & 0x3F);
|
|
|
|
}
|
|
|
|
else if (codepoints[i] < 0x10000)
|
|
|
|
{
|
|
|
|
*ptr++ = 0xE0 | ((codepoints[i] >> 12) & 0x0F);
|
|
|
|
*ptr++ = 0x80 | ((codepoints[i] >> 6) & 0x3F);
|
|
|
|
*ptr++ = 0x80 | ((codepoints[i] >> 0) & 0x3F);
|
|
|
|
}
|
|
|
|
else if (codepoints[i] < 0x110000)
|
|
|
|
{
|
|
|
|
*ptr++ = 0xF0 | ((codepoints[i] >> 18) & 0x07);
|
|
|
|
*ptr++ = 0x80 | ((codepoints[i] >> 12) & 0x3F);
|
|
|
|
*ptr++ = 0x80 | ((codepoints[i] >> 6) & 0x3F);
|
|
|
|
*ptr++ = 0x80 | ((codepoints[i] >> 0) & 0x3F);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2024-01-24 14:43:46 +02:00
|
|
|
}
|