From 22e45278a2da93e3480840496acd0a919600db29 Mon Sep 17 00:00:00 2001
From: Bananymous <bananymousosq@gmail.com>
Date: Mon, 20 Mar 2023 14:52:42 +0200
Subject: [PATCH] Kernel: Fix PC Screen font parsing

I had misread the format and the parsing code was incorrect. I also
changed fonts to store unicode codepoints as 32 bit integers, so
every character can be represented
---
 BAN/include/BAN/UTF8.h       | 40 ++++++++++-----
 kernel/include/kernel/Font.h |  6 +--
 kernel/kernel/Font.cpp       | 96 ++++++++++++++++++------------------
 3 files changed, 81 insertions(+), 61 deletions(-)
diff --git a/BAN/include/BAN/UTF8.h b/BAN/include/BAN/UTF8.h
index b122a68b43..d712a39546 100644
--- a/BAN/include/BAN/UTF8.h
+++ b/BAN/include/BAN/UTF8.h
@@ -6,23 +6,41 @@
 namespace BAN
 {
 
-	static constexpr uint16_t utf8_to_codepoint(uint8_t* bytes, size_t count)
+	namespace UTF8
 	{
-		if (count > 3)
-			return 0xFFFF;
+		static constexpr uint32_t invalid = 0xFFFFFFFF;
+	}
 
-		for (size_t i = 1; i < count; i++)
+	static constexpr uint32_t utf8_byte_length(uint8_t first_byte)
+	{
+		if ((first_byte & 0x80) == 0x00)
+			return 1;
+		if ((first_byte & 0xE0) == 0xC0)
+			return 2;
+		if ((first_byte & 0xF0) == 0xE0)
+			return 3;
+		if ((first_byte & 0xF8) == 0xF0)
+			return 4;
+		return 0;
+	}
+
+	static constexpr uint32_t utf8_to_codepoint(uint8_t* bytes)
+	{
+		uint32_t length = utf8_byte_length(bytes[0]);
+
+		for (uint32_t i = 1; i < length; i++)
 			if ((bytes[i] & 0xC0) != 0x80)
-				return 0xFFFF;
-
-		switch (count)
+				return UTF8::invalid;
+		
+		switch (length)
 		{
-			case 1: return   bytes[0];
-			case 2: return ((bytes[0] & 0x1F) <<  6) |  (bytes[1] & 0x3F);
-			case 3: return ((bytes[0] & 0x1F) << 12) | ((bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F);
+			case 1: return ((bytes[0] & 0x80) != 0x00) ? UTF8::invalid :   bytes[0];
+			case 2: return ((bytes[0] & 0xE0) != 0xC0) ? UTF8::invalid : ((bytes[0] & 0x1F) <<  6) |  (bytes[1] & 0x3F);
+			case 3: return ((bytes[0] & 0xF0) != 0xE0) ? UTF8::invalid : ((bytes[0] & 0x0F) << 12) | ((bytes[1] & 0x3F) <<  6) |  (bytes[2] & 0x3F);
+			case 4: return ((bytes[0] & 0xF8) != 0xF0) ? UTF8::invalid : ((bytes[0] & 0x07) << 18) | ((bytes[1] & 0x3F) << 12) | ((bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F);
 		}
 
-		return 0xFFFF;
+		return UTF8::invalid;
 	}
 
 }
\ No newline at end of file
diff --git a/kernel/include/kernel/Font.h b/kernel/include/kernel/Font.h
index 89470c7749..bb10264be4 100644
--- a/kernel/include/kernel/Font.h
+++ b/kernel/include/kernel/Font.h
@@ -17,15 +17,15 @@ namespace Kernel
 		uint32_t height() const { return m_height; }
 		uint32_t pitch() const { return m_pitch; }
 
-		bool has_glyph(uint16_t) const;
-		const uint8_t* glyph(uint16_t) const;
+		bool has_glyph(uint32_t) const;
+		const uint8_t* glyph(uint32_t) const;
 
 	private:
 		static BAN::ErrorOr<Font> parse_psf1(const BAN::Span<uint8_t>);
 		static BAN::ErrorOr<Font> parse_psf2(const BAN::Span<uint8_t>);
 
 	private:
-		BAN::HashMap<uint16_t, uint32_t> m_glyph_offsets;
+		BAN::HashMap<uint32_t, uint32_t> m_glyph_offsets;
 		BAN::Vector<uint8_t> m_glyph_data;
 		uint32_t m_width = 0;
 		uint32_t m_height = 0;
diff --git a/kernel/kernel/Font.cpp b/kernel/kernel/Font.cpp
index 9f79cea59e..043aaa9513 100644
--- a/kernel/kernel/Font.cpp
+++ b/kernel/kernel/Font.cpp
@@ -73,17 +73,16 @@ namespace Kernel
 		TRY(glyph_data.resize(glyph_data_size));
 		memcpy(glyph_data.data(), font_data.data() + sizeof(PSF1Header), glyph_data_size);
 
-		BAN::HashMap<uint16_t, uint32_t> glyph_offsets;
+		BAN::HashMap<uint32_t, uint32_t> glyph_offsets;
 		TRY(glyph_offsets.reserve(glyph_count));
 
-		bool unsupported_utf = false;
 		bool codepoint_redef = false;
+		bool codepoint_sequence = false;
 
 		if (header->magic & (PSF1_MODE_HASTAB | PSF1_MODE_SEQ))
 		{
 			uint32_t current_index = sizeof(PSF1Header) + glyph_data_size;
 
-			bool in_sequence = false;
 			uint32_t glyph_index = 0;
 			while (current_index < font_data.size())
 			{
@@ -91,17 +90,16 @@ namespace Kernel
 				uint16_t hi = font_data[current_index + 1];
 				uint16_t codepoint = (hi << 8) | lo;
 
-				if (codepoint == 0xFFFF)
+				if (codepoint == 0xFFFE)
+				{
+					codepoint_sequence = true;
+					break;
+				}
+				else if (codepoint == 0xFFFF)
 				{
 					glyph_index++;
-					in_sequence = false;
 				}
-				else if (codepoint == 0xFFFE)
-				{
-					in_sequence = true;
-					unsupported_utf = true;
-				}
-				else if (!in_sequence)
+				else
 				{
 					if (glyph_offsets.contains(codepoint))
 						codepoint_redef = true;
@@ -111,9 +109,6 @@ namespace Kernel
 
 				current_index += 2;
 			}
-
-			if (glyph_index != glyph_count)
-				return BAN::Error::from_c_string("Font did not contain unicode entry for all glyphs");
 		}
 		else
 		{
@@ -121,10 +116,10 @@ namespace Kernel
 				TRY(glyph_offsets.insert(i, i * glyph_size));
 		}
 
-		if (unsupported_utf)
-			dwarnln("Font contains invalid/unsupported UTF-8 codepoint(s)");
 		if (codepoint_redef)
 			dwarnln("Font contsins multiple definitions for same codepoint(s)");
+		if (codepoint_sequence)
+			dwarnln("Font contains codepoint sequences (not supported)");
 
 		Font result;
 		result.m_glyph_offsets = BAN::move(glyph_offsets);
@@ -171,11 +166,12 @@ namespace Kernel
 		TRY(glyph_data.resize(glyph_data_size));
 		memcpy(glyph_data.data(), font_data.data() + header.header_size, glyph_data_size);
 
-		BAN::HashMap<uint16_t, uint32_t> glyph_offsets;
+		BAN::HashMap<uint32_t, uint32_t> glyph_offsets;
 		TRY(glyph_offsets.reserve(400));
 
-		bool unsupported_utf = false;
+		bool invalid_utf = false;
 		bool codepoint_redef = false;
+		bool codepoint_sequence = false;
 
 		uint8_t bytes[4] {};
 		uint32_t byte_index = 0;
@@ -186,32 +182,44 @@ namespace Kernel
 			{
 				uint8_t byte = font_data[i];
 
-				if ((byte >> 1) == 0x7F)
+				if (byte == 0xFE)
 				{
-					if (byte_index <= 4)
+					codepoint_sequence = true;
+					break;
+				}
+				else if (byte == 0xFF)
+				{
+					if (byte_index)
 					{
-						uint16_t codepoint = BAN::utf8_to_codepoint(bytes, byte_index);
-						if (codepoint == 0xFFFF)
-							unsupported_utf = true;
+						invalid_utf = true;
+						byte_index = 0;
+					}
+					glyph_index++;
+				}
+				else
+				{
+					ASSERT(byte_index < 4);
+					bytes[byte_index++] = byte;
+					uint32_t len = BAN::utf8_byte_length(bytes[0]);
+
+					if (len == 0)
+					{
+						invalid_utf = true;
+						byte_index = 0;
+					}
+					else if (len == byte_index)
+					{
+						uint32_t codepoint = BAN::utf8_to_codepoint(bytes);
+						if (codepoint == BAN::UTF8::invalid)
+							invalid_utf = true;
 						else if (glyph_offsets.contains(codepoint))
 							codepoint_redef = true;
 						else
 							TRY(glyph_offsets.insert(codepoint, glyph_index * header.glyph_size));
+						byte_index = 0;
 					}
-					byte_index = 0;
-					if (byte == 0xFF)
-						glyph_index++;
-				}
-				else
-				{
-					if (byte_index < 4)
-						bytes[byte_index++] = byte;
-					else
-						unsupported_utf = true;
 				}
 			}
-			if (glyph_index != header.glyph_count)
-				return BAN::Error::from_c_string("Font did not contain unicode entry for all glyphs");
 		}
 		else
 		{
@@ -219,18 +227,12 @@ namespace Kernel
 				TRY(glyph_offsets.insert(i, i * header.glyph_size));	
 		}
 
-		// Manually add space (empty) character if it is not present
-		if (!glyph_offsets.contains(' '))
-		{
-			TRY(glyph_data.resize(glyph_data_size + header.glyph_size));
-			memset(glyph_data.data() + glyph_data_size, 0, header.glyph_size);
-			TRY(glyph_offsets.insert(' ', glyph_data_size));
-		}
-
-		if (unsupported_utf)
-			dwarnln("Font contains invalid/unsupported UTF-8 codepoint(s)");
+		if (invalid_utf)
+			dwarnln("Font contains invalid UTF-8 codepoint(s)");
 		if (codepoint_redef)
 			dwarnln("Font contsins multiple definitions for same codepoint(s)");
+		if (codepoint_sequence)
+			dwarnln("Font contains codepoint sequences (not supported)");
 
 		Font result;
 		result.m_glyph_offsets = BAN::move(glyph_offsets);
@@ -241,12 +243,12 @@ namespace Kernel
 		return result;
 	}
 	
-	bool Font::has_glyph(uint16_t codepoint) const
+	bool Font::has_glyph(uint32_t codepoint) const
 	{
 		return m_glyph_offsets.contains(codepoint);
 	}
 
-	const uint8_t* Font::glyph(uint16_t codepoint) const
+	const uint8_t* Font::glyph(uint32_t codepoint) const
 	{
 		return m_glyph_data.data() + m_glyph_offsets[codepoint];
 	}