Kernel: Rewrite paging and AP initialization

Initial step of paging now just prepares fast page for heap, actual page
table initialization happens after heap is initialized which allows
x86_64 to never depend on kmalloc for pages.

Processor's stacks are now also spawned with PMM/VMM allocated stacks
instead of kmalloc identity mapped.
This commit is contained in:
2026-05-02 15:45:08 +03:00
parent 1602b195c5
commit d2b9b49cb0
11 changed files with 370 additions and 487 deletions

View File

@@ -2,7 +2,6 @@
#include <kernel/CPUID.h>
#include <kernel/Lock/SpinLock.h>
#include <kernel/Memory/Heap.h>
#include <kernel/Memory/kmalloc.h>
#include <kernel/Memory/PageTable.h>
extern uint8_t g_kernel_start[];
@@ -17,13 +16,15 @@ extern uint8_t g_kernel_writable_end[];
extern uint8_t g_userspace_start[];
extern uint8_t g_userspace_end[];
extern uint64_t g_boot_fast_page_pt[];
namespace Kernel
{
SpinLock PageTable::s_fast_page_lock;
static constexpr vaddr_t s_hhdm_offset = 0xFFFF800000000000;
static bool s_is_post_heap_done = false;
static bool s_is_initialized = false;
constexpr uint64_t s_page_flag_mask = 0x8000000000000FFF;
constexpr uint64_t s_page_addr_mask = ~s_page_flag_mask;
@@ -35,6 +36,8 @@ namespace Kernel
static paddr_t s_global_pml4_entries[512] { 0 };
static uint64_t* s_fast_page_pt { nullptr };
static constexpr inline bool is_canonical(uintptr_t addr)
{
constexpr uintptr_t mask = 0xFFFF800000000000;
@@ -54,68 +57,27 @@ namespace Kernel
return addr;
}
struct FuncsKmalloc
static paddr_t allocate_zeroed_page_aligned_page()
{
static paddr_t allocate_zeroed_page_aligned_page()
{
void* page = kmalloc(PAGE_SIZE, PAGE_SIZE, true);
ASSERT(page);
memset(page, 0, PAGE_SIZE);
return kmalloc_paddr_of(reinterpret_cast<vaddr_t>(page)).value();
}
const paddr_t paddr = Heap::get().take_free_page();
ASSERT(paddr);
memset(reinterpret_cast<void*>(paddr + s_hhdm_offset), 0, PAGE_SIZE);
return paddr;
}
static void unallocate_page(paddr_t paddr)
{
kfree(reinterpret_cast<void*>(kmalloc_vaddr_of(paddr).value()));
}
static paddr_t V2P(vaddr_t vaddr)
{
return vaddr - KERNEL_OFFSET + g_boot_info.kernel_paddr;
}
static uint64_t* P2V(paddr_t paddr)
{
return reinterpret_cast<uint64_t*>(paddr - g_boot_info.kernel_paddr + KERNEL_OFFSET);
}
};
struct FuncsHHDM
static void unallocate_page(paddr_t paddr)
{
static paddr_t allocate_zeroed_page_aligned_page()
{
const paddr_t paddr = Heap::get().take_free_page();
ASSERT(paddr);
memset(reinterpret_cast<void*>(paddr + s_hhdm_offset), 0, PAGE_SIZE);
return paddr;
}
Heap::get().release_page(paddr);
}
static void unallocate_page(paddr_t paddr)
{
Heap::get().release_page(paddr);
}
static uint64_t* P2V(paddr_t paddr)
{
ASSERT(paddr != 0);
ASSERT(!BAN::Math::will_addition_overflow(paddr, s_hhdm_offset));
return reinterpret_cast<uint64_t*>(paddr + s_hhdm_offset);
}
static paddr_t V2P(vaddr_t vaddr)
{
ASSERT(vaddr >= s_hhdm_offset);
ASSERT(vaddr < KERNEL_OFFSET);
return vaddr - s_hhdm_offset;
}
static uint64_t* P2V(paddr_t paddr)
{
ASSERT(paddr != 0);
ASSERT(!BAN::Math::will_addition_overflow(paddr, s_hhdm_offset));
return reinterpret_cast<uint64_t*>(paddr + s_hhdm_offset);
}
};
static paddr_t (*allocate_zeroed_page_aligned_page)() = &FuncsKmalloc::allocate_zeroed_page_aligned_page;
static void (*unallocate_page)(paddr_t) = &FuncsKmalloc::unallocate_page;
static paddr_t (*V2P)(vaddr_t) = &FuncsKmalloc::V2P;
static uint64_t* (*P2V)(paddr_t) = &FuncsKmalloc::P2V;
static inline PageTable::flags_t parse_flags(uint64_t entry)
static PageTable::flags_t parse_flags(uint64_t entry)
{
using Flags = PageTable::Flags;
@@ -137,7 +99,7 @@ namespace Kernel
// 0: 4 KiB
// 1: 2 MiB
// 2: 1 GiB
static void init_map_hhdm_page(paddr_t pml4, paddr_t paddr, uint8_t page_size)
static void map_hhdm_page(paddr_t pml4, paddr_t paddr, uint8_t page_size)
{
ASSERT(0 <= page_size && page_size <= 2);
@@ -184,7 +146,7 @@ namespace Kernel
const uint64_t noexec_flag = s_has_nxe ? (static_cast<uint64_t>(1) << 63) : 0;
const paddr_t pdpt = get_or_allocate_entry(pml4, pml4e, noexec_flag);
s_global_pml4_entries[pml4e] = pdpt | hhdm_flags;
s_global_pml4_entries[pml4e] = pdpt | hhdm_flags | noexec_flag;
paddr_t lowest_paddr = pdpt;
uint16_t lowest_entry = pdpte;
@@ -207,23 +169,11 @@ namespace Kernel
});
}
static void init_map_hhdm(paddr_t pml4)
static void initialize_hhdm(paddr_t pml4)
{
for (const auto& entry : g_boot_info.memory_map_entries)
{
bool should_map = false;
switch (entry.type)
{
case MemoryMapEntry::Type::Available:
should_map = true;
break;
case MemoryMapEntry::Type::ACPIReclaim:
case MemoryMapEntry::Type::ACPINVS:
case MemoryMapEntry::Type::Reserved:
should_map = false;
break;
}
if (!should_map)
if (entry.type != MemoryMapEntry::Type::Available)
continue;
constexpr size_t one_gib = 1024 * 1024 * 1024;
@@ -235,156 +185,39 @@ namespace Kernel
{
if (s_has_gib && paddr % one_gib == 0 && paddr + one_gib <= entry_end)
{
init_map_hhdm_page(pml4, paddr, 2);
map_hhdm_page(pml4, paddr, 2);
paddr += one_gib;
}
else if (paddr % two_mib == 0 && paddr + two_mib <= entry_end)
{
init_map_hhdm_page(pml4, paddr, 1);
map_hhdm_page(pml4, paddr, 1);
paddr += two_mib;
}
else
{
init_map_hhdm_page(pml4, paddr, 0);
map_hhdm_page(pml4, paddr, 0);
paddr += PAGE_SIZE;
}
}
}
}
static paddr_t copy_page_from_kmalloc_to_heap(paddr_t kmalloc_paddr)
void PageTable::initialize_fast_page()
{
const paddr_t heap_paddr = Heap::get().take_free_page();
ASSERT(heap_paddr);
const vaddr_t kmalloc_vaddr = kmalloc_vaddr_of(kmalloc_paddr).value();
PageTable::with_fast_page(heap_paddr, [kmalloc_vaddr] {
memcpy(PageTable::fast_page_as_ptr(), reinterpret_cast<void*>(kmalloc_vaddr), PAGE_SIZE);
});
return heap_paddr;
s_fast_page_pt = g_boot_fast_page_pt;
}
static void copy_paging_structure_to_heap(uint64_t* old_table, uint64_t* new_table, int depth)
{
if (depth == 0)
return;
constexpr uint64_t page_flag_mask = 0x8000000000000FFF;
constexpr uint64_t page_addr_mask = ~page_flag_mask;
for (uint16_t index = 0; index < 512; index++)
{
const uint64_t old_entry = old_table[index];
if (old_entry == 0)
{
new_table[index] = 0;
continue;
}
const paddr_t old_paddr = old_entry & page_addr_mask;
const paddr_t new_paddr = copy_page_from_kmalloc_to_heap(old_paddr);
new_table[index] = new_paddr | (old_entry & page_flag_mask);
uint64_t* next_old_table = reinterpret_cast<uint64_t*>(old_paddr + s_hhdm_offset);
uint64_t* next_new_table = reinterpret_cast<uint64_t*>(new_paddr + s_hhdm_offset);
copy_paging_structure_to_heap(next_old_table, next_new_table, depth - 1);
}
}
static void free_kmalloc_paging_structure(uint64_t* table, int depth)
{
if (depth == 0)
return;
constexpr uint64_t page_flag_mask = 0x8000000000000FFF;
constexpr uint64_t page_addr_mask = ~page_flag_mask;
for (uint16_t index = 0; index < 512; index++)
{
const uint64_t entry = table[index];
if (entry == 0)
continue;
const paddr_t paddr = entry & page_addr_mask;
uint64_t* next_table = reinterpret_cast<uint64_t*>(paddr + s_hhdm_offset);
free_kmalloc_paging_structure(next_table, depth - 1);
kfree(reinterpret_cast<void*>(kmalloc_vaddr_of(paddr).value()));
}
}
void PageTable::initialize_pre_heap()
static void detect_cpu_features()
{
if (CPUID::has_nxe())
s_has_nxe = true;
if (CPUID::has_pge())
s_has_pge = true;
if (CPUID::has_1gib_pages())
s_has_gib = true;
ASSERT(s_kernel == nullptr);
s_kernel = new PageTable();
ASSERT(s_kernel);
s_kernel->m_highest_paging_struct = allocate_zeroed_page_aligned_page();
s_kernel->prepare_fast_page();
s_kernel->initialize_kernel();
for (auto pml4e : s_global_pml4_entries)
ASSERT(pml4e == 0);
const uint64_t* pml4 = P2V(s_kernel->m_highest_paging_struct);
s_global_pml4_entries[511] = pml4[511];
}
void PageTable::initialize_post_heap()
{
ASSERT(s_kernel);
init_map_hhdm(s_kernel->m_highest_paging_struct);
const paddr_t old_pml4_paddr = s_kernel->m_highest_paging_struct;
const paddr_t new_pml4_paddr = copy_page_from_kmalloc_to_heap(old_pml4_paddr);
uint64_t* old_pml4 = reinterpret_cast<uint64_t*>(kmalloc_vaddr_of(old_pml4_paddr).value());
uint64_t* new_pml4 = reinterpret_cast<uint64_t*>(new_pml4_paddr + s_hhdm_offset);
const paddr_t old_pdpt_paddr = old_pml4[511] & s_page_addr_mask;
const paddr_t new_pdpt_paddr = Heap::get().take_free_page();
ASSERT(new_pdpt_paddr);
uint64_t* old_pdpt = reinterpret_cast<uint64_t*>(old_pdpt_paddr + s_hhdm_offset);
uint64_t* new_pdpt = reinterpret_cast<uint64_t*>(new_pdpt_paddr + s_hhdm_offset);
copy_paging_structure_to_heap(old_pdpt, new_pdpt, 2);
new_pml4[511] = new_pdpt_paddr | (old_pml4[511] & s_page_flag_mask);
s_global_pml4_entries[511] = new_pml4[511];
s_kernel->m_highest_paging_struct = new_pml4_paddr;
s_kernel->load();
free_kmalloc_paging_structure(old_pdpt, 2);
kfree(reinterpret_cast<void*>(kmalloc_vaddr_of(old_pdpt_paddr).value()));
kfree(reinterpret_cast<void*>(kmalloc_vaddr_of(old_pml4_paddr).value()));
allocate_zeroed_page_aligned_page = &FuncsHHDM::allocate_zeroed_page_aligned_page;
unallocate_page = &FuncsHHDM::unallocate_page;
V2P = &FuncsHHDM::V2P;
P2V = &FuncsHHDM::P2V;
s_is_post_heap_done = true;
// This is a hack to unmap fast page. fast page pt is copied
// while it is mapped, so we need to manually unmap it
SpinLockGuard _(s_fast_page_lock);
unmap_fast_page();
}
void PageTable::initial_load()
void PageTable::enable_cpu_features()
{
if (s_has_nxe)
{
@@ -423,8 +256,63 @@ namespace Kernel
"movq %%rax, %%cr0;"
::: "rax"
);
}
load();
void PageTable::initialize_and_load()
{
detect_cpu_features();
enable_cpu_features();
const paddr_t boot_pml4_paddr = ({
paddr_t paddr;
asm volatile("movq %%cr3, %0" : "=r"(paddr));
paddr;
});
initialize_hhdm(boot_pml4_paddr);
ASSERT(s_kernel == nullptr);
s_kernel = new PageTable();
ASSERT(s_kernel != nullptr);
s_kernel->m_highest_paging_struct = allocate_zeroed_page_aligned_page();
ASSERT(s_kernel->m_highest_paging_struct);
uint64_t* pml4 = P2V(s_kernel->m_highest_paging_struct);
memcpy(pml4, s_global_pml4_entries, sizeof(s_global_pml4_entries));
s_kernel->map_kernel_memory();
s_global_pml4_entries[511] = pml4[511];
// update fast page pt
{
constexpr vaddr_t uc_vaddr = uncanonicalize(fast_page());
constexpr uint16_t pml4e = (uc_vaddr >> 39) & 0x1FF;
constexpr uint16_t pdpte = (uc_vaddr >> 30) & 0x1FF;
constexpr uint16_t pde = (uc_vaddr >> 21) & 0x1FF;
const auto get_or_allocate_entry =
[](paddr_t table_paddr, uint16_t entry, uint64_t flags)
{
uint64_t* table = P2V(table_paddr);
if (!(table[entry] & Flags::Present))
{
table[entry] = allocate_zeroed_page_aligned_page();
ASSERT(table[entry]);
}
table[entry] |= flags;
return table[entry] & s_page_addr_mask;
};
const paddr_t pml4 = s_kernel->m_highest_paging_struct;
const paddr_t pdpt = get_or_allocate_entry(pml4, pml4e, Flags::ReadWrite | Flags::Present);
const paddr_t pd = get_or_allocate_entry(pdpt, pdpte, Flags::ReadWrite | Flags::Present);
s_fast_page_pt = P2V(get_or_allocate_entry(pd, pde, Flags::ReadWrite | Flags::Present));
}
s_kernel->load();
}
PageTable& PageTable::kernel()
@@ -440,12 +328,12 @@ namespace Kernel
return true;
}
void PageTable::initialize_kernel()
void PageTable::map_kernel_memory()
{
// Map (phys_kernel_start -> phys_kernel_end) to (virt_kernel_start -> virt_kernel_end)
const vaddr_t kernel_start = reinterpret_cast<vaddr_t>(g_kernel_start);
map_range_at(
V2P(kernel_start),
kernel_start - KERNEL_OFFSET,
kernel_start,
g_kernel_end - g_kernel_start,
Flags::Present
@@ -454,7 +342,7 @@ namespace Kernel
// Map executable kernel memory as executable
const vaddr_t kernel_execute_start = reinterpret_cast<vaddr_t>(g_kernel_execute_start);
map_range_at(
V2P(kernel_execute_start),
kernel_execute_start - KERNEL_OFFSET,
kernel_execute_start,
g_kernel_execute_end - g_kernel_execute_start,
Flags::Execute | Flags::Present
@@ -463,7 +351,7 @@ namespace Kernel
// Map writable kernel memory as writable
const vaddr_t kernel_writable_start = reinterpret_cast<vaddr_t>(g_kernel_writable_start);
map_range_at(
V2P(kernel_writable_start),
kernel_writable_start - KERNEL_OFFSET,
kernel_writable_start,
g_kernel_writable_end - g_kernel_writable_start,
Flags::ReadWrite | Flags::Present
@@ -472,114 +360,58 @@ namespace Kernel
// Map userspace memory
const vaddr_t userspace_start = reinterpret_cast<vaddr_t>(g_userspace_start);
map_range_at(
V2P(userspace_start),
userspace_start - KERNEL_OFFSET,
userspace_start,
g_userspace_end - g_userspace_start,
Flags::Execute | Flags::UserSupervisor | Flags::Present
);
}
void PageTable::prepare_fast_page()
{
constexpr vaddr_t uc_vaddr = uncanonicalize(fast_page());
constexpr uint64_t pml4e = (uc_vaddr >> 39) & 0x1FF;
constexpr uint64_t pdpte = (uc_vaddr >> 30) & 0x1FF;
constexpr uint64_t pde = (uc_vaddr >> 21) & 0x1FF;
constexpr uint64_t pte = (uc_vaddr >> 12) & 0x1FF;
uint64_t* pml4 = P2V(m_highest_paging_struct);
ASSERT(!(pml4[pml4e] & Flags::Present));
pml4[pml4e] = allocate_zeroed_page_aligned_page() | Flags::ReadWrite | Flags::Present;
uint64_t* pdpt = P2V(pml4[pml4e] & s_page_addr_mask);
ASSERT(!(pdpt[pdpte] & Flags::Present));
pdpt[pdpte] = allocate_zeroed_page_aligned_page() | Flags::ReadWrite | Flags::Present;
uint64_t* pd = P2V(pdpt[pdpte] & s_page_addr_mask);
ASSERT(!(pd[pde] & Flags::Present));
pd[pde] = allocate_zeroed_page_aligned_page() | Flags::ReadWrite | Flags::Present;
uint64_t* pt = P2V(pd[pde] & s_page_addr_mask);
ASSERT(pt[pte] == 0);
pt[pte] = Flags::Reserved;
}
void PageTable::map_fast_page(paddr_t paddr)
{
ASSERT(s_kernel);
ASSERT(paddr);
ASSERT(paddr % PAGE_SIZE == 0);
ASSERT(paddr && paddr % PAGE_SIZE == 0);
ASSERT(s_fast_page_pt);
ASSERT(s_fast_page_lock.current_processor_has_lock());
constexpr vaddr_t uc_vaddr = uncanonicalize(fast_page());
constexpr uint64_t pml4e = (uc_vaddr >> 39) & 0x1FF;
constexpr uint64_t pdpte = (uc_vaddr >> 30) & 0x1FF;
constexpr uint64_t pde = (uc_vaddr >> 21) & 0x1FF;
constexpr uint64_t pte = (uc_vaddr >> 12) & 0x1FF;
ASSERT(!(*s_fast_page_pt & Flags::Present));
s_fast_page_pt[0] = paddr | Flags::ReadWrite | Flags::Present;
const uint64_t* pml4 = P2V(s_kernel->m_highest_paging_struct);
const uint64_t* pdpt = P2V(pml4[pml4e] & s_page_addr_mask);
const uint64_t* pd = P2V(pdpt[pdpte] & s_page_addr_mask);
uint64_t* pt = P2V(pd[pde] & s_page_addr_mask);
ASSERT(!(pt[pte] & Flags::Present));
pt[pte] = paddr | Flags::ReadWrite | Flags::Present;
asm volatile("invlpg (%0)" :: "r"(fast_page()) : "memory");
asm volatile("invlpg (%0)" :: "r"(fast_page()));
}
void PageTable::unmap_fast_page()
{
ASSERT(s_kernel);
ASSERT(s_fast_page_pt);
ASSERT(s_fast_page_lock.current_processor_has_lock());
constexpr vaddr_t uc_vaddr = uncanonicalize(fast_page());
constexpr uint64_t pml4e = (uc_vaddr >> 39) & 0x1FF;
constexpr uint64_t pdpte = (uc_vaddr >> 30) & 0x1FF;
constexpr uint64_t pde = (uc_vaddr >> 21) & 0x1FF;
constexpr uint64_t pte = (uc_vaddr >> 12) & 0x1FF;
ASSERT((*s_fast_page_pt & Flags::Present));
s_fast_page_pt[0] = 0;
const uint64_t* pml4 = P2V(s_kernel->m_highest_paging_struct);
const uint64_t* pdpt = P2V(pml4[pml4e] & s_page_addr_mask);
const uint64_t* pd = P2V(pdpt[pdpte] & s_page_addr_mask);
uint64_t* pt = P2V(pd[pde] & s_page_addr_mask);
ASSERT(pt[pte] & Flags::Present);
pt[pte] = Flags::Reserved;
asm volatile("invlpg (%0)" :: "r"(fast_page()) : "memory");
asm volatile("invlpg (%0)" :: "r"(fast_page()));
}
BAN::ErrorOr<PageTable*> PageTable::create_userspace()
{
SpinLockGuard _(s_kernel->m_lock);
PageTable* page_table = new PageTable;
if (page_table == nullptr)
return BAN::Error::from_errno(ENOMEM);
page_table->map_kernel_memory();
page_table->m_highest_paging_struct = allocate_zeroed_page_aligned_page();
if (page_table->m_highest_paging_struct == 0)
{
delete page_table;
return BAN::Error::from_errno(ENOMEM);
}
uint64_t* pml4 = P2V(page_table->m_highest_paging_struct);
memcpy(pml4, s_global_pml4_entries, sizeof(s_global_pml4_entries));
return page_table;
}
void PageTable::map_kernel_memory()
{
ASSERT(s_kernel);
ASSERT(s_global_pml4_entries[511]);
ASSERT(m_highest_paging_struct == 0);
m_highest_paging_struct = allocate_zeroed_page_aligned_page();
PageTable::with_fast_page(m_highest_paging_struct, [] {
for (size_t i = 0; i < 512; i++)
{
if (s_global_pml4_entries[i] == 0)
continue;
ASSERT(i >= 256);
PageTable::fast_page_as_sized<uint64_t>(i) = s_global_pml4_entries[i];
}
});
}
PageTable::~PageTable()
{
if (m_highest_paging_struct == 0)
@@ -624,7 +456,7 @@ namespace Kernel
const bool is_userspace = (vaddr < KERNEL_OFFSET);
if (is_userspace && this != &PageTable::current())
;
else if (pages <= 32 || !s_is_post_heap_done)
else if (pages <= 32 || !s_is_initialized)
{
for (size_t i = 0; i < pages; i++, vaddr += PAGE_SIZE)
asm volatile("invlpg (%0)" :: "r"(vaddr));