Kernel: Rework syscall memory validation and locking

Process's memory regions are now behind an rwlock instead of using the
full process lock. This allows most pointer validations to not block as
write operations to memory regions are rare.

Thread's userspace stack is now part of process's memory regions. This
simplifies code that explicitly looped over threads to see if the
accessed address was inside a thread's stack.

Only drawback of this is that MemoryRegions don't support guard pages,
so userspace stackoverflow will be handeled as cleanly as it was prior
to this.

This patch also fixes some unnecessary locking of the process lock and
moves locking to the internal helper functions instead of asserting that
the lock is held. Also we now make sure loaded ELF regions are in sorted
order as we previously expected.
This commit is contained in:
Bananymous 2026-01-16 16:09:38 +02:00
parent 0299d4d44e
commit 1143dc3cae
5 changed files with 955 additions and 603 deletions

View File

@ -9,6 +9,7 @@
#include <kernel/ELF.h> #include <kernel/ELF.h>
#include <kernel/FS/Inode.h> #include <kernel/FS/Inode.h>
#include <kernel/Lock/Mutex.h> #include <kernel/Lock/Mutex.h>
#include <kernel/Lock/RWLock.h>
#include <kernel/Memory/Heap.h> #include <kernel/Memory/Heap.h>
#include <kernel/Memory/MemoryRegion.h> #include <kernel/Memory/MemoryRegion.h>
#include <kernel/Memory/SharedMemoryObject.h> #include <kernel/Memory/SharedMemoryObject.h>
@ -272,17 +273,20 @@ namespace Kernel
}; };
// Adds new region to the sorted array of mapped regions // Adds new region to the sorted array of mapped regions
// You must hold writer end of m_mapped_region_lock when calling this.
BAN::ErrorOr<void> add_mapped_region(BAN::UniqPtr<MemoryRegion>&&); BAN::ErrorOr<void> add_mapped_region(BAN::UniqPtr<MemoryRegion>&&);
// If address is contained by a region, returns the index of that. // If address is contained by a region, returns the index of that.
// Otherwise returns the address of the first region after this address. // Otherwise returns the address of the first region after this address.
// You must hold reader end of m_mapped_region_lock when calling this.
size_t find_mapped_region(vaddr_t) const; size_t find_mapped_region(vaddr_t) const;
BAN::ErrorOr<VirtualFileSystem::File> find_file(int fd, const char* path, int flags) const; BAN::ErrorOr<VirtualFileSystem::File> find_file(int fd, const char* path, int flags) const;
BAN::ErrorOr<FileParent> find_parent_file(int fd, const char* path, int flags) const; BAN::ErrorOr<FileParent> find_parent_file(int fd, const char* path, int flags) const;
BAN::ErrorOr<VirtualFileSystem::File> find_relative_parent(int fd, const char* path) const; BAN::ErrorOr<VirtualFileSystem::File> find_relative_parent(int fd, const char* path) const;
BAN::ErrorOr<void> validate_string_access(const char*); BAN::ErrorOr<void> read_from_user(const void* user_addr, void* out, size_t size);
BAN::ErrorOr<void> validate_pointer_access(const void*, size_t, bool needs_write); BAN::ErrorOr<void> read_string_from_user(const char* user_addr, char* out, size_t max_size);
BAN::ErrorOr<void> write_to_user(void* user_addr, const void* in, size_t size);
BAN::ErrorOr<MemoryRegion*> validate_and_pin_pointer_access(const void*, size_t, bool needs_write); BAN::ErrorOr<MemoryRegion*> validate_and_pin_pointer_access(const void*, size_t, bool needs_write);
uint64_t signal_pending_mask() const uint64_t signal_pending_mask() const
@ -327,6 +331,7 @@ namespace Kernel
OpenFileDescriptorSet m_open_file_descriptors; OpenFileDescriptorSet m_open_file_descriptors;
mutable RWLock m_memory_region_lock;
BAN::Vector<BAN::UniqPtr<MemoryRegion>> m_mapped_regions; BAN::Vector<BAN::UniqPtr<MemoryRegion>> m_mapped_regions;
pid_t m_sid; pid_t m_sid;

View File

@ -16,6 +16,7 @@
namespace Kernel namespace Kernel
{ {
class MemoryBackedRegion;
class Process; class Process;
class Thread class Thread
@ -103,9 +104,7 @@ namespace Kernel
vaddr_t kernel_stack_top() const { return m_kernel_stack->vaddr() + m_kernel_stack->size(); } vaddr_t kernel_stack_top() const { return m_kernel_stack->vaddr() + m_kernel_stack->size(); }
VirtualRange& kernel_stack() { return *m_kernel_stack; } VirtualRange& kernel_stack() { return *m_kernel_stack; }
vaddr_t userspace_stack_bottom() const { return is_userspace() ? m_userspace_stack->vaddr() : UINTPTR_MAX; } MemoryBackedRegion& userspace_stack() { ASSERT(is_userspace() && m_userspace_stack); return *m_userspace_stack; }
vaddr_t userspace_stack_top() const { return is_userspace() ? m_userspace_stack->vaddr() + m_userspace_stack->size() : 0; }
VirtualRange& userspace_stack() { ASSERT(is_userspace()); return *m_userspace_stack; }
static Thread& current(); static Thread& current();
static pid_t current_tid(); static pid_t current_tid();
@ -129,7 +128,7 @@ namespace Kernel
void set_gsbase(vaddr_t base) { m_gsbase = base; } void set_gsbase(vaddr_t base) { m_gsbase = base; }
vaddr_t get_gsbase() const { return m_gsbase; } vaddr_t get_gsbase() const { return m_gsbase; }
size_t virtual_page_count() const { return (m_kernel_stack ? (m_kernel_stack->size() / PAGE_SIZE) : 0) + (m_userspace_stack ? (m_userspace_stack->size() / PAGE_SIZE) : 0); } size_t virtual_page_count() const { return m_kernel_stack ? (m_kernel_stack->size() / PAGE_SIZE) : 0; }
size_t physical_page_count() const { return virtual_page_count(); } size_t physical_page_count() const { return virtual_page_count(); }
YieldRegisters& yield_registers() { return m_yield_registers; } YieldRegisters& yield_registers() { return m_yield_registers; }
@ -160,7 +159,7 @@ namespace Kernel
BAN::UniqPtr<PageTable> m_keep_alive_page_table; BAN::UniqPtr<PageTable> m_keep_alive_page_table;
BAN::UniqPtr<VirtualRange> m_kernel_stack; BAN::UniqPtr<VirtualRange> m_kernel_stack;
BAN::UniqPtr<VirtualRange> m_userspace_stack; MemoryBackedRegion* m_userspace_stack { nullptr };
const pid_t m_tid { 0 }; const pid_t m_tid { 0 };
State m_state { State::NotStarted }; State m_state { State::NotStarted };
Process* m_process { nullptr }; Process* m_process { nullptr };

View File

@ -148,12 +148,12 @@ namespace Kernel
if (m_preallocated) if (m_preallocated)
return false; return false;
SpinLockGuard _(m_lock);
const size_t index = (vaddr - this->vaddr()) / PAGE_SIZE; const size_t index = (vaddr - this->vaddr()) / PAGE_SIZE;
if (m_paddrs[index]) if (m_paddrs[index])
return false; return false;
SpinLockGuard _(m_lock);
m_paddrs[index] = Heap::get().take_free_page(); m_paddrs[index] = Heap::get().take_free_page();
if (m_paddrs[index] == 0) if (m_paddrs[index] == 0)
return BAN::Error::from_errno(ENOMEM); return BAN::Error::from_errno(ENOMEM);

File diff suppressed because it is too large Load Diff

View File

@ -4,6 +4,7 @@
#include <kernel/InterruptController.h> #include <kernel/InterruptController.h>
#include <kernel/InterruptStack.h> #include <kernel/InterruptStack.h>
#include <kernel/Memory/kmalloc.h> #include <kernel/Memory/kmalloc.h>
#include <kernel/Memory/MemoryBackedRegion.h>
#include <kernel/Process.h> #include <kernel/Process.h>
#include <kernel/Scheduler.h> #include <kernel/Scheduler.h>
#include <kernel/Thread.h> #include <kernel/Thread.h>
@ -218,13 +219,16 @@ namespace Kernel
true, true true, true
)); ));
thread->m_userspace_stack = TRY(VirtualRange::create_to_vaddr_range( auto userspace_stack = TRY(MemoryBackedRegion::create(
page_table, page_table,
stack_addr_start, USERSPACE_END,
userspace_stack_size, userspace_stack_size,
{ stack_addr_start, USERSPACE_END },
MemoryRegion::Type::PRIVATE,
PageTable::Flags::UserSupervisor | PageTable::Flags::ReadWrite | PageTable::Flags::Present, PageTable::Flags::UserSupervisor | PageTable::Flags::ReadWrite | PageTable::Flags::Present,
false, true O_RDWR
)); ));
thread->m_userspace_stack = userspace_stack.ptr();
TRY(process->add_mapped_region(BAN::move(userspace_stack)));
thread_deleter.disable(); thread_deleter.disable();
@ -319,16 +323,18 @@ namespace Kernel
{ {
auto* thread = TRY(create_userspace(m_process, m_process->page_table())); auto* thread = TRY(create_userspace(m_process, m_process->page_table()));
if (Processor::get_current_sse_thread() == this)
save_sse(); save_sse();
memcpy(thread->m_sse_storage, m_sse_storage, sizeof(m_sse_storage)); memcpy(thread->m_sse_storage, m_sse_storage, sizeof(m_sse_storage));
TRY(thread->userspace_stack().allocate_page_for_demand_paging(thread->userspace_stack_top() - PAGE_SIZE)); TRY(thread->userspace_stack().copy_data_to_region(
PageTable::with_fast_page(thread->userspace_stack().paddr_of(thread->userspace_stack_top() - PAGE_SIZE), [=] { thread->m_userspace_stack->size() - sizeof(void*),
PageTable::fast_page_as<void*>(PAGE_SIZE - sizeof(uintptr_t)) = arg; reinterpret_cast<const uint8_t*>(&arg),
}); sizeof(void*)
));
const vaddr_t entry_addr = reinterpret_cast<vaddr_t>(entry); const vaddr_t entry_addr = reinterpret_cast<vaddr_t>(entry);
thread->setup_exec(entry_addr, thread->userspace_stack_top() - sizeof(uintptr_t)); thread->setup_exec(entry_addr, thread->userspace_stack().vaddr() + thread->userspace_stack().size() - sizeof(void*));
return thread; return thread;
} }
@ -346,13 +352,17 @@ namespace Kernel
thread->m_is_userspace = true; thread->m_is_userspace = true;
thread->m_kernel_stack = TRY(m_kernel_stack->clone(new_process->page_table())); thread->m_kernel_stack = TRY(m_kernel_stack->clone(new_process->page_table()));
thread->m_userspace_stack = TRY(m_userspace_stack->clone(new_process->page_table()));
const auto stack_index = new_process->find_mapped_region(m_userspace_stack->vaddr());
thread->m_userspace_stack = static_cast<MemoryBackedRegion*>(new_process->m_mapped_regions[stack_index].ptr());
ASSERT(thread->m_userspace_stack->vaddr() == m_userspace_stack->vaddr());
thread->m_fsbase = m_fsbase; thread->m_fsbase = m_fsbase;
thread->m_gsbase = m_gsbase; thread->m_gsbase = m_gsbase;
thread->m_state = State::NotStarted; thread->m_state = State::NotStarted;
if (Processor::get_current_sse_thread() == this)
save_sse(); save_sse();
memcpy(thread->m_sse_storage, m_sse_storage, sizeof(m_sse_storage)); memcpy(thread->m_sse_storage, m_sse_storage, sizeof(m_sse_storage));
@ -397,29 +407,18 @@ namespace Kernel
if (needed_size > m_userspace_stack->size()) if (needed_size > m_userspace_stack->size())
return BAN::Error::from_errno(ENOBUFS); return BAN::Error::from_errno(ENOBUFS);
vaddr_t vaddr = userspace_stack_top() - needed_size; vaddr_t vaddr = userspace_stack().vaddr() + userspace_stack().size() - needed_size;
const size_t page_count = BAN::Math::div_round_up<size_t>(needed_size, PAGE_SIZE); const size_t page_count = BAN::Math::div_round_up<size_t>(needed_size, PAGE_SIZE);
for (size_t i = 0; i < page_count; i++) for (size_t i = 0; i < page_count; i++)
TRY(m_userspace_stack->allocate_page_for_demand_paging(vaddr + i * PAGE_SIZE)); TRY(m_userspace_stack->allocate_page_containing(vaddr + i * PAGE_SIZE, true));
const auto stack_copy_buf = const auto stack_copy_buf =
[this](BAN::ConstByteSpan buffer, vaddr_t vaddr) -> void [this](BAN::ConstByteSpan buffer, vaddr_t vaddr) -> void
{ {
ASSERT(vaddr + buffer.size() <= userspace_stack_top()); ASSERT(vaddr >= m_userspace_stack->vaddr());
ASSERT(vaddr + buffer.size() <= m_userspace_stack->vaddr() + m_userspace_stack->size());
size_t bytes_copied = 0; MUST(m_userspace_stack->copy_data_to_region(vaddr - m_userspace_stack->vaddr(), buffer.data(), buffer.size()));
while (bytes_copied < buffer.size())
{
const size_t to_copy = BAN::Math::min<size_t>(buffer.size() - bytes_copied, PAGE_SIZE - (vaddr % PAGE_SIZE));
PageTable::with_fast_page(userspace_stack().paddr_of(vaddr & PAGE_ADDR_MASK), [=]() {
memcpy(PageTable::fast_page_as_ptr(vaddr % PAGE_SIZE), buffer.data() + bytes_copied, to_copy);
});
vaddr += to_copy;
bytes_copied += to_copy;
}
}; };
const auto stack_push_buf = const auto stack_push_buf =
@ -471,7 +470,7 @@ namespace Kernel
stack_push_str(envp[i]); stack_push_str(envp[i]);
} }
setup_exec(entry, userspace_stack_top() - needed_size); setup_exec(entry, m_userspace_stack->vaddr() + m_userspace_stack->size() - needed_size);
return {}; return {};
} }