Kernel: Implement copy-on-write memory for file backed `mmap`s

This commit is contained in:
Bananymous 2024-09-11 19:32:42 +03:00
parent 4006a04817
commit c77ad5fb34
15 changed files with 217 additions and 208 deletions

View File

@ -24,7 +24,9 @@ You can find a live demo [here](https://bananymous.com/banan-os)
- [ ] Program launcher
- [ ] Some nice apps
- [x] ELF dynamic linking
- [ ] copy-on-write memory
- [x] copy-on-write memory
- [x] file mappings
- [ ] anonymous mappings
#### Drivers
- [x] NVMe disks

View File

@ -159,6 +159,7 @@ namespace Kernel
private:
BAN::WeakPtr<SharedFileData> m_shared_region;
friend class FileBackedRegion;
friend class SharedFileData;
};
}

View File

@ -12,6 +12,8 @@ namespace Kernel
void sync(size_t page_index);
Mutex mutex;
// FIXME: this should probably be ordered tree like map
// for fast lookup and less memory usage
BAN::Vector<paddr_t> pages;
@ -33,7 +35,7 @@ namespace Kernel
virtual BAN::ErrorOr<BAN::UniqPtr<MemoryRegion>> clone(PageTable& new_page_table) override;
protected:
virtual BAN::ErrorOr<bool> allocate_page_containing_impl(vaddr_t vaddr) override;
virtual BAN::ErrorOr<bool> allocate_page_containing_impl(vaddr_t vaddr, bool wants_write) override;
private:
FileBackedRegion(BAN::RefPtr<Inode>, PageTable&, off_t offset, ssize_t size, Type flags, PageTable::flags_t page_flags);
@ -42,7 +44,7 @@ namespace Kernel
BAN::RefPtr<Inode> m_inode;
const off_t m_offset;
// FIXME: is this even synchronized?
BAN::Vector<paddr_t> m_dirty_pages;
BAN::RefPtr<SharedFileData> m_shared_data;
};

View File

@ -23,7 +23,7 @@ namespace Kernel
BAN::ErrorOr<void> copy_data_to_region(size_t offset_into_region, const uint8_t* buffer, size_t buffer_size);
protected:
virtual BAN::ErrorOr<bool> allocate_page_containing_impl(vaddr_t vaddr) override;
virtual BAN::ErrorOr<bool> allocate_page_containing_impl(vaddr_t vaddr, bool wants_write) override;
private:
MemoryBackedRegion(PageTable&, size_t size, Type, PageTable::flags_t);

View File

@ -34,6 +34,8 @@ namespace Kernel
bool contains_fully(vaddr_t address, size_t size) const;
bool overlaps(vaddr_t address, size_t size) const;
bool writable() const { return m_flags & PageTable::Flags::ReadWrite; }
size_t size() const { return m_size; }
vaddr_t vaddr() const { return m_vaddr; }
@ -45,7 +47,7 @@ namespace Kernel
// Returns error if no memory was available
// Returns true if page was succesfully allocated
// Returns false if page was already allocated
BAN::ErrorOr<bool> allocate_page_containing(vaddr_t address);
BAN::ErrorOr<bool> allocate_page_containing(vaddr_t address, bool wants_write);
virtual BAN::ErrorOr<BAN::UniqPtr<MemoryRegion>> clone(PageTable& new_page_table) = 0;
@ -53,7 +55,7 @@ namespace Kernel
MemoryRegion(PageTable&, size_t size, Type type, PageTable::flags_t flags);
BAN::ErrorOr<void> initialize(AddressRange);
virtual BAN::ErrorOr<bool> allocate_page_containing_impl(vaddr_t address) = 0;
virtual BAN::ErrorOr<bool> allocate_page_containing_impl(vaddr_t address, bool wants_write) = 0;
protected:
PageTable& m_page_table;

View File

@ -59,7 +59,7 @@ namespace Kernel
virtual BAN::ErrorOr<void> msync(vaddr_t, size_t, int) override { return {}; }
protected:
virtual BAN::ErrorOr<bool> allocate_page_containing_impl(vaddr_t vaddr) override;
virtual BAN::ErrorOr<bool> allocate_page_containing_impl(vaddr_t vaddr, bool wants_write) override;
private:
SharedMemoryObject(BAN::RefPtr<SharedMemoryObjectManager::Object> object, PageTable& page_table)

View File

@ -210,7 +210,7 @@ namespace Kernel
// Returns error if page could not be allocated
// Returns true if the page was allocated successfully
// Return false if access was page violation (segfault)
BAN::ErrorOr<bool> allocate_page_for_demand_paging(vaddr_t addr);
BAN::ErrorOr<bool> allocate_page_for_demand_paging(vaddr_t addr, bool wants_write);
BAN::ErrorOr<BAN::String> absolute_path_of(BAN::StringView) const;
@ -225,8 +225,8 @@ namespace Kernel
static BAN::ErrorOr<BAN::UniqPtr<LibELF::LoadableELF>> load_elf_for_exec(const Credentials&, BAN::StringView file_path, const BAN::String& cwd, Kernel::PageTable&);
BAN::ErrorOr<void> validate_string_access(const char*);
BAN::ErrorOr<void> validate_pointer_access_check(const void*, size_t);
BAN::ErrorOr<void> validate_pointer_access(const void*, size_t);
BAN::ErrorOr<void> validate_pointer_access_check(const void*, size_t, bool needs_write);
BAN::ErrorOr<void> validate_pointer_access(const void*, size_t, bool needs_write);
uint64_t signal_pending_mask() const
{

View File

@ -280,8 +280,10 @@ namespace Kernel
// Returns error if no memory was available
// Returns true if page was succesfully allocated
// Returns false if page was already allocated
virtual BAN::ErrorOr<bool> allocate_page_containing_impl(vaddr_t vaddr) override
virtual BAN::ErrorOr<bool> allocate_page_containing_impl(vaddr_t vaddr, bool wants_write) override
{
(void)wants_write;
vaddr &= PAGE_ADDR_MASK;
if (m_page_table.physical_address_of(vaddr))
return false;

View File

@ -200,13 +200,13 @@ namespace Kernel
goto done;
}
// Try demand paging on non present pages
PageFaultError page_fault_error;
page_fault_error.raw = error;
if (pid && !page_fault_error.present)
if (pid)
{
PageFaultError page_fault_error;
page_fault_error.raw = error;
Processor::set_interrupt_state(InterruptState::Enabled);
auto result = Process::current().allocate_page_for_demand_paging(regs->cr2);
auto result = Process::current().allocate_page_for_demand_paging(regs->cr2, page_fault_error.write);
Processor::set_interrupt_state(InterruptState::Disabled);
if (!result.is_error() && result.value())

View File

@ -13,20 +13,8 @@ namespace Kernel
if (offset < 0 || offset % PAGE_SIZE || size == 0)
return BAN::Error::from_errno(EINVAL);
switch (type)
{
case Type::PRIVATE:
if (offset >= inode->size())
return BAN::Error::from_errno(EOVERFLOW);
break;
case Type::SHARED:
if ((size > (size_t)inode->size() || (size_t)offset > (size_t)inode->size() - size))
return BAN::Error::from_errno(EOVERFLOW);
break;
default:
ASSERT_NOT_REACHED();
break;
}
if ((size > (size_t)inode->size() || (size_t)offset > (size_t)inode->size() - size))
return BAN::Error::from_errno(EOVERFLOW);
auto* region_ptr = new FileBackedRegion(inode, page_table, offset, size, type, flags);
if (region_ptr == nullptr)
@ -35,19 +23,19 @@ namespace Kernel
TRY(region->initialize(address_range));
if (type == Type::SHARED)
if (type == Type::PRIVATE && (flags & PageTable::Flags::ReadWrite))
TRY(region->m_dirty_pages.resize(BAN::Math::div_round_up<size_t>(size, PAGE_SIZE)));
LockGuard _(inode->m_mutex);
if (inode->m_shared_region.valid())
region->m_shared_data = inode->m_shared_region.lock();
else
{
LockGuard _(inode->m_mutex);
if (inode->m_shared_region.valid())
region->m_shared_data = inode->m_shared_region.lock();
else
{
auto shared_data = TRY(BAN::RefPtr<SharedFileData>::create());
TRY(shared_data->pages.resize(BAN::Math::div_round_up<size_t>(inode->size(), PAGE_SIZE)));
shared_data->inode = inode;
inode->m_shared_region = TRY(shared_data->get_weak_ptr());
region->m_shared_data = BAN::move(shared_data);
}
auto shared_data = TRY(BAN::RefPtr<SharedFileData>::create());
TRY(shared_data->pages.resize(BAN::Math::div_round_up<size_t>(inode->size(), PAGE_SIZE)));
shared_data->inode = inode;
inode->m_shared_region = TRY(shared_data->get_weak_ptr());
region->m_shared_data = BAN::move(shared_data);
}
return region;
@ -64,32 +52,27 @@ namespace Kernel
{
if (m_vaddr == 0)
return;
if (m_type == Type::SHARED)
return;
size_t needed_pages = BAN::Math::div_round_up<size_t>(m_size, PAGE_SIZE);
for (size_t i = 0; i < needed_pages; i++)
{
paddr_t paddr = m_page_table.physical_address_of(m_vaddr + i * PAGE_SIZE);
if (paddr != 0)
Heap::get().release_page(paddr);
}
for (paddr_t dirty_page : m_dirty_pages)
if (dirty_page)
Heap::get().release_page(dirty_page);
}
SharedFileData::~SharedFileData()
{
// no-one should be referencing this anymore
[[maybe_unused]] bool success = mutex.try_lock();
ASSERT(success);
for (size_t i = 0; i < pages.size(); i++)
{
if (pages[i] == 0)
continue;
sync(i);
}
if (pages[i])
sync(i);
mutex.unlock();
}
void SharedFileData::sync(size_t page_index)
{
// FIXME: should this be locked?
ASSERT(mutex.is_locked());
if (pages[page_index] == 0)
return;
@ -105,13 +88,14 @@ namespace Kernel
BAN::ErrorOr<void> FileBackedRegion::msync(vaddr_t address, size_t size, int flags)
{
if (flags != MS_SYNC)
return BAN::Error::from_errno(ENOTSUP);
dprintln("async file backed mmap msync");
if (m_type != Type::SHARED)
return {};
vaddr_t first_page = address & PAGE_ADDR_MASK;
vaddr_t last_page = BAN::Math::div_round_up<vaddr_t>(address + size, PAGE_SIZE) * PAGE_SIZE;
const vaddr_t first_page = address & PAGE_ADDR_MASK;
const vaddr_t last_page = BAN::Math::div_round_up<vaddr_t>(address + size, PAGE_SIZE) * PAGE_SIZE;
LockGuard _(m_shared_data->mutex);
for (vaddr_t page_addr = first_page; page_addr < last_page; page_addr += PAGE_SIZE)
if (contains(page_addr))
m_shared_data->sync((page_addr - m_vaddr) / PAGE_SIZE);
@ -119,89 +103,96 @@ namespace Kernel
return {};
}
BAN::ErrorOr<bool> FileBackedRegion::allocate_page_containing_impl(vaddr_t address)
BAN::ErrorOr<bool> FileBackedRegion::allocate_page_containing_impl(vaddr_t address, bool wants_write)
{
ASSERT(contains(address));
ASSERT(m_type == Type::SHARED || m_type == Type::PRIVATE);
ASSERT(!wants_write || writable());
// Check if address is already mapped
const vaddr_t vaddr = address & PAGE_ADDR_MASK;
if (m_page_table.physical_address_of(vaddr) != 0)
return false;
if (m_type == Type::PRIVATE)
const size_t local_page_index = (vaddr - m_vaddr) / PAGE_SIZE;
const size_t shared_page_index = local_page_index + m_offset / PAGE_SIZE;
if (m_page_table.physical_address_of(vaddr) == 0)
{
// Map new physcial page to address
paddr_t paddr = Heap::get().take_free_page();
if (paddr == 0)
return BAN::Error::from_errno(ENOMEM);
ASSERT(m_shared_data);
LockGuard _(m_shared_data->mutex);
// Temporarily force mapping to be writable so kernel can write to it
m_page_table.map_page_at(paddr, vaddr, m_flags | PageTable::Flags::ReadWrite);
ASSERT(&PageTable::current() == &m_page_table);
memset(reinterpret_cast<void*>(vaddr), 0x00, PAGE_SIZE);
const size_t file_offset = m_offset + (vaddr - m_vaddr);
if (file_offset < static_cast<size_t>(m_inode->size()))
bool shared_data_has_correct_page = false;
if (m_shared_data->pages[shared_page_index] == 0)
{
const size_t bytes = BAN::Math::min<size_t>(BAN::Math::min<size_t>(m_offset + m_size, m_inode->size()) - file_offset, PAGE_SIZE);
auto read_ret = m_inode->read(file_offset, BAN::ByteSpan((uint8_t*)vaddr, bytes));
if (read_ret.is_error())
{
Heap::get().release_page(paddr);
m_page_table.unmap_page(vaddr);
return read_ret.release_error();
}
if (read_ret.value() < bytes)
{
dwarnln("Only {}/{} bytes read", read_ret.value(), bytes);
Heap::get().release_page(paddr);
m_page_table.unmap_page(vaddr);
return BAN::Error::from_errno(EIO);
}
}
// Disable writable if not wanted
if (!(m_flags & PageTable::Flags::ReadWrite))
m_page_table.map_page_at(paddr, vaddr, m_flags);
}
else if (m_type == Type::SHARED)
{
LockGuard _(m_inode->m_mutex);
ASSERT(m_inode->m_shared_region.valid());
ASSERT(m_shared_data->pages.size() == BAN::Math::div_round_up<size_t>(m_inode->size(), PAGE_SIZE));
auto& pages = m_shared_data->pages;
size_t page_index = (vaddr - m_vaddr) / PAGE_SIZE;
if (pages[page_index] == 0)
{
pages[page_index] = Heap::get().take_free_page();
if (pages[page_index] == 0)
m_shared_data->pages[shared_page_index] = Heap::get().take_free_page();
if (m_shared_data->pages[shared_page_index] == 0)
return BAN::Error::from_errno(ENOMEM);
size_t offset = vaddr - m_vaddr;
size_t bytes = BAN::Math::min<size_t>(m_size - offset, PAGE_SIZE);
const size_t offset = (vaddr - m_vaddr) + m_offset;
ASSERT(offset % 4096 == 0);
const size_t bytes = BAN::Math::min<size_t>(m_inode->size() - offset, PAGE_SIZE);
memset(m_shared_data->page_buffer, 0x00, PAGE_SIZE);
TRY(m_inode->read(offset, BAN::ByteSpan(m_shared_data->page_buffer, bytes)));
shared_data_has_correct_page = true;
PageTable::with_fast_page(pages[page_index], [&] {
memcpy(PageTable::fast_page_as_ptr(), m_shared_data->page_buffer, bytes);
memset(PageTable::fast_page_as_ptr(bytes), 0x00, PAGE_SIZE - bytes);
PageTable::with_fast_page(m_shared_data->pages[shared_page_index], [&] {
memcpy(PageTable::fast_page_as_ptr(), m_shared_data->page_buffer, PAGE_SIZE);
});
}
paddr_t paddr = pages[page_index];
ASSERT(paddr);
m_page_table.map_page_at(paddr, vaddr, m_flags);
if (m_type == Type::PRIVATE && wants_write)
{
const paddr_t paddr = Heap::get().take_free_page();
if (paddr == 0)
return BAN::Error::from_errno(ENOMEM);
if (!shared_data_has_correct_page)
{
PageTable::with_fast_page(m_shared_data->pages[shared_page_index], [&] {
memcpy(m_shared_data->page_buffer, PageTable::fast_page_as_ptr(), PAGE_SIZE);
});
}
PageTable::with_fast_page(paddr, [&] {
memcpy(PageTable::fast_page_as_ptr(), m_shared_data->page_buffer, PAGE_SIZE);
});
m_dirty_pages[local_page_index] = paddr;
m_page_table.map_page_at(paddr, vaddr, m_flags);
}
else
{
const paddr_t paddr = m_shared_data->pages[shared_page_index];
auto flags = m_flags;
if (m_type == Type::PRIVATE)
flags &= ~PageTable::Flags::ReadWrite;
m_page_table.map_page_at(paddr, vaddr, flags);
}
}
else
{
ASSERT_NOT_REACHED();
// page does not need remappings
if (m_type != Type::PRIVATE || !wants_write)
return false;
ASSERT(writable());
// page is already mapped as writable
if (m_page_table.get_page_flags(vaddr) & PageTable::Flags::ReadWrite)
return false;
const paddr_t paddr = Heap::get().take_free_page();
if (paddr == 0)
return BAN::Error::from_errno(ENOMEM);
ASSERT(m_shared_data);
LockGuard _(m_shared_data->mutex);
ASSERT(m_shared_data->pages[shared_page_index]);
PageTable::with_fast_page(m_shared_data->pages[shared_page_index], [&] {
memcpy(m_shared_data->page_buffer, PageTable::fast_page_as_ptr(), PAGE_SIZE);
});
PageTable::with_fast_page(paddr, [&] {
memcpy(PageTable::fast_page_as_ptr(), m_shared_data->page_buffer, PAGE_SIZE);
});
m_dirty_pages[local_page_index] = paddr;
m_page_table.map_page_at(paddr, vaddr, m_flags);
}
return true;
@ -212,31 +203,26 @@ namespace Kernel
const size_t aligned_size = (m_size + PAGE_SIZE - 1) & PAGE_ADDR_MASK;
auto result = TRY(FileBackedRegion::create(m_inode, page_table, m_offset, m_size, { .start = m_vaddr, .end = m_vaddr + aligned_size }, m_type, m_flags));
// shared regions can just go through demand paging
if (m_type == Type::SHARED)
return BAN::UniqPtr<MemoryRegion>(BAN::move(result));
// non-dirty pages can go through demand paging
ASSERT(m_type == Type::PRIVATE);
for (size_t offset = 0; offset < m_size; offset += PAGE_SIZE)
for (size_t i = 0; i < m_dirty_pages.size(); i++)
{
const vaddr_t vaddr = m_vaddr + offset;
if (m_page_table.physical_address_of(vaddr) == 0)
if (m_dirty_pages[i] == 0)
continue;
ASSERT(&PageTable::current() == &m_page_table);
const vaddr_t vaddr = m_vaddr + i * PAGE_SIZE;
const paddr_t paddr = Heap::get().take_free_page();
if (paddr == 0)
return BAN::Error::from_errno(ENOMEM);
page_table.map_page_at(paddr, vaddr, m_flags);
const size_t to_copy = BAN::Math::min<size_t>(PAGE_SIZE, m_size - offset);
ASSERT(&m_page_table == &PageTable::current() || &m_page_table == &PageTable::kernel());
PageTable::with_fast_page(paddr, [&] {
memcpy(PageTable::fast_page_as_ptr(), reinterpret_cast<void*>(vaddr), to_copy);
memset(PageTable::fast_page_as_ptr(to_copy), 0, PAGE_SIZE - to_copy);
memcpy(PageTable::fast_page_as_ptr(), reinterpret_cast<void*>(vaddr), PAGE_SIZE);
});
result->m_page_table.map_page_at(paddr, vaddr, m_flags);
result->m_dirty_pages[i] = paddr;
}
return BAN::UniqPtr<MemoryRegion>(BAN::move(result));

View File

@ -38,11 +38,12 @@ namespace Kernel
}
}
BAN::ErrorOr<bool> MemoryBackedRegion::allocate_page_containing_impl(vaddr_t address)
BAN::ErrorOr<bool> MemoryBackedRegion::allocate_page_containing_impl(vaddr_t address, bool wants_write)
{
ASSERT(m_type == Type::PRIVATE);
ASSERT(contains(address));
(void)wants_write;
// Check if address is already mapped
vaddr_t vaddr = address & PAGE_ADDR_MASK;
@ -93,7 +94,7 @@ namespace Kernel
vaddr_t page_offset = write_vaddr % PAGE_SIZE;
size_t bytes = BAN::Math::min<size_t>(buffer_size - written, PAGE_SIZE - page_offset);
TRY(allocate_page_containing(write_vaddr));
TRY(allocate_page_containing(write_vaddr, true));
PageTable::with_fast_page(m_page_table.physical_address_of(write_vaddr & PAGE_ADDR_MASK), [&] {
memcpy(PageTable::fast_page_as_ptr(page_offset), (void*)(buffer + written), bytes);

View File

@ -47,9 +47,12 @@ namespace Kernel
return true;
}
BAN::ErrorOr<bool> MemoryRegion::allocate_page_containing(vaddr_t address)
BAN::ErrorOr<bool> MemoryRegion::allocate_page_containing(vaddr_t address, bool wants_write)
{
auto ret = allocate_page_containing_impl(address);
ASSERT(contains(address));
if (wants_write && !writable())
return false;
auto ret = allocate_page_containing_impl(address, wants_write);
if (!ret.is_error() && ret.value())
m_physical_page_count++;
return ret;

View File

@ -87,9 +87,10 @@ namespace Kernel
return BAN::UniqPtr<MemoryRegion>(BAN::move(region));
}
BAN::ErrorOr<bool> SharedMemoryObject::allocate_page_containing_impl(vaddr_t address)
BAN::ErrorOr<bool> SharedMemoryObject::allocate_page_containing_impl(vaddr_t address, bool wants_write)
{
ASSERT(contains(address));
(void)wants_write;
// Check if address is already mapped
vaddr_t vaddr = address & PAGE_ADDR_MASK;

View File

@ -655,7 +655,11 @@ namespace Kernel::PCI
if (m_vaddr == 0)
return BAN::Error::from_errno(ENOMEM);
PageTable::kernel().map_range_at(m_paddr, m_vaddr, m_size, PageTable::Flags::ReadWrite | PageTable::Flags::Present);
PageTable::kernel().map_range_at(
m_paddr, m_vaddr, m_size,
PageTable::Flags::ReadWrite | PageTable::Flags::Present,
PageTable::MemoryType::Uncached
);
return {};
}

View File

@ -362,7 +362,7 @@ namespace Kernel
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(termios, sizeof(termios)));
TRY(validate_pointer_access(termios, sizeof(termios), true));
auto inode = TRY(m_open_file_descriptors.inode_of(fildes));
if (!inode->is_tty())
@ -381,7 +381,7 @@ namespace Kernel
LockGuard _(m_process_lock);
TRY(validate_pointer_access(termios, sizeof(termios)));
TRY(validate_pointer_access(termios, sizeof(termios), false));
auto inode = TRY(m_open_file_descriptors.inode_of(fildes));
if (!inode->is_tty())
@ -484,7 +484,7 @@ namespace Kernel
BAN::Vector<BAN::String> str_argv;
for (int i = 0; argv && argv[i]; i++)
{
TRY(validate_pointer_access(argv + i, sizeof(char*)));
TRY(validate_pointer_access(argv + i, sizeof(char*), false));
TRY(validate_string_access(argv[i]));
TRY(str_argv.emplace_back(argv[i]));
}
@ -492,7 +492,7 @@ namespace Kernel
BAN::Vector<BAN::String> str_envp;
for (int i = 0; envp && envp[i]; i++)
{
TRY(validate_pointer_access(envp + 1, sizeof(char*)));
TRY(validate_pointer_access(envp + 1, sizeof(char*), false));
TRY(validate_string_access(envp[i]));
TRY(str_envp.emplace_back(envp[i]));
}
@ -640,7 +640,7 @@ namespace Kernel
if (stat_loc)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(stat_loc, sizeof(stat_loc)));
TRY(validate_pointer_access(stat_loc, sizeof(stat_loc), true));
*stat_loc = exit_code;
}
remove_pending_signal(SIGCHLD);
@ -676,9 +676,9 @@ namespace Kernel
{
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(rqtp, sizeof(timespec)));
TRY(validate_pointer_access(rqtp, sizeof(timespec), false));
if (rmtp)
TRY(validate_pointer_access(rmtp, sizeof(timespec)));
TRY(validate_pointer_access(rmtp, sizeof(timespec), true));
}
const uint64_t sleep_ns = (rqtp->tv_sec * 1'000'000'000) + rqtp->tv_nsec;
@ -718,9 +718,9 @@ namespace Kernel
LockGuard _(m_process_lock);
if (value)
TRY(validate_pointer_access(value, sizeof(itimerval)));
TRY(validate_pointer_access(value, sizeof(itimerval), false));
if (ovalue)
TRY(validate_pointer_access(ovalue, sizeof(itimerval)));
TRY(validate_pointer_access(ovalue, sizeof(itimerval), true));
{
SpinLockGuard _(s_process_lock);
@ -846,7 +846,7 @@ namespace Kernel
return {};
}
BAN::ErrorOr<bool> Process::allocate_page_for_demand_paging(vaddr_t address)
BAN::ErrorOr<bool> Process::allocate_page_for_demand_paging(vaddr_t address, bool wants_write)
{
ASSERT(&Process::current() == this);
@ -862,7 +862,7 @@ namespace Kernel
{
if (!region->contains(address))
continue;
TRY(region->allocate_page_containing(address));
TRY(region->allocate_page_containing(address, wants_write));
return true;
}
@ -971,14 +971,14 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_read(int fd, void* buffer, size_t count)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(buffer, count));
TRY(validate_pointer_access(buffer, count, true));
return TRY(m_open_file_descriptors.read(fd, BAN::ByteSpan((uint8_t*)buffer, count)));
}
BAN::ErrorOr<long> Process::sys_write(int fd, const void* buffer, size_t count)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(buffer, count));
TRY(validate_pointer_access(buffer, count, false));
return TRY(m_open_file_descriptors.write(fd, BAN::ByteSpan((uint8_t*)buffer, count)));
}
@ -1064,7 +1064,7 @@ namespace Kernel
{
LockGuard _(m_process_lock);
TRY(validate_string_access(path));
TRY(validate_pointer_access(buffer, bufsize));
TRY(validate_pointer_access(buffer, bufsize, true));
auto absolute_path = TRY(absolute_path_of(path));
@ -1075,7 +1075,7 @@ namespace Kernel
{
LockGuard _(m_process_lock);
TRY(validate_string_access(path));
TRY(validate_pointer_access(buffer, bufsize));
TRY(validate_pointer_access(buffer, bufsize, true));
// FIXME: handle O_SEARCH in fd
auto parent_path = TRY(m_open_file_descriptors.path_of(fd));
@ -1091,7 +1091,7 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_pread(int fd, void* buffer, size_t count, off_t offset)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(buffer, count));
TRY(validate_pointer_access(buffer, count, true));
auto inode = TRY(m_open_file_descriptors.inode_of(fd));
return TRY(inode->read(offset, { (uint8_t*)buffer, count }));
}
@ -1144,8 +1144,8 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_getsockname(int socket, sockaddr* address, socklen_t* address_len)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(address_len, sizeof(address_len)));
TRY(validate_pointer_access(address, *address_len));
TRY(validate_pointer_access(address_len, sizeof(address_len), true));
TRY(validate_pointer_access(address, *address_len, true));
auto inode = TRY(m_open_file_descriptors.inode_of(socket));
if (!inode->mode().ifsock())
@ -1158,8 +1158,8 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_getsockopt(int socket, int level, int option_name, void* option_value, socklen_t* option_len)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(option_len, sizeof(option_len)));
TRY(validate_pointer_access(option_value, *option_len));
TRY(validate_pointer_access(option_len, sizeof(option_len), true));
TRY(validate_pointer_access(option_value, *option_len, true));
auto inode = TRY(m_open_file_descriptors.inode_of(socket));
if (!inode->mode().ifsock())
@ -1180,7 +1180,7 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_setsockopt(int socket, int level, int option_name, const void* option_value, socklen_t option_len)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(option_value, option_len));
TRY(validate_pointer_access(option_value, option_len, false));
auto inode = TRY(m_open_file_descriptors.inode_of(socket));
if (!inode->mode().ifsock())
@ -1202,8 +1202,8 @@ namespace Kernel
LockGuard _(m_process_lock);
if (address)
{
TRY(validate_pointer_access(address_len, sizeof(*address_len)));
TRY(validate_pointer_access(address, *address_len));
TRY(validate_pointer_access(address_len, sizeof(*address_len), true));
TRY(validate_pointer_access(address, *address_len, true));
}
auto inode = TRY(m_open_file_descriptors.inode_of(socket));
@ -1216,7 +1216,7 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_bind(int socket, const sockaddr* address, socklen_t address_len)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(address, address_len));
TRY(validate_pointer_access(address, address_len, false));
auto inode = TRY(m_open_file_descriptors.inode_of(socket));
if (!inode->mode().ifsock())
@ -1229,7 +1229,7 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_connect(int socket, const sockaddr* address, socklen_t address_len)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(address, address_len));
TRY(validate_pointer_access(address, address_len, false));
auto inode = TRY(m_open_file_descriptors.inode_of(socket));
if (!inode->mode().ifsock())
@ -1254,9 +1254,9 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_sendto(const sys_sendto_t* arguments)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(arguments, sizeof(sys_sendto_t)));
TRY(validate_pointer_access(arguments->message, arguments->length));
TRY(validate_pointer_access(arguments->dest_addr, arguments->dest_len));
TRY(validate_pointer_access(arguments, sizeof(sys_sendto_t), false));
TRY(validate_pointer_access(arguments->message, arguments->length, false));
TRY(validate_pointer_access(arguments->dest_addr, arguments->dest_len, false));
auto inode = TRY(m_open_file_descriptors.inode_of(arguments->socket));
if (!inode->mode().ifsock())
@ -1278,12 +1278,12 @@ namespace Kernel
return BAN::Error::from_errno(EINVAL);
LockGuard _(m_process_lock);
TRY(validate_pointer_access(arguments, sizeof(sys_recvfrom_t)));
TRY(validate_pointer_access(arguments->buffer, arguments->length));
TRY(validate_pointer_access(arguments, sizeof(sys_recvfrom_t), false));
TRY(validate_pointer_access(arguments->buffer, arguments->length, true));
if (arguments->address)
{
TRY(validate_pointer_access(arguments->address_len, sizeof(*arguments->address_len)));
TRY(validate_pointer_access(arguments->address, *arguments->address_len));
TRY(validate_pointer_access(arguments->address_len, sizeof(*arguments->address_len), true));
TRY(validate_pointer_access(arguments->address, *arguments->address_len, true));
}
auto inode = TRY(m_open_file_descriptors.inode_of(arguments->socket));
@ -1309,17 +1309,17 @@ namespace Kernel
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(arguments, sizeof(sys_pselect_t)));
TRY(validate_pointer_access(arguments, sizeof(sys_pselect_t), false));
if (arguments->readfds)
TRY(validate_pointer_access(arguments->readfds, sizeof(fd_set)));
TRY(validate_pointer_access(arguments->readfds, sizeof(fd_set), true));
if (arguments->writefds)
TRY(validate_pointer_access(arguments->writefds, sizeof(fd_set)));
TRY(validate_pointer_access(arguments->writefds, sizeof(fd_set), true));
if (arguments->errorfds)
TRY(validate_pointer_access(arguments->errorfds, sizeof(fd_set)));
TRY(validate_pointer_access(arguments->errorfds, sizeof(fd_set), true));
if (arguments->timeout)
TRY(validate_pointer_access(arguments->timeout, sizeof(timespec)));
TRY(validate_pointer_access(arguments->timeout, sizeof(timespec), false));
if (arguments->sigmask)
TRY(validate_pointer_access(arguments->sigmask, sizeof(sigset_t)));
TRY(validate_pointer_access(arguments->sigmask, sizeof(sigset_t), false));
if (arguments->sigmask)
return BAN::Error::from_errno(ENOTSUP);
@ -1399,7 +1399,7 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_pipe(int fildes[2])
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(fildes, sizeof(int) * 2));
TRY(validate_pointer_access(fildes, sizeof(int) * 2, true));
TRY(m_open_file_descriptors.pipe(fildes));
return 0;
}
@ -1456,7 +1456,7 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_fstat(int fd, struct stat* buf)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(buf, sizeof(struct stat)));
TRY(validate_pointer_access(buf, sizeof(struct stat), true));
TRY(m_open_file_descriptors.fstat(fd, buf));
return 0;
}
@ -1464,7 +1464,7 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_fstatat(int fd, const char* path, struct stat* buf, int flag)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(buf, sizeof(struct stat)));
TRY(validate_pointer_access(buf, sizeof(struct stat), true));
TRY(m_open_file_descriptors.fstatat(fd, path, buf, flag));
return 0;
}
@ -1472,7 +1472,7 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_stat(const char* path, struct stat* buf, int flag)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(buf, sizeof(struct stat)));
TRY(validate_pointer_access(buf, sizeof(struct stat), true));
TRY(m_open_file_descriptors.stat(TRY(absolute_path_of(path)), buf, flag));
return 0;
}
@ -1481,7 +1481,7 @@ namespace Kernel
{
LockGuard _(m_process_lock);
TRY(validate_string_access(path));
TRY(validate_pointer_access(buffer, PATH_MAX));
TRY(validate_pointer_access(buffer, PATH_MAX, true));
auto absolute_path = TRY(absolute_path_of(path));
@ -1534,7 +1534,7 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_readdir(int fd, struct dirent* list, size_t list_len)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(list, sizeof(dirent) * list_len));
TRY(validate_pointer_access(list, sizeof(dirent) * list_len, true));
return TRY(m_open_file_descriptors.read_dir_entries(fd, list, list_len));
}
@ -1562,7 +1562,7 @@ namespace Kernel
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(buffer, size));
TRY(validate_pointer_access(buffer, size, true));
if (size < m_working_directory.size() + 1)
return BAN::Error::from_errno(ERANGE);
@ -1577,7 +1577,7 @@ namespace Kernel
{
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(args, sizeof(sys_mmap_t)));
TRY(validate_pointer_access(args, sizeof(sys_mmap_t), true));
}
if (args->prot != PROT_NONE && (args->prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)))
@ -1745,7 +1745,7 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_ttyname(int fildes, char* storage)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(storage, TTY_NAME_MAX));
TRY(validate_pointer_access(storage, TTY_NAME_MAX, true));
auto inode = TRY(m_open_file_descriptors.inode_of(fildes));
if (!inode->is_tty())
return BAN::Error::from_errno(ENOTTY);
@ -1794,7 +1794,7 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_ptsname(int fildes, char* buffer, size_t buffer_len)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(buffer, buffer_len));
TRY(validate_pointer_access(buffer, buffer_len, true));
auto inode = TRY(m_open_file_descriptors.inode_of(fildes));
if (TRY(m_open_file_descriptors.path_of(fildes)) != "<ptmx>"_sv)
@ -1846,7 +1846,7 @@ namespace Kernel
{
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(tp, sizeof(timespec)));
TRY(validate_pointer_access(tp, sizeof(timespec), true));
}
switch (clock_id)
@ -1925,9 +1925,9 @@ namespace Kernel
LockGuard _(m_process_lock);
if (act)
TRY(validate_pointer_access(act, sizeof(struct sigaction)));
TRY(validate_pointer_access(act, sizeof(struct sigaction), false));
if (oact)
TRY(validate_pointer_access(oact, sizeof(struct sigaction)));
TRY(validate_pointer_access(oact, sizeof(struct sigaction), true));
SpinLockGuard signal_lock_guard(m_signal_lock);
@ -1947,7 +1947,7 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_sigpending(sigset_t* set)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(set, sizeof(sigset_t)));
TRY(validate_pointer_access(set, sizeof(sigset_t), true));
*set = (signal_pending_mask() | Thread::current().m_signal_pending_mask) & Thread::current().m_signal_block_mask;
return 0;
}
@ -1956,9 +1956,9 @@ namespace Kernel
{
LockGuard _(m_process_lock);
if (set)
TRY(validate_pointer_access(set, sizeof(sigset_t)));
TRY(validate_pointer_access(set, sizeof(sigset_t), false));
if (oset)
TRY(validate_pointer_access(oset, sizeof(sigset_t)));
TRY(validate_pointer_access(oset, sizeof(sigset_t), true));
if (oset)
*oset = Thread::current().m_signal_block_mask;
@ -2342,10 +2342,10 @@ namespace Kernel
{
// NOTE: we will page fault here, if str is not actually mapped
// outcome is still the same; SIGSEGV
return validate_pointer_access(str, strlen(str) + 1);
return validate_pointer_access(str, strlen(str) + 1, false);
}
BAN::ErrorOr<void> Process::validate_pointer_access_check(const void* ptr, size_t size)
BAN::ErrorOr<void> Process::validate_pointer_access_check(const void* ptr, size_t size, bool needs_write)
{
ASSERT(&Process::current() == this);
auto& thread = Thread::current();
@ -2368,10 +2368,15 @@ namespace Kernel
// FIXME: should we allow cross mapping access?
for (auto& mapped_region : m_mapped_regions)
if (mapped_region->contains_fully(vaddr, size))
return {};
{
if (!mapped_region->contains_fully(vaddr, size))
continue;
if (needs_write && !mapped_region->writable())
goto unauthorized_access;
return {};
}
// FIXME: elf should contain full range [vaddr, vaddr + size)
// FIXME: elf should use MemoryRegions instead of mapping executables itself
if (m_loadable_elf->contains(vaddr))
return {};
@ -2382,11 +2387,11 @@ unauthorized_access:
return BAN::Error::from_errno(EINTR);
}
BAN::ErrorOr<void> Process::validate_pointer_access(const void* ptr, size_t size)
BAN::ErrorOr<void> Process::validate_pointer_access(const void* ptr, size_t size, bool needs_write)
{
// TODO: This seems very slow as we loop over the range twice
TRY(validate_pointer_access_check(ptr, size));
TRY(validate_pointer_access_check(ptr, size, needs_write));
const vaddr_t vaddr = reinterpret_cast<vaddr_t>(ptr);
@ -2399,7 +2404,7 @@ unauthorized_access:
const vaddr_t current = page_start + i * PAGE_SIZE;
if (page_table().get_page_flags(current) & PageTable::Flags::Present)
continue;
TRY(Process::allocate_page_for_demand_paging(current));
TRY(Process::allocate_page_for_demand_paging(current, needs_write));
}
return {};