diff --git a/kernel/include/kernel/Process.h b/kernel/include/kernel/Process.h index 93e876af..2db360ec 100644 --- a/kernel/include/kernel/Process.h +++ b/kernel/include/kernel/Process.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -272,17 +273,20 @@ namespace Kernel }; // Adds new region to the sorted array of mapped regions + // You must hold writer end of m_mapped_region_lock when calling this. BAN::ErrorOr add_mapped_region(BAN::UniqPtr&&); // If address is contained by a region, returns the index of that. // Otherwise returns the address of the first region after this address. + // You must hold reader end of m_mapped_region_lock when calling this. size_t find_mapped_region(vaddr_t) const; BAN::ErrorOr find_file(int fd, const char* path, int flags) const; BAN::ErrorOr find_parent_file(int fd, const char* path, int flags) const; BAN::ErrorOr find_relative_parent(int fd, const char* path) const; - BAN::ErrorOr validate_string_access(const char*); - BAN::ErrorOr validate_pointer_access(const void*, size_t, bool needs_write); + BAN::ErrorOr read_from_user(const void* user_addr, void* out, size_t size); + BAN::ErrorOr read_string_from_user(const char* user_addr, char* out, size_t max_size); + BAN::ErrorOr write_to_user(void* user_addr, const void* in, size_t size); BAN::ErrorOr validate_and_pin_pointer_access(const void*, size_t, bool needs_write); uint64_t signal_pending_mask() const @@ -327,6 +331,7 @@ namespace Kernel OpenFileDescriptorSet m_open_file_descriptors; + mutable RWLock m_memory_region_lock; BAN::Vector> m_mapped_regions; pid_t m_sid; diff --git a/kernel/include/kernel/Thread.h b/kernel/include/kernel/Thread.h index 2f163dbe..8dfde905 100644 --- a/kernel/include/kernel/Thread.h +++ b/kernel/include/kernel/Thread.h @@ -16,6 +16,7 @@ namespace Kernel { + class MemoryBackedRegion; class Process; class Thread @@ -103,9 +104,7 @@ namespace Kernel vaddr_t kernel_stack_top() const { return m_kernel_stack->vaddr() + m_kernel_stack->size(); } VirtualRange& kernel_stack() { return *m_kernel_stack; } - vaddr_t userspace_stack_bottom() const { return is_userspace() ? m_userspace_stack->vaddr() : UINTPTR_MAX; } - vaddr_t userspace_stack_top() const { return is_userspace() ? m_userspace_stack->vaddr() + m_userspace_stack->size() : 0; } - VirtualRange& userspace_stack() { ASSERT(is_userspace()); return *m_userspace_stack; } + MemoryBackedRegion& userspace_stack() { ASSERT(is_userspace() && m_userspace_stack); return *m_userspace_stack; } static Thread& current(); static pid_t current_tid(); @@ -129,7 +128,7 @@ namespace Kernel void set_gsbase(vaddr_t base) { m_gsbase = base; } vaddr_t get_gsbase() const { return m_gsbase; } - size_t virtual_page_count() const { return (m_kernel_stack ? (m_kernel_stack->size() / PAGE_SIZE) : 0) + (m_userspace_stack ? (m_userspace_stack->size() / PAGE_SIZE) : 0); } + size_t virtual_page_count() const { return m_kernel_stack ? (m_kernel_stack->size() / PAGE_SIZE) : 0; } size_t physical_page_count() const { return virtual_page_count(); } YieldRegisters& yield_registers() { return m_yield_registers; } @@ -160,7 +159,7 @@ namespace Kernel BAN::UniqPtr m_keep_alive_page_table; BAN::UniqPtr m_kernel_stack; - BAN::UniqPtr m_userspace_stack; + MemoryBackedRegion* m_userspace_stack { nullptr }; const pid_t m_tid { 0 }; State m_state { State::NotStarted }; Process* m_process { nullptr }; diff --git a/kernel/kernel/Memory/VirtualRange.cpp b/kernel/kernel/Memory/VirtualRange.cpp index 45e1e452..32ec4ec7 100644 --- a/kernel/kernel/Memory/VirtualRange.cpp +++ b/kernel/kernel/Memory/VirtualRange.cpp @@ -148,12 +148,12 @@ namespace Kernel if (m_preallocated) return false; + SpinLockGuard _(m_lock); + const size_t index = (vaddr - this->vaddr()) / PAGE_SIZE; if (m_paddrs[index]) return false; - SpinLockGuard _(m_lock); - m_paddrs[index] = Heap::get().take_free_page(); if (m_paddrs[index] == 0) return BAN::Error::from_errno(ENOMEM); diff --git a/kernel/kernel/Process.cpp b/kernel/kernel/Process.cpp index 4702ef3e..eaf87d5e 100644 --- a/kernel/kernel/Process.cpp +++ b/kernel/kernel/Process.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -125,7 +126,9 @@ namespace Kernel auto executable_inode = executable_file.inode; auto executable = TRY(ELF::load_from_inode(process->m_root_file.inode, executable_inode, process->m_credentials, process->page_table())); - process->m_mapped_regions = BAN::move(executable.regions); + for (auto& region : executable.regions) + TRY(process->add_mapped_region(BAN::move(region))); + executable.regions.clear(); TRY(process->m_executable.append(executable_file.canonical_path)); @@ -250,6 +253,7 @@ namespace Kernel ProcFileSystem::get().on_process_delete(*this); m_process_lock.lock(); + m_memory_region_lock.wr_lock(); m_open_file_descriptors.close_all(); @@ -261,6 +265,17 @@ namespace Kernel bool Process::on_thread_exit(Thread& thread) { + { + RWLockWRGuard _(m_memory_region_lock); + + const size_t index = find_mapped_region(thread.userspace_stack().vaddr()); + ASSERT(m_mapped_regions[index].ptr() == thread.m_userspace_stack); + + m_mapped_regions.remove(index); + + thread.m_userspace_stack = nullptr; + } + LockGuard _(m_process_lock); ASSERT(m_threads.size() > 0); @@ -468,6 +483,9 @@ namespace Kernel meminfo.virt_pages += thread->virtual_page_count(); meminfo.phys_pages += thread->physical_page_count(); } + } + { + RWLockRDGuard _(m_memory_region_lock); for (auto& region : m_mapped_regions) { meminfo.virt_pages += region->virtual_page_count(); @@ -534,7 +552,7 @@ namespace Kernel BAN::ErrorOr Process::find_file(int fd, const char* path, int flags) const { - ASSERT(m_process_lock.is_locked()); + LockGuard _(m_process_lock); auto parent_file = TRY(find_relative_parent(fd, path)); auto file = path @@ -546,7 +564,7 @@ namespace Kernel BAN::ErrorOr Process::find_parent_file(int fd, const char* path, int flags) const { - ASSERT(m_process_lock.is_locked()); + LockGuard _(m_process_lock); if (path && path[0] == '\0') return BAN::Error::from_errno(ENOENT); @@ -607,36 +625,34 @@ namespace Kernel ASSERT_NOT_REACHED(); } - BAN::ErrorOr Process::sys_tcgetattr(int fildes, termios* termios) + BAN::ErrorOr Process::sys_tcgetattr(int fildes, termios* user_termios) { - LockGuard _(m_process_lock); - - TRY(validate_pointer_access(termios, sizeof(struct termios), true)); - auto inode = TRY(m_open_file_descriptors.inode_of(fildes)); if (!inode->is_tty()) return BAN::Error::from_errno(ENOTTY); - static_cast(inode.ptr())->get_termios(termios); + struct termios termios; + static_cast(inode.ptr())->get_termios(&termios); + + TRY(write_to_user(user_termios, &termios, sizeof(struct termios))); return 0; } - BAN::ErrorOr Process::sys_tcsetattr(int fildes, int optional_actions, const termios* termios) + BAN::ErrorOr Process::sys_tcsetattr(int fildes, int optional_actions, const termios* user_termios) { //if (optional_actions != TCSANOW) // return BAN::Error::from_errno(EINVAL); (void)optional_actions; - LockGuard _(m_process_lock); - - TRY(validate_pointer_access(termios, sizeof(struct termios), false)); - auto inode = TRY(m_open_file_descriptors.inode_of(fildes)); if (!inode->is_tty()) return BAN::Error::from_errno(ENOTTY); - TRY(static_cast(inode.ptr())->set_termios(termios)); + termios termios; + TRY(read_from_user(user_termios, &termios, sizeof(struct termios))); + + TRY(static_cast(inode.ptr())->set_termios(&termios)); // FIXME: SIGTTOU @@ -687,9 +703,12 @@ namespace Kernel TRY(open_file_descriptors->clone_from(m_open_file_descriptors)); BAN::Vector> mapped_regions; - TRY(mapped_regions.reserve(m_mapped_regions.size())); - for (auto& mapped_region : m_mapped_regions) - MUST(mapped_regions.push_back(TRY(mapped_region->clone(*page_table)))); + { + RWLockRDGuard _(m_memory_region_lock); + TRY(mapped_regions.reserve(m_mapped_regions.size())); + for (auto& mapped_region : m_mapped_regions) + MUST(mapped_regions.push_back(TRY(mapped_region->clone(*page_table)))); + } const vaddr_t shared_page_vaddr = m_shared_page_vaddr; page_table->map_page_at( @@ -725,7 +744,7 @@ namespace Kernel return forked->pid(); } - BAN::ErrorOr Process::sys_exec(const char* path, const char* const* argv, const char* const* envp) + BAN::ErrorOr Process::sys_exec(const char* user_path, const char* const* user_argv, const char* const* user_envp) { // NOTE: We scope everything for automatic deletion { @@ -733,24 +752,38 @@ namespace Kernel auto new_page_table = BAN::UniqPtr::adopt(TRY(PageTable::create_userspace())); - TRY(validate_string_access(path)); + char path[PATH_MAX]; + TRY(read_string_from_user(user_path, path, PATH_MAX)); + + BAN::Vector arg_buffer; + TRY(arg_buffer.resize(ARG_MAX)); BAN::Vector str_argv; - for (int i = 0; argv && argv[i]; i++) + for (int i = 0; user_argv; i++) { - TRY(validate_pointer_access(argv + i, sizeof(char*), false)); - TRY(validate_string_access(argv[i])); - TRY(str_argv.emplace_back()); - TRY(str_argv.back().append(argv[i])); + char* argv_i; + TRY(read_from_user(user_argv + i, &argv_i, sizeof(char*))); + if (argv_i == nullptr) + break; + + BAN::String arg; + TRY(read_string_from_user(argv_i, arg_buffer.data(), arg_buffer.size())); + TRY(arg.append(BAN::StringView { arg_buffer.data() })); + TRY(str_argv.emplace_back(BAN::move(arg))); } BAN::Vector str_envp; - for (int i = 0; envp && envp[i]; i++) + for (int i = 0; user_envp; i++) { - TRY(validate_pointer_access(envp + i, sizeof(char*), false)); - TRY(validate_string_access(envp[i])); - TRY(str_envp.emplace_back()); - TRY(str_envp.back().append(envp[i])); + char* envp_i; + TRY(read_from_user(user_envp + i, &envp_i, sizeof(char*))); + if (envp_i == nullptr) + break; + + BAN::String env; + TRY(read_string_from_user(envp_i, arg_buffer.data(), arg_buffer.size())); + TRY(env.append(BAN::StringView { arg_buffer.data() })); + TRY(str_envp.emplace_back(BAN::move(env))); } auto executable_file = TRY(find_file(AT_FDCWD, path, O_EXEC)); @@ -801,7 +834,12 @@ namespace Kernel .a_un = { .a_val = 0 }, })); - auto* new_thread = TRY(Thread::create_userspace(this, *new_page_table)); + // This is ugly but thread insterts userspace stack to process' memory region + BAN::swap(m_mapped_regions, new_mapped_regions); + auto new_thread_or_error = Thread::create_userspace(this, *new_page_table); + BAN::swap(m_mapped_regions, new_mapped_regions); + + auto* new_thread = TRY(new_thread_or_error); TRY(new_thread->initialize_userspace( executable.entry_point, str_argv.span(), @@ -822,6 +860,12 @@ namespace Kernel #endif } + BAN::sort::sort(new_mapped_regions.begin(), new_mapped_regions.end(), [](auto& a, auto& b) { + return a->vaddr() < b->vaddr(); + }); + + RWLockWRGuard wr_guard(m_memory_region_lock); + // NOTE: this is done before disabling interrupts and moving the threads as // shared filebacked mmap can write to disk on on clearing, this will lock // filesystem mutex which can yield @@ -882,7 +926,7 @@ namespace Kernel ASSERT_NOT_REACHED(); } - BAN::ErrorOr Process::sys_wait(pid_t pid, int* stat_loc, int options) + BAN::ErrorOr Process::sys_wait(pid_t pid, int* user_stat_loc, int options) { if (options & ~(WCONTINUED | WNOHANG | WUNTRACED)) return BAN::Error::from_errno(EINVAL); @@ -952,13 +996,8 @@ namespace Kernel TRY(Thread::current().block_or_eintr_indefinite(m_child_wait_blocker, &smutex)); } - LockGuard _(m_process_lock); - - if (stat_loc) - { - TRY(validate_pointer_access(stat_loc, sizeof(stat_loc), true)); - *stat_loc = child_status; - } + if (user_stat_loc != nullptr) + TRY(write_to_user(user_stat_loc, &child_status, sizeof(int))); remove_pending_signal(SIGCHLD); return child_pid; @@ -985,16 +1024,15 @@ namespace Kernel return 0; } - BAN::ErrorOr Process::sys_nanosleep(const timespec* rqtp, timespec* rmtp) + BAN::ErrorOr Process::sys_nanosleep(const timespec* user_rqtp, timespec* user_rmtp) { - { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(rqtp, sizeof(timespec), false)); - if (rmtp) - TRY(validate_pointer_access(rmtp, sizeof(timespec), true)); - } + timespec rqtp; + TRY(read_from_user(user_rqtp, &rqtp, sizeof(timespec))); - const uint64_t sleep_ns = (rqtp->tv_sec * 1'000'000'000) + rqtp->tv_nsec; + if (rqtp.tv_nsec < 0 || rqtp.tv_nsec >= 1'000'000'000) + return BAN::Error::from_errno(EINVAL); + + const uint64_t sleep_ns = (rqtp.tv_sec * 1'000'000'000) + rqtp.tv_nsec; if (sleep_ns == 0) return 0; @@ -1004,12 +1042,13 @@ namespace Kernel const uint64_t current_ns = SystemTimer::get().ns_since_boot(); if (current_ns < wake_time_ns) { - if (rmtp) - { - const uint64_t remaining_ns = wake_time_ns - current_ns; - rmtp->tv_sec = remaining_ns / 1'000'000'000; - rmtp->tv_nsec = remaining_ns % 1'000'000'000; - } + const uint64_t remaining_ns = wake_time_ns - current_ns; + const timespec remaining_ts = { + .tv_sec = static_cast(remaining_ns / 1'000'000'000), + .tv_nsec = static_cast(remaining_ns % 1'000'000'000), + }; + if (user_rmtp != nullptr) + TRY(write_to_user(user_rmtp, &remaining_ts, sizeof(timespec))); return BAN::Error::from_errno(EINTR); } @@ -1028,12 +1067,18 @@ namespace Kernel return BAN::Error::from_errno(EINVAL); } - LockGuard _(m_process_lock); + MemoryRegion* value_region = nullptr; + MemoryRegion* ovalue_region = nullptr; + BAN::ScopeGuard _([&] { + if (value_region) + value_region->unpin(); + if (ovalue_region) + ovalue_region->unpin(); + }); - if (value) - TRY(validate_pointer_access(value, sizeof(itimerval), false)); - if (ovalue) - TRY(validate_pointer_access(ovalue, sizeof(itimerval), true)); + value_region = TRY(validate_and_pin_pointer_access(value, sizeof(itimerval), false)); + if (ovalue != nullptr) + ovalue_region = TRY(validate_and_pin_pointer_access(ovalue, sizeof(itimerval), true)); { SpinLockGuard _(s_process_lock); @@ -1158,38 +1203,6 @@ namespace Kernel return {}; } - BAN::ErrorOr Process::allocate_page_for_demand_paging(vaddr_t address, bool wants_write, bool wants_exec) - { - ASSERT(&Process::current() == this); - - LockGuard _(m_process_lock); - - { - // NOTE: page can be already allocated by another thread! - - PageTable::flags_t wanted_flags = PageTable::Flags::UserSupervisor | PageTable::Flags::Present; - if (wants_write) - wanted_flags |= PageTable::Flags::ReadWrite; - if (wants_exec) - wanted_flags |= PageTable::Flags::Execute; - if ((m_page_table->get_page_flags(address & PAGE_ADDR_MASK) & wanted_flags) == wanted_flags) - return true; - } - - if (Thread::current().userspace_stack().contains(address)) - return Thread::current().userspace_stack().allocate_page_for_demand_paging(address); - - const size_t index = find_mapped_region(address); - if (index >= m_mapped_regions.size()) - return false; - - auto& region = m_mapped_regions[index]; - if (!region->contains(address)) - return false; - - return region->allocate_page_containing(address, wants_write); - } - BAN::ErrorOr Process::open_inode(VirtualFileSystem::File&& file, int flags) { ASSERT(file.inode); @@ -1197,14 +1210,15 @@ namespace Kernel return TRY(m_open_file_descriptors.open(BAN::move(file), flags)); } - BAN::ErrorOr Process::sys_openat(int fd, const char* path, int flags, mode_t mode) + BAN::ErrorOr Process::sys_openat(int fd, const char* user_path, int flags, mode_t mode) { if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT)) return BAN::Error::from_errno(EINVAL); - LockGuard _(m_process_lock); + char path[PATH_MAX]; + TRY(read_string_from_user(user_path, path, PATH_MAX)); - TRY(validate_string_access(path)); + LockGuard _(m_process_lock); auto [parent, file_name] = TRY(find_parent_file(fd, path, O_RDONLY)); auto file_or_error = VirtualFileSystem::get().file_from_relative_path(m_root_file.inode, parent, m_credentials, file_name, flags); @@ -1259,7 +1273,7 @@ namespace Kernel // FIXME: buffer_region can be null as stack is not MemoryRegion auto* buffer_region = TRY(validate_and_pin_pointer_access(buffer, count, true)); - BAN::ScopeGuard _([&] { if (buffer_region) buffer_region->unpin(); }); + BAN::ScopeGuard _([buffer_region] { if (buffer_region) buffer_region->unpin(); }); return TRY(m_open_file_descriptors.read(fd, BAN::ByteSpan(static_cast(buffer), count))); } @@ -1273,11 +1287,11 @@ namespace Kernel // FIXME: buffer_region can be null as stack is not MemoryRegion auto* buffer_region = TRY(validate_and_pin_pointer_access(buffer, count, false)); - BAN::ScopeGuard _([&] { if (buffer_region) buffer_region->unpin(); }); + BAN::ScopeGuard _([buffer_region] { if (buffer_region) buffer_region->unpin(); }); return TRY(m_open_file_descriptors.write(fd, BAN::ConstByteSpan(static_cast(buffer), count))); } - BAN::ErrorOr Process::sys_access(const char* path, int amode) + BAN::ErrorOr Process::sys_access(const char* user_path, int amode) { int flags = 0; if (amode & F_OK) @@ -1290,8 +1304,10 @@ namespace Kernel flags |= O_EXEC; static_assert((O_RDONLY | O_WRONLY) == O_RDWR); + char path[PATH_MAX]; + TRY(read_string_from_user(user_path, path, PATH_MAX)); + LockGuard _(m_process_lock); - TRY(validate_string_access(path)); auto credentials = m_credentials; credentials.set_euid(credentials.ruid()); @@ -1303,30 +1319,41 @@ namespace Kernel return 0; } - BAN::ErrorOr Process::sys_create_dir(const char* path, mode_t mode) + BAN::ErrorOr Process::sys_create_dir(const char* user_path, mode_t mode) { - LockGuard _(m_process_lock); - TRY(validate_string_access(path)); + char path[PATH_MAX]; + TRY(read_string_from_user(user_path, path, PATH_MAX)); + + uid_t uid; + gid_t gid; + + { + LockGuard _(m_process_lock); + uid = m_credentials.euid(); + gid = m_credentials.egid(); + } auto [parent, file_name] = TRY(find_parent_file(AT_FDCWD, path, O_WRONLY)); - TRY(parent.inode->create_directory(file_name, (mode & 0777) | Inode::Mode::IFDIR, m_credentials.euid(), m_credentials.egid())); + TRY(parent.inode->create_directory(file_name, (mode & 0777) | Inode::Mode::IFDIR, uid, gid)); return 0; } - BAN::ErrorOr Process::sys_hardlinkat(int fd1, const char* path1, int fd2, const char* path2, int flag) + BAN::ErrorOr Process::sys_hardlinkat(int fd1, const char* user_path1, int fd2, const char* user_path2, int flag) { - LockGuard _(m_process_lock); - if (path1 != nullptr) - TRY(validate_string_access(path1)); - if (path2 != nullptr) - TRY(validate_string_access(path2)); + char path1[PATH_MAX]; + if (user_path1 != nullptr) + TRY(read_string_from_user(user_path1, path1, PATH_MAX)); - auto inode = TRY(find_file(fd1, path1, flag)).inode; + char path2[PATH_MAX]; + if (user_path2 != nullptr) + TRY(read_string_from_user(user_path2, path2, PATH_MAX)); + + auto inode = TRY(find_file(fd1, user_path1 ? path1 : nullptr, flag)).inode; if (inode->mode().ifdir()) return BAN::Error::from_errno(EISDIR); - auto [parent, file_name] = TRY(find_parent_file(fd2, path2, O_WRONLY)); + auto [parent, file_name] = TRY(find_parent_file(fd2, user_path2 ? path2 : nullptr, O_WRONLY)); if (!parent.inode->mode().ifdir()) return BAN::Error::from_errno(ENOTDIR); @@ -1335,19 +1362,21 @@ namespace Kernel return 0; } - BAN::ErrorOr Process::sys_renameat(int oldfd, const char* old, int newfd, const char* _new) + BAN::ErrorOr Process::sys_renameat(int oldfd, const char* user_path_old, int newfd, const char* user_path_new) { - LockGuard _(m_process_lock); - if (old != nullptr) - TRY(validate_string_access(old)); - if (_new != nullptr) - TRY(validate_string_access(_new)); + char path_old[PATH_MAX]; + if (user_path_old != nullptr) + TRY(read_string_from_user(user_path_old, path_old, PATH_MAX)); - auto [old_parent, old_name] = TRY(find_parent_file(oldfd, old, O_WRONLY)); + char path_new[PATH_MAX]; + if (user_path_new != nullptr) + TRY(read_string_from_user(user_path_new, path_new, PATH_MAX)); + + auto [old_parent, old_name] = TRY(find_parent_file(oldfd, user_path_old ? path_old : nullptr, O_WRONLY)); if (!old_parent.inode->mode().ifdir()) return BAN::Error::from_errno(ENOTDIR); - auto [new_parent, new_name] = TRY(find_parent_file(newfd, _new, O_WRONLY)); + auto [new_parent, new_name] = TRY(find_parent_file(newfd, user_path_new ? path_new : nullptr, O_WRONLY)); if (!new_parent.inode->mode().ifdir()) return BAN::Error::from_errno(ENOTDIR); @@ -1356,16 +1385,17 @@ namespace Kernel return 0; } - BAN::ErrorOr Process::sys_unlinkat(int fd, const char* path, int flag) + BAN::ErrorOr Process::sys_unlinkat(int fd, const char* user_path, int flag) { if (flag && flag != AT_REMOVEDIR) return BAN::Error::from_errno(EINVAL); - LockGuard _(m_process_lock); - if (path != nullptr) - TRY(validate_string_access(path)); + char path[PATH_MAX]; + if (user_path != nullptr) + TRY(read_string_from_user(user_path, path, PATH_MAX)); - auto [parent, file_name] = TRY(find_parent_file(fd, path, O_WRONLY)); + + auto [parent, file_name] = TRY(find_parent_file(fd, user_path ? path : nullptr, O_WRONLY)); const auto inode = TRY(parent.inode->find_inode(file_name)); @@ -1373,44 +1403,48 @@ namespace Kernel return BAN::Error::from_errno(flag ? EPERM : ENOTDIR); if (parent.inode->mode().mode & Inode::Mode::ISVTX) + { + LockGuard _(m_process_lock); if (m_credentials.ruid() != parent.inode->uid() && m_credentials.ruid() != inode->uid()) return BAN::Error::from_errno(EPERM); + } TRY(parent.inode->unlink(file_name)); return 0; } - BAN::ErrorOr Process::sys_readlinkat(int fd, const char* path, char* buffer, size_t bufsize) + BAN::ErrorOr Process::sys_readlinkat(int fd, const char* user_path, char* buffer, size_t bufsize) { - LockGuard _(m_process_lock); - if (path != nullptr) - TRY(validate_string_access(path)); - TRY(validate_pointer_access(buffer, bufsize, true)); + char path[PATH_MAX]; + if (user_path != nullptr) + TRY(read_string_from_user(user_path, path, PATH_MAX)); - auto inode = TRY(find_file(fd, path, O_NOFOLLOW | O_RDONLY)).inode; + auto inode = TRY(find_file(fd, user_path ? path : nullptr, O_NOFOLLOW | O_RDONLY)).inode; - // FIXME: no allocation needed auto link_target = TRY(inode->link_target()); const size_t byte_count = BAN::Math::min(link_target.size(), bufsize); - memcpy(buffer, link_target.data(), byte_count); + TRY(write_to_user(buffer, link_target.data(), byte_count)); + return byte_count; } - BAN::ErrorOr Process::sys_symlinkat(const char* path1, int fd, const char* path2) + BAN::ErrorOr Process::sys_symlinkat(const char* user_path1, int fd, const char* user_path2) { - LockGuard _(m_process_lock); - TRY(validate_string_access(path1)); - if (path2 != nullptr) - TRY(validate_string_access(path2)); + char path1[PATH_MAX]; + TRY(read_string_from_user(user_path1, path1, PATH_MAX)); - if (!find_file(fd, path2, O_NOFOLLOW).is_error()) + char path2[PATH_MAX]; + if (user_path2 != nullptr) + TRY(read_string_from_user(user_path2, path2, PATH_MAX)); + + if (!find_file(fd, user_path2 ? path2 : nullptr, O_NOFOLLOW).is_error()) return BAN::Error::from_errno(EEXIST); - TRY(create_file_or_dir(fd, path2, 0777 | Inode::Mode::IFLNK)); + TRY(create_file_or_dir(fd, user_path2 ? path2 : nullptr, 0777 | Inode::Mode::IFLNK)); - auto symlink = TRY(find_file(fd, path2, O_NOFOLLOW)); + auto symlink = TRY(find_file(fd, user_path2 ? path2 : nullptr, O_NOFOLLOW)); TRY(symlink.inode->set_link_target(path1)); return 0; @@ -1431,7 +1465,7 @@ namespace Kernel auto inode = TRY(m_open_file_descriptors.inode_of(fd)); auto* buffer_region = TRY(validate_and_pin_pointer_access(buffer, count, true)); - BAN::ScopeGuard _([buffer_region]{ if (buffer_region) buffer_region->unpin(); }); + BAN::ScopeGuard _([buffer_region] { if (buffer_region) buffer_region->unpin(); }); return TRY(inode->read(offset, { reinterpret_cast(buffer), count })); } @@ -1441,27 +1475,30 @@ namespace Kernel auto inode = TRY(m_open_file_descriptors.inode_of(fd)); auto* buffer_region = TRY(validate_and_pin_pointer_access(buffer, count, false)); - BAN::ScopeGuard _([buffer_region]{ if (buffer_region) buffer_region->unpin(); }); + BAN::ScopeGuard _([buffer_region] { if (buffer_region) buffer_region->unpin(); }); return TRY(inode->write(offset, { reinterpret_cast(buffer), count })); } - BAN::ErrorOr Process::sys_fchmodat(int fd, const char* path, mode_t mode, int flag) + BAN::ErrorOr Process::sys_fchmodat(int fd, const char* user_path, mode_t mode, int flag) { if (flag & ~AT_SYMLINK_NOFOLLOW) return BAN::Error::from_errno(EINVAL); if (flag == AT_SYMLINK_NOFOLLOW) flag = O_NOFOLLOW; - LockGuard _(m_process_lock); - if (path != nullptr) - TRY(validate_string_access(path)); + char path[PATH_MAX]; + if (user_path != nullptr) + TRY(read_string_from_user(user_path, path, PATH_MAX)); - auto inode = TRY(find_file(fd, path, flag)).inode; + auto inode = TRY(find_file(fd, user_path ? path : nullptr, flag)).inode; - if (!m_credentials.is_superuser() && inode->uid() != m_credentials.euid()) { - dwarnln("cannot chmod uid {} vs {}", inode->uid(), m_credentials.euid()); - return BAN::Error::from_errno(EPERM); + LockGuard _(m_process_lock); + if (!m_credentials.is_superuser() && inode->uid() != m_credentials.euid()) + { + dwarnln("cannot chmod uid {} vs {}", inode->uid(), m_credentials.euid()); + return BAN::Error::from_errno(EPERM); + } } TRY(inode->chmod(mode & ~S_IFMASK)); @@ -1469,23 +1506,26 @@ namespace Kernel return 0; } - BAN::ErrorOr Process::sys_fchownat(int fd, const char* path, uid_t uid, gid_t gid, int flag) + BAN::ErrorOr Process::sys_fchownat(int fd, const char* user_path, uid_t uid, gid_t gid, int flag) { if (flag & ~AT_SYMLINK_NOFOLLOW) return BAN::Error::from_errno(EINVAL); if (flag == AT_SYMLINK_NOFOLLOW) flag = O_NOFOLLOW; - LockGuard _(m_process_lock); - if (path != nullptr) - TRY(validate_string_access(path)); + char path[PATH_MAX]; + if (user_path != nullptr) + TRY(read_string_from_user(user_path, path, PATH_MAX)); - auto inode = TRY(find_file(fd, path, flag)).inode; + auto inode = TRY(find_file(fd, user_path ? path : nullptr, flag)).inode; - if (uid != -1 && !m_credentials.is_superuser()) - return BAN::Error::from_errno(EPERM); - if (gid != -1 && !m_credentials.is_superuser() && (m_credentials.euid() != uid || !m_credentials.has_egid(gid))) - return BAN::Error::from_errno(EPERM); + { + LockGuard _(m_process_lock); + if (uid != -1 && !m_credentials.is_superuser()) + return BAN::Error::from_errno(EPERM); + if (gid != -1 && !m_credentials.is_superuser() && (m_credentials.euid() != uid || !m_credentials.has_egid(gid))) + return BAN::Error::from_errno(EPERM); + } if (uid == -1) uid = inode->uid(); @@ -1496,49 +1536,57 @@ namespace Kernel return 0; } - BAN::ErrorOr Process::sys_utimensat(int fd, const char* path, const struct timespec _times[2], int flag) + BAN::ErrorOr Process::sys_utimensat(int fd, const char* user_path, const struct timespec user_times[2], int flag) { if (flag & ~AT_SYMLINK_NOFOLLOW) return BAN::Error::from_errno(EINVAL); if (flag == AT_SYMLINK_NOFOLLOW) flag = O_NOFOLLOW; - timespec times[2]; const uint64_t current_ns = SystemTimer::get().ns_since_boot(); - times[0] = times[1] = timespec { + const timespec current_ts = { .tv_sec = static_cast(current_ns / 1'000'000), .tv_nsec = static_cast(current_ns % 1'000'000), }; - if (_times) + timespec times[2]; + if (user_times == nullptr) { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(_times, sizeof(timespec) * 2, false)); - if (_times[0].tv_nsec != UTIME_NOW) + times[0] = current_ts; + times[1] = current_ts; + } + else + { + TRY(read_from_user(user_times, times, 2 * sizeof(timespec))); + + for (size_t i = 0; i < 2; i++) { - times[0] = _times[0]; - if (auto ns = times[0].tv_nsec; ns != UTIME_OMIT && (ns < 0 || ns >= 1'000'000'000)) - return BAN::Error::from_errno(EINVAL); - } - if (_times[1].tv_nsec != UTIME_NOW) - { - times[1] = _times[1]; - if (auto ns = times[1].tv_nsec; ns != UTIME_OMIT && (ns < 0 || ns >= 1'000'000'000)) + if (times[i].tv_nsec == UTIME_OMIT) + ; + else if (times[i].tv_nsec == UTIME_NOW) + times[i] = current_ts; + else if (times[i].tv_nsec < 0 || times[i].tv_nsec >= 1'000'000'000) return BAN::Error::from_errno(EINVAL); } + } if (times[0].tv_nsec == UTIME_OMIT && times[1].tv_nsec == UTIME_OMIT) return 0; - LockGuard _(m_process_lock); + char path[PATH_MAX]; + if (user_path != nullptr) + TRY(read_string_from_user(user_path, path, PATH_MAX)); - auto inode = TRY(find_file(fd, path, flag)).inode; + auto inode = TRY(find_file(fd, user_path ? path : nullptr, flag)).inode; - if (!m_credentials.is_superuser() && inode->uid() != m_credentials.euid()) { - dwarnln("cannot chmod uid {} vs {}", inode->uid(), m_credentials.euid()); - return BAN::Error::from_errno(EPERM); + LockGuard _(m_process_lock); + if (!m_credentials.is_superuser() && inode->uid() != m_credentials.euid()) + { + dwarnln("cannot chmod uid {} vs {}", inode->uid(), m_credentials.euid()); + return BAN::Error::from_errno(EPERM); + } } TRY(inode->utimens(times)); @@ -1548,80 +1596,120 @@ namespace Kernel BAN::ErrorOr Process::sys_socket(int domain, int type, int protocol) { - LockGuard _(m_process_lock); return TRY(m_open_file_descriptors.socket(domain, type, protocol)); } - BAN::ErrorOr Process::sys_socketpair(int domain, int type, int protocol, int socket_vector[2]) + BAN::ErrorOr Process::sys_socketpair(int domain, int type, int protocol, int user_socket_vector[2]) { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(socket_vector, sizeof(int) * 2, true)); + int socket_vector[2]; TRY(m_open_file_descriptors.socketpair(domain, type, protocol, socket_vector)); - return 0; - } - BAN::ErrorOr Process::sys_getsockname(int socket, sockaddr* address, socklen_t* address_len) - { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(address_len, sizeof(address_len), true)); - TRY(validate_pointer_access(address, *address_len, true)); - - auto inode = TRY(m_open_file_descriptors.inode_of(socket)); - if (!inode->mode().ifsock()) - return BAN::Error::from_errno(ENOTSOCK); - - TRY(inode->getsockname(address, address_len)); - return 0; - } - - BAN::ErrorOr Process::sys_getpeername(int socket, sockaddr* address, socklen_t* address_len) - { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(address_len, sizeof(address_len), true)); - TRY(validate_pointer_access(address, *address_len, true)); - - auto inode = TRY(m_open_file_descriptors.inode_of(socket)); - if (!inode->mode().ifsock()) - return BAN::Error::from_errno(ENOTSOCK); - - TRY(inode->getpeername(address, address_len)); - return 0; - } - - BAN::ErrorOr Process::sys_getsockopt(int socket, int level, int option_name, void* option_value, socklen_t* option_len) - { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(option_len, sizeof(option_len), true)); - TRY(validate_pointer_access(option_value, *option_len, true)); - - auto inode = TRY(m_open_file_descriptors.inode_of(socket)); - if (!inode->mode().ifsock()) - return BAN::Error::from_errno(ENOTSOCK); - - // Because all networking is synchronous, there can not be errors to report - if (level == SOL_SOCKET && option_name == SO_ERROR) + if (auto ret = write_to_user(user_socket_vector, socket_vector, 2 * sizeof(int)); ret.is_error()) { - if (*option_len) - *reinterpret_cast(option_value) = 0; - *option_len = BAN::Math::min(*option_len, sizeof(int)); - return 0; + MUST(m_open_file_descriptors.close(socket_vector[0])); + MUST(m_open_file_descriptors.close(socket_vector[1])); + return ret.release_error(); } + return 0; + } + + BAN::ErrorOr Process::sys_getsockname(int socket, sockaddr* user_address, socklen_t* user_address_len) + { + socklen_t address_len; + TRY(read_from_user(user_address_len, &address_len, sizeof(socklen_t))); + if (address_len > static_cast(sizeof(sockaddr_storage))) + address_len = sizeof(sockaddr_storage); + + sockaddr_storage address; + TRY(read_from_user(user_address, &address, address_len)); + + auto inode = TRY(m_open_file_descriptors.inode_of(socket)); + if (!inode->mode().ifsock()) + return BAN::Error::from_errno(ENOTSOCK); + + TRY(inode->getsockname(reinterpret_cast(&address), &address_len)); + + TRY(write_to_user(user_address_len, &address_len, sizeof(socklen_t))); + TRY(write_to_user(user_address, &address, address_len)); + + return 0; + } + + BAN::ErrorOr Process::sys_getpeername(int socket, sockaddr* user_address, socklen_t* user_address_len) + { + socklen_t address_len; + TRY(read_from_user(user_address_len, &address_len, sizeof(socklen_t))); + if (address_len > static_cast(sizeof(sockaddr_storage))) + address_len = sizeof(sockaddr_storage); + + sockaddr_storage address; + TRY(read_from_user(user_address, &address, address_len)); + + auto inode = TRY(m_open_file_descriptors.inode_of(socket)); + if (!inode->mode().ifsock()) + return BAN::Error::from_errno(ENOTSOCK); + + TRY(inode->getpeername(reinterpret_cast(&address), &address_len)); + + TRY(write_to_user(user_address_len, &address_len, sizeof(socklen_t))); + TRY(write_to_user(user_address, &address, address_len)); + + return 0; + } + + BAN::ErrorOr Process::sys_getsockopt(int socket, int level, int option_name, void* user_option_value, socklen_t* user_option_len) + { + if (level != SOL_SOCKET) + { + dwarnln("{}: getsockopt level {}", name(), level); + return BAN::Error::from_errno(EINVAL); + } + + socklen_t option_len; + TRY(read_from_user(user_option_len, &option_len, sizeof(socklen_t))); + + if (option_len < 0) + return BAN::Error::from_errno(EINVAL); + + auto inode = TRY(m_open_file_descriptors.inode_of(socket)); + if (!inode->mode().ifsock()) + return BAN::Error::from_errno(ENOTSOCK); + + switch (option_name) + { + case SO_ERROR: + { + option_len = BAN::Math::min(option_len, sizeof(int)); + const int zero { 0 }; + TRY(write_to_user(user_option_value, &zero, option_len)); + TRY(write_to_user(user_option_len, &option_len, sizeof(socklen_t))); + return 0; + } + } + + dwarnln("getsockopt(SOL_SOCKET, {})", option_name); return BAN::Error::from_errno(ENOTSUP); } - BAN::ErrorOr Process::sys_setsockopt(int socket, int level, int option_name, const void* option_value, socklen_t option_len) + BAN::ErrorOr Process::sys_setsockopt(int socket, int level, int option_name, const void* user_option_value, socklen_t option_len) { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(option_value, option_len, false)); + if (level != SOL_SOCKET) + { + dwarnln("{}: setsockopt level {}", name(), level); + return BAN::Error::from_errno(EINVAL); + } + + if (option_len < 0) + return BAN::Error::from_errno(EINVAL); auto inode = TRY(m_open_file_descriptors.inode_of(socket)); if (!inode->mode().ifsock()) return BAN::Error::from_errno(ENOTSOCK); - (void)level; - (void)option_name; + (void)user_option_value; + dwarnln("setsockopt(SOL_SOCKET, {})", option_name); return BAN::Error::from_errno(ENOTSUP); } @@ -1661,52 +1749,55 @@ namespace Kernel return TRY(inode->accept(address, address_len, open_flags)); } - BAN::ErrorOr Process::sys_bind(int socket, const sockaddr* address, socklen_t address_len) + BAN::ErrorOr Process::sys_bind(int socket, const sockaddr* user_address, socklen_t address_len) { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(address, address_len, false)); + if (address_len > static_cast(sizeof(sockaddr_storage))) + address_len = sizeof(sockaddr_storage); + + sockaddr_storage address; + TRY(read_from_user(user_address, &address, address_len)); auto inode = TRY(m_open_file_descriptors.inode_of(socket)); if (!inode->mode().ifsock()) return BAN::Error::from_errno(ENOTSOCK); - TRY(inode->bind(address, address_len)); + TRY(inode->bind(reinterpret_cast(&address), address_len)); + return 0; } - BAN::ErrorOr Process::sys_connect(int socket, const sockaddr* address, socklen_t address_len) + BAN::ErrorOr Process::sys_connect(int socket, const sockaddr* user_address, socklen_t address_len) { + if (address_len > static_cast(sizeof(sockaddr_storage))) + address_len = sizeof(sockaddr_storage); + + sockaddr_storage address; + TRY(read_from_user(user_address, &address, address_len)); + auto inode = TRY(m_open_file_descriptors.inode_of(socket)); if (!inode->mode().ifsock()) return BAN::Error::from_errno(ENOTSOCK); - auto* address_region = TRY(validate_and_pin_pointer_access(address, address_len, true)); - BAN::ScopeGuard _([&] { if (address_region) address_region->unpin(); }); + TRY(inode->connect(reinterpret_cast(&address), address_len)); - TRY(inode->connect(address, address_len)); return 0; } BAN::ErrorOr Process::sys_listen(int socket, int backlog) { - LockGuard _(m_process_lock); - auto inode = TRY(m_open_file_descriptors.inode_of(socket)); if (!inode->mode().ifsock()) return BAN::Error::from_errno(ENOTSOCK); TRY(inode->listen(backlog)); + return 0; } - BAN::ErrorOr Process::sys_recvmsg(int socket, msghdr* _message, int flags) + BAN::ErrorOr Process::sys_recvmsg(int socket, msghdr* user_message, int flags) { msghdr message; - { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(_message, sizeof(msghdr), true)); - message = *_message; - } + TRY(read_from_user(user_message, &message, sizeof(msghdr))); BAN::Vector regions; BAN::ScopeGuard _([®ions] { @@ -1726,25 +1817,17 @@ namespace Kernel TRY(regions.push_back(TRY(validate_and_pin_pointer_access(message.msg_iov[i].iov_base, message.msg_iov[i].iov_len, true)))); } - auto ret = TRY(m_open_file_descriptors.recvmsg(socket, message, flags)); + const auto ret = TRY(m_open_file_descriptors.recvmsg(socket, message, flags)); - { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(_message, sizeof(msghdr), true)); - *_message = message; - } + TRY(write_to_user(user_message, &message, sizeof(msghdr))); return ret; } - BAN::ErrorOr Process::sys_sendmsg(int socket, const msghdr* _message, int flags) + BAN::ErrorOr Process::sys_sendmsg(int socket, const msghdr* user_message, int flags) { msghdr message; - { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(_message, sizeof(msghdr), false)); - message = *_message; - } + TRY(read_from_user(user_message, &message, sizeof(msghdr))); BAN::Vector regions; BAN::ScopeGuard _([®ions] { @@ -1769,7 +1852,6 @@ namespace Kernel BAN::ErrorOr Process::sys_ioctl(int fildes, int request, void* arg) { - LockGuard _(m_process_lock); auto inode = TRY(m_open_file_descriptors.inode_of(fildes)); return TRY(inode->ioctl(request, arg)); } @@ -1777,12 +1859,7 @@ namespace Kernel BAN::ErrorOr Process::sys_pselect(sys_pselect_t* user_arguments) { sys_pselect_t arguments; - - { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(user_arguments, sizeof(sys_pselect_t), false)); - arguments = *user_arguments; - } + TRY(read_from_user(user_arguments, &arguments, sizeof(sys_pselect_t))); MemoryRegion* readfd_region = nullptr; MemoryRegion* writefd_region = nullptr; @@ -1807,21 +1884,21 @@ namespace Kernel const auto old_sigmask = Thread::current().m_signal_block_mask; if (arguments.sigmask) { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(arguments.sigmask, sizeof(sigset_t), false)); - Thread::current().m_signal_block_mask = *arguments.sigmask; + sigset_t sigmask; + TRY(read_from_user(user_arguments->sigmask, &sigmask, sizeof(sigset_t))); + Thread::current().m_signal_block_mask = sigmask; } BAN::ScopeGuard sigmask_restore([old_sigmask] { Thread::current().m_signal_block_mask = old_sigmask; }); uint64_t waketime_ns = BAN::numeric_limits::max(); if (arguments.timeout) { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(arguments.timeout, sizeof(timespec), false)); + timespec timeout; + TRY(read_from_user(arguments.timeout, &timeout, sizeof(timespec))); waketime_ns = SystemTimer::get().ns_since_boot() + - (arguments.timeout->tv_sec * 1'000'000'000) + - arguments.timeout->tv_nsec; + timeout.tv_sec * 1'000'000'000 + + timeout.tv_nsec; } { @@ -1905,29 +1982,29 @@ namespace Kernel return return_value; } - BAN::ErrorOr Process::sys_ppoll(pollfd* fds, nfds_t nfds, const timespec* timeout, const sigset_t* sigmask) + BAN::ErrorOr Process::sys_ppoll(pollfd* fds, nfds_t nfds, const timespec* user_timeout, const sigset_t* user_sigmask) { auto* fds_region = TRY(validate_and_pin_pointer_access(fds, nfds * sizeof(pollfd), true)); BAN::ScopeGuard _([fds_region] { if (fds_region) fds_region->unpin(); }); const auto old_sigmask = Thread::current().m_signal_block_mask; - if (sigmask) + if (user_sigmask != nullptr) { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(sigmask, sizeof(sigset_t), false)); - Thread::current().m_signal_block_mask = *sigmask; + sigset_t sigmask; + TRY(read_from_user(user_sigmask, &sigmask, sizeof(sigmask))); + Thread::current().m_signal_block_mask = sigmask; } BAN::ScopeGuard sigmask_restore([old_sigmask] { Thread::current().m_signal_block_mask = old_sigmask; }); uint64_t waketime_ns = BAN::numeric_limits::max(); - if (timeout) + if (user_timeout != nullptr) { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(timeout, sizeof(timespec), false)); + timespec timeout; + TRY(read_from_user(user_timeout, &timeout, sizeof(timespec))); waketime_ns = SystemTimer::get().ns_since_boot() + - (timeout->tv_sec * 1'000'000'000) + - timeout->tv_nsec; + timeout.tv_sec * 1'000'000'000 + + timeout.tv_nsec; } size_t return_value = 0; @@ -2063,21 +2140,15 @@ namespace Kernel epoll_event event {}; if (user_event) - { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(user_event, sizeof(epoll_event), false)); - event = *user_event; - } + TRY(read_from_user(user_event, &event, sizeof(epoll_event))); TRY(static_cast(epoll_inode.ptr())->ctl(op, fd, inode, event)); return 0; } - BAN::ErrorOr Process::sys_epoll_pwait2(int epfd, epoll_event* events, int maxevents, const timespec* timeout, const sigset_t* sigmask) + BAN::ErrorOr Process::sys_epoll_pwait2(int epfd, epoll_event* events, int maxevents, const timespec* user_timeout, const sigset_t* user_sigmask) { - (void)sigmask; - if (maxevents <= 0) return BAN::Error::from_errno(EINVAL); @@ -2086,14 +2157,14 @@ namespace Kernel return BAN::Error::from_errno(EINVAL); uint64_t waketime_ns = BAN::numeric_limits::max(); - if (timeout) + if (user_timeout != nullptr) { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(timeout, sizeof(timespec), false)); + timespec timeout; + TRY(read_from_user(user_timeout, &timeout, sizeof(timespec))); waketime_ns = SystemTimer::get().ns_since_boot() + - (timeout->tv_sec * 1'000'000'000) + - timeout->tv_nsec; + timeout.tv_sec * 1'000'000'000 + + timeout.tv_nsec; } auto* events_region = TRY(validate_and_pin_pointer_access(events, maxevents * sizeof(epoll_event), true)); @@ -2103,22 +2174,29 @@ namespace Kernel }); const auto old_sigmask = Thread::current().m_signal_block_mask; - if (sigmask) + if (user_sigmask) { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(sigmask, sizeof(sigset_t), false)); - Thread::current().m_signal_block_mask = *sigmask; + sigset_t sigmask; + TRY(read_from_user(user_sigmask, &sigmask, sizeof(sigset_t))); + Thread::current().m_signal_block_mask = sigmask; } BAN::ScopeGuard sigmask_restore([old_sigmask] { Thread::current().m_signal_block_mask = old_sigmask; }); return TRY(static_cast(epoll_inode.ptr())->wait(BAN::Span(events, maxevents), waketime_ns)); } - BAN::ErrorOr Process::sys_pipe(int fildes[2]) + BAN::ErrorOr Process::sys_pipe(int user_fildes[2]) { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(fildes, sizeof(int) * 2, true)); + int fildes[2]; TRY(m_open_file_descriptors.pipe(fildes)); + + if (auto ret = write_to_user(user_fildes, fildes, 2 * sizeof(int)); ret.is_error()) + { + MUST(m_open_file_descriptors.close(fildes[0])); + MUST(m_open_file_descriptors.close(fildes[1])); + return ret.release_error(); + } + return 0; } @@ -2155,45 +2233,49 @@ namespace Kernel return 0; } - BAN::ErrorOr Process::sys_fstatat(int fd, const char* path, struct stat* buf, int flag) + BAN::ErrorOr Process::sys_fstatat(int fd, const char* user_path, struct stat* user_buf, int flag) { if (flag & ~AT_SYMLINK_NOFOLLOW) return BAN::Error::from_errno(EINVAL); if (flag == AT_SYMLINK_NOFOLLOW) flag = O_NOFOLLOW; - LockGuard _(m_process_lock); - if (path != nullptr) - TRY(validate_string_access(path)); - TRY(validate_pointer_access(buf, sizeof(struct stat), true)); + char path[PATH_MAX]; + if (user_path != nullptr) + TRY(read_string_from_user(user_path, path, PATH_MAX)); - auto inode = TRY(find_file(fd, path, flag)).inode; - buf->st_dev = inode->dev(); - buf->st_ino = inode->ino(); - buf->st_mode = inode->mode().mode; - buf->st_nlink = inode->nlink(); - buf->st_uid = inode->uid(); - buf->st_gid = inode->gid(); - buf->st_rdev = inode->rdev(); - buf->st_size = inode->size(); - buf->st_atim = inode->atime(); - buf->st_mtim = inode->mtime(); - buf->st_ctim = inode->ctime(); - buf->st_blksize = inode->blksize(); - buf->st_blocks = inode->blocks(); + auto inode = TRY(find_file(fd, user_path ? path : nullptr, flag)).inode; + + const struct stat buf { + .st_dev = inode->dev(), + .st_ino = inode->ino(), + .st_mode = inode->mode().mode, + .st_nlink = inode->nlink(), + .st_uid = inode->uid(), + .st_gid = inode->gid(), + .st_rdev = inode->rdev(), + .st_size = inode->size(), + .st_atim = inode->atime(), + .st_mtim = inode->mtime(), + .st_ctim = inode->ctime(), + .st_blksize = inode->blksize(), + .st_blocks = inode->blocks(), + }; + + TRY(write_to_user(user_buf, &buf, sizeof(struct stat))); return 0; } - BAN::ErrorOr Process::sys_fstatvfsat(int fd, const char* path, struct statvfs* buf) + BAN::ErrorOr Process::sys_fstatvfsat(int fd, const char* user_path, struct statvfs* user_buf) { - LockGuard _(m_process_lock); - if (path != nullptr) - TRY(validate_string_access(path)); - TRY(validate_pointer_access(buf, sizeof(struct statvfs), true)); + char path[PATH_MAX]; + if (user_path != nullptr) + TRY(read_string_from_user(user_path, path, PATH_MAX)); - auto inode = TRY(find_file(fd, path, 0)).inode; - auto* fs = inode->filesystem(); + auto inode = TRY(find_file(fd, user_path ? path : nullptr, 0)).inode; + + const auto* fs = inode->filesystem(); if (fs == nullptr) { ASSERT(inode->mode().ifsock() || inode->mode().ififo()); @@ -2201,32 +2283,36 @@ namespace Kernel return BAN::Error::from_errno(EINVAL); } - buf->f_bsize = fs->bsize(); - buf->f_frsize = fs->frsize(); - buf->f_blocks = fs->blocks(); - buf->f_bfree = fs->bfree(); - buf->f_bavail = fs->bavail(); - buf->f_files = fs->files(); - buf->f_ffree = fs->ffree(); - buf->f_favail = fs->favail(); - buf->f_fsid = fs->fsid(); - buf->f_flag = fs->flag(); - buf->f_namemax = fs->namemax(); + const struct statvfs buf { + .f_bsize = fs->bsize(), + .f_frsize = fs->frsize(), + .f_blocks = fs->blocks(), + .f_bfree = fs->bfree(), + .f_bavail = fs->bavail(), + .f_files = fs->files(), + .f_ffree = fs->ffree(), + .f_favail = fs->favail(), + .f_fsid = fs->fsid(), + .f_flag = fs->flag(), + .f_namemax = fs->namemax(), + }; + + TRY(write_to_user(user_buf, &buf, sizeof(struct statvfs))); return 0; } - BAN::ErrorOr Process::sys_realpath(const char* path, char* buffer) + BAN::ErrorOr Process::sys_realpath(const char* user_path, char* user_buffer) { - LockGuard _(m_process_lock); - TRY(validate_string_access(path)); - TRY(validate_pointer_access(buffer, PATH_MAX, true)); + char path[PATH_MAX]; + TRY(read_string_from_user(user_path, path, PATH_MAX)); auto file = TRY(find_file(AT_FDCWD, path, O_RDONLY)); if (file.canonical_path.size() >= PATH_MAX) return BAN::Error::from_errno(ENAMETOOLONG); - strcpy(buffer, file.canonical_path.data()); + TRY(write_to_user(user_buffer, file.canonical_path.data(), file.canonical_path.size() + 1)); + return file.canonical_path.size(); } @@ -2274,31 +2360,31 @@ namespace Kernel BAN::ErrorOr Process::sys_readdir(int fd, struct dirent* list, size_t list_len) { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(list, sizeof(dirent) * list_len, true)); + if (BAN::Math::will_multiplication_overflow(list_len, sizeof(struct dirent))) + return BAN::Error::from_errno(EOVERFLOW); + + auto* list_region = TRY(validate_and_pin_pointer_access(list, list_len * sizeof(struct dirent), true)); + BAN::ScopeGuard _([list_region] { if (list_region) list_region->unpin(); }); return TRY(m_open_file_descriptors.read_dir_entries(fd, list, list_len)); } - BAN::ErrorOr Process::sys_getcwd(char* buffer, size_t size) + BAN::ErrorOr Process::sys_getcwd(char* user_buffer, size_t size) { - LockGuard _(m_process_lock); - - TRY(validate_pointer_access(buffer, size, true)); - if (size < m_working_directory.canonical_path.size() + 1) return BAN::Error::from_errno(ERANGE); - - memcpy(buffer, m_working_directory.canonical_path.data(), m_working_directory.canonical_path.size()); - buffer[m_working_directory.canonical_path.size()] = '\0'; - - return (long)buffer; + TRY(write_to_user(user_buffer, m_working_directory.canonical_path.data(), m_working_directory.canonical_path.size() + 1)); + return reinterpret_cast(user_buffer); } - BAN::ErrorOr Process::sys_chdir(const char* path) + BAN::ErrorOr Process::sys_chdir(const char* user_path) { + char path[PATH_MAX]; + TRY(read_string_from_user(user_path, path, PATH_MAX)); + + auto new_cwd = TRY(find_file(AT_FDCWD, path, O_SEARCH)); + LockGuard _(m_process_lock); - TRY(validate_string_access(path)); - m_working_directory = TRY(find_file(AT_FDCWD, path, O_SEARCH)); + m_working_directory = BAN::move(new_cwd); return 0; } @@ -2309,13 +2395,17 @@ namespace Kernel return 0; } - BAN::ErrorOr Process::sys_chroot(const char* path) + BAN::ErrorOr Process::sys_chroot(const char* user_path) { + char path[PATH_MAX]; + TRY(read_string_from_user(user_path, path, PATH_MAX)); + + auto new_root = TRY(find_file(AT_FDCWD, path, O_SEARCH)); + LockGuard _(m_process_lock); - TRY(validate_string_access(path)); if (!m_credentials.is_superuser()) return BAN::Error::from_errno(EACCES); - m_root_file = TRY(find_file(AT_FDCWD, path, O_SEARCH)); + m_root_file = BAN::move(new_root); return 0; } @@ -2354,11 +2444,7 @@ namespace Kernel BAN::ErrorOr Process::sys_mmap(const sys_mmap_t* user_args) { sys_mmap_t args; - { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(user_args, sizeof(sys_mmap_t), false)); - args = *user_args; - } + TRY(read_from_user(user_args, &args, sizeof(sys_mmap_t))); if (args.prot != PROT_NONE && (args.prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC))) return BAN::Error::from_errno(EINVAL); @@ -2383,7 +2469,7 @@ namespace Kernel else page_flags |= PageTable::Flags::UserSupervisor; - LockGuard _(m_process_lock); + RWLockWRGuard _(m_memory_region_lock); AddressRange address_range { .start = 0x400000, .end = USERSPACE_END }; if (args.flags & (MAP_FIXED | MAP_FIXED_NOREPLACE)) @@ -2512,7 +2598,7 @@ namespace Kernel if (auto rem = len % PAGE_SIZE) len += PAGE_SIZE - rem; - LockGuard _(m_process_lock); + RWLockWRGuard _(m_memory_region_lock); const size_t first_index = find_mapped_region(vaddr); for (size_t i = first_index; i < m_mapped_regions.size(); i++) @@ -2561,7 +2647,7 @@ namespace Kernel else flags |= PageTable::Flags::UserSupervisor; - LockGuard _(m_process_lock); + RWLockWRGuard _(m_memory_region_lock); const size_t first_index = find_mapped_region(vaddr); for (size_t i = first_index; i < m_mapped_regions.size(); i++) @@ -2602,7 +2688,7 @@ namespace Kernel if (flags != MS_SYNC && flags != MS_ASYNC && flags != MS_INVALIDATE) return BAN::Error::from_errno(EINVAL); - LockGuard _(m_process_lock); + RWLockRDGuard _(m_memory_region_lock); const vaddr_t vaddr = reinterpret_cast(addr); @@ -2661,17 +2747,18 @@ namespace Kernel return region_vaddr; } - BAN::ErrorOr Process::sys_ttyname(int fildes, char* name, size_t namesize) + BAN::ErrorOr Process::sys_ttyname(int fildes, char* user_buffer, size_t buffer_size) { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(name, namesize, true)); auto inode = TRY(m_open_file_descriptors.inode_of(fildes)); if (!inode->is_tty()) return BAN::Error::from_errno(ENOTTY); + auto path = TRY(m_open_file_descriptors.path_of(fildes)); - if (namesize < path.size() + 1) + if (buffer_size < path.size() + 1) return BAN::Error::from_errno(ERANGE); - strcpy(name, path.data()); + + TRY(write_to_user(user_buffer, path.data(), path.size() + 1)); + return 0; } @@ -2710,20 +2797,17 @@ namespace Kernel return pts_master_fd; } - BAN::ErrorOr Process::sys_ptsname(int fildes, char* buffer, size_t buffer_len) + BAN::ErrorOr Process::sys_ptsname(int fildes, char* user_buffer, size_t buffer_len) { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(buffer, buffer_len, true)); - - if (TRY(m_open_file_descriptors.path_of(fildes)) != ""_sv) + auto file = TRY(m_open_file_descriptors.file_of(fildes)); + if (file.canonical_path != ""_sv) return BAN::Error::from_errno(ENOTTY); - auto inode = TRY(m_open_file_descriptors.inode_of(fildes)); - auto ptsname = TRY(static_cast(inode.ptr())->ptsname()); + auto ptsname = TRY(static_cast(file.inode.ptr())->ptsname()); + if (buffer_len < ptsname.size() + 1) + return BAN::Error::from_errno(ERANGE); - const size_t to_copy = BAN::Math::min(ptsname.size(), buffer_len - 1); - memcpy(buffer, ptsname.data(), to_copy); - buffer[to_copy] = '\0'; + TRY(write_to_user(user_buffer, ptsname.data(), ptsname.size() + 1)); return 0; } @@ -2741,26 +2825,25 @@ namespace Kernel return 0; } + BAN::ErrorOr Process::sys_clock_gettime(clockid_t clock_id, timespec* user_tp) { - BAN::ErrorOr Process::sys_clock_gettime(clockid_t clock_id, timespec* tp) - { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(tp, sizeof(timespec), true)); + timespec tp; switch (clock_id) { case CLOCK_MONOTONIC: - *tp = SystemTimer::get().time_since_boot(); + tp = SystemTimer::get().time_since_boot(); break; case CLOCK_REALTIME: - *tp = SystemTimer::get().real_time(); + tp = SystemTimer::get().real_time(); break; case CLOCK_PROCESS_CPUTIME_ID: { + LockGuard _(m_process_lock); uint64_t cpu_time_ns { 0 }; for (auto* thread : m_threads) cpu_time_ns += thread->cpu_time_ns(); - *tp = { + tp = { .tv_sec = static_cast(cpu_time_ns / 1'000'000'000), .tv_nsec = static_cast(cpu_time_ns % 1'000'000'000), }; @@ -2769,7 +2852,7 @@ namespace Kernel case CLOCK_THREAD_CPUTIME_ID: { const auto cpu_time_ns = Thread::current().cpu_time_ns(); - *tp = { + tp = { .tv_sec = static_cast(cpu_time_ns / 1'000'000'000), .tv_nsec = static_cast(cpu_time_ns % 1'000'000'000), }; @@ -2780,20 +2863,25 @@ namespace Kernel return BAN::Error::from_errno(ENOTSUP); } + TRY(write_to_user(user_tp, &tp, sizeof(timespec))); + return 0; } - BAN::ErrorOr Process::sys_load_keymap(const char* path) + BAN::ErrorOr Process::sys_load_keymap(const char* user_path) { + char path[PATH_MAX]; + TRY(read_string_from_user(user_path, path, PATH_MAX)); + LockGuard _(m_process_lock); - TRY(validate_string_access(path)); if (!m_credentials.is_superuser()) return BAN::Error::from_errno(EPERM); auto absolute_path = TRY(absolute_path_of(path)); TRY(LibInput::KeyboardLayout::get().load_from_file(absolute_path)); + return 0; } @@ -2949,50 +3037,51 @@ namespace Kernel return 0; } - BAN::ErrorOr Process::sys_sigaction(int signal, const struct sigaction* act, struct sigaction* oact) + BAN::ErrorOr Process::sys_sigaction(int signal, const struct sigaction* user_act, struct sigaction* user_oact) { if (signal < _SIGMIN || signal > _SIGMAX) return BAN::Error::from_errno(EINVAL); - LockGuard _(m_process_lock); - if (act) - TRY(validate_pointer_access(act, sizeof(struct sigaction), false)); - if (oact) - TRY(validate_pointer_access(oact, sizeof(struct sigaction), true)); + struct sigaction new_act, old_act; + if (user_act != nullptr) + TRY(read_from_user(user_act, &new_act, sizeof(struct sigaction))); - SpinLockGuard signal_lock_guard(m_signal_lock); - - if (oact) - *oact = m_signal_handlers[signal]; - - if (act) - m_signal_handlers[signal] = *act; - - return 0; - } - - BAN::ErrorOr Process::sys_sigpending(sigset_t* set) - { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(set, sizeof(sigset_t), true)); - *set = (signal_pending_mask() | Thread::current().m_signal_pending_mask) & Thread::current().m_signal_block_mask; - return 0; - } - - BAN::ErrorOr Process::sys_sigprocmask(int how, const sigset_t* set, sigset_t* oset) - { - LockGuard _(m_process_lock); - if (set) - TRY(validate_pointer_access(set, sizeof(sigset_t), false)); - if (oset) - TRY(validate_pointer_access(oset, sizeof(sigset_t), true)); - - if (oset) - *oset = Thread::current().m_signal_block_mask; - - if (set) { - const sigset_t mask = *set & ~(SIGKILL | SIGSTOP); + SpinLockGuard signal_lock_guard(m_signal_lock); + old_act = m_signal_handlers[signal]; + if (user_act != nullptr) + m_signal_handlers[signal] = new_act; + } + + if (user_oact != nullptr) + TRY(write_to_user(user_oact, &old_act, sizeof(struct sigaction))); + + return 0; + } + + BAN::ErrorOr Process::sys_sigpending(sigset_t* user_sigset) + { + const sigset_t sigset = (signal_pending_mask() | Thread::current().m_signal_pending_mask) & Thread::current().m_signal_block_mask; + TRY(write_to_user(user_sigset, &sigset, sizeof(sigset_t))); + return 0; + } + + BAN::ErrorOr Process::sys_sigprocmask(int how, const sigset_t* user_set, sigset_t* user_oset) + { + LockGuard _(m_process_lock); + + if (user_oset != nullptr) + { + const sigset_t current = Thread::current().m_signal_block_mask; + TRY(write_to_user(user_oset, ¤t, sizeof(sigset_t))); + } + + if (user_set != nullptr) + { + sigset_t set; + TRY(read_from_user(user_set, &set, sizeof(sigset_t))); + + const sigset_t mask = set & ~(SIGKILL | SIGSTOP); switch (how) { case SIG_BLOCK: @@ -3012,36 +3101,41 @@ namespace Kernel return 0; } - BAN::ErrorOr Process::sys_sigsuspend(const sigset_t* set) + BAN::ErrorOr Process::sys_sigsuspend(const sigset_t* user_set) { + sigset_t set; + TRY(read_from_user(user_set, &set, sizeof(sigset_t))); + LockGuard _(m_process_lock); - TRY(validate_pointer_access(set, sizeof(sigset_t), false)); auto& thread = Thread::current(); - thread.set_suspend_signal_mask(*set & ~(SIGKILL | SIGSTOP)); + thread.set_suspend_signal_mask(set & ~(SIGKILL | SIGSTOP)); + // FIXME: i *think* here is a race condition as kill doesnt hold process lock while (!thread.is_interrupted_by_signal()) Processor::scheduler().block_current_thread(nullptr, -1, &m_process_lock); return BAN::Error::from_errno(EINTR); } - BAN::ErrorOr Process::sys_sigwait(const sigset_t* set, int* sig) + BAN::ErrorOr Process::sys_sigwait(const sigset_t* user_set, int* user_signal) { + sigset_t set; + TRY(read_from_user(user_set, &set, sizeof(sigset_t))); + LockGuard _(m_process_lock); auto& thread = Thread::current(); for (;;) { - TRY(validate_pointer_access(set, sizeof(sigset_t), false)); - TRY(validate_pointer_access(sig, sizeof(int), true)); + BAN::Optional signal; { SpinLockGuard _1(thread.m_signal_lock); SpinLockGuard _2(m_signal_lock); const uint64_t pending = thread.m_signal_pending_mask | this->m_signal_pending_mask; - if (const auto wait_mask = pending & *set) + if (const auto wait_mask = pending & set) { for (size_t i = _SIGMIN; i <= _SIGMAX; i++) { @@ -3050,32 +3144,40 @@ namespace Kernel continue; thread.m_signal_pending_mask &= ~mask; this->m_signal_pending_mask &= ~mask; - *sig = i; - return 0; + signal = i; + break; } - ASSERT_NOT_REACHED(); + ASSERT(signal.has_value()); } } + if (signal.has_value()) + { + TRY(write_to_user(user_signal, &signal.value(), sizeof(int))); + return 0; + } + + // FIXME: i *think* here is a race condition as kill doesnt hold process lock Processor::scheduler().block_current_thread(nullptr, -1, &m_process_lock); } } - BAN::ErrorOr Process::sys_sigaltstack(const stack_t* ss, stack_t* oss) + BAN::ErrorOr Process::sys_sigaltstack(const stack_t* user_ss, stack_t* user_oss) { - LockGuard _(m_process_lock); - if (ss != nullptr) - TRY(validate_pointer_access(ss, sizeof(stack_t), false)); - if (oss != nullptr) - TRY(validate_pointer_access(oss, sizeof(stack_t), true)); + stack_t ss, oss; + if (user_ss != nullptr) + TRY(read_from_user(user_ss, &ss, sizeof(stack_t))); - TRY(Thread::current().sigaltstack(ss, oss)); + TRY(Thread::current().sigaltstack(user_ss ? &ss : nullptr, user_oss ? &oss : nullptr)); + + if (user_oss != nullptr) + TRY(write_to_user(user_oss, &oss, sizeof(stack_t))); return 0; } - BAN::ErrorOr Process::sys_futex(int op, const uint32_t* addr, uint32_t val, const timespec* abstime) + BAN::ErrorOr Process::sys_futex(int op, const uint32_t* addr, uint32_t val, const timespec* user_abstime) { const vaddr_t vaddr = reinterpret_cast(addr); if (vaddr % 4) @@ -3096,32 +3198,32 @@ namespace Kernel case FUTEX_WAIT: break; case FUTEX_WAKE: - abstime = nullptr; + user_abstime = nullptr; break; default: return BAN::Error::from_errno(ENOSYS); } - const uint64_t wake_time_ns = - TRY([abstime, is_realtime, this]() -> BAN::ErrorOr + uint64_t wake_time_ns = BAN::numeric_limits::max(); + + if (user_abstime != nullptr) + { + timespec abstime; + TRY(read_from_user(user_abstime, &abstime, sizeof(timespec))); + + const uint64_t abstime_ns = abstime.tv_sec * 1'000'000'000 + abstime.tv_nsec; + + if (!is_realtime) + wake_time_ns = abstime_ns; + else { - if (abstime == nullptr) - return BAN::numeric_limits::max(); - const uint64_t abs_ns = - TRY([abstime, this]() -> BAN::ErrorOr - { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(abstime, sizeof(*abstime), false)); - return abstime->tv_sec * 1'000'000'000 + abstime->tv_nsec; - }()); - if (!is_realtime) - return abs_ns; const auto realtime = SystemTimer::get().real_time(); - const uint64_t real_ns = realtime.tv_sec * 1'000'000'000 + realtime.tv_nsec; - if (abs_ns <= real_ns) + const uint64_t realtime_ns = realtime.tv_sec * 1'000'000'000 + realtime.tv_nsec; + if (abstime_ns <= realtime_ns) return BAN::Error::from_errno(ETIMEDOUT); - return SystemTimer::get().ns_since_boot() + (abs_ns - real_ns); - }()); + wake_time_ns = SystemTimer::get().ns_since_boot() + (abstime_ns - realtime_ns); + } + } if (op == FUTEX_WAIT && BAN::atomic_load(*addr) != val) return BAN::Error::from_errno(EAGAIN); @@ -3222,13 +3324,10 @@ namespace Kernel return Thread::current().get_gsbase(); } - BAN::ErrorOr Process::sys_pthread_create(const pthread_attr_t* attr, void (*entry)(void*), void* arg) + BAN::ErrorOr Process::sys_pthread_create(const pthread_attr_t* user_attr, void (*entry)(void*), void* arg) { - if (attr) - { - TRY(validate_pointer_access(attr, sizeof(*attr), false)); + if (user_attr != nullptr) dwarnln("TODO: ignoring thread attr"); - } LockGuard _(m_process_lock); @@ -3256,51 +3355,49 @@ namespace Kernel ASSERT_NOT_REACHED(); } - BAN::ErrorOr Process::sys_pthread_join(pthread_t thread, void** value) + BAN::ErrorOr Process::sys_pthread_join(pthread_t thread, void** user_value) { LockGuard _(m_process_lock); - if (value) - TRY(validate_pointer_access(value, sizeof(void*), true)); - if (thread == Thread::current().tid()) return BAN::Error::from_errno(EINVAL); - const auto wait_thread = - [&]() -> bool + const auto check_thread = + [&]() -> BAN::Optional { for (size_t i = 0; i < m_exited_pthreads.size(); i++) { if (m_exited_pthreads[i].thread != thread) continue; - if (value) - *value = m_exited_pthreads[i].value; + void* ret = m_exited_pthreads[i].value; + m_exited_pthreads.remove(i); - return true; + return ret; } - return false; + return {}; }; - if (wait_thread()) - return 0; - - { - bool found = false; - for (auto* _thread : m_threads) - if (_thread->tid() == thread) - found = true; - if (!found) - return BAN::Error::from_errno(EINVAL); - } - for (;;) { - TRY(Thread::current().block_or_eintr_indefinite(m_pthread_exit_blocker, &m_process_lock)); - if (wait_thread()) + if (auto ret = check_thread(); ret.has_value()) + { + TRY(write_to_user(user_value, &ret.value(), sizeof(void*))); return 0; + } + + { + bool found = false; + for (auto* _thread : m_threads) + if (_thread->tid() == thread) + found = true; + if (!found) + return BAN::Error::from_errno(EINVAL); + } + + TRY(Thread::current().block_or_eintr_indefinite(m_pthread_exit_blocker, &m_process_lock)); } } @@ -3692,27 +3789,40 @@ namespace Kernel return BAN::Error::from_errno(error); } - BAN::ErrorOr Process::sys_getgroups(gid_t groups[], size_t count) + BAN::ErrorOr Process::sys_getgroups(gid_t user_groups[], size_t count) { + if (BAN::Math::will_multiplication_overflow(count, sizeof(gid_t))) + return BAN::Error::from_errno(EOVERFLOW); + LockGuard _(m_process_lock); + const auto current = m_credentials.groups(); if (count == 0) return current.size(); - TRY(validate_pointer_access(groups, count * sizeof(gid_t), true)); + if (current.size() > count) return BAN::Error::from_errno(EINVAL); - for (size_t i = 0; i < current.size(); i++) - groups[i] = current[i]; + + TRY(write_to_user(user_groups, current.data(), current.size() * sizeof(gid_t))); + return current.size(); } BAN::ErrorOr Process::sys_setgroups(const gid_t groups[], size_t count) { + if (BAN::Math::will_multiplication_overflow(count, sizeof(gid_t))) + return BAN::Error::from_errno(EOVERFLOW); + LockGuard _(m_process_lock); - TRY(validate_pointer_access(groups, count * sizeof(gid_t), true)); + if (!m_credentials.is_superuser()) return BAN::Error::from_errno(EPERM); + + auto* region = TRY(validate_and_pin_pointer_access(groups, count * sizeof(gid_t), false)); + BAN::ScopeGuard pin_guard([region] { if (region) region->unpin(); }); + TRY(m_credentials.set_groups({ groups, count })); + return 0; } @@ -3735,99 +3845,338 @@ namespace Kernel return absolute_path; } - BAN::ErrorOr Process::validate_string_access(const char* str) + BAN::ErrorOr Process::allocate_page_for_demand_paging(vaddr_t address, bool wants_write, bool wants_exec) { - // NOTE: we will page fault here, if str is not actually mapped - // outcome is still the same; SIGSEGV - return validate_pointer_access(str, strlen(str) + 1, false); + ASSERT(&Process::current() == this); + + const auto is_allocated = + [&]() -> bool + { + auto wanted_flags = PageTable::Flags::UserSupervisor | PageTable::Flags::Present; + if (wants_write) + wanted_flags |= PageTable::Flags::ReadWrite; + if (wants_exec) + wanted_flags |= PageTable::Flags::Execute; + if ((m_page_table->get_page_flags(address & PAGE_ADDR_MASK) & wanted_flags) == wanted_flags) + return true; + return false; + }; + + { + RWLockRDGuard _(m_memory_region_lock); + if (is_allocated()) + return true; + } + + RWLockWRGuard _(m_memory_region_lock); + + if (is_allocated()) + return true; + + const size_t index = find_mapped_region(address); + if (index >= m_mapped_regions.size()) + return false; + + auto& region = m_mapped_regions[index]; + if (!region->contains(address)) + return false; + + return region->allocate_page_containing(address, wants_write); } - BAN::ErrorOr Process::validate_pointer_access(const void* ptr, size_t size, bool needs_write) + // TODO: The following 3 functions could be simplified into one generic helper function + + BAN::ErrorOr Process::read_from_user(const void* user_addr, void* out, size_t size) { - if (size == 0) - return {}; + const vaddr_t user_vaddr = reinterpret_cast(user_addr); - const auto page_flags = (needs_write ? PageTable::Flags::ReadWrite : 0) | PageTable::Flags::Present; + auto* out_u8 = static_cast(out); + size_t ncopied = 0; - // Make sure all of the pages are mapped here, so demand paging does not happen - // while processing syscall. - - const vaddr_t vaddr = reinterpret_cast(ptr); - const vaddr_t page_start = vaddr & PAGE_ADDR_MASK; - const size_t page_count = range_page_count(vaddr, size); - for (size_t i = 0; i < page_count; i++) { - const vaddr_t current = page_start + i * PAGE_SIZE; - if ((page_table().get_page_flags(current) & page_flags) == page_flags) - continue; - if (!TRY(Process::allocate_page_for_demand_paging(current, needs_write, false))) - { - const MemoryRegion* region_ptr = nullptr; - for (const auto& region : m_mapped_regions) - { - if (!region->contains(vaddr + i * PAGE_SIZE)) - continue; - region_ptr = region.ptr(); - break; - } + RWLockRDGuard _(m_memory_region_lock); - dwarnln("{} pid {}, tid {} made an invalid {} access to 0x{h}", - name(), - pid(), - Thread::current().tid(), - needs_write ? "write" : "read", - vaddr + i * PAGE_SIZE + const size_t first_index = find_mapped_region(user_vaddr); + for (size_t i = first_index; ncopied < size && i < m_mapped_regions.size(); i++) + { + auto& region = m_mapped_regions[i]; + if (!region->contains(user_vaddr + ncopied)) + return BAN::Error::from_errno(EFAULT); + + const size_t ncopy = BAN::Math::min( + (region->vaddr() + region->size()) - (user_vaddr + ncopied), + size - ncopied ); - dwarnln(" 0x{h}->0x{h}", vaddr, vaddr + size); - - if (region_ptr == nullptr) - dwarnln(" no mapping covers this page"); - else + const size_t page_count = range_page_count(user_vaddr + ncopied, ncopy); + const vaddr_t page_base = (user_vaddr + ncopied) & PAGE_ADDR_MASK; + for (size_t p = 0; p < page_count; p++) { - dwarnln(" inside region 0x{h}->0x{h} {}{}{}", - region_ptr->vaddr(), - region_ptr->vaddr() + region_ptr->size(), - (region_ptr->flags() & PageTable::Flags::Present) ? 'r' : '-', - (region_ptr->flags() & PageTable::Flags::ReadWrite) ? 'w' : '-', - (region_ptr->flags() & PageTable::Flags::Execute) ? 'x' : '-' - ); + const auto flags = PageTable::UserSupervisor | PageTable::Present; + if ((m_page_table->get_page_flags(page_base + p * PAGE_SIZE) & flags) != flags) + goto read_from_user_with_allocation; } - page_table().debug_dump(); + memcpy(out_u8 + ncopied, reinterpret_cast(user_vaddr + ncopied), ncopy); + ncopied += ncopy; + } - Debug::dump_stack_trace(); - MUST(sys_kill(pid(), SIGSEGV)); - return BAN::Error::from_errno(EINTR); + if (ncopied >= size) + return {}; + if (ncopied > 0) + return BAN::Error::from_errno(EFAULT); + } + + read_from_user_with_allocation: + RWLockWRGuard _(m_memory_region_lock); + + const size_t first_index = find_mapped_region(user_vaddr + ncopied); + for (size_t i = first_index; ncopied < size && i < m_mapped_regions.size(); i++) + { + auto& region = m_mapped_regions[i]; + if (!region->contains(user_vaddr + ncopied)) + return BAN::Error::from_errno(EFAULT); + + const size_t ncopy = BAN::Math::min( + (region->vaddr() + region->size()) - (user_vaddr + ncopied), + size - ncopied + ); + + const size_t page_count = range_page_count(user_vaddr + ncopied, ncopy); + const vaddr_t page_base = (user_vaddr + ncopied) & PAGE_ADDR_MASK; + for (size_t p = 0; p < page_count; p++) + { + const auto flags = PageTable::UserSupervisor | PageTable::Present; + if ((m_page_table->get_page_flags(page_base + p * PAGE_SIZE) & flags) == flags) + continue; + if (!TRY(region->allocate_page_containing(page_base + p * PAGE_SIZE, false))) + return BAN::Error::from_errno(EFAULT); + } + + memcpy(out_u8 + ncopied, reinterpret_cast(user_vaddr + ncopied), ncopy); + ncopied += ncopy; + } + + if (ncopied >= size) + return {}; + return BAN::Error::from_errno(EFAULT); + } + + BAN::ErrorOr Process::read_string_from_user(const char* user_addr, char* out, size_t max_size) + { + const vaddr_t user_vaddr = reinterpret_cast(user_addr); + + size_t ncopied = 0; + + { + RWLockRDGuard _(m_memory_region_lock); + + const size_t first_index = find_mapped_region(user_vaddr); + for (size_t i = first_index; ncopied < max_size && i < m_mapped_regions.size(); i++) + { + auto& region = m_mapped_regions[i]; + if (!region->contains(user_vaddr + ncopied)) + return BAN::Error::from_errno(EFAULT); + + vaddr_t last_page = 0; + + for (; ncopied < max_size; ncopied++) + { + const vaddr_t curr_page = (user_vaddr + ncopied) & PAGE_ADDR_MASK; + if (curr_page != last_page) + { + const auto flags = PageTable::UserSupervisor | PageTable::Present; + if ((m_page_table->get_page_flags(curr_page) & flags) != flags) + goto read_string_from_user_with_allocation; + } + + out[ncopied] = user_addr[ncopied]; + if (out[ncopied] == '\0') + return {}; + + last_page = curr_page; + } + } + + if (ncopied >= max_size) + return BAN::Error::from_errno(ENAMETOOLONG); + if (ncopied > 0) + return BAN::Error::from_errno(EFAULT); + } + + read_string_from_user_with_allocation: + RWLockWRGuard _(m_memory_region_lock); + + const size_t first_index = find_mapped_region(user_vaddr + ncopied); + for (size_t i = first_index; ncopied < max_size && i < m_mapped_regions.size(); i++) + { + auto& region = m_mapped_regions[i]; + if (!region->contains(user_vaddr + ncopied)) + return BAN::Error::from_errno(EFAULT); + + vaddr_t last_page = 0; + + for (; ncopied < max_size; ncopied++) + { + const vaddr_t curr_page = (user_vaddr + ncopied) & PAGE_ADDR_MASK; + if (curr_page != last_page) + { + const auto flags = PageTable::UserSupervisor | PageTable::Present; + if ((m_page_table->get_page_flags(curr_page) & flags) == flags) + ; + else if (!TRY(region->allocate_page_containing(curr_page, false))) + return BAN::Error::from_errno(EFAULT); + } + + out[ncopied] = user_addr[ncopied]; + if (out[ncopied] == '\0') + return {}; + + last_page = curr_page; } } - return {}; + if (ncopied >= max_size) + return BAN::Error::from_errno(ENAMETOOLONG); + return BAN::Error::from_errno(EFAULT); + } + + BAN::ErrorOr Process::write_to_user(void* user_addr, const void* in, size_t size) + { + const vaddr_t user_vaddr = reinterpret_cast(user_addr); + + const auto* in_u8 = static_cast(in); + size_t ncopied = 0; + + { + RWLockRDGuard _(m_memory_region_lock); + + const size_t first_index = find_mapped_region(user_vaddr); + for (size_t i = first_index; ncopied < size && i < m_mapped_regions.size(); i++) + { + auto& region = m_mapped_regions[i]; + if (!region->contains(user_vaddr + ncopied)) + return BAN::Error::from_errno(EFAULT); + + const size_t ncopy = BAN::Math::min( + (region->vaddr() + region->size()) - (user_vaddr + ncopied), + size - ncopied + ); + + const size_t page_count = range_page_count(user_vaddr + ncopied, ncopy); + const vaddr_t page_base = (user_vaddr + ncopied) & PAGE_ADDR_MASK; + for (size_t i = 0; i < page_count; i++) + { + const auto flags = PageTable::UserSupervisor | PageTable::ReadWrite | PageTable::Present; + if ((m_page_table->get_page_flags(page_base + i * PAGE_SIZE) & flags) != flags) + goto write_to_user_with_allocation; + } + + memcpy(reinterpret_cast(user_vaddr + ncopied), in_u8 + ncopied, ncopy); + ncopied += ncopy; + } + + if (ncopied >= size) + return {}; + if (ncopied > 0) + return BAN::Error::from_errno(EFAULT); + } + + write_to_user_with_allocation: + RWLockWRGuard _(m_memory_region_lock); + + const size_t first_index = find_mapped_region(user_vaddr + ncopied); + for (size_t i = first_index; ncopied < size && i < m_mapped_regions.size(); i++) + { + auto& region = m_mapped_regions[i]; + if (!region->contains(user_vaddr + ncopied)) + return BAN::Error::from_errno(EFAULT); + + const size_t ncopy = BAN::Math::min( + (region->vaddr() + region->size()) - (user_vaddr + ncopied), + size - ncopied + ); + + const size_t page_count = range_page_count(user_vaddr + ncopied, ncopy); + const vaddr_t page_base = (user_vaddr + ncopied) & PAGE_ADDR_MASK; + for (size_t p = 0; p < page_count; p++) + { + const auto flags = PageTable::UserSupervisor | PageTable::ReadWrite | PageTable::Present; + if ((m_page_table->get_page_flags(page_base + p * PAGE_SIZE) & flags) == flags) + continue; + if (!TRY(region->allocate_page_containing(page_base + p * PAGE_SIZE, true))) + return BAN::Error::from_errno(EFAULT); + } + + memcpy(reinterpret_cast(user_vaddr + ncopied), in_u8 + ncopied, ncopy); + ncopied += ncopy; + } + + if (ncopied >= size) + return {}; + return BAN::Error::from_errno(EFAULT); } BAN::ErrorOr Process::validate_and_pin_pointer_access(const void* ptr, size_t size, bool needs_write) { - LockGuard _(m_process_lock); - TRY(validate_pointer_access(ptr, size, needs_write)); + // TODO: allow pinning multiple regions? - // FIXME: make stack MemoryRegion? - // FIXME: allow pinning multiple regions + const vaddr_t user_vaddr = reinterpret_cast(ptr); - const vaddr_t vaddr = reinterpret_cast(ptr); + { + RWLockRDGuard _(m_memory_region_lock); - const size_t first_index = find_mapped_region(vaddr); + const size_t first_index = find_mapped_region(user_vaddr); + for (size_t i = first_index; i < m_mapped_regions.size(); i++) + { + auto& region = m_mapped_regions[i]; + if (user_vaddr >= region->vaddr() + region->size()) + break; + if (!region->contains_fully(user_vaddr, size)) + continue; + + const size_t page_count = range_page_count(user_vaddr, size); + const vaddr_t page_base = user_vaddr & PAGE_ADDR_MASK; + for (size_t p = 0; p < page_count; p++) + { + const auto flags = PageTable::UserSupervisor | (needs_write ? PageTable::ReadWrite : 0) | PageTable::Present; + if ((m_page_table->get_page_flags(page_base + p * PAGE_SIZE) & flags) != flags) + goto validate_and_pin_pointer_access_with_allocation; + } + + region->pin(); + return region.ptr(); + } + } + + validate_and_pin_pointer_access_with_allocation: + RWLockWRGuard _(m_memory_region_lock); + + const size_t first_index = find_mapped_region(user_vaddr); for (size_t i = first_index; i < m_mapped_regions.size(); i++) { auto& region = m_mapped_regions[i]; - if (vaddr >= region->vaddr() + region->size()) + if (user_vaddr >= region->vaddr() + region->size()) break; - if (!region->contains_fully(vaddr, size)) + if (!region->contains_fully(user_vaddr, size)) continue; + + const size_t page_count = range_page_count(user_vaddr, size); + const vaddr_t page_base = user_vaddr & PAGE_ADDR_MASK; + for (size_t p = 0; p < page_count; p++) + { + const auto flags = PageTable::UserSupervisor | (needs_write ? PageTable::ReadWrite : 0) | PageTable::Present; + if ((m_page_table->get_page_flags(page_base + p * PAGE_SIZE) & flags) == flags) + continue; + if (!TRY(region->allocate_page_containing(page_base + p * PAGE_SIZE, needs_write))) + return BAN::Error::from_errno(EFAULT); + } + region->pin(); return region.ptr(); } - return nullptr; + return BAN::Error::from_errno(EFAULT); } } diff --git a/kernel/kernel/Thread.cpp b/kernel/kernel/Thread.cpp index 287562f7..51dd51a6 100644 --- a/kernel/kernel/Thread.cpp +++ b/kernel/kernel/Thread.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -218,13 +219,16 @@ namespace Kernel true, true )); - thread->m_userspace_stack = TRY(VirtualRange::create_to_vaddr_range( + auto userspace_stack = TRY(MemoryBackedRegion::create( page_table, - stack_addr_start, USERSPACE_END, userspace_stack_size, + { stack_addr_start, USERSPACE_END }, + MemoryRegion::Type::PRIVATE, PageTable::Flags::UserSupervisor | PageTable::Flags::ReadWrite | PageTable::Flags::Present, - false, true + O_RDWR )); + thread->m_userspace_stack = userspace_stack.ptr(); + TRY(process->add_mapped_region(BAN::move(userspace_stack))); thread_deleter.disable(); @@ -319,16 +323,18 @@ namespace Kernel { auto* thread = TRY(create_userspace(m_process, m_process->page_table())); - save_sse(); + if (Processor::get_current_sse_thread() == this) + save_sse(); memcpy(thread->m_sse_storage, m_sse_storage, sizeof(m_sse_storage)); - TRY(thread->userspace_stack().allocate_page_for_demand_paging(thread->userspace_stack_top() - PAGE_SIZE)); - PageTable::with_fast_page(thread->userspace_stack().paddr_of(thread->userspace_stack_top() - PAGE_SIZE), [=] { - PageTable::fast_page_as(PAGE_SIZE - sizeof(uintptr_t)) = arg; - }); + TRY(thread->userspace_stack().copy_data_to_region( + thread->m_userspace_stack->size() - sizeof(void*), + reinterpret_cast(&arg), + sizeof(void*) + )); const vaddr_t entry_addr = reinterpret_cast(entry); - thread->setup_exec(entry_addr, thread->userspace_stack_top() - sizeof(uintptr_t)); + thread->setup_exec(entry_addr, thread->userspace_stack().vaddr() + thread->userspace_stack().size() - sizeof(void*)); return thread; } @@ -346,14 +352,18 @@ namespace Kernel thread->m_is_userspace = true; thread->m_kernel_stack = TRY(m_kernel_stack->clone(new_process->page_table())); - thread->m_userspace_stack = TRY(m_userspace_stack->clone(new_process->page_table())); + + const auto stack_index = new_process->find_mapped_region(m_userspace_stack->vaddr()); + thread->m_userspace_stack = static_cast(new_process->m_mapped_regions[stack_index].ptr()); + ASSERT(thread->m_userspace_stack->vaddr() == m_userspace_stack->vaddr()); thread->m_fsbase = m_fsbase; thread->m_gsbase = m_gsbase; thread->m_state = State::NotStarted; - save_sse(); + if (Processor::get_current_sse_thread() == this) + save_sse(); memcpy(thread->m_sse_storage, m_sse_storage, sizeof(m_sse_storage)); thread->m_yield_registers = {}; @@ -397,29 +407,18 @@ namespace Kernel if (needed_size > m_userspace_stack->size()) return BAN::Error::from_errno(ENOBUFS); - vaddr_t vaddr = userspace_stack_top() - needed_size; + vaddr_t vaddr = userspace_stack().vaddr() + userspace_stack().size() - needed_size; const size_t page_count = BAN::Math::div_round_up(needed_size, PAGE_SIZE); for (size_t i = 0; i < page_count; i++) - TRY(m_userspace_stack->allocate_page_for_demand_paging(vaddr + i * PAGE_SIZE)); + TRY(m_userspace_stack->allocate_page_containing(vaddr + i * PAGE_SIZE, true)); const auto stack_copy_buf = [this](BAN::ConstByteSpan buffer, vaddr_t vaddr) -> void { - ASSERT(vaddr + buffer.size() <= userspace_stack_top()); - - size_t bytes_copied = 0; - while (bytes_copied < buffer.size()) - { - const size_t to_copy = BAN::Math::min(buffer.size() - bytes_copied, PAGE_SIZE - (vaddr % PAGE_SIZE)); - - PageTable::with_fast_page(userspace_stack().paddr_of(vaddr & PAGE_ADDR_MASK), [=]() { - memcpy(PageTable::fast_page_as_ptr(vaddr % PAGE_SIZE), buffer.data() + bytes_copied, to_copy); - }); - - vaddr += to_copy; - bytes_copied += to_copy; - } + ASSERT(vaddr >= m_userspace_stack->vaddr()); + ASSERT(vaddr + buffer.size() <= m_userspace_stack->vaddr() + m_userspace_stack->size()); + MUST(m_userspace_stack->copy_data_to_region(vaddr - m_userspace_stack->vaddr(), buffer.data(), buffer.size())); }; const auto stack_push_buf = @@ -471,7 +470,7 @@ namespace Kernel stack_push_str(envp[i]); } - setup_exec(entry, userspace_stack_top() - needed_size); + setup_exec(entry, m_userspace_stack->vaddr() + m_userspace_stack->size() - needed_size); return {}; }