Kernel: Fix deadlock caused by multithreading

This allows multiple threads to concurrently call the most common
blocking syscalls:
- read
- write
- accept
- connect
- sendto
- recv
- pselect

This prevents a dead lock when for example process is waiting on a pipe,
but unable to write to it since process is locked.

This is the beginning of starting to remove processes own lock from
syscall and locking only necessary parts.
This commit is contained in:
Bananymous 2025-04-22 04:32:47 +03:00
parent c790bad667
commit f8e3ae0525
2 changed files with 166 additions and 84 deletions

View File

@ -241,6 +241,7 @@ namespace Kernel
BAN::ErrorOr<void> validate_string_access(const char*); BAN::ErrorOr<void> validate_string_access(const char*);
BAN::ErrorOr<void> validate_pointer_access_check(const void*, size_t, bool needs_write); BAN::ErrorOr<void> validate_pointer_access_check(const void*, size_t, bool needs_write);
BAN::ErrorOr<void> validate_pointer_access(const void*, size_t, bool needs_write); BAN::ErrorOr<void> validate_pointer_access(const void*, size_t, bool needs_write);
BAN::ErrorOr<MemoryRegion*> validate_and_pin_pointer_access(const void*, size_t, bool needs_write);
uint64_t signal_pending_mask() const uint64_t signal_pending_mask() const
{ {

View File

@ -1051,28 +1051,36 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_close(int fd) BAN::ErrorOr<long> Process::sys_close(int fd)
{ {
LockGuard _(m_process_lock);
TRY(m_open_file_descriptors.close(fd)); TRY(m_open_file_descriptors.close(fd));
return 0; return 0;
} }
BAN::ErrorOr<long> Process::sys_read(int fd, void* buffer, size_t count) BAN::ErrorOr<long> Process::sys_read(int fd, void* buffer, size_t count)
{ {
LockGuard _(m_process_lock);
if (count == 0) if (count == 0)
{ {
TRY(m_open_file_descriptors.inode_of(fd)); TRY(m_open_file_descriptors.inode_of(fd));
return 0; return 0;
} }
TRY(validate_pointer_access(buffer, count, true));
return TRY(m_open_file_descriptors.read(fd, BAN::ByteSpan((uint8_t*)buffer, count))); // FIXME: buffer_region can be null as stack is not MemoryRegion
auto* buffer_region = TRY(validate_and_pin_pointer_access(buffer, count, true));
BAN::ScopeGuard _([&] { if (buffer_region) buffer_region->unpin(); });
return TRY(m_open_file_descriptors.read(fd, BAN::ByteSpan(static_cast<uint8_t*>(buffer), count)));
} }
BAN::ErrorOr<long> Process::sys_write(int fd, const void* buffer, size_t count) BAN::ErrorOr<long> Process::sys_write(int fd, const void* buffer, size_t count)
{ {
LockGuard _(m_process_lock); if (count == 0)
TRY(validate_pointer_access(buffer, count, false)); {
return TRY(m_open_file_descriptors.write(fd, BAN::ByteSpan((uint8_t*)buffer, count))); TRY(m_open_file_descriptors.inode_of(fd));
return 0;
}
// FIXME: buffer_region can be null as stack is not MemoryRegion
auto* buffer_region = TRY(validate_and_pin_pointer_access(buffer, count, false));
BAN::ScopeGuard _([&] { if (buffer_region) buffer_region->unpin(); });
return TRY(m_open_file_descriptors.write(fd, BAN::ConstByteSpan(static_cast<const uint8_t*>(buffer), count)));
} }
BAN::ErrorOr<long> Process::sys_access(const char* path, int amode) BAN::ErrorOr<long> Process::sys_access(const char* path, int amode)
@ -1296,19 +1304,24 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_accept(int socket, sockaddr* address, socklen_t* address_len, int flags) BAN::ErrorOr<long> Process::sys_accept(int socket, sockaddr* address, socklen_t* address_len, int flags)
{ {
if (address && !address_len) if (!address != !address_len)
return BAN::Error::from_errno(EINVAL);
if (!address && address_len)
return BAN::Error::from_errno(EINVAL); return BAN::Error::from_errno(EINVAL);
if (flags & ~(SOCK_NONBLOCK | SOCK_CLOEXEC)) if (flags & ~(SOCK_NONBLOCK | SOCK_CLOEXEC))
return BAN::Error::from_errno(EINVAL); return BAN::Error::from_errno(EINVAL);
LockGuard _(m_process_lock); MemoryRegion* address_region1 = nullptr;
if (address) MemoryRegion* address_region2 = nullptr;
{
TRY(validate_pointer_access(address_len, sizeof(*address_len), true)); BAN::ScopeGuard _([&] {
TRY(validate_pointer_access(address, *address_len, true)); if (address_region1)
} address_region1->unpin();
if (address_region2)
address_region2->unpin();
});
address_region1 = TRY(validate_and_pin_pointer_access(address_len, sizeof(address_len), true));
const socklen_t address_len_safe = address_len ? *address_len : 0;
address_region2 = TRY(validate_and_pin_pointer_access(address, address_len_safe, true));
auto inode = TRY(m_open_file_descriptors.inode_of(socket)); auto inode = TRY(m_open_file_descriptors.inode_of(socket));
if (!inode->mode().ifsock()) if (!inode->mode().ifsock())
@ -1338,13 +1351,13 @@ namespace Kernel
BAN::ErrorOr<long> Process::sys_connect(int socket, const sockaddr* address, socklen_t address_len) BAN::ErrorOr<long> Process::sys_connect(int socket, const sockaddr* address, socklen_t address_len)
{ {
LockGuard _(m_process_lock);
TRY(validate_pointer_access(address, address_len, false));
auto inode = TRY(m_open_file_descriptors.inode_of(socket)); auto inode = TRY(m_open_file_descriptors.inode_of(socket));
if (!inode->mode().ifsock()) if (!inode->mode().ifsock())
return BAN::Error::from_errno(ENOTSOCK); return BAN::Error::from_errno(ENOTSOCK);
auto* address_region = TRY(validate_and_pin_pointer_access(address, address_len, true));
BAN::ScopeGuard _([&] { if (address_region) address_region->unpin(); });
TRY(inode->connect(address, address_len)); TRY(inode->connect(address, address_len));
return 0; return 0;
} }
@ -1361,35 +1374,69 @@ namespace Kernel
return 0; return 0;
} }
BAN::ErrorOr<long> Process::sys_sendto(const sys_sendto_t* arguments) BAN::ErrorOr<long> Process::sys_sendto(const sys_sendto_t* _arguments)
{
sys_sendto_t arguments;
{ {
LockGuard _(m_process_lock); LockGuard _(m_process_lock);
TRY(validate_pointer_access(arguments, sizeof(sys_sendto_t), false)); TRY(validate_pointer_access(_arguments, sizeof(sys_sendto_t), false));
TRY(validate_pointer_access(arguments->message, arguments->length, false)); arguments = *_arguments;
TRY(validate_pointer_access(arguments->dest_addr, arguments->dest_len, false));
auto message = BAN::ConstByteSpan(static_cast<const uint8_t*>(arguments->message), arguments->length);
return TRY(m_open_file_descriptors.sendto(arguments->socket, message, arguments->dest_addr, arguments->dest_len));
} }
BAN::ErrorOr<long> Process::sys_recvfrom(sys_recvfrom_t* arguments) if (arguments.length == 0)
{
if (arguments->address && !arguments->address_len)
return BAN::Error::from_errno(EINVAL);
if (!arguments->address && arguments->address_len)
return BAN::Error::from_errno(EINVAL); return BAN::Error::from_errno(EINVAL);
MemoryRegion* message_region = nullptr;
MemoryRegion* address_region = nullptr;
BAN::ScopeGuard _([&] {
if (message_region)
message_region->unpin();
if (address_region)
address_region->unpin();
});
message_region = TRY(validate_and_pin_pointer_access(arguments.message, arguments.length, false));
address_region = TRY(validate_and_pin_pointer_access(arguments.dest_addr, arguments.dest_len, false));
auto message = BAN::ConstByteSpan(static_cast<const uint8_t*>(arguments.message), arguments.length);
return TRY(m_open_file_descriptors.sendto(arguments.socket, message, arguments.dest_addr, arguments.dest_len));
}
BAN::ErrorOr<long> Process::sys_recvfrom(sys_recvfrom_t* _arguments)
{
sys_recvfrom_t arguments;
{
LockGuard _(m_process_lock); LockGuard _(m_process_lock);
TRY(validate_pointer_access(arguments, sizeof(sys_recvfrom_t), false)); TRY(validate_pointer_access(_arguments, sizeof(sys_sendto_t), false));
TRY(validate_pointer_access(arguments->buffer, arguments->length, true)); arguments = *_arguments;
if (arguments->address)
{
TRY(validate_pointer_access(arguments->address_len, sizeof(*arguments->address_len), true));
TRY(validate_pointer_access(arguments->address, *arguments->address_len, true));
} }
auto message = BAN::ByteSpan(static_cast<uint8_t*>(arguments->buffer), arguments->length); if (!arguments.address != !arguments.address_len)
return TRY(m_open_file_descriptors.recvfrom(arguments->socket, message, arguments->address, arguments->address_len)); return BAN::Error::from_errno(EINVAL);
if (arguments.length == 0)
return BAN::Error::from_errno(EINVAL);
MemoryRegion* buffer_region = nullptr;
MemoryRegion* address_region1 = nullptr;
MemoryRegion* address_region2 = nullptr;
BAN::ScopeGuard _([&] {
if (buffer_region)
buffer_region->unpin();
if (address_region1)
address_region1->unpin();
if (address_region2)
address_region2->unpin();
});
buffer_region = TRY(validate_and_pin_pointer_access(arguments.buffer, arguments.length, true));
address_region1 = TRY(validate_and_pin_pointer_access(arguments.address_len, sizeof(*arguments.address_len), true));
const socklen_t address_len_safe = arguments.address_len ? *arguments.address_len : 0;
address_region2 = TRY(validate_and_pin_pointer_access(arguments.address, address_len_safe, true));
auto message = BAN::ByteSpan(static_cast<uint8_t*>(arguments.buffer), arguments.length);
return TRY(m_open_file_descriptors.recvfrom(arguments.socket, message, arguments.address, arguments.address_len));
} }
BAN::ErrorOr<long> Process::sys_ioctl(int fildes, int request, void* arg) BAN::ErrorOr<long> Process::sys_ioctl(int fildes, int request, void* arg)
@ -1399,32 +1446,51 @@ namespace Kernel
return TRY(inode->ioctl(request, arg)); return TRY(inode->ioctl(request, arg));
} }
BAN::ErrorOr<long> Process::sys_pselect(sys_pselect_t* arguments) BAN::ErrorOr<long> Process::sys_pselect(sys_pselect_t* _arguments)
{
sys_pselect_t arguments;
{ {
LockGuard _(m_process_lock); LockGuard _(m_process_lock);
TRY(validate_pointer_access(_arguments, sizeof(sys_pselect_t), false));
arguments = *_arguments;
}
TRY(validate_pointer_access(arguments, sizeof(sys_pselect_t), false)); MemoryRegion* readfd_region = nullptr;
if (arguments->readfds) MemoryRegion* writefd_region = nullptr;
TRY(validate_pointer_access(arguments->readfds, sizeof(fd_set), true)); MemoryRegion* errorfd_region = nullptr;
if (arguments->writefds) MemoryRegion* timeout_region = nullptr;
TRY(validate_pointer_access(arguments->writefds, sizeof(fd_set), true)); MemoryRegion* sigmask_region = nullptr;
if (arguments->errorfds)
TRY(validate_pointer_access(arguments->errorfds, sizeof(fd_set), true)); BAN::ScopeGuard _([&] {
if (arguments->timeout) if (readfd_region)
TRY(validate_pointer_access(arguments->timeout, sizeof(timespec), false)); readfd_region->unpin();
if (arguments->sigmask) if (writefd_region)
TRY(validate_pointer_access(arguments->sigmask, sizeof(sigset_t), false)); writefd_region->unpin();
if (errorfd_region)
errorfd_region->unpin();
if (timeout_region)
timeout_region->unpin();
if (sigmask_region)
sigmask_region->unpin();
});
readfd_region = TRY(validate_and_pin_pointer_access(arguments.readfds, sizeof(fd_set), true));
writefd_region = TRY(validate_and_pin_pointer_access(arguments.writefds, sizeof(fd_set), true));
errorfd_region = TRY(validate_and_pin_pointer_access(arguments.errorfds, sizeof(fd_set), true));
timeout_region = TRY(validate_and_pin_pointer_access(arguments.timeout, sizeof(timespec), false));
sigmask_region = TRY(validate_and_pin_pointer_access(arguments.sigmask, sizeof(sigset_t), false));
const auto old_sigmask = Thread::current().m_signal_block_mask; const auto old_sigmask = Thread::current().m_signal_block_mask;
if (arguments->sigmask) if (arguments.sigmask)
Thread::current().m_signal_block_mask = *arguments->sigmask; Thread::current().m_signal_block_mask = *arguments.sigmask;
BAN::ScopeGuard sigmask_restore([old_sigmask] { Thread::current().m_signal_block_mask = old_sigmask; }); BAN::ScopeGuard sigmask_restore([old_sigmask] { Thread::current().m_signal_block_mask = old_sigmask; });
uint64_t timedout_ns = SystemTimer::get().ns_since_boot(); uint64_t timedout_ns = SystemTimer::get().ns_since_boot();
if (arguments->timeout) if (arguments.timeout)
{ {
timedout_ns += arguments->timeout->tv_sec * 1'000'000'000; timedout_ns += arguments.timeout->tv_sec * 1'000'000'000;
timedout_ns += arguments->timeout->tv_nsec; timedout_ns += arguments.timeout->tv_nsec;
} }
fd_set readfds; FD_ZERO(&readfds); fd_set readfds; FD_ZERO(&readfds);
@ -1455,38 +1521,38 @@ namespace Kernel
} }
}; };
for (int i = 0; i < arguments->nfds; i++) for (int i = 0; i < arguments.nfds; i++)
{ {
update_fds(i, arguments->readfds, &readfds, &Inode::can_read); update_fds(i, arguments.readfds, &readfds, &Inode::can_read);
update_fds(i, arguments->writefds, &writefds, &Inode::can_write); update_fds(i, arguments.writefds, &writefds, &Inode::can_write);
update_fds(i, arguments->errorfds, &errorfds, &Inode::has_error); update_fds(i, arguments.errorfds, &errorfds, &Inode::has_error);
} }
if (set_bits > 0) if (set_bits > 0)
break; break;
if (arguments->timeout && SystemTimer::get().ns_since_boot() >= timedout_ns) if (arguments.timeout && SystemTimer::get().ns_since_boot() >= timedout_ns)
break; break;
LockFreeGuard free(m_process_lock); // FIXME: implement some multi thread blocker system?
TRY(Thread::current().sleep_or_eintr_ms(1)); TRY(Thread::current().sleep_or_eintr_ms(1));
} }
if (arguments->readfds) if (arguments.readfds)
FD_ZERO(arguments->readfds); FD_ZERO(arguments.readfds);
if (arguments->writefds) if (arguments.writefds)
FD_ZERO(arguments->writefds); FD_ZERO(arguments.writefds);
if (arguments->errorfds) if (arguments.errorfds)
FD_ZERO(arguments->errorfds); FD_ZERO(arguments.errorfds);
for (int i = 0; i < arguments->nfds; i++) for (int i = 0; i < arguments.nfds; i++)
{ {
if (arguments->readfds && FD_ISSET(i, &readfds)) if (arguments.readfds && FD_ISSET(i, &readfds))
FD_SET(i, arguments->readfds); FD_SET(i, arguments.readfds);
if (arguments->writefds && FD_ISSET(i, &writefds)) if (arguments.writefds && FD_ISSET(i, &writefds))
FD_SET(i, arguments->writefds); FD_SET(i, arguments.writefds);
if (arguments->errorfds && FD_ISSET(i, &errorfds)) if (arguments.errorfds && FD_ISSET(i, &errorfds))
FD_SET(i, arguments->errorfds); FD_SET(i, arguments.errorfds);
} }
return set_bits; return set_bits;
@ -2684,4 +2750,19 @@ unauthorized_access:
return {}; return {};
} }
BAN::ErrorOr<MemoryRegion*> Process::validate_and_pin_pointer_access(const void* ptr, size_t size, bool needs_write)
{
LockGuard _(m_process_lock);
TRY(validate_pointer_access(ptr, size, needs_write));
for (auto& region : m_mapped_regions)
{
if (!region->contains_fully(reinterpret_cast<vaddr_t>(ptr), size))
continue;
region->pin();
return region.ptr();
}
// FIXME: Make stack MemoryRegion?
return nullptr;
}
} }