From 9b875fb930b8f480e125ad87c65d3faebce41f7f Mon Sep 17 00:00:00 2001 From: Bananymous Date: Fri, 30 May 2025 22:06:12 +0300 Subject: [PATCH] Kernel: Make epoll work with different fds pointing to same inode --- kernel/include/kernel/Epoll.h | 43 ++++++++++++++-- kernel/kernel/Epoll.cpp | 97 ++++++++++++++++++++++++----------- kernel/kernel/Process.cpp | 8 +-- 3 files changed, 111 insertions(+), 37 deletions(-) diff --git a/kernel/include/kernel/Epoll.h b/kernel/include/kernel/Epoll.h index 1af406ee..551953da 100644 --- a/kernel/include/kernel/Epoll.h +++ b/kernel/include/kernel/Epoll.h @@ -1,10 +1,12 @@ #pragma once +#include #include #include #include #include +#include #include namespace Kernel @@ -16,7 +18,7 @@ namespace Kernel static BAN::ErrorOr> create(); ~Epoll(); - BAN::ErrorOr ctl(int op, BAN::RefPtr inode, epoll_event event); + BAN::ErrorOr ctl(int op, int fd, BAN::RefPtr inode, epoll_event event); BAN::ErrorOr wait(BAN::Span events, uint64_t waketime_ns); void notify(BAN::RefPtr inode, uint32_t event); @@ -59,10 +61,45 @@ namespace Kernel } }; + struct ListenEventList + { + BAN::Array events; + uint32_t bitmap[(OPEN_MAX + 31) / 32] {}; + + bool has_fd(int fd) const + { + if (fd < 0 || static_cast(fd) >= events.size()) + return false; + return bitmap[fd / 32] & (1u << (fd % 32)); + } + + bool empty() const + { + for (auto val : bitmap) + if (val != 0) + return false; + return true; + } + + void add_fd(int fd, epoll_event event) + { + ASSERT(!has_fd(fd)); + bitmap[fd / 32] |= (1u << (fd % 32)); + events[fd] = event; + } + + void remove_fd(int fd) + { + ASSERT(has_fd(fd)); + bitmap[fd / 32] &= ~(1u << (fd % 32)); + events[fd] = {}; + } + }; + private: ThreadBlocker m_thread_blocker; - BAN::HashMap, uint32_t, InodeRefPtrHash> m_ready_events; - BAN::HashMap, epoll_event, InodeRefPtrHash> m_listening_events; + BAN::HashMap, uint32_t, InodeRefPtrHash> m_ready_events; + BAN::HashMap, ListenEventList, InodeRefPtrHash> m_listening_events; }; } diff --git a/kernel/kernel/Epoll.cpp b/kernel/kernel/Epoll.cpp index 49a9eeb1..39a656eb 100644 --- a/kernel/kernel/Epoll.cpp +++ b/kernel/kernel/Epoll.cpp @@ -19,7 +19,7 @@ namespace Kernel inode->del_epoll(this); } - BAN::ErrorOr Epoll::ctl(int op, BAN::RefPtr inode, epoll_event event) + BAN::ErrorOr Epoll::ctl(int op, int fd, BAN::RefPtr inode, epoll_event event) { LockGuard _(m_mutex); @@ -28,27 +28,51 @@ namespace Kernel switch (op) { case EPOLL_CTL_ADD: - if (it != m_listening_events.end()) + { + if (it == m_listening_events.end()) + it = TRY(m_listening_events.emplace(inode)); + if (it->value.has_fd(fd)) return BAN::Error::from_errno(EEXIST); - TRY(m_listening_events.reserve(m_listening_events.size() + 1)); - TRY(m_ready_events.reserve(m_listening_events.size() + 1)); + TRY(m_ready_events.reserve(m_listening_events.size())); TRY(inode->add_epoll(this)); - MUST(m_listening_events.insert(inode, event)); - MUST(m_ready_events.insert(inode, event.events)); + it->value.add_fd(fd, event); + + auto ready_it = m_ready_events.find(inode); + if (ready_it == m_ready_events.end()) + ready_it = MUST(m_ready_events.insert(inode, 0)); + ready_it->value |= event.events; + return {}; + } case EPOLL_CTL_MOD: + { if (it == m_listening_events.end()) return BAN::Error::from_errno(ENOENT); - MUST(m_ready_events.emplace_or_assign(inode, event.events)); - it->value = event; + if (!it->value.has_fd(fd)) + return BAN::Error::from_errno(ENOENT); + it->value.events[fd] = event; + + auto ready_it = m_ready_events.find(inode); + if (ready_it == m_ready_events.end()) + ready_it = MUST(m_ready_events.insert(inode, 0)); + ready_it->value |= event.events; + return {}; + } case EPOLL_CTL_DEL: + { if (it == m_listening_events.end()) return BAN::Error::from_errno(ENOENT); - m_listening_events.remove(it); - m_ready_events.remove(inode); - inode->del_epoll(this); + if (!it->value.has_fd(fd)) + return BAN::Error::from_errno(ENOENT); + it->value.remove_fd(fd); + if (it->value.empty()) + { + m_listening_events.remove(it); + m_ready_events.remove(inode); + } return {}; + } } return BAN::Error::from_errno(EINVAL); @@ -56,6 +80,9 @@ namespace Kernel BAN::ErrorOr Epoll::wait(BAN::Span event_span, uint64_t waketime_ns) { + if (event_span.empty()) + return BAN::Error::from_errno(EINVAL); + size_t count = 0; for (;;) @@ -64,13 +91,17 @@ namespace Kernel { LockGuard _(m_mutex); + for (auto it = m_ready_events.begin(); it != m_ready_events.end() && count < event_span.size();) { auto& [inode, events] = *it; auto& listen = m_listening_events[inode]; - const uint32_t listen_mask = (listen.events & (EPOLLIN | EPOLLOUT)) | EPOLLERR | EPOLLHUP; + uint32_t listen_mask = EPOLLERR | EPOLLHUP; + for (int fd = 0; fd < OPEN_MAX; fd++) + if (listen.has_fd(fd)) + listen_mask |= listen.events[fd].events; events &= listen_mask; // This prevents a possible deadlock @@ -98,16 +129,27 @@ namespace Kernel continue; } - event_span[count++] = { - .events = events, - .data = listen.data, - }; + for (int fd = 0; fd < OPEN_MAX && count < event_span.size(); fd++) + { + if (!listen.has_fd(fd)) + continue; + auto& listen_event = listen.events[fd]; - if (listen.events & EPOLLONESHOT) - listen.events = 0; + const auto new_events = listen_event.events & events; + if (new_events == 0) + continue; - if (listen.events & EPOLLET) - events &= ~listen_mask; + event_span[count++] = { + .events = new_events, + .data = listen_event.data, + }; + + if (listen_event.events & EPOLLONESHOT) + listen_event.events = 0; + // this doesn't work with multiple of the same inode + if (listen_event.events & EPOLLET) + events &= ~new_events; + } it++; } @@ -132,18 +174,13 @@ namespace Kernel { LockGuard _(m_mutex); - auto listen_it = m_listening_events.find(inode); - if (listen_it == m_listening_events.end()) + if (!m_listening_events.contains(inode)) return; - event &= (listen_it->value.events & (EPOLLIN | EPOLLOUT)) | EPOLLERR | EPOLLHUP; - if (event == 0) - return; - - if (auto ready_it = m_ready_events.find(inode); ready_it != m_ready_events.end()) - ready_it->value |= event; - else - MUST(m_ready_events.insert(inode, event)); + auto ready_it = m_ready_events.find(inode); + if (ready_it == m_ready_events.end()) + ready_it = MUST(m_ready_events.insert(inode, 0)); + ready_it->value |= event; m_thread_blocker.unblock(); } diff --git a/kernel/kernel/Process.cpp b/kernel/kernel/Process.cpp index 7c2f0f9f..e4d37016 100644 --- a/kernel/kernel/Process.cpp +++ b/kernel/kernel/Process.cpp @@ -1583,7 +1583,7 @@ namespace Kernel } auto epoll = TRY(Epoll::create()); - for (int fd = 0; fd < user_arguments->nfds; fd++) + for (int fd = 0; fd < arguments.nfds; fd++) { uint32_t events = 0; if (arguments.readfds && FD_ISSET(fd, arguments.readfds)) @@ -1599,11 +1599,11 @@ namespace Kernel if (inode_or_error.is_error()) continue; - TRY(epoll->ctl(EPOLL_CTL_ADD, inode_or_error.release_value(), { .events = events, .data = { .fd = fd }})); + TRY(epoll->ctl(EPOLL_CTL_ADD, fd, inode_or_error.release_value(), { .events = events, .data = { .fd = fd }})); } BAN::Vector event_buffer; - TRY(event_buffer.resize(user_arguments->nfds)); + TRY(event_buffer.resize(arguments.nfds)); const size_t waited_events = TRY(epoll->wait(event_buffer.span(), waketime_ns)); @@ -1663,7 +1663,7 @@ namespace Kernel event = *user_event; } - TRY(static_cast(epoll_inode.ptr())->ctl(op, inode, event)); + TRY(static_cast(epoll_inode.ptr())->ctl(op, fd, inode, event)); return 0; }