Kernel: Rewrite epoll notifying system

This removes the need to lock epoll's mutex when notifying epoll. This
prevents a ton of deadlocks when epoll is notified from an interrupt
handler or otherwise with interrupts disabled
This commit is contained in:
Bananymous 2025-06-02 10:50:48 +03:00
parent e9f8471a28
commit 9883fb7bf6
2 changed files with 86 additions and 52 deletions

View File

@ -98,7 +98,9 @@ namespace Kernel
private: private:
ThreadBlocker m_thread_blocker; ThreadBlocker m_thread_blocker;
SpinLock m_ready_lock;
BAN::HashMap<BAN::RefPtr<Inode>, uint32_t, InodeRefPtrHash> m_ready_events; BAN::HashMap<BAN::RefPtr<Inode>, uint32_t, InodeRefPtrHash> m_ready_events;
BAN::HashMap<BAN::RefPtr<Inode>, uint32_t, InodeRefPtrHash> m_processing_events;
BAN::HashMap<BAN::RefPtr<Inode>, ListenEventList, InodeRefPtrHash> m_listening_events; BAN::HashMap<BAN::RefPtr<Inode>, ListenEventList, InodeRefPtrHash> m_listening_events;
}; };

View File

@ -29,18 +29,26 @@ namespace Kernel
{ {
case EPOLL_CTL_ADD: case EPOLL_CTL_ADD:
{ {
if (it == m_listening_events.end()) bool contains_inode = (it != m_listening_events.end());
if (!contains_inode)
it = TRY(m_listening_events.emplace(inode)); it = TRY(m_listening_events.emplace(inode));
if (it->value.has_fd(fd)) if (it->value.has_fd(fd))
return BAN::Error::from_errno(EEXIST); return BAN::Error::from_errno(EEXIST);
TRY(m_ready_events.reserve(m_listening_events.size()));
TRY(inode->add_epoll(this)); {
SpinLockGuard _(m_ready_lock);
TRY(m_ready_events.reserve(m_listening_events.size()));
}
TRY(m_processing_events.reserve(m_listening_events.size()));
if (!contains_inode)
TRY(inode->add_epoll(this));
it->value.add_fd(fd, event); it->value.add_fd(fd, event);
auto ready_it = m_ready_events.find(inode); auto processing_it = m_processing_events.find(inode);
if (ready_it == m_ready_events.end()) if (processing_it == m_processing_events.end())
ready_it = MUST(m_ready_events.insert(inode, 0)); processing_it = MUST(m_processing_events.insert(inode, 0));
ready_it->value |= event.events; processing_it->value |= event.events;
return {}; return {};
} }
@ -50,12 +58,13 @@ namespace Kernel
return BAN::Error::from_errno(ENOENT); return BAN::Error::from_errno(ENOENT);
if (!it->value.has_fd(fd)) if (!it->value.has_fd(fd))
return BAN::Error::from_errno(ENOENT); return BAN::Error::from_errno(ENOENT);
it->value.events[fd] = event; it->value.events[fd] = event;
auto ready_it = m_ready_events.find(inode); auto processing_it = m_processing_events.find(inode);
if (ready_it == m_ready_events.end()) if (processing_it == m_processing_events.end())
ready_it = MUST(m_ready_events.insert(inode, 0)); processing_it = MUST(m_processing_events.insert(inode, 0));
ready_it->value |= event.events; processing_it->value |= event.events;
return {}; return {};
} }
@ -68,7 +77,10 @@ namespace Kernel
it->value.remove_fd(fd); it->value.remove_fd(fd);
if (it->value.empty()) if (it->value.empty())
{ {
inode->del_epoll(this);
m_listening_events.remove(it); m_listening_events.remove(it);
m_processing_events.remove(inode);
SpinLockGuard _(m_ready_lock);
m_ready_events.remove(inode); m_ready_events.remove(inode);
} }
return {}; return {};
@ -83,53 +95,76 @@ namespace Kernel
if (event_span.empty()) if (event_span.empty())
return BAN::Error::from_errno(EINVAL); return BAN::Error::from_errno(EINVAL);
size_t count = 0; size_t event_count = 0;
for (;;) for (;;)
{ {
bool failed_lock = false;
{ {
LockGuard _(m_mutex); LockGuard _(m_mutex);
for (auto it = m_ready_events.begin(); it != m_ready_events.end() && count < event_span.size();) {
SpinLockGuard _(m_ready_lock);
while (!m_ready_events.empty())
{
auto [inode, events] = *m_ready_events.begin();
m_ready_events.remove(m_ready_events.begin());
ASSERT(events);
if (auto it = m_processing_events.find(inode); it != m_processing_events.end())
it->value |= events;
else
MUST(m_processing_events.insert(inode, events));
}
}
for (auto it = m_processing_events.begin(); it != m_processing_events.end() && event_count < event_span.size();)
{ {
auto& [inode, events] = *it; auto& [inode, events] = *it;
auto& listen = m_listening_events[inode]; #define REMOVE_IT_AND_CONTINUE() \
({ \
m_processing_events.remove(it); \
it = m_processing_events.begin(); \
continue; \
})
uint32_t listen_mask = EPOLLERR | EPOLLHUP; auto listen_it = m_listening_events.find(inode);
for (int fd = 0; fd < OPEN_MAX; fd++) if (listen_it == m_listening_events.end())
if (listen.has_fd(fd)) REMOVE_IT_AND_CONTINUE();
listen_mask |= listen.events[fd].events; auto& listen = listen_it->value;
events &= listen_mask;
// This prevents a possible deadlock
if (!inode->m_mutex.try_lock())
{ {
failed_lock = true; uint32_t listen_mask = EPOLLERR | EPOLLHUP;
continue; for (size_t fd = 0; fd < listen.events.size(); fd++)
if (listen.has_fd(fd))
listen_mask |= listen.events[fd].events;
events &= listen_mask;
} }
#define CHECK_EVENT_BIT(mask, func) \
if ((events & mask) && !inode->func()) \
events &= ~mask;
CHECK_EVENT_BIT(EPOLLIN, can_read);
CHECK_EVENT_BIT(EPOLLOUT, can_write);
CHECK_EVENT_BIT(EPOLLERR, has_error);
CHECK_EVENT_BIT(EPOLLHUP, has_hungup);
#undef CHECK_EVENT_BIT
inode->m_mutex.unlock();
if (events == 0) if (events == 0)
REMOVE_IT_AND_CONTINUE();
{ {
m_ready_events.remove(it); LockGuard inode_locker(inode->m_mutex);
it = m_ready_events.begin();
continue; #define CHECK_EVENT_BIT(mask, func) \
if ((events & mask) && !inode->func()) \
events &= ~mask;
CHECK_EVENT_BIT(EPOLLIN, can_read);
CHECK_EVENT_BIT(EPOLLOUT, can_write);
CHECK_EVENT_BIT(EPOLLERR, has_error);
CHECK_EVENT_BIT(EPOLLHUP, has_hungup);
#undef CHECK_EVENT_BIT
} }
for (int fd = 0; fd < OPEN_MAX && count < event_span.size(); fd++) if (events == 0)
REMOVE_IT_AND_CONTINUE();
#undef REMOVE_IT_AND_CONTINUE
for (size_t fd = 0; fd < listen.events.size() && event_count < event_span.size(); fd++)
{ {
if (!listen.has_fd(fd)) if (!listen.has_fd(fd))
continue; continue;
@ -139,7 +174,7 @@ namespace Kernel
if (new_events == 0) if (new_events == 0)
continue; continue;
event_span[count++] = { event_span[event_count++] = {
.events = new_events, .events = new_events,
.data = listen_event.data, .data = listen_event.data,
}; };
@ -155,32 +190,29 @@ namespace Kernel
} }
} }
if (count) if (event_count > 0)
break; break;
const uint64_t current_ns = SystemTimer::get().ns_since_boot(); const uint64_t current_ns = SystemTimer::get().ns_since_boot();
if (current_ns >= waketime_ns) if (current_ns >= waketime_ns)
break; break;
if (failed_lock)
continue;
const uint64_t timeout_ns = BAN::Math::min<uint64_t>(100'000'000, waketime_ns - current_ns); const uint64_t timeout_ns = BAN::Math::min<uint64_t>(100'000'000, waketime_ns - current_ns);
TRY(Thread::current().block_or_eintr_or_timeout_ns(m_thread_blocker, timeout_ns, false)); TRY(Thread::current().block_or_eintr_or_timeout_ns(m_thread_blocker, timeout_ns, false));
} }
return count; return event_count;
} }
void Epoll::notify(BAN::RefPtr<Inode> inode, uint32_t event) void Epoll::notify(BAN::RefPtr<Inode> inode, uint32_t event)
{ {
LockGuard _(m_mutex); ASSERT(event);
if (!m_listening_events.contains(inode)) SpinLockGuard _(m_ready_lock);
return;
auto ready_it = m_ready_events.find(inode); if (auto it = m_ready_events.find(inode); it != m_ready_events.end())
if (ready_it == m_ready_events.end()) it->value |= event;
ready_it = MUST(m_ready_events.insert(inode, 0)); else
ready_it->value |= event; MUST(m_ready_events.insert(inode, event));
m_thread_blocker.unblock(); m_thread_blocker.unblock();
} }