Kernel: Rewrite epoll notifying system
This removes the need to lock epoll's mutex when notifying epoll. This prevents a ton of deadlocks when epoll is notified from an interrupt handler or otherwise with interrupts disabled
This commit is contained in:
parent
e9f8471a28
commit
9883fb7bf6
|
@ -98,7 +98,9 @@ namespace Kernel
|
||||||
|
|
||||||
private:
|
private:
|
||||||
ThreadBlocker m_thread_blocker;
|
ThreadBlocker m_thread_blocker;
|
||||||
|
SpinLock m_ready_lock;
|
||||||
BAN::HashMap<BAN::RefPtr<Inode>, uint32_t, InodeRefPtrHash> m_ready_events;
|
BAN::HashMap<BAN::RefPtr<Inode>, uint32_t, InodeRefPtrHash> m_ready_events;
|
||||||
|
BAN::HashMap<BAN::RefPtr<Inode>, uint32_t, InodeRefPtrHash> m_processing_events;
|
||||||
BAN::HashMap<BAN::RefPtr<Inode>, ListenEventList, InodeRefPtrHash> m_listening_events;
|
BAN::HashMap<BAN::RefPtr<Inode>, ListenEventList, InodeRefPtrHash> m_listening_events;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -29,18 +29,26 @@ namespace Kernel
|
||||||
{
|
{
|
||||||
case EPOLL_CTL_ADD:
|
case EPOLL_CTL_ADD:
|
||||||
{
|
{
|
||||||
if (it == m_listening_events.end())
|
bool contains_inode = (it != m_listening_events.end());
|
||||||
|
if (!contains_inode)
|
||||||
it = TRY(m_listening_events.emplace(inode));
|
it = TRY(m_listening_events.emplace(inode));
|
||||||
if (it->value.has_fd(fd))
|
if (it->value.has_fd(fd))
|
||||||
return BAN::Error::from_errno(EEXIST);
|
return BAN::Error::from_errno(EEXIST);
|
||||||
|
|
||||||
|
{
|
||||||
|
SpinLockGuard _(m_ready_lock);
|
||||||
TRY(m_ready_events.reserve(m_listening_events.size()));
|
TRY(m_ready_events.reserve(m_listening_events.size()));
|
||||||
|
}
|
||||||
|
TRY(m_processing_events.reserve(m_listening_events.size()));
|
||||||
|
|
||||||
|
if (!contains_inode)
|
||||||
TRY(inode->add_epoll(this));
|
TRY(inode->add_epoll(this));
|
||||||
it->value.add_fd(fd, event);
|
it->value.add_fd(fd, event);
|
||||||
|
|
||||||
auto ready_it = m_ready_events.find(inode);
|
auto processing_it = m_processing_events.find(inode);
|
||||||
if (ready_it == m_ready_events.end())
|
if (processing_it == m_processing_events.end())
|
||||||
ready_it = MUST(m_ready_events.insert(inode, 0));
|
processing_it = MUST(m_processing_events.insert(inode, 0));
|
||||||
ready_it->value |= event.events;
|
processing_it->value |= event.events;
|
||||||
|
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
@ -50,12 +58,13 @@ namespace Kernel
|
||||||
return BAN::Error::from_errno(ENOENT);
|
return BAN::Error::from_errno(ENOENT);
|
||||||
if (!it->value.has_fd(fd))
|
if (!it->value.has_fd(fd))
|
||||||
return BAN::Error::from_errno(ENOENT);
|
return BAN::Error::from_errno(ENOENT);
|
||||||
|
|
||||||
it->value.events[fd] = event;
|
it->value.events[fd] = event;
|
||||||
|
|
||||||
auto ready_it = m_ready_events.find(inode);
|
auto processing_it = m_processing_events.find(inode);
|
||||||
if (ready_it == m_ready_events.end())
|
if (processing_it == m_processing_events.end())
|
||||||
ready_it = MUST(m_ready_events.insert(inode, 0));
|
processing_it = MUST(m_processing_events.insert(inode, 0));
|
||||||
ready_it->value |= event.events;
|
processing_it->value |= event.events;
|
||||||
|
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
@ -68,7 +77,10 @@ namespace Kernel
|
||||||
it->value.remove_fd(fd);
|
it->value.remove_fd(fd);
|
||||||
if (it->value.empty())
|
if (it->value.empty())
|
||||||
{
|
{
|
||||||
|
inode->del_epoll(this);
|
||||||
m_listening_events.remove(it);
|
m_listening_events.remove(it);
|
||||||
|
m_processing_events.remove(inode);
|
||||||
|
SpinLockGuard _(m_ready_lock);
|
||||||
m_ready_events.remove(inode);
|
m_ready_events.remove(inode);
|
||||||
}
|
}
|
||||||
return {};
|
return {};
|
||||||
|
@ -83,34 +95,60 @@ namespace Kernel
|
||||||
if (event_span.empty())
|
if (event_span.empty())
|
||||||
return BAN::Error::from_errno(EINVAL);
|
return BAN::Error::from_errno(EINVAL);
|
||||||
|
|
||||||
size_t count = 0;
|
size_t event_count = 0;
|
||||||
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
bool failed_lock = false;
|
|
||||||
|
|
||||||
{
|
{
|
||||||
LockGuard _(m_mutex);
|
LockGuard _(m_mutex);
|
||||||
|
|
||||||
for (auto it = m_ready_events.begin(); it != m_ready_events.end() && count < event_span.size();)
|
{
|
||||||
|
SpinLockGuard _(m_ready_lock);
|
||||||
|
|
||||||
|
while (!m_ready_events.empty())
|
||||||
|
{
|
||||||
|
auto [inode, events] = *m_ready_events.begin();
|
||||||
|
m_ready_events.remove(m_ready_events.begin());
|
||||||
|
|
||||||
|
ASSERT(events);
|
||||||
|
|
||||||
|
if (auto it = m_processing_events.find(inode); it != m_processing_events.end())
|
||||||
|
it->value |= events;
|
||||||
|
else
|
||||||
|
MUST(m_processing_events.insert(inode, events));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto it = m_processing_events.begin(); it != m_processing_events.end() && event_count < event_span.size();)
|
||||||
{
|
{
|
||||||
auto& [inode, events] = *it;
|
auto& [inode, events] = *it;
|
||||||
|
|
||||||
auto& listen = m_listening_events[inode];
|
#define REMOVE_IT_AND_CONTINUE() \
|
||||||
|
({ \
|
||||||
|
m_processing_events.remove(it); \
|
||||||
|
it = m_processing_events.begin(); \
|
||||||
|
continue; \
|
||||||
|
})
|
||||||
|
|
||||||
|
auto listen_it = m_listening_events.find(inode);
|
||||||
|
if (listen_it == m_listening_events.end())
|
||||||
|
REMOVE_IT_AND_CONTINUE();
|
||||||
|
auto& listen = listen_it->value;
|
||||||
|
|
||||||
|
{
|
||||||
uint32_t listen_mask = EPOLLERR | EPOLLHUP;
|
uint32_t listen_mask = EPOLLERR | EPOLLHUP;
|
||||||
for (int fd = 0; fd < OPEN_MAX; fd++)
|
for (size_t fd = 0; fd < listen.events.size(); fd++)
|
||||||
if (listen.has_fd(fd))
|
if (listen.has_fd(fd))
|
||||||
listen_mask |= listen.events[fd].events;
|
listen_mask |= listen.events[fd].events;
|
||||||
events &= listen_mask;
|
events &= listen_mask;
|
||||||
|
|
||||||
// This prevents a possible deadlock
|
|
||||||
if (!inode->m_mutex.try_lock())
|
|
||||||
{
|
|
||||||
failed_lock = true;
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (events == 0)
|
||||||
|
REMOVE_IT_AND_CONTINUE();
|
||||||
|
|
||||||
|
{
|
||||||
|
LockGuard inode_locker(inode->m_mutex);
|
||||||
|
|
||||||
#define CHECK_EVENT_BIT(mask, func) \
|
#define CHECK_EVENT_BIT(mask, func) \
|
||||||
if ((events & mask) && !inode->func()) \
|
if ((events & mask) && !inode->func()) \
|
||||||
events &= ~mask;
|
events &= ~mask;
|
||||||
|
@ -119,17 +157,14 @@ namespace Kernel
|
||||||
CHECK_EVENT_BIT(EPOLLERR, has_error);
|
CHECK_EVENT_BIT(EPOLLERR, has_error);
|
||||||
CHECK_EVENT_BIT(EPOLLHUP, has_hungup);
|
CHECK_EVENT_BIT(EPOLLHUP, has_hungup);
|
||||||
#undef CHECK_EVENT_BIT
|
#undef CHECK_EVENT_BIT
|
||||||
|
|
||||||
inode->m_mutex.unlock();
|
|
||||||
|
|
||||||
if (events == 0)
|
|
||||||
{
|
|
||||||
m_ready_events.remove(it);
|
|
||||||
it = m_ready_events.begin();
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int fd = 0; fd < OPEN_MAX && count < event_span.size(); fd++)
|
if (events == 0)
|
||||||
|
REMOVE_IT_AND_CONTINUE();
|
||||||
|
|
||||||
|
#undef REMOVE_IT_AND_CONTINUE
|
||||||
|
|
||||||
|
for (size_t fd = 0; fd < listen.events.size() && event_count < event_span.size(); fd++)
|
||||||
{
|
{
|
||||||
if (!listen.has_fd(fd))
|
if (!listen.has_fd(fd))
|
||||||
continue;
|
continue;
|
||||||
|
@ -139,7 +174,7 @@ namespace Kernel
|
||||||
if (new_events == 0)
|
if (new_events == 0)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
event_span[count++] = {
|
event_span[event_count++] = {
|
||||||
.events = new_events,
|
.events = new_events,
|
||||||
.data = listen_event.data,
|
.data = listen_event.data,
|
||||||
};
|
};
|
||||||
|
@ -155,32 +190,29 @@ namespace Kernel
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (count)
|
if (event_count > 0)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
const uint64_t current_ns = SystemTimer::get().ns_since_boot();
|
const uint64_t current_ns = SystemTimer::get().ns_since_boot();
|
||||||
if (current_ns >= waketime_ns)
|
if (current_ns >= waketime_ns)
|
||||||
break;
|
break;
|
||||||
if (failed_lock)
|
|
||||||
continue;
|
|
||||||
const uint64_t timeout_ns = BAN::Math::min<uint64_t>(100'000'000, waketime_ns - current_ns);
|
const uint64_t timeout_ns = BAN::Math::min<uint64_t>(100'000'000, waketime_ns - current_ns);
|
||||||
TRY(Thread::current().block_or_eintr_or_timeout_ns(m_thread_blocker, timeout_ns, false));
|
TRY(Thread::current().block_or_eintr_or_timeout_ns(m_thread_blocker, timeout_ns, false));
|
||||||
}
|
}
|
||||||
|
|
||||||
return count;
|
return event_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Epoll::notify(BAN::RefPtr<Inode> inode, uint32_t event)
|
void Epoll::notify(BAN::RefPtr<Inode> inode, uint32_t event)
|
||||||
{
|
{
|
||||||
LockGuard _(m_mutex);
|
ASSERT(event);
|
||||||
|
|
||||||
if (!m_listening_events.contains(inode))
|
SpinLockGuard _(m_ready_lock);
|
||||||
return;
|
|
||||||
|
|
||||||
auto ready_it = m_ready_events.find(inode);
|
if (auto it = m_ready_events.find(inode); it != m_ready_events.end())
|
||||||
if (ready_it == m_ready_events.end())
|
it->value |= event;
|
||||||
ready_it = MUST(m_ready_events.insert(inode, 0));
|
else
|
||||||
ready_it->value |= event;
|
MUST(m_ready_events.insert(inode, event));
|
||||||
|
|
||||||
m_thread_blocker.unblock();
|
m_thread_blocker.unblock();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue