Kernel: Make epoll work with different fds pointing to same inode

This commit is contained in:
Bananymous 2025-05-30 22:06:12 +03:00
parent 857bac4b78
commit 9b875fb930
3 changed files with 111 additions and 37 deletions

View File

@ -1,10 +1,12 @@
#pragma once
#include <BAN/Array.h>
#include <BAN/CircularQueue.h>
#include <BAN/HashMap.h>
#include <BAN/HashSet.h>
#include <kernel/FS/Inode.h>
#include <limits.h>
#include <sys/epoll.h>
namespace Kernel
@ -16,7 +18,7 @@ namespace Kernel
static BAN::ErrorOr<BAN::RefPtr<Epoll>> create();
~Epoll();
BAN::ErrorOr<void> ctl(int op, BAN::RefPtr<Inode> inode, epoll_event event);
BAN::ErrorOr<void> ctl(int op, int fd, BAN::RefPtr<Inode> inode, epoll_event event);
BAN::ErrorOr<size_t> wait(BAN::Span<epoll_event> events, uint64_t waketime_ns);
void notify(BAN::RefPtr<Inode> inode, uint32_t event);
@ -59,10 +61,45 @@ namespace Kernel
}
};
struct ListenEventList
{
BAN::Array<epoll_event, OPEN_MAX> events;
uint32_t bitmap[(OPEN_MAX + 31) / 32] {};
bool has_fd(int fd) const
{
if (fd < 0 || static_cast<size_t>(fd) >= events.size())
return false;
return bitmap[fd / 32] & (1u << (fd % 32));
}
bool empty() const
{
for (auto val : bitmap)
if (val != 0)
return false;
return true;
}
void add_fd(int fd, epoll_event event)
{
ASSERT(!has_fd(fd));
bitmap[fd / 32] |= (1u << (fd % 32));
events[fd] = event;
}
void remove_fd(int fd)
{
ASSERT(has_fd(fd));
bitmap[fd / 32] &= ~(1u << (fd % 32));
events[fd] = {};
}
};
private:
ThreadBlocker m_thread_blocker;
BAN::HashMap<BAN::RefPtr<Inode>, uint32_t, InodeRefPtrHash> m_ready_events;
BAN::HashMap<BAN::RefPtr<Inode>, epoll_event, InodeRefPtrHash> m_listening_events;
BAN::HashMap<BAN::RefPtr<Inode>, uint32_t, InodeRefPtrHash> m_ready_events;
BAN::HashMap<BAN::RefPtr<Inode>, ListenEventList, InodeRefPtrHash> m_listening_events;
};
}

View File

@ -19,7 +19,7 @@ namespace Kernel
inode->del_epoll(this);
}
BAN::ErrorOr<void> Epoll::ctl(int op, BAN::RefPtr<Inode> inode, epoll_event event)
BAN::ErrorOr<void> Epoll::ctl(int op, int fd, BAN::RefPtr<Inode> inode, epoll_event event)
{
LockGuard _(m_mutex);
@ -28,27 +28,51 @@ namespace Kernel
switch (op)
{
case EPOLL_CTL_ADD:
if (it != m_listening_events.end())
{
if (it == m_listening_events.end())
it = TRY(m_listening_events.emplace(inode));
if (it->value.has_fd(fd))
return BAN::Error::from_errno(EEXIST);
TRY(m_listening_events.reserve(m_listening_events.size() + 1));
TRY(m_ready_events.reserve(m_listening_events.size() + 1));
TRY(m_ready_events.reserve(m_listening_events.size()));
TRY(inode->add_epoll(this));
MUST(m_listening_events.insert(inode, event));
MUST(m_ready_events.insert(inode, event.events));
it->value.add_fd(fd, event);
auto ready_it = m_ready_events.find(inode);
if (ready_it == m_ready_events.end())
ready_it = MUST(m_ready_events.insert(inode, 0));
ready_it->value |= event.events;
return {};
}
case EPOLL_CTL_MOD:
{
if (it == m_listening_events.end())
return BAN::Error::from_errno(ENOENT);
MUST(m_ready_events.emplace_or_assign(inode, event.events));
it->value = event;
if (!it->value.has_fd(fd))
return BAN::Error::from_errno(ENOENT);
it->value.events[fd] = event;
auto ready_it = m_ready_events.find(inode);
if (ready_it == m_ready_events.end())
ready_it = MUST(m_ready_events.insert(inode, 0));
ready_it->value |= event.events;
return {};
}
case EPOLL_CTL_DEL:
{
if (it == m_listening_events.end())
return BAN::Error::from_errno(ENOENT);
m_listening_events.remove(it);
m_ready_events.remove(inode);
inode->del_epoll(this);
if (!it->value.has_fd(fd))
return BAN::Error::from_errno(ENOENT);
it->value.remove_fd(fd);
if (it->value.empty())
{
m_listening_events.remove(it);
m_ready_events.remove(inode);
}
return {};
}
}
return BAN::Error::from_errno(EINVAL);
@ -56,6 +80,9 @@ namespace Kernel
BAN::ErrorOr<size_t> Epoll::wait(BAN::Span<epoll_event> event_span, uint64_t waketime_ns)
{
if (event_span.empty())
return BAN::Error::from_errno(EINVAL);
size_t count = 0;
for (;;)
@ -64,13 +91,17 @@ namespace Kernel
{
LockGuard _(m_mutex);
for (auto it = m_ready_events.begin(); it != m_ready_events.end() && count < event_span.size();)
{
auto& [inode, events] = *it;
auto& listen = m_listening_events[inode];
const uint32_t listen_mask = (listen.events & (EPOLLIN | EPOLLOUT)) | EPOLLERR | EPOLLHUP;
uint32_t listen_mask = EPOLLERR | EPOLLHUP;
for (int fd = 0; fd < OPEN_MAX; fd++)
if (listen.has_fd(fd))
listen_mask |= listen.events[fd].events;
events &= listen_mask;
// This prevents a possible deadlock
@ -98,16 +129,27 @@ namespace Kernel
continue;
}
event_span[count++] = {
.events = events,
.data = listen.data,
};
for (int fd = 0; fd < OPEN_MAX && count < event_span.size(); fd++)
{
if (!listen.has_fd(fd))
continue;
auto& listen_event = listen.events[fd];
if (listen.events & EPOLLONESHOT)
listen.events = 0;
const auto new_events = listen_event.events & events;
if (new_events == 0)
continue;
if (listen.events & EPOLLET)
events &= ~listen_mask;
event_span[count++] = {
.events = new_events,
.data = listen_event.data,
};
if (listen_event.events & EPOLLONESHOT)
listen_event.events = 0;
// this doesn't work with multiple of the same inode
if (listen_event.events & EPOLLET)
events &= ~new_events;
}
it++;
}
@ -132,18 +174,13 @@ namespace Kernel
{
LockGuard _(m_mutex);
auto listen_it = m_listening_events.find(inode);
if (listen_it == m_listening_events.end())
if (!m_listening_events.contains(inode))
return;
event &= (listen_it->value.events & (EPOLLIN | EPOLLOUT)) | EPOLLERR | EPOLLHUP;
if (event == 0)
return;
if (auto ready_it = m_ready_events.find(inode); ready_it != m_ready_events.end())
ready_it->value |= event;
else
MUST(m_ready_events.insert(inode, event));
auto ready_it = m_ready_events.find(inode);
if (ready_it == m_ready_events.end())
ready_it = MUST(m_ready_events.insert(inode, 0));
ready_it->value |= event;
m_thread_blocker.unblock();
}

View File

@ -1583,7 +1583,7 @@ namespace Kernel
}
auto epoll = TRY(Epoll::create());
for (int fd = 0; fd < user_arguments->nfds; fd++)
for (int fd = 0; fd < arguments.nfds; fd++)
{
uint32_t events = 0;
if (arguments.readfds && FD_ISSET(fd, arguments.readfds))
@ -1599,11 +1599,11 @@ namespace Kernel
if (inode_or_error.is_error())
continue;
TRY(epoll->ctl(EPOLL_CTL_ADD, inode_or_error.release_value(), { .events = events, .data = { .fd = fd }}));
TRY(epoll->ctl(EPOLL_CTL_ADD, fd, inode_or_error.release_value(), { .events = events, .data = { .fd = fd }}));
}
BAN::Vector<epoll_event> event_buffer;
TRY(event_buffer.resize(user_arguments->nfds));
TRY(event_buffer.resize(arguments.nfds));
const size_t waited_events = TRY(epoll->wait(event_buffer.span(), waketime_ns));
@ -1663,7 +1663,7 @@ namespace Kernel
event = *user_event;
}
TRY(static_cast<Epoll*>(epoll_inode.ptr())->ctl(op, inode, event));
TRY(static_cast<Epoll*>(epoll_inode.ptr())->ctl(op, fd, inode, event));
return 0;
}