Compare commits

...

2 Commits

Author SHA1 Message Date
Bananymous 891b36397a Kernel: Fix TCP sending
- Fix race condition when adding packet to send buffer before other end
  has acknowledged it
- Allow sending multiple packets before receiving ACK for previous ones
2025-06-04 22:13:11 +03:00
Bananymous 448632cf11 Kernel: Add fast path to pselect and ppoll
If events are available right away, don't create epoll object to wait
on. Also fix pselect return value to the number of bits set. Previously
return value could've been greater than that if epoll return EPOLLERR or
EPOLLHUP for events that were not listened to
2025-06-04 17:51:22 +03:00
2 changed files with 90 additions and 30 deletions

View File

@ -717,27 +717,31 @@ namespace Kernel
auto* send_buffer = reinterpret_cast<uint8_t*>(m_send_window.buffer->vaddr()); auto* send_buffer = reinterpret_cast<uint8_t*>(m_send_window.buffer->vaddr());
memmove(send_buffer, send_buffer + acknowledged_bytes, m_send_window.data_size); memmove(send_buffer, send_buffer + acknowledged_bytes, m_send_window.data_size);
} }
else
{ m_send_window.sent_size -= acknowledged_bytes;
m_send_window.last_send_ms = 0;
} epoll_notify(EPOLLOUT);
dprintln_if(DEBUG_TCP, "Target acknowledged {} bytes", acknowledged_bytes); dprintln_if(DEBUG_TCP, "Target acknowledged {} bytes", acknowledged_bytes);
continue; continue;
} }
if (m_send_window.data_size > 0 && current_ms >= m_send_window.last_send_ms + retransmit_timeout_ms) const bool should_retransmit = m_send_window.data_size > 0 && current_ms >= m_send_window.last_send_ms + retransmit_timeout_ms;
if (m_send_window.data_size > m_send_window.sent_size || should_retransmit)
{ {
ASSERT(m_connection_info.has_value()); ASSERT(m_connection_info.has_value());
auto* target_address = reinterpret_cast<const sockaddr*>(&m_connection_info->address); auto* target_address = reinterpret_cast<const sockaddr*>(&m_connection_info->address);
auto target_address_len = m_connection_info->address_len; auto target_address_len = m_connection_info->address_len;
const uint32_t total_send = BAN::Math::min<uint32_t>(m_send_window.data_size, m_send_window.scaled_size()); const uint32_t send_base = should_retransmit ? 0 : m_send_window.sent_size;
const uint32_t total_send = BAN::Math::min<uint32_t>(m_send_window.data_size - send_base, m_send_window.scaled_size());
m_send_window.current_seq = m_send_window.start_seq; m_send_window.current_seq = m_send_window.start_seq;
auto* send_buffer = reinterpret_cast<const uint8_t*>(m_send_window.buffer->vaddr()); auto* send_buffer = reinterpret_cast<const uint8_t*>(m_send_window.buffer->vaddr() + send_base);
for (uint32_t i = 0; i < total_send;) for (uint32_t i = 0; i < total_send;)
{ {
const uint32_t to_send = BAN::Math::min(total_send - i, m_send_window.mss); const uint32_t to_send = BAN::Math::min(total_send - i, m_send_window.mss);
@ -753,10 +757,9 @@ namespace Kernel
dprintln_if(DEBUG_TCP, "Sent {} bytes", to_send); dprintln_if(DEBUG_TCP, "Sent {} bytes", to_send);
m_send_window.sent_size += to_send;
m_send_window.current_seq += to_send; m_send_window.current_seq += to_send;
i += to_send; i += to_send;
epoll_notify(EPOLLOUT);
} }
m_send_window.last_send_ms = current_ms; m_send_window.last_send_ms = current_ms;

View File

@ -1595,6 +1595,40 @@ namespace Kernel
arguments.timeout->tv_nsec; arguments.timeout->tv_nsec;
} }
{
fd_set rfds, wfds, efds;
FD_ZERO(&rfds);
FD_ZERO(&wfds);
FD_ZERO(&efds);
size_t return_value = 0;
for (int fd = 0; fd < arguments.nfds; fd++)
{
auto inode_or_error = m_open_file_descriptors.inode_of(fd);
if (inode_or_error.is_error())
continue;
auto inode = inode_or_error.release_value();
if (arguments.readfds && FD_ISSET(fd, arguments.readfds) && inode->can_read())
{ FD_SET(fd, &rfds); return_value++; }
if (arguments.writefds && FD_ISSET(fd, arguments.writefds) && inode->can_write())
{ FD_SET(fd, &wfds); return_value++; }
if (arguments.errorfds && FD_ISSET(fd, arguments.errorfds) && inode->has_error())
{ FD_SET(fd, &efds); return_value++; }
}
if (return_value || SystemTimer::get().ns_since_boot() >= waketime_ns)
{
if (arguments.readfds)
memcpy(arguments.readfds, &rfds, sizeof(fd_set));
if (arguments.writefds)
memcpy(arguments.writefds, &wfds, sizeof(fd_set));
if (arguments.errorfds)
memcpy(arguments.errorfds, &efds, sizeof(fd_set));
return return_value;
}
}
auto epoll = TRY(Epoll::create()); auto epoll = TRY(Epoll::create());
for (int fd = 0; fd < arguments.nfds; fd++) for (int fd = 0; fd < arguments.nfds; fd++)
{ {
@ -1627,18 +1661,19 @@ namespace Kernel
if (arguments.errorfds) if (arguments.errorfds)
FD_ZERO(arguments.errorfds); FD_ZERO(arguments.errorfds);
size_t return_value = 0;
for (size_t i = 0; i < waited_events; i++) for (size_t i = 0; i < waited_events; i++)
{ {
const int fd = event_buffer[i].data.fd; const int fd = event_buffer[i].data.fd;
if (arguments.readfds && event_buffer[i].events & (EPOLLIN | EPOLLHUP)) if (arguments.readfds && event_buffer[i].events & (EPOLLIN | EPOLLHUP))
FD_SET(fd, arguments.readfds); { FD_SET(fd, arguments.readfds); return_value++; }
if (arguments.writefds && event_buffer[i].events & (EPOLLOUT)) if (arguments.writefds && event_buffer[i].events & (EPOLLOUT))
FD_SET(fd, arguments.writefds); { FD_SET(fd, arguments.writefds); return_value++; }
if (arguments.errorfds && event_buffer[i].events & (EPOLLERR)) if (arguments.errorfds && event_buffer[i].events & (EPOLLERR))
FD_SET(fd, arguments.errorfds); { FD_SET(fd, arguments.errorfds); return_value++; }
} }
return waited_events; return return_value;
} }
BAN::ErrorOr<long> Process::sys_ppoll(pollfd* fds, nfds_t nfds, const timespec* timeout, const sigset_t* sigmask) BAN::ErrorOr<long> Process::sys_ppoll(pollfd* fds, nfds_t nfds, const timespec* timeout, const sigset_t* sigmask)
@ -1666,14 +1701,48 @@ namespace Kernel
timeout->tv_nsec; timeout->tv_nsec;
} }
uint32_t events_per_fd[OPEN_MAX] {}; size_t return_value = 0;
for (nfds_t i = 0; i < nfds; i++) for (nfds_t i = 0; i < nfds; i++)
{ {
if (fds[i].fd < 0 || fds[i].fd >= OPEN_MAX) fds[i].revents = 0;
if (fds[i].fd < 0)
continue; continue;
events_per_fd[fds[i].fd] |= fds[i].events; auto inode_or_error = m_open_file_descriptors.inode_of(fds[i].fd);
if (inode_or_error.is_error())
{
fds[i].revents |= POLLNVAL;
return_value++;
continue;
}
auto inode = inode_or_error.release_value();
if (inode->has_hungup())
fds[i].revents |= POLLHUP;
if (inode->has_error())
fds[i].revents |= POLLERR;
if ((fds[i].events & (POLLIN | POLLRDNORM)) && inode->can_read())
fds[i].revents |= fds[i].events & (POLLIN | POLLRDNORM);
if ((fds[i].events & (POLLOUT | POLLWRNORM)) && inode->can_write())
fds[i].revents |= fds[i].events & (POLLOUT | POLLWRNORM);
// POLLPRI
// POLLRDBAND
// POLLWRBAND
if (fds[i].revents)
return_value++;
} }
if (return_value || SystemTimer::get().ns_since_boot() >= waketime_ns)
return return_value;
uint32_t events_per_fd[OPEN_MAX] {};
for (nfds_t i = 0; i < nfds; i++)
if (fds[i].fd >= 0 && fds[i].fd < OPEN_MAX)
events_per_fd[fds[i].fd] |= fds[i].events;
size_t fd_count = 0; size_t fd_count = 0;
auto epoll = TRY(Epoll::create()); auto epoll = TRY(Epoll::create());
@ -1682,9 +1751,7 @@ namespace Kernel
if (events_per_fd[fd] == 0) if (events_per_fd[fd] == 0)
continue; continue;
auto inode_or_error = m_open_file_descriptors.inode_of(fd); auto inode = TRY(m_open_file_descriptors.inode_of(fd));
if (inode_or_error.is_error())
continue;
uint32_t events = 0; uint32_t events = 0;
if (events_per_fd[fd] & (POLLIN | POLLRDNORM)) if (events_per_fd[fd] & (POLLIN | POLLRDNORM))
@ -1696,7 +1763,7 @@ namespace Kernel
// POLLRDBAND // POLLRDBAND
// POLLWRBAND // POLLWRBAND
TRY(epoll->ctl(EPOLL_CTL_ADD, fd, inode_or_error.release_value(), { .events = events, .data = { .fd = fd }})); TRY(epoll->ctl(EPOLL_CTL_ADD, fd, inode, { .events = events, .data = { .fd = fd }}));
fd_count++; fd_count++;
} }
@ -1706,21 +1773,11 @@ namespace Kernel
const size_t waited_events = TRY(epoll->wait(event_buffer.span(), waketime_ns)); const size_t waited_events = TRY(epoll->wait(event_buffer.span(), waketime_ns));
size_t return_value = 0;
for (size_t i = 0; i < nfds; i++) for (size_t i = 0; i < nfds; i++)
{ {
fds[i].revents = 0;
if (fds[i].fd < 0) if (fds[i].fd < 0)
continue; continue;
if (m_open_file_descriptors.inode_of(fds[i].fd).is_error())
{
fds[i].revents = POLLNVAL;
return_value++;
continue;
}
for (size_t j = 0; j < waited_events; j++) for (size_t j = 0; j < waited_events; j++)
{ {
if (fds[i].fd != event_buffer[j].data.fd) if (fds[i].fd != event_buffer[j].data.fd)