Kernel: Add syscall-less clock_gettime

If the processor has invariant TSC it can be used to measure time. We
keep track of the last nanosecond and TSC values and offset them based
on the current TSC. This allows getting current time in userspace.

The implementation maps a single RO page to every processes' address
space. The page contains the TSC info which gets updated every 100 ms.
If the processor does not have invariant TSC, this page will not
indicate the capability for TSC based timing.

There was the problem about how does a processor know which cpu it is
running without doing syscall. TSC counters may or may not be
synchronized between cores, so we need a separate TSC info for each
processor. I ended up adding sequence of bytes 0..255 at the start of
the shared page. When a scheduler gets a new thread, it updates the
threads gs/fs segment to point to the byte corresponding to the current
cpu.

This TSC based timing is also used in kernel. With 64 bit HPET this
probably does not bring much of a benefit, but on PIT or 32 bit HPET
this removes the need to aquire a spinlock to get the current time.

This change does force the userspace to not use gs/fs themselves and
they are both now reserved. Other one is used for TLS (this can be
technically used if user does not call libc code) and the other for
the current processor index (cannot be used as kernel unconditionally
resets it after each load balance).

I was looking at how many times timer's current time was polled
(userspace and kernel combined). When idling in window manager, it was
around 8k times/s. When running doom it peaked at over 1 million times
per second when loading and settled at ~30k times/s.
This commit is contained in:
2026-01-08 13:30:04 +02:00
parent ee57cf3e9a
commit 9eb3834ae5
20 changed files with 448 additions and 15 deletions

View File

@@ -75,6 +75,16 @@ namespace CPUID
return buffer[3] & (1 << 26);
}
bool has_invariant_tsc()
{
uint32_t buffer[4] {};
get_cpuid(0x80000000, buffer);
if (buffer[0] < 0x80000007)
return false;
get_cpuid(0x80000007, buffer);
return buffer[3] & (1 << 8);
}
const char* feature_string_ecx(uint32_t feat)
{
switch (feat)

View File

@@ -152,6 +152,20 @@ namespace Kernel
}));
}
process->m_shared_page_vaddr = process->page_table().reserve_free_page(process->m_mapped_regions.back()->vaddr(), USERSPACE_END);
if (process->m_shared_page_vaddr == 0)
return BAN::Error::from_errno(ENOMEM);
process->page_table().map_page_at(
Processor::shared_page_paddr(),
process->m_shared_page_vaddr,
PageTable::UserSupervisor | PageTable::Present
);
TRY(auxiliary_vector.push_back({
.a_type = LibELF::AT_SHARED_PAGE,
.a_un = { .a_ptr = reinterpret_cast<void*>(process->m_shared_page_vaddr) },
}));
TRY(auxiliary_vector.push_back({
.a_type = LibELF::AT_NULL,
.a_un = { .a_val = 0 },
@@ -683,6 +697,13 @@ namespace Kernel
for (auto& mapped_region : m_mapped_regions)
MUST(mapped_regions.push_back(TRY(mapped_region->clone(*page_table))));
const vaddr_t shared_page_vaddr = m_shared_page_vaddr;
page_table->map_page_at(
Processor::shared_page_paddr(),
shared_page_vaddr,
PageTable::UserSupervisor | PageTable::Present
);
Process* forked = create_process(m_credentials, m_pid, m_sid, m_pgrp);
forked->m_controlling_terminal = m_controlling_terminal;
forked->m_working_directory = BAN::move(working_directory);
@@ -691,6 +712,7 @@ namespace Kernel
forked->m_environ = BAN::move(environ);
forked->m_executable = BAN::move(executable);
forked->m_page_table = BAN::move(page_table);
forked->m_shared_page_vaddr = BAN::move(shared_page_vaddr);
forked->m_open_file_descriptors = BAN::move(*open_file_descriptors);
forked->m_mapped_regions = BAN::move(mapped_regions);
forked->m_has_called_exec = false;
@@ -766,6 +788,20 @@ namespace Kernel
}));
}
const vaddr_t shared_page_vaddr = new_page_table->reserve_free_page(new_mapped_regions.back()->vaddr(), USERSPACE_END);
if (shared_page_vaddr == 0)
return BAN::Error::from_errno(ENOMEM);
new_page_table->map_page_at(
Processor::shared_page_paddr(),
shared_page_vaddr,
PageTable::UserSupervisor | PageTable::Present
);
TRY(auxiliary_vector.push_back({
.a_type = LibELF::AT_SHARED_PAGE,
.a_un = { .a_ptr = reinterpret_cast<void*>(shared_page_vaddr) },
}));
TRY(auxiliary_vector.push_back({
.a_type = LibELF::AT_NULL,
.a_un = { .a_val = 0 },
@@ -837,6 +873,9 @@ namespace Kernel
m_mapped_regions = BAN::move(new_mapped_regions);
m_page_table = BAN::move(new_page_table);
m_shared_page_vaddr = shared_page_vaddr;
m_threads.front()->update_processor_index_address();
execfd_guard.disable();
m_cmdline = BAN::move(str_argv);

View File

@@ -15,10 +15,12 @@ namespace Kernel
static constexpr uint32_t MSR_IA32_KERNEL_GS_BASE = 0xC0000102;
#endif
ProcessorID Processor::s_bsp_id { PROCESSOR_NONE };
BAN::Atomic<uint8_t> Processor::s_processor_count { 0 };
BAN::Atomic<bool> Processor::s_is_smp_enabled { false };
BAN::Atomic<bool> Processor::s_should_print_cpu_load { false };
ProcessorID Processor::s_bsp_id { PROCESSOR_NONE };
BAN::Atomic<uint8_t> Processor::s_processor_count { 0 };
BAN::Atomic<bool> Processor::s_is_smp_enabled { false };
BAN::Atomic<bool> Processor::s_should_print_cpu_load { false };
paddr_t Processor::s_shared_page_paddr { 0 };
vaddr_t Processor::s_shared_page_vaddr { 0 };
static BAN::Atomic<uint8_t> s_processors_created { 0 };
@@ -128,6 +130,33 @@ namespace Kernel
processor.m_smp_free = smp_storage;
}
void Processor::initialize_shared_page()
{
[[maybe_unused]] constexpr size_t max_processors = (PAGE_SIZE - sizeof(API::SharedPage)) / sizeof(decltype(*API::SharedPage::cpus));
ASSERT(s_processors_created < max_processors);
s_shared_page_paddr = Heap::get().take_free_page();
ASSERT(s_shared_page_paddr);
s_shared_page_vaddr = PageTable::kernel().reserve_free_page(KERNEL_OFFSET);
ASSERT(s_shared_page_vaddr);
PageTable::kernel().map_page_at(
s_shared_page_paddr,
s_shared_page_vaddr,
PageTable::ReadWrite | PageTable::Present
);
memset(reinterpret_cast<void*>(s_shared_page_vaddr), 0, PAGE_SIZE);
auto& shared_page = *reinterpret_cast<volatile API::SharedPage*>(s_shared_page_vaddr);
for (size_t i = 0; i <= 0xFF; i++)
shared_page.__sequence[i] = i;
shared_page.features = 0;
ASSERT(Processor::count() + sizeof(Kernel::API::SharedPage) <= PAGE_SIZE);
}
ProcessorID Processor::id_from_index(size_t index)
{
ASSERT(index < s_processor_count);
@@ -142,8 +171,11 @@ namespace Kernel
// wait until bsp is ready
if (current_is_bsp())
{
initialize_shared_page();
s_processor_count = 1;
s_processor_ids[0] = current_id();
s_processors[current_id().as_u32()].m_index = 0;
// single processor system
if (s_processors_created == 1)
@@ -167,9 +199,10 @@ namespace Kernel
while (s_processor_count == 0)
__builtin_ia32_pause();
auto lookup_index = s_processor_count++;
ASSERT(s_processor_ids[lookup_index] == PROCESSOR_NONE);
s_processor_ids[lookup_index] = current_id();
const auto index = s_processor_count++;
ASSERT(s_processor_ids[index] == PROCESSOR_NONE);
s_processor_ids[index] = current_id();
s_processors[current_id().as_u32()].m_index = index;
uint32_t expected = static_cast<uint32_t>(-1);
s_first_ap_ready_ms.compare_exchange(expected, SystemTimer::get().ms_since_boot());
@@ -191,6 +224,76 @@ namespace Kernel
}
}
void Processor::initialize_tsc(uint8_t shift, uint64_t mult, uint64_t realtime_seconds)
{
auto& shared_page = Processor::shared_page();
shared_page.gettime_shared.shift = shift;
shared_page.gettime_shared.mult = mult;
shared_page.gettime_shared.realtime_seconds = realtime_seconds;
update_tsc();
broadcast_smp_message({
.type = SMPMessage::Type::UpdateTSC,
.dummy = 0,
});
bool everyone_initialized { false };
while (!everyone_initialized)
{
everyone_initialized = true;
for (size_t i = 0; i < count(); i++)
{
if (shared_page.cpus[i].gettime_local.seq != 0)
continue;
everyone_initialized = false;
break;
}
}
shared_page.features |= API::SPF_GETTIME;
}
void Processor::update_tsc()
{
const auto read_tsc =
[]() -> uint64_t {
uint32_t high, low;
asm volatile("lfence; rdtsc" : "=d"(high), "=a"(low));
return (static_cast<uint64_t>(high) << 32) | low;
};
auto& sgettime = shared_page().cpus[current_index()].gettime_local;
sgettime.seq = sgettime.seq + 1;
sgettime.last_ns = SystemTimer::get().ns_since_boot_no_tsc();
sgettime.last_tsc = read_tsc();
sgettime.seq = sgettime.seq + 1;
}
uint64_t Processor::ns_since_boot_tsc()
{
const auto read_tsc =
[]() -> uint64_t {
uint32_t high, low;
asm volatile("lfence; rdtsc" : "=d"(high), "=a"(low));
return (static_cast<uint64_t>(high) << 32) | low;
};
const auto& shared_page = Processor::shared_page();
const auto& sgettime = shared_page.gettime_shared;
const auto& lgettime = shared_page.cpus[current_index()].gettime_local;
auto state = get_interrupt_state();
set_interrupt_state(InterruptState::Disabled);
const auto current_ns = lgettime.last_ns + (((read_tsc() - lgettime.last_tsc) * sgettime.mult) >> sgettime.shift);
set_interrupt_state(state);
return current_ns;
}
void Processor::handle_ipi()
{
handle_smp_messages();
@@ -240,6 +343,9 @@ namespace Kernel
case SMPMessage::Type::UnblockThread:
processor.m_scheduler->unblock_thread(message->unblock_thread);
break;
case SMPMessage::Type::UpdateTSC:
update_tsc();
break;
#if WITH_PROFILING
case SMPMessage::Type::StartProfiling:
processor.start_profiling();
@@ -375,13 +481,14 @@ namespace Kernel
if (!is_smp_enabled())
return;
auto state = get_interrupt_state();
const auto state = get_interrupt_state();
set_interrupt_state(InterruptState::Disabled);
const auto current_id = Processor::current_id();
for (size_t i = 0; i < Processor::count(); i++)
{
auto processor_id = s_processor_ids[i];
if (processor_id != current_id())
const auto processor_id = s_processor_ids[i];
if (processor_id != current_id)
send_smp_message(processor_id, message, false);
}

View File

@@ -387,6 +387,9 @@ namespace Kernel
else
m_block_queue.add_thread_with_wake_time(node);
if (auto* thread = node->thread; thread->is_userspace() && thread->has_process())
thread->update_processor_index_address();
m_thread_count++;
Processor::set_interrupt_state(state);

View File

@@ -295,6 +295,20 @@ namespace Kernel
m_cpu_time_start_ns = UINT64_MAX;
}
void Thread::update_processor_index_address()
{
if (!is_userspace() || !has_process())
return;
const vaddr_t vaddr = process().shared_page_vaddr() + Processor::current_index();
#if ARCH(x86_64)
set_gsbase(vaddr);
#elif ARCH(i686)
set_fsbase(vaddr);
#endif
}
BAN::ErrorOr<Thread*> Thread::pthread_create(entry_t entry, void* arg)
{
auto* thread = TRY(create_userspace(m_process, m_process->page_table()));

View File

@@ -272,6 +272,8 @@ namespace Kernel
m_last_ticks = current_ticks;
}
SystemTimer::get().update_tsc();
if (should_invoke_scheduler())
Processor::scheduler().timer_interrupt();
}

View File

@@ -58,6 +58,8 @@ namespace Kernel
m_system_time_ms++;
}
SystemTimer::get().update_tsc();
if (should_invoke_scheduler())
Processor::scheduler().timer_interrupt();
}

View File

@@ -1,3 +1,6 @@
#include <BAN/Sort.h>
#include <kernel/CPUID.h>
#include <kernel/Scheduler.h>
#include <kernel/Timer/HPET.h>
#include <kernel/Timer/PIT.h>
@@ -54,19 +57,107 @@ namespace Kernel
Kernel::panic("Could not initialize any timer");
}
uint64_t SystemTimer::ms_since_boot() const
void SystemTimer::initialize_tsc()
{
return m_timer->ms_since_boot();
if (!CPUID::has_invariant_tsc())
{
dwarnln("CPU does not have an invariant TSC");
return;
}
const uint64_t tsc_freq = get_tsc_frequency();
dprintln("Initialized invariant TSC ({} Hz)", tsc_freq);
const uint8_t tsc_shift = 22;
const uint64_t tsc_mult = (static_cast<uint64_t>(1'000'000'000) << tsc_shift) / tsc_freq;
Processor::initialize_tsc(tsc_shift, tsc_mult, m_boot_time);
m_has_invariant_tsc = true;
}
uint64_t SystemTimer::ns_since_boot() const
uint64_t SystemTimer::get_tsc_frequency() const
{
// take 5x 50 ms samples and use the median value
const auto read_tsc =
[]() -> uint64_t {
uint32_t high, low;
asm volatile("lfence; rdtsc" : "=d"(high), "=a"(low));
return (static_cast<uint64_t>(high) << 32) | low;
};
constexpr size_t tsc_sample_count = 5;
constexpr size_t tsc_sample_ns = 50'000'000;
uint64_t tsc_freq_samples[tsc_sample_count];
for (size_t i = 0; i < tsc_sample_count; i++)
{
const auto start_ns = m_timer->ns_since_boot();
const auto start_tsc = read_tsc();
while (m_timer->ns_since_boot() < start_ns + tsc_sample_ns)
Processor::pause();
const auto stop_tsc = read_tsc();
const auto stop_ns = m_timer->ns_since_boot();
const auto duration_ns = stop_ns - start_ns;
const auto count_tsc = stop_tsc - start_tsc;
tsc_freq_samples[i] = count_tsc * 1'000'000'000 / duration_ns;
}
BAN::sort::sort(tsc_freq_samples, tsc_freq_samples + tsc_sample_count);
return tsc_freq_samples[tsc_sample_count / 2];
}
void SystemTimer::update_tsc() const
{
if (!m_has_invariant_tsc)
return;
// only update every 100 ms
if (++m_timer_ticks < 100)
return;
m_timer_ticks = 0;
Processor::update_tsc();
Processor::broadcast_smp_message({
.type = Processor::SMPMessage::Type::UpdateTSC,
.dummy = 0,
});
}
uint64_t SystemTimer::ns_since_boot_no_tsc() const
{
return m_timer->ns_since_boot();
}
uint64_t SystemTimer::ms_since_boot() const
{
if (!m_has_invariant_tsc)
return m_timer->ms_since_boot();
return Processor::ns_since_boot_tsc() / 1'000'000;
}
uint64_t SystemTimer::ns_since_boot() const
{
if (!m_has_invariant_tsc)
return m_timer->ns_since_boot();
return Processor::ns_since_boot_tsc();
}
timespec SystemTimer::time_since_boot() const
{
return m_timer->time_since_boot();
if (!m_has_invariant_tsc)
return m_timer->time_since_boot();
const auto ns_since_boot = Processor::ns_since_boot_tsc();
return {
.tv_sec = static_cast<time_t>(ns_since_boot / 1'000'000'000),
.tv_nsec = static_cast<long>(ns_since_boot % 1'000'000'000)
};
}
bool SystemTimer::pre_scheduler_sleep_needs_lock() const

View File

@@ -208,6 +208,8 @@ static void init2(void*)
dprintln("Scheduler started");
SystemTimer::get().initialize_tsc();
auto console = MUST(DevFileSystem::get().root_inode()->find_inode(cmdline.console));
ASSERT(console->is_tty());
static_cast<Kernel::TTY*>(console.ptr())->set_as_current();