Kernel: Use syscall/sysret for syscalls in x86_64

This commit is contained in:
Bananymous 2026-01-09 15:15:46 +02:00
parent 94bd74d0bb
commit a9ceab0415
19 changed files with 178 additions and 149 deletions

View File

@ -15,9 +15,7 @@ asm_syscall_handler:
andl $-16, %esp
# push arguments
subl $4, %esp
pushl %ebp
addl $24, (%esp)
subl $8, %esp
pushl %edi
pushl %esi
pushl %edx
@ -65,7 +63,7 @@ sys_fork_trampoline:
call read_ip
testl %eax, %eax
jz .reload_stack
jz .done
movl %esp, %ebx
@ -81,9 +79,3 @@ sys_fork_trampoline:
popl %ebx
popl %ebp
ret
.reload_stack:
call get_thread_start_sp
movl %eax, %esp
xorl %eax, %eax
jmp .done

View File

@ -1,6 +1,6 @@
.macro maybe_load_kernel_segments, n
cmpb $0x08, \n(%esp)
je 1f
testb $3, \n(%esp)
jz 1f; jnp 1f
movw $0x10, %ax
movw %ax, %ds
@ -13,8 +13,8 @@
.endm
.macro maybe_load_userspace_segments, n
cmpb $0x08, \n(%esp)
je 1f
testb $3, \n(%esp)
jz 1f; jnp 1f
movw $(0x20 | 3), %bx
movw %bx, %ds

View File

@ -1,50 +1,26 @@
// arguments in RAX, RBX, RCX, RDX, RSI, RDI
// System V ABI: RDI, RSI, RDX, RCX, R8, R9
.global asm_syscall_handler
asm_syscall_handler:
swapgs
pushq %rbx
pushq %rcx
pushq %rdx
pushq %rdi
pushq %rsi
pushq %rbp
pushq %r8
pushq %r9
pushq %r10
movq %rsp, %rax
movq %gs:8, %rsp
pushq $(0x20 | 3)
pushq %rax
pushq %r11
pushq %r12
pushq %r13
pushq %r14
pushq %r15
cld
pushq $(0x28 | 3)
pushq %rcx
subq $8, %rsp
movq %rsi, %r8
movq %rdi, %r9
movq %rax, %rdi
movq %rbx, %rsi
xchgq %rcx, %rdx
leaq 112(%rsp), %rbx
pushq %rbx
movq %r10, %rcx
call cpp_syscall_handler
addq $8, %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %r11
popq %r10
popq %r9
popq %r8
popq %rbp
popq %rsi
popq %rdi
popq %rdx
popq %rcx
popq %rbx
movq 8(%rsp), %rcx
movq 24(%rsp), %r11
movq 32(%rsp), %rsp
swapgs
iretq
sysretq
.global sys_fork_trampoline
sys_fork_trampoline:
@ -57,7 +33,7 @@ sys_fork_trampoline:
call read_ip
testq %rax, %rax
je .reload_stack
je .done
movq %rax, %rsi
movq %rsp, %rdi
@ -71,9 +47,3 @@ sys_fork_trampoline:
popq %rbp
popq %rbx
ret
.reload_stack:
call get_thread_start_sp
movq %rax, %rsp
xorq %rax, %rax
jmp .done

View File

@ -1,6 +1,6 @@
.macro swapgs_if_necessary, n
cmpb $0x08, \n(%rsp)
je 1f
testb $3, \n(%rsp)
jz 1f; jnp 1f
swapgs
1:
.endm

View File

@ -0,0 +1,44 @@
#pragma once
#include <kernel/Attributes.h>
#include <kernel/IDT.h>
#include <stdint.h>
#include <sys/syscall.h>
namespace Kernel
{
ALWAYS_INLINE long syscall(int syscall, uintptr_t arg1 = 0, uintptr_t arg2 = 0, uintptr_t arg3 = 0, uintptr_t arg4 = 0, uintptr_t arg5 = 0)
{
long ret;
#if ARCH(x86_64)
register uintptr_t r10 asm("r10") = arg3;
register uintptr_t r8 asm( "r8") = arg4;
register uintptr_t r9 asm( "r9") = arg5;
asm volatile(
"syscall"
: "=a"(ret)
, "+D"(syscall)
, "+S"(arg1)
, "+d"(arg2)
, "+r"(r10)
, "+r"(r8)
, "+r"(r9)
:: "rcx", "r11", "memory");
#elif ARCH(i686)
asm volatile(
"int %[irq]"
: "=a"(ret)
: [irq]"i"(static_cast<int>(IRQ_SYSCALL)) // WTF GCC 15
, "a"(syscall)
, "b"(arg1)
, "c"(arg2)
, "d"(arg3)
, "S"(arg4)
, "D"(arg5)
: "memory");
#endif
return ret;
}
}

View File

@ -151,8 +151,8 @@ namespace Kernel
private:
#if ARCH(x86_64)
BAN::Array<SegmentDescriptor, 7> m_gdt; // null, kernel code, kernel data, user code, user data, tss low, tss high
static constexpr uint16_t m_tss_offset = 0x28;
BAN::Array<SegmentDescriptor, 8> m_gdt; // null, kernel code, kernel data, user code (32 bit), user data, user code (64 bit), tss low, tss high
static constexpr uint16_t m_tss_offset = 0x30;
#elif ARCH(i686)
BAN::Array<SegmentDescriptor, 9> m_gdt; // null, kernel code, kernel data, user code, user data, processor data, fsbase, gsbase, tss
static constexpr uint16_t m_tss_offset = 0x40;

View File

@ -18,7 +18,10 @@ namespace Kernel
constexpr uint8_t IRQ_VECTOR_BASE = 0x20;
constexpr uint8_t IRQ_MSI_BASE = 0x80;
constexpr uint8_t IRQ_MSI_END = 0xF0;
#if ARCH(i686)
constexpr uint8_t IRQ_SYSCALL = 0xF0;
#endif
constexpr uint8_t IRQ_YIELD = 0xF1;
constexpr uint8_t IRQ_IPI = 0xF2;
constexpr uint8_t IRQ_TIMER = 0xF3;

View File

@ -187,7 +187,7 @@ namespace Kernel::PCI
void initialize_impl();
private:
static constexpr uint8_t m_msi_count = IRQ_SYSCALL - IRQ_MSI_BASE;
static constexpr uint8_t m_msi_count = IRQ_MSI_END - IRQ_MSI_BASE;
using PCIBus = BAN::Array<BAN::Array<Device, 8>, 32>;
BAN::Array<PCIBus, 256> m_buses;
BAN::Array<paddr_t, 256> m_bus_pcie_paddr;

View File

@ -102,6 +102,8 @@ namespace Kernel
uintptr_t stack_bottom() const { return reinterpret_cast<uintptr_t>(m_stack); }
uintptr_t stack_top() const { return stack_bottom() + s_stack_size; }
static void set_thread_syscall_stack(vaddr_t vaddr) { write_gs_sized<vaddr_t>(offsetof(Processor, m_thread_syscall_stack), vaddr); }
static GDT& gdt() { return *read_gs_sized<GDT*>(offsetof(Processor, m_gdt)); }
static IDT& idt() { return *read_gs_sized<IDT*>(offsetof(Processor, m_idt)); }
@ -137,6 +139,13 @@ namespace Kernel
static void initialize_smp();
static void initialize_shared_page();
static void dummy()
{
#if ARCH(x86_64)
static_assert(offsetof(Processor, m_thread_syscall_stack) == 8, "This is hardcoded in Syscall.S");
#endif
}
template<typename T>
static T read_gs_sized(uintptr_t offset) requires(sizeof(T) <= 8)
{
@ -180,6 +189,8 @@ namespace Kernel
ProcessorID m_id { 0 };
uint8_t m_index { 0xFF };
vaddr_t m_thread_syscall_stack;
static constexpr size_t s_stack_size { 4096 };
void* m_stack { nullptr };

View File

@ -1,27 +0,0 @@
#pragma once
#include <kernel/Attributes.h>
#include <kernel/IDT.h>
#include <stdint.h>
#include <sys/syscall.h>
namespace Kernel
{
ALWAYS_INLINE long syscall(int syscall, uintptr_t arg1 = 0, uintptr_t arg2 = 0, uintptr_t arg3 = 0, uintptr_t arg4 = 0, uintptr_t arg5 = 0)
{
long ret;
asm volatile("int %[irq]"
: "=a"(ret)
: [irq]"i"(static_cast<int>(IRQ_SYSCALL)) // WTF GCC 15
, "a"(syscall)
, "b"((uintptr_t)arg1)
, "c"((uintptr_t)arg2)
, "d"((uintptr_t)arg3)
, "S"((uintptr_t)arg4)
, "D"((uintptr_t)arg5)
: "memory");
return ret;
}
}

View File

@ -15,23 +15,23 @@ namespace Kernel
ASSERT(gdt);
#if ARCH(x86_64)
constexpr uint8_t code_flags = 0xA;
constexpr uint8_t data_flags = 0xC;
gdt->write_entry(0x00, 0x00000000, 0x00000, 0x00, 0x0); // null
gdt->write_entry(0x08, 0x00000000, 0xFFFFF, 0x9A, 0xA); // kernel code
gdt->write_entry(0x10, 0x00000000, 0xFFFFF, 0x92, 0xC); // kernel data
gdt->write_entry(0x18, 0x00000000, 0xFFFFF, 0xFA, 0xC); // user code (32 bit)
gdt->write_entry(0x20, 0x00000000, 0xFFFFF, 0xF2, 0xC); // user data
gdt->write_entry(0x28, 0x00000000, 0xFFFFF, 0xFA, 0xA); // user code (64 bit)
#elif ARCH(i686)
constexpr uint8_t code_flags = 0xC;
constexpr uint8_t data_flags = 0xC;
gdt->write_entry(0x00, 0x00000000, 0x00000, 0x00, 0x0); // null
gdt->write_entry(0x08, 0x00000000, 0xFFFFF, 0x9A, 0xC); // kernel code
gdt->write_entry(0x10, 0x00000000, 0xFFFFF, 0x92, 0xC); // kernel data
gdt->write_entry(0x18, 0x00000000, 0xFFFFF, 0xFA, 0xC); // user code
gdt->write_entry(0x20, 0x00000000, 0xFFFFF, 0xF2, 0xC); // user data
gdt->write_entry(0x28, reinterpret_cast<uint32_t>(processor), sizeof(Processor), 0x92, 0x4); // processor data
gdt->write_entry(0x30, 0x00000000, 0x00000, 0xF2, 0xC); // fsbase
gdt->write_entry(0x38, 0x00000000, 0x00000, 0xF2, 0xC); // gsbase
#endif
gdt->write_entry(0x00, 0x00000000, 0x00000, 0x00, 0x0); // null
gdt->write_entry(0x08, 0x00000000, 0xFFFFF, 0x9A, code_flags); // kernel code
gdt->write_entry(0x10, 0x00000000, 0xFFFFF, 0x92, data_flags); // kernel data
gdt->write_entry(0x18, 0x00000000, 0xFFFFF, 0xFA, code_flags); // user code
gdt->write_entry(0x20, 0x00000000, 0xFFFFF, 0xF2, data_flags); // user data
#if ARCH(i686)
gdt->write_entry(0x28, reinterpret_cast<uint32_t>(processor), sizeof(Processor), 0x92, 0x4); // processor data
gdt->write_entry(0x30, 0x00000000, 0x00000, 0xF2, data_flags); // fsbase
gdt->write_entry(0x38, 0x00000000, 0x00000, 0xF2, data_flags); // gsbase
#endif
gdt->write_tss();
return gdt;

View File

@ -18,8 +18,6 @@
X(160) X(161) X(162) X(163) X(164) X(165) X(166) X(167) X(168) X(169) X(170) X(171) X(172) X(173) X(174) X(175) X(176) X(177) X(178) X(179) X(180) X(181) X(182) X(183) X(184) X(185) X(186) X(187) X(188) X(189) X(190) X(191) \
X(192) X(193) X(194) X(195) X(196) X(197) X(198) X(199) X(200) X(201) X(202) X(203) X(204) X(205) X(206) X(207)
static_assert(Kernel::IRQ_SYSCALL == Kernel::IRQ_VECTOR_BASE + 208);
namespace Kernel
{
@ -446,7 +444,9 @@ namespace Kernel
extern "C" void asm_yield_handler();
extern "C" void asm_ipi_handler();
extern "C" void asm_timer_handler();
#if ARCH(i686)
extern "C" void asm_syscall_handler();
#endif
IDT* IDT::create()
{
@ -480,7 +480,9 @@ namespace Kernel
idt->register_interrupt_handler(IRQ_YIELD, asm_yield_handler);
idt->register_interrupt_handler(IRQ_IPI, asm_ipi_handler);
idt->register_interrupt_handler(IRQ_TIMER, asm_timer_handler);
#if ARCH(i686)
idt->register_syscall_handler(IRQ_SYSCALL, asm_syscall_handler);
#endif
return idt;
}

View File

@ -13,6 +13,11 @@ namespace Kernel
static constexpr uint32_t MSR_IA32_FS_BASE = 0xC0000100;
static constexpr uint32_t MSR_IA32_GS_BASE = 0xC0000101;
static constexpr uint32_t MSR_IA32_KERNEL_GS_BASE = 0xC0000102;
static constexpr uint32_t MSR_IA32_EFER = 0xC0000080;
static constexpr uint32_t MSR_IA32_STAR = 0xC0000081;
static constexpr uint32_t MSR_IA32_LSTAR = 0xC0000082;
static constexpr uint32_t MSR_IA32_FMASK = 0xC0000084;
#endif
ProcessorID Processor::s_bsp_id { PROCESSOR_NONE };
@ -30,6 +35,8 @@ namespace Kernel
static BAN::Array<Processor, 0xFF> s_processors;
static BAN::Array<ProcessorID, 0xFF> s_processor_ids { PROCESSOR_NONE };
extern "C" void asm_syscall_handler();
ProcessorID Processor::read_processor_id()
{
uint32_t id;
@ -87,13 +94,53 @@ namespace Kernel
// initialize GS
#if ARCH(x86_64)
// set gs base to pointer to this processor
uint64_t ptr = reinterpret_cast<uint64_t>(&processor);
uint32_t ptr_hi = ptr >> 32;
uint32_t ptr_lo = ptr & 0xFFFFFFFF;
asm volatile("wrmsr" :: "d"(ptr_hi), "a"(ptr_lo), "c"(MSR_IA32_GS_BASE));
{
// set gs base to pointer to this processor
const uint64_t val = reinterpret_cast<uint64_t>(&processor);
const uint32_t val_hi = val >> 32;
const uint32_t val_lo = val & 0xFFFFFFFF;
asm volatile("wrmsr" :: "d"(val_hi), "a"(val_lo), "c"(MSR_IA32_GS_BASE));
}
#elif ARCH(i686)
asm volatile("movw $0x28, %%ax; movw %%ax, %%gs" ::: "ax");
asm volatile("movw %0, %%gs" :: "r"(0x28));
#endif
#if ARCH(x86_64)
// enable syscall instruction
asm volatile("rdmsr; orb $1, %%al; wrmsr" :: "c"(MSR_IA32_EFER) : "eax", "edx");
{
union STAR
{
struct
{
uint32_t : 32;
uint16_t sel_ring0;
uint16_t sel_ring3;
};
uint64_t raw;
};
// set kernel and user segments
const uint64_t val = STAR { .sel_ring0 = 0x08, .sel_ring3 = 0x18 | 3 }.raw;
const uint32_t val_hi = val >> 32;
const uint32_t val_lo = val & 0xFFFFFFFF;
asm volatile("wrmsr" :: "d"(val_hi), "a"(val_lo), "c"(MSR_IA32_STAR));
}
{
// set syscall handler address
const uint64_t val = reinterpret_cast<uint64_t>(&asm_syscall_handler);
const uint32_t val_hi = val >> 32;
const uint32_t val_lo = val & 0xFFFFFFFF;
asm volatile("wrmsr" :: "d"(val_hi), "a"(val_lo), "c"(MSR_IA32_LSTAR));
}
{
// mask DF and IF
const uint64_t val = (1 << 10) | (1 << 9);
const uint32_t val_hi = val >> 32;
const uint32_t val_lo = val & 0xFFFFFFFF;
asm volatile("wrmsr" :: "d"(val_hi), "a"(val_lo), "c"(MSR_IA32_FMASK));
}
#endif
ASSERT(processor.m_idt);
@ -372,36 +419,17 @@ namespace Kernel
void Processor::load_segments()
{
{
const auto addr = scheduler().current_thread().get_fsbase();
#if ARCH(x86_64)
uint32_t ptr_hi = addr >> 32;
uint32_t ptr_lo = addr & 0xFFFFFFFF;
asm volatile("wrmsr" :: "d"(ptr_hi), "a"(ptr_lo), "c"(MSR_IA32_FS_BASE));
#elif ARCH(i686)
gdt().set_fsbase(addr);
#endif
}
{
const auto addr = scheduler().current_thread().get_gsbase();
#if ARCH(x86_64)
uint32_t ptr_hi = addr >> 32;
uint32_t ptr_lo = addr & 0xFFFFFFFF;
asm volatile("wrmsr" :: "d"(ptr_hi), "a"(ptr_lo), "c"(MSR_IA32_KERNEL_GS_BASE));
#elif ARCH(i686)
gdt().set_gsbase(addr);
#endif
}
load_fsbase();
load_gsbase();
}
void Processor::load_fsbase()
{
const auto addr = scheduler().current_thread().get_fsbase();
#if ARCH(x86_64)
uint32_t ptr_hi = addr >> 32;
uint32_t ptr_lo = addr & 0xFFFFFFFF;
asm volatile("wrmsr" :: "d"(ptr_hi), "a"(ptr_lo), "c"(MSR_IA32_FS_BASE));
const uint32_t addr_hi = addr >> 32;
const uint32_t addr_lo = addr & 0xFFFFFFFF;
asm volatile("wrmsr" :: "d"(addr_hi), "a"(addr_lo), "c"(MSR_IA32_FS_BASE));
#elif ARCH(i686)
gdt().set_fsbase(addr);
#endif
@ -411,9 +439,9 @@ namespace Kernel
{
const auto addr = scheduler().current_thread().get_gsbase();
#if ARCH(x86_64)
uint32_t ptr_hi = addr >> 32;
uint32_t ptr_lo = addr & 0xFFFFFFFF;
asm volatile("wrmsr" :: "d"(ptr_hi), "a"(ptr_lo), "c"(MSR_IA32_KERNEL_GS_BASE));
const uint32_t addr_hi = addr >> 32;
const uint32_t addr_lo = addr & 0xFFFFFFFF;
asm volatile("wrmsr" :: "d"(addr_hi), "a"(addr_lo), "c"(MSR_IA32_KERNEL_GS_BASE));
#elif ARCH(i686)
gdt().set_gsbase(addr);
#endif

View File

@ -284,9 +284,14 @@ namespace Kernel
thread->set_cpu_time_start();
}
Processor::gdt().set_tss_stack(thread->kernel_stack_top());
if (thread->is_userspace())
{
const vaddr_t kernel_stack_top = thread->kernel_stack_top();
Processor::gdt().set_tss_stack(kernel_stack_top);
Processor::set_thread_syscall_stack(kernel_stack_top);
Processor::load_segments();
}
*interrupt_stack = thread->interrupt_stack();
*interrupt_registers = thread->interrupt_registers();

View File

@ -1,9 +1,9 @@
#include <BAN/Bitcast.h>
#include <kernel/API/Syscall.h>
#include <kernel/Debug.h>
#include <kernel/InterruptStack.h>
#include <kernel/Process.h>
#include <kernel/Scheduler.h>
#include <kernel/Syscall.h>
#include <kernel/Timer/Timer.h>
#include <termios.h>
@ -40,10 +40,8 @@ namespace Kernel
static bool is_restartable_syscall(int syscall);
extern "C" long cpp_syscall_handler(int syscall, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, InterruptStack* interrupt_stack)
extern "C" long cpp_syscall_handler(int syscall, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
{
ASSERT(GDT::is_user_segment(interrupt_stack->cs));
Processor::set_interrupt_state(InterruptState::Enabled);
Process::current().wait_while_stopped();

View File

@ -490,7 +490,11 @@ namespace Kernel
write_to_stack(cur_sp, 0x20 | 3);
write_to_stack(cur_sp, sp);
write_to_stack(cur_sp, 0x202);
#if ARCH(x86_64)
write_to_stack(cur_sp, 0x28 | 3);
#elif ARCH(i686)
write_to_stack(cur_sp, 0x18 | 3);
#endif
write_to_stack(cur_sp, ip);
});

View File

@ -22,7 +22,6 @@
#include <kernel/Processor.h>
#include <kernel/Random.h>
#include <kernel/Scheduler.h>
#include <kernel/Syscall.h>
#include <kernel/Terminal/FramebufferTerminal.h>
#include <kernel/Terminal/Serial.h>
#include <kernel/Terminal/VirtualTTY.h>

View File

@ -5,8 +5,8 @@
#include <LibELF/AuxiliaryVector.h>
#include <kernel/API/SharedPage.h>
#include <kernel/API/Syscall.h>
#include <kernel/Memory/Types.h>
#include <kernel/Syscall.h>
#include <ctype.h>
#include <dlfcn.h>

View File

@ -1,7 +1,7 @@
#pragma once
#include <BAN/Traits.h>
#include <kernel/Syscall.h>
#include <kernel/API/Syscall.h>
#include <stddef.h>
#include <stdint.h>