Kernel: Rewrite the whole scheduler and re-architecture SMP handling

Change Semaphore -> ThreadBlocker This was not a semaphore, I just named it one because I didn't know what semaphore was. I have meant to change this sooner, but it was in no way urgent :D Implement SMP events. Processors can now be sent SMP events through IPIs. SMP events can be sent either to a single processor or broadcasted to every processor. PageTable::{map_page,map_range,unmap_page,unmap_range}() now send SMP event to invalidate TLB caches for the changed pages. Scheduler no longer uses a global run queue. Each processor has its own scheduler that keeps track of the load on the processor. Once every second schedulers do load balancing. Schedulers have no access to other processors' schedulers, they just see approximate loads. If scheduler decides that it has too much load, it will send a thread to another processor through a SMP event. Schedulers are currently run using the timer interrupt on BSB. This should be not the case, and each processor should use its LAPIC timer for interrupts. There is no reason to broadcast SMP event to all processors when BSB gets timer interrupt. Old scheduler only achieved 20% idle load on qemu. That was probably a very inefficient implementation. This new scheduler seems to average around 1% idle load. This is much closer to what I would expect. On my own laptop idle load seems to be only around 0.5% on each processor.
2024-07-22 00:33:50 +03:00
parent 9f90eeab05
commit f8261c60c0
60 changed files with 1559 additions and 715 deletions
--- a/kernel/kernel/ACPI/ACPI.cpp
+++ b/kernel/kernel/ACPI/ACPI.cpp
@@ -602,7 +602,7 @@ acpi_release_global_lock:
 			{
 				if (IO::inw(fadt().pm1a_cnt_blk) & PM1_CNT_SCI_EN)
 					break;
-				SystemTimer::get().sleep(10);
+				SystemTimer::get().sleep_ms(10);
 			}

 			if (!(IO::inw(fadt().pm1a_cnt_blk) & PM1_CNT_SCI_EN))
@@ -761,7 +761,7 @@ acpi_release_global_lock:

 			// FIXME: this can cause missing of event if it happens between
 			//        reading the status and blocking
-			m_event_semaphore.block_with_timeout(100);
+			m_event_thread_blocker.block_with_timeout_ms(100);
 			continue;

 handle_event:
@@ -782,7 +782,7 @@ handle_event:

 	void ACPI::handle_irq()
 	{
-		m_event_semaphore.unblock();
+		m_event_thread_blocker.unblock();
 	}

 }
--- a/kernel/kernel/APIC.cpp
+++ b/kernel/kernel/APIC.cpp
@@ -244,7 +244,7 @@ namespace Kernel

 		dprintln("System has {} processors", m_processors.size());

-		uint8_t bsp_id = Kernel::Processor::current_id();
+		uint8_t bsp_id = Kernel::Processor::current_id().as_u32();
 		dprintln("BSP lapic id: {}", bsp_id);

 		if (m_processors.size() == 1)
@@ -267,7 +267,7 @@ namespace Kernel

 			dprintln("Trying to enable processor (lapic id {})", processor.apic_id);

-			auto& proc = Kernel::Processor::create(processor.apic_id);
+			auto& proc = Kernel::Processor::create(ProcessorID(processor.apic_id));
 			PageTable::with_fast_page((paddr_t)g_ap_init_addr, [&] {
 				PageTable::fast_page_as_sized<uint32_t>(2) = V2P(proc.stack_top());
 			});
@@ -308,14 +308,24 @@ namespace Kernel
 			}

 			// give processor upto 100 * 100 us + 200 us to boot
-			for (int i = 0; *g_ap_stack_loaded == 0 && i < 100; i++)
+			for (int i = 0; i < 100; i++)
+			{
+				if (__atomic_load_n(&g_ap_stack_loaded[0], __ATOMIC_SEQ_CST))
+					break;
 				udelay(100);
+			}
 		}

-		*g_ap_startup_done = 1;
+		__atomic_store_n(&g_ap_startup_done[0], 1, __ATOMIC_SEQ_CST);
+
+		const size_t timeout_ms = SystemTimer::get().ms_since_boot() + 500;
+		while (__atomic_load_n(&g_ap_running_count[0], __ATOMIC_SEQ_CST) < m_processors.size() - 1)
+		{
+			if (SystemTimer::get().ms_since_boot() >= timeout_ms)
+				Kernel::panic("Could not start all processors");
+			__builtin_ia32_pause();
+		}

-		// give processors 100 us time to increment running count
-		udelay(100);
 		dprintln("{} processors started", *g_ap_running_count);
 	}

@@ -325,7 +335,7 @@ namespace Kernel
 		ASSERT(Kernel::Processor::get_interrupt_state() == InterruptState::Disabled);
 		while ((read_from_local_apic(LAPIC_ICR_LO_REG) & ICR_LO_delivery_status_send_pending) == ICR_LO_delivery_status_send_pending)
 			__builtin_ia32_pause();
-		write_to_local_apic(LAPIC_ICR_HI_REG, (read_from_local_apic(LAPIC_ICR_HI_REG) & 0x00FFFFFF) | (target << 24));
+		write_to_local_apic(LAPIC_ICR_HI_REG, (read_from_local_apic(LAPIC_ICR_HI_REG) & 0x00FFFFFF) | (target.as_u32() << 24));
 		write_to_local_apic(LAPIC_ICR_LO_REG,
 			(read_from_local_apic(LAPIC_ICR_LO_REG) & ICR_LO_reserved_mask)
 			| ICR_LO_delivery_mode_fixed
@@ -359,7 +369,6 @@ namespace Kernel
 		write_to_local_apic(LAPIC_SIV_REG, read_from_local_apic(LAPIC_SIV_REG) | 0x1FF);
 	}

-
 	uint32_t APIC::read_from_local_apic(ptrdiff_t offset)
 	{
 		return MMIO::read32(m_local_apic_vaddr + offset);
@@ -418,7 +427,7 @@ namespace Kernel
 		redir.vector = IRQ_VECTOR_BASE + irq;
 		redir.mask = 0;
 		// FIXME: distribute IRQs more evenly?
-		redir.destination = Kernel::Processor::bsb_id();
+		redir.destination = Kernel::Processor::bsb_id().as_u32();

 		ioapic->write(IOAPIC_REDIRS + gsi * 2,		redir.lo_dword);
 		ioapic->write(IOAPIC_REDIRS + gsi * 2 + 1,	redir.hi_dword);
--- a/kernel/kernel/FS/DevFS/FileSystem.cpp
+++ b/kernel/kernel/FS/DevFS/FileSystem.cpp
@@ -49,7 +49,7 @@ namespace Kernel
 						for (auto& device : s_instance->m_devices)
 							device->update();
 					}
-					SystemTimer::get().sleep(10);
+					SystemTimer::get().sleep_ms(10);
 				}
 			}, nullptr
 		);
@@ -65,7 +65,7 @@ namespace Kernel
 					while (!s_instance->m_should_sync)
 					{
 						LockFreeGuard _(s_instance->m_device_lock);
-						s_instance->m_sync_semaphore.block_indefinite();
+						s_instance->m_sync_thread_blocker.block_indefinite();
 					}

 					for (auto& device : s_instance->m_devices)
@@ -84,11 +84,11 @@ namespace Kernel
 			{
 				while (true)
 				{
-					SystemTimer::get().sleep(10000);
+					SystemTimer::get().sleep_ms(10'000);

 					LockGuard _(s_instance->m_device_lock);
 					s_instance->m_should_sync = true;
-					s_instance->m_sync_semaphore.unblock();
+					s_instance->m_sync_thread_blocker.unblock();
 				}
 			}, nullptr, sync_process
 		)));
@@ -101,7 +101,7 @@ namespace Kernel
 		{
 			LockGuard _(m_device_lock);
 			m_should_sync = true;
-			m_sync_semaphore.unblock();
+			m_sync_thread_blocker.unblock();
 		}
 		if (should_block)
 			m_sync_done.block_indefinite();
--- a/kernel/kernel/FS/Pipe.cpp
+++ b/kernel/kernel/FS/Pipe.cpp
@@ -37,7 +37,7 @@ namespace Kernel
 		ASSERT(m_writing_count > 0);
 		m_writing_count--;
 		if (m_writing_count == 0)
-			m_semaphore.unblock();
+			m_thread_blocker.unblock();
 	}

 	BAN::ErrorOr<size_t> Pipe::read_impl(off_t, BAN::ByteSpan buffer)
@@ -48,7 +48,7 @@ namespace Kernel
 			if (m_writing_count == 0)
 				return 0;
 			LockFreeGuard lock_free(m_mutex);
-			TRY(Thread::current().block_or_eintr_indefinite(m_semaphore));
+			TRY(Thread::current().block_or_eintr_indefinite(m_thread_blocker));
 		}

 		size_t to_copy = BAN::Math::min<size_t>(buffer.size(), m_buffer.size());
@@ -59,7 +59,7 @@ namespace Kernel

 		m_atime = SystemTimer::get().real_time();

-		m_semaphore.unblock();
+		m_thread_blocker.unblock();

 		return to_copy;
 	}
@@ -77,7 +77,7 @@ namespace Kernel
 		m_mtime = current_time;
 		m_ctime = current_time;

-		m_semaphore.unblock();
+		m_thread_blocker.unblock();

 		return buffer.size();
 	}
--- a/kernel/kernel/IDT.cpp
+++ b/kernel/kernel/IDT.cpp
@@ -10,7 +10,7 @@
 #include <kernel/Timer/PIT.h>

 #define ISR_LIST_X X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) X(8) X(9) X(10) X(11) X(12) X(13) X(14) X(15) X(16) X(17) X(18) X(19) X(20) X(21) X(22) X(23) X(24) X(25) X(26) X(27) X(28) X(29) X(30) X(31)
-#define IRQ_LIST_X X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) X(8) X(9) X(10) X(11) X(12) X(13) X(14) X(15) X(16) X(17) X(18) X(19) X(20) X(21) X(22) X(23) X(24) X(25) X(26) X(27) X(28) X(29) X(30) X(31) X(32)
+#define IRQ_LIST_X X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) X(8) X(9) X(10) X(11) X(12) X(13) X(14) X(15) X(16) X(17) X(18) X(19) X(20) X(21) X(22) X(23) X(24) X(25) X(26) X(27) X(28) X(29) X(30) X(31)

 namespace Kernel
 {
@@ -168,8 +168,8 @@ namespace Kernel
 			asm volatile("cli; 1: hlt; jmp 1b");
 		}

-		pid_t tid = Scheduler::current_tid();
-		pid_t pid = tid ? Process::current().pid() : 0;
+		const pid_t tid = Thread::current_tid();
+		const pid_t pid = (tid && Thread::current().has_process()) ? Process::current().pid() : 0;

 		if (tid)
 		{
@@ -241,13 +241,13 @@ namespace Kernel

 #if ARCH(x86_64)
 		dwarnln(
-			"{} (error code: 0x{8H}), pid {}, tid {}\r\n"
+			"CPU {}: {} (error code: 0x{8H}), pid {}, tid {}\r\n"
 			"Register dump\r\n"
 			"rax=0x{16H}, rbx=0x{16H}, rcx=0x{16H}, rdx=0x{16H}\r\n"
 			"rsp=0x{16H}, rbp=0x{16H}, rdi=0x{16H}, rsi=0x{16H}\r\n"
 			"rip=0x{16H}, rflags=0x{16H}\r\n"
 			"cr0=0x{16H}, cr2=0x{16H}, cr3=0x{16H}, cr4=0x{16H}",
-			isr_exceptions[isr], error, pid, tid,
+			Processor::current_id(), isr_exceptions[isr], error, pid, tid,
 			regs->rax, regs->rbx, regs->rcx, regs->rdx,
 			interrupt_stack->sp, regs->rbp, regs->rdi, regs->rsi,
 			interrupt_stack->ip, interrupt_stack->flags,
@@ -255,13 +255,13 @@ namespace Kernel
 		);
 #elif ARCH(i686)
 		dwarnln(
-			"{} (error code: 0x{8H}), pid {}, tid {}\r\n"
+			"CPU {}: {} (error code: 0x{8H}), pid {}, tid {}\r\n"
 			"Register dump\r\n"
 			"eax=0x{8H}, ebx=0x{8H}, ecx=0x{8H}, edx=0x{8H}\r\n"
 			"esp=0x{8H}, ebp=0x{8H}, edi=0x{8H}, esi=0x{8H}\r\n"
 			"eip=0x{8H}, eflags=0x{8H}\r\n"
 			"cr0=0x{8H}, cr2=0x{8H}, cr3=0x{8H}, cr4=0x{8H}",
-			isr_exceptions[isr], error, pid, tid,
+			Processor::current_id(), isr_exceptions[isr], error, pid, tid,
 			regs->eax, regs->ebx, regs->ecx, regs->edx,
 			interrupt_stack->sp, regs->ebp, regs->edi, regs->esi,
 			interrupt_stack->ip, interrupt_stack->flags,
@@ -320,12 +320,17 @@ done:

 	extern "C" void cpp_yield_handler(InterruptStack* interrupt_stack, InterruptRegisters* interrupt_registers)
 	{
+		// yield is raised through kernel software interrupt
 		ASSERT(!InterruptController::get().is_in_service(IRQ_YIELD));
 		ASSERT(!GDT::is_user_segment(interrupt_stack->cs));
+		Processor::scheduler().reschedule(interrupt_stack, interrupt_registers);
+	}

-		Processor::enter_interrupt(interrupt_stack, interrupt_registers);
-		Scheduler::get().irq_reschedule();
-		Processor::leave_interrupt();
+	extern "C" void cpp_ipi_handler()
+	{
+		ASSERT(InterruptController::get().is_in_service(IRQ_IPI));
+		InterruptController::get().eoi(IRQ_IPI);
+		Processor::handle_ipi();
 	}

 	extern "C" void cpp_irq_handler(uint32_t irq)
@@ -349,8 +354,6 @@ done:
 			InterruptController::get().eoi(irq);
 			if (auto* handler = s_interruptables[irq])
 				handler->handle_irq();
-			else if (irq == IRQ_IPI)
-				Scheduler::get().yield();
 			else
 				dprintln("no handler for irq 0x{2H}", irq);
 		}
@@ -359,7 +362,7 @@ done:
 		if (current_thread.can_add_signal_to_execute())
 			current_thread.handle_signal();

-		Scheduler::get().reschedule_if_idling();
+		Processor::scheduler().reschedule_if_idle();

 		ASSERT(Thread::current().state() != Thread::State::Terminated);

@@ -405,6 +408,7 @@ done:
 #undef X

 	extern "C" void asm_yield_handler();
+	extern "C" void asm_ipi_handler();
 	extern "C" void asm_syscall_handler();

 	IDT* IDT::create()
@@ -423,6 +427,7 @@ done:
 #undef X

 		idt->register_interrupt_handler(IRQ_VECTOR_BASE + IRQ_YIELD, asm_yield_handler);
+		idt->register_interrupt_handler(IRQ_VECTOR_BASE + IRQ_IPI,   asm_ipi_handler);

 		idt->register_syscall_handler(0x80, asm_syscall_handler);

--- a/kernel/kernel/Input/InputDevice.cpp
+++ b/kernel/kernel/Input/InputDevice.cpp
@@ -131,7 +131,7 @@ namespace Kernel
 		m_event_head = (m_event_head + 1) % m_max_event_count;
 		m_event_count++;

-		m_event_semaphore.unblock();
+		m_event_thread_blocker.unblock();
 		if (m_type == Type::Keyboard && s_keyboard_device)
 			s_keyboard_device->notify();
 		if (m_type == Type::Mouse && s_mouse_device)
@@ -149,7 +149,7 @@ namespace Kernel
 			m_event_lock.unlock(state);
 			{
 				LockFreeGuard _(m_mutex);
-				TRY(Thread::current().block_or_eintr_indefinite(m_event_semaphore));
+				TRY(Thread::current().block_or_eintr_indefinite(m_event_thread_blocker));
 			}
 			state = m_event_lock.lock();
 		}
@@ -213,7 +213,7 @@ namespace Kernel
 			}

 			LockFreeGuard _(m_mutex);
-			TRY(Thread::current().block_or_eintr_indefinite(m_semaphore));
+			TRY(Thread::current().block_or_eintr_indefinite(m_thread_blocker));
 		}
 	}

@@ -259,7 +259,7 @@ namespace Kernel
 			}

 			LockFreeGuard _(m_mutex);
-			TRY(Thread::current().block_or_eintr_indefinite(m_semaphore));
+			TRY(Thread::current().block_or_eintr_indefinite(m_thread_blocker));
 		}
 	}

--- a/kernel/kernel/Networking/ARPTable.cpp
+++ b/kernel/kernel/Networking/ARPTable.cpp
@@ -79,7 +79,7 @@ namespace Kernel
 				if (it != m_arp_table.end())
 					return it->value;
 			}
-			Scheduler::get().yield();
+			Processor::yield();
 		}

 		return BAN::Error::from_errno(ETIMEDOUT);
@@ -150,7 +150,7 @@ namespace Kernel
 				while (m_pending_packets.empty())
 				{
 					m_pending_lock.unlock(state);
-					m_pending_semaphore.block_indefinite();
+					m_pending_thread_blocker.block_indefinite();
 					state = m_pending_lock.lock();
 				}
 				auto packet = m_pending_packets.front();
@@ -178,7 +178,7 @@ namespace Kernel
 		}

 		m_pending_packets.push({ .interface = interface, .packet = arp_packet });
-		m_pending_semaphore.unblock();
+		m_pending_thread_blocker.unblock();
 	}

 }
--- a/kernel/kernel/Networking/IPv4Layer.cpp
+++ b/kernel/kernel/Networking/IPv4Layer.cpp
@@ -308,7 +308,7 @@ namespace Kernel
 				while (m_pending_packets.empty())
 				{
 					m_pending_lock.unlock(state);
-					m_pending_semaphore.block_indefinite();
+					m_pending_thread_blocker.block_indefinite();
 					state = m_pending_lock.lock();
 				}
 				auto packet = m_pending_packets.front();
@@ -367,7 +367,7 @@ namespace Kernel
 		m_pending_total_size += ipv4_header.total_length;

 		m_pending_packets.push({ .interface = interface });
-		m_pending_semaphore.unblock();
+		m_pending_thread_blocker.unblock();
 	}

 }
--- a/kernel/kernel/Networking/TCPSocket.cpp
+++ b/kernel/kernel/Networking/TCPSocket.cpp
@@ -75,7 +75,7 @@ namespace Kernel
 		while (m_pending_connections.empty())
 		{
 			LockFreeGuard _(m_mutex);
-			TRY(Thread::current().block_or_eintr_indefinite(m_semaphore));
+			TRY(Thread::current().block_or_eintr_indefinite(m_thread_blocker));
 		}

 		auto connection = m_pending_connections.front();
@@ -113,7 +113,7 @@ namespace Kernel
 			if (SystemTimer::get().ms_since_boot() >= wake_time_ms)
 				return BAN::Error::from_errno(ECONNABORTED);
 			LockFreeGuard free(m_mutex);
-			TRY(Thread::current().block_or_eintr_or_waketime(return_inode->m_semaphore, wake_time_ms, true));
+			TRY(Thread::current().block_or_eintr_or_waketime_ms(return_inode->m_thread_blocker, wake_time_ms, true));
 		}

 		if (address)
@@ -170,7 +170,7 @@ namespace Kernel
 			if (SystemTimer::get().ms_since_boot() >= wake_time_ms)
 				return BAN::Error::from_errno(ECONNREFUSED);
 			LockFreeGuard free(m_mutex);
-			TRY(Thread::current().block_or_eintr_or_waketime(m_semaphore, wake_time_ms, true));
+			TRY(Thread::current().block_or_eintr_or_waketime_ms(m_thread_blocker, wake_time_ms, true));
 		}

 		return {};
@@ -207,7 +207,7 @@ namespace Kernel
 			if (m_state != State::Established)
 				return return_with_maybe_zero();
 			LockFreeGuard free(m_mutex);
-			TRY(Thread::current().block_or_eintr_indefinite(m_semaphore));
+			TRY(Thread::current().block_or_eintr_indefinite(m_thread_blocker));
 		}

 		const uint32_t to_recv = BAN::Math::min<uint32_t>(buffer.size(), m_recv_window.data_size);
@@ -249,7 +249,7 @@ namespace Kernel
 			if (m_send_window.data_size + message.size() <= m_send_window.buffer->size())
 				break;
 			LockFreeGuard free(m_mutex);
-			TRY(Thread::current().block_or_eintr_indefinite(m_semaphore));
+			TRY(Thread::current().block_or_eintr_indefinite(m_thread_blocker));
 		}

 		{
@@ -259,14 +259,14 @@ namespace Kernel
 		}

 		const uint32_t target_ack = m_send_window.start_seq + m_send_window.data_size;
-		m_semaphore.unblock();
+		m_thread_blocker.unblock();

 		while (m_send_window.current_ack < target_ack)
 		{
 			if (m_state != State::Established)
 				return return_with_maybe_zero();
 			LockFreeGuard free(m_mutex);
-			TRY(Thread::current().block_or_eintr_indefinite(m_semaphore));
+			TRY(Thread::current().block_or_eintr_indefinite(m_thread_blocker));
 		}

 		return message.size();
@@ -597,7 +597,7 @@ namespace Kernel
 			}
 		}

-		m_semaphore.unblock();
+		m_thread_blocker.unblock();
 	}

 	void TCPSocket::set_connection_as_closed()
@@ -743,11 +743,11 @@ namespace Kernel
 				}
 			}

-			m_semaphore.unblock();
-			m_semaphore.block_with_wake_time(current_ms + retransmit_timeout_ms);
+			m_thread_blocker.unblock();
+			m_thread_blocker.block_with_wake_time_ms(current_ms + retransmit_timeout_ms);
 		}

-		m_semaphore.unblock();
+		m_thread_blocker.unblock();
 	}

 }
--- a/kernel/kernel/Networking/UDPSocket.cpp
+++ b/kernel/kernel/Networking/UDPSocket.cpp
@@ -70,7 +70,7 @@ namespace Kernel
 		m_packets.emplace(packet_info);
 		m_packet_total_size += payload.size();

-		m_packet_semaphore.unblock();
+		m_packet_thread_blocker.unblock();
 	}

 	BAN::ErrorOr<void> UDPSocket::bind_impl(const sockaddr* address, socklen_t address_len)
@@ -93,7 +93,7 @@ namespace Kernel
 		while (m_packets.empty())
 		{
 			m_packet_lock.unlock(state);
-			TRY(Thread::current().block_or_eintr_indefinite(m_packet_semaphore));
+			TRY(Thread::current().block_or_eintr_indefinite(m_packet_thread_blocker));
 			state = m_packet_lock.lock();
 		}

--- a/kernel/kernel/Networking/UNIX/Socket.cpp
+++ b/kernel/kernel/Networking/UNIX/Socket.cpp
@@ -73,7 +73,7 @@ namespace Kernel
 			return BAN::Error::from_errno(EINVAL);

 		while (connection_info.pending_connections.empty())
-			TRY(Thread::current().block_or_eintr_indefinite(connection_info.pending_semaphore));
+			TRY(Thread::current().block_or_eintr_indefinite(connection_info.pending_thread_blocker));

 		BAN::RefPtr<UnixDomainSocket> pending;

@@ -81,7 +81,7 @@ namespace Kernel
 			SpinLockGuard _(connection_info.pending_lock);
 			pending = connection_info.pending_connections.front();
 			connection_info.pending_connections.pop();
-			connection_info.pending_semaphore.unblock();
+			connection_info.pending_thread_blocker.unblock();
 		}

 		BAN::RefPtr<UnixDomainSocket> return_inode;
@@ -162,15 +162,15 @@ namespace Kernel
 				if (target_info.pending_connections.size() < target_info.pending_connections.capacity())
 				{
 					MUST(target_info.pending_connections.push(this));
-					target_info.pending_semaphore.unblock();
+					target_info.pending_thread_blocker.unblock();
 					break;
 				}
 			}
-			TRY(Thread::current().block_or_eintr_indefinite(target_info.pending_semaphore));
+			TRY(Thread::current().block_or_eintr_indefinite(target_info.pending_thread_blocker));
 		}

 		while (!connection_info.connection_done)
-			Scheduler::get().yield();
+			Processor::yield();

 		return {};
 	}
@@ -241,7 +241,7 @@ namespace Kernel
 		while (m_packet_sizes.full() || m_packet_size_total + packet.size() > s_packet_buffer_size)
 		{
 			m_packet_lock.unlock(state);
-			TRY(Thread::current().block_or_eintr_indefinite(m_packet_semaphore));
+			TRY(Thread::current().block_or_eintr_indefinite(m_packet_thread_blocker));
 			state = m_packet_lock.lock();
 		}

@@ -252,7 +252,7 @@ namespace Kernel
 		if (!is_streaming())
 			m_packet_sizes.push(packet.size());

-		m_packet_semaphore.unblock();
+		m_packet_thread_blocker.unblock();
 		m_packet_lock.unlock(state);
 		return {};
 	}
@@ -357,7 +357,7 @@ namespace Kernel
 		while (m_packet_size_total == 0)
 		{
 			m_packet_lock.unlock(state);
-			TRY(Thread::current().block_or_eintr_indefinite(m_packet_semaphore));
+			TRY(Thread::current().block_or_eintr_indefinite(m_packet_thread_blocker));
 			state = m_packet_lock.lock();
 		}

@@ -376,7 +376,7 @@ namespace Kernel
 		memmove(packet_buffer, packet_buffer + nread, m_packet_size_total - nread);
 		m_packet_size_total -= nread;

-		m_packet_semaphore.unblock();
+		m_packet_thread_blocker.unblock();
 		m_packet_lock.unlock(state);

 		return nread;
--- a/kernel/kernel/Process.cpp
+++ b/kernel/kernel/Process.cpp
@@ -82,12 +82,13 @@ namespace Kernel

 	void Process::register_to_scheduler()
 	{
+		// FIXME: Allow failing...
 		{
 			SpinLockGuard _(s_process_lock);
 			MUST(s_processes.push_back(this));
 		}
 		for (auto* thread : m_threads)
-			MUST(Scheduler::get().add_thread(thread));
+			MUST(Processor::scheduler().add_thread(thread));
 	}

 	Process* Process::create_kernel()
@@ -206,10 +207,10 @@ namespace Kernel
 		ProcFileSystem::get().on_process_delete(*this);

 		m_exit_status.exited = true;
-		m_exit_status.semaphore.unblock();
+		m_exit_status.thread_blocker.unblock();

 		while (m_exit_status.waiting > 0)
-			Scheduler::get().yield();
+			Processor::yield();

 		m_process_lock.lock();

@@ -250,16 +251,6 @@ namespace Kernel
 		m_exit_status.exit_code = __WGENEXITCODE(status, signal);
 		while (!m_threads.empty())
 			m_threads.front()->on_exit();
-		//for (auto* thread : m_threads)
-		//	if (thread != &Thread::current())
-		//		Scheduler::get().terminate_thread(thread);
-		//if (this == &Process::current())
-		//{
-		//	m_threads.clear();
-		//	Processor::set_interrupt_state(InterruptState::Disabled);
-		//	Thread::current().setup_process_cleanup();
-		//	Scheduler::get().yield();
-		//}
 	}

 	size_t Process::proc_meminfo(off_t offset, BAN::ByteSpan buffer) const
@@ -534,13 +525,13 @@ namespace Kernel
 			m_cmdline = BAN::move(str_argv);
 			m_environ = BAN::move(str_envp);

-			asm volatile("cli");
+			Processor::set_interrupt_state(InterruptState::Disabled);
 		}

 		m_has_called_exec = true;

 		m_threads.front()->setup_exec();
-		Scheduler::get().yield();
+		Processor::yield();
 		ASSERT_NOT_REACHED();
 	}

@@ -603,12 +594,12 @@ namespace Kernel
 		if (seconds == 0)
 			return 0;

-		uint64_t wake_time = SystemTimer::get().ms_since_boot() + seconds * 1000;
-		Scheduler::get().set_current_thread_sleeping(wake_time);
+		const uint64_t wake_time_ms = SystemTimer::get().ms_since_boot() + (seconds * 1000);
+		SystemTimer::get().sleep_ms(seconds * 1000);

-		uint64_t current_time = SystemTimer::get().ms_since_boot();
-		if (current_time < wake_time)
-			return BAN::Math::div_round_up<long>(wake_time - current_time, 1000);
+		const uint64_t current_ms = SystemTimer::get().ms_since_boot();
+		if (current_ms < wake_time_ms)
+			return BAN::Math::div_round_up<long>(wake_time_ms - current_ms, 1000);

 		return 0;
 	}
@@ -622,23 +613,21 @@ namespace Kernel
 				TRY(validate_pointer_access(rmtp, sizeof(timespec)));
 		}

-		uint64_t sleep_ms = rqtp->tv_sec * 1000 + BAN::Math::div_round_up<uint64_t>(rqtp->tv_nsec, 1'000'000);
-		if (sleep_ms == 0)
+		const uint64_t sleep_ns = (rqtp->tv_sec * 1'000'000'000) + rqtp->tv_nsec;
+		if (sleep_ns == 0)
 			return 0;

-		uint64_t wake_time_ms = SystemTimer::get().ms_since_boot() + sleep_ms;
+		const uint64_t wake_time_ns = SystemTimer::get().ns_since_boot() + sleep_ns;
+		SystemTimer::get().sleep_ns(sleep_ns);

-		Scheduler::get().set_current_thread_sleeping(wake_time_ms);
-
-		uint64_t current_ms = SystemTimer::get().ms_since_boot();
-
-		if (current_ms < wake_time_ms)
+		const uint64_t current_ns = SystemTimer::get().ns_since_boot();
+		if (current_ns < wake_time_ns)
 		{
 			if (rmtp)
 			{
-				uint64_t remaining_ms = wake_time_ms - current_ms;
-				rmtp->tv_sec = remaining_ms / 1000;
-				rmtp->tv_nsec = (remaining_ms % 1000) * 1'000'000;
+				const uint64_t remaining_ns = wake_time_ns - current_ns;
+				rmtp->tv_sec  = remaining_ns / 1'000'000'000;
+				rmtp->tv_nsec = remaining_ns % 1'000'000'000;
 			}
 			return BAN::Error::from_errno(EINTR);
 		}
@@ -1140,7 +1129,7 @@ namespace Kernel
 				break;

 			LockFreeGuard free(m_process_lock);
-			SystemTimer::get().sleep(1);
+			SystemTimer::get().sleep_ms(1);
 		}

 		if (arguments->readfds)
@@ -1347,7 +1336,7 @@ namespace Kernel
 			TRY(validate_pointer_access(args, sizeof(sys_mmap_t)));
 		}

-		if (args->prot != PROT_NONE && args->prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC))
+		if (args->prot != PROT_NONE && (args->prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)))
 			return BAN::Error::from_errno(EINVAL);

 		if (args->flags & MAP_FIXED)
@@ -1628,7 +1617,7 @@ namespace Kernel
 					{
 						process.add_pending_signal(signal);
 						// FIXME: This feels hacky
-						Scheduler::get().unblock_thread(process.m_threads.front()->tid());
+						Processor::scheduler().unblock_thread(process.m_threads.front()->tid());
 					}
 					return (pid > 0) ? BAN::Iteration::Break : BAN::Iteration::Continue;
 				}
--- a/kernel/kernel/Processor.cpp
+++ b/kernel/kernel/Processor.cpp
@@ -1,17 +1,31 @@
+#include <kernel/InterruptController.h>
 #include <kernel/Memory/kmalloc.h>
 #include <kernel/Processor.h>
+#include <kernel/Terminal/TerminalDriver.h>
 #include <kernel/Thread.h>
+#include <kernel/Timer/Timer.h>
+
+extern Kernel::TerminalDriver* g_terminal_driver;

 namespace Kernel
 {

 	static constexpr uint32_t MSR_IA32_GS_BASE = 0xC0000101;

-	ProcessorID Processor::s_bsb_id { PROCESSOR_NONE };
+	ProcessorID          Processor::s_bsb_id                { PROCESSOR_NONE };
+	BAN::Atomic<uint8_t> Processor::s_processor_count       { 0 };
+	BAN::Atomic<bool>    Processor::s_is_smp_enabled        { false };
+	BAN::Atomic<bool>    Processor::s_should_print_cpu_load { false };

-	static BAN::Array<Processor, 0xFF> s_processors;
+	static BAN::Atomic<uint8_t>  s_processors_created { 0 };

-	static ProcessorID read_processor_id()
+	// 32 bit milli seconds are definitely enough as APs start on boot
+	static BAN::Atomic<uint32_t> s_first_ap_ready_ms { 0 };
+
+	static BAN::Array<Processor,   0xFF> s_processors;
+	static BAN::Array<ProcessorID, 0xFF> s_processor_ids { PROCESSOR_NONE };
+
+	ProcessorID Processor::read_processor_id()
 	{
 		uint32_t id;
 		asm volatile(
@@ -21,16 +35,18 @@ namespace Kernel
 			: "=b"(id)
 			:: "eax", "ecx", "edx"
 		);
-		return id;
+		return ProcessorID(id);
 	}

 	Processor& Processor::create(ProcessorID id)
 	{
 		// bsb is the first processor
-		if (s_bsb_id == PROCESSOR_NONE)
+		if (s_bsb_id == PROCESSOR_NONE && id == PROCESSOR_NONE)
 			s_bsb_id = id = read_processor_id();
+		if (s_bsb_id == PROCESSOR_NONE || id == PROCESSOR_NONE || id.m_id >= s_processors.size())
+			Kernel::panic("Trying to initialize invalid processor {}", id.m_id);

-		auto& processor = s_processors[id];
+		auto& processor = s_processors[id.m_id];

 		ASSERT(processor.m_id == PROCESSOR_NONE);
 		processor.m_id = id;
@@ -44,13 +60,27 @@ namespace Kernel
 		processor.m_idt = IDT::create();
 		ASSERT(processor.m_idt);

+		processor.m_scheduler = MUST(Scheduler::create());
+		ASSERT(processor.m_scheduler);
+
+		SMPMessage* smp_storage = new SMPMessage[0x1000];
+		ASSERT(smp_storage);
+		for (size_t i = 0; i < 0xFFF; i++)
+			smp_storage[i].next = &smp_storage[i + 1];
+		smp_storage[0xFFF].next = nullptr;
+
+		processor.m_smp_pending = nullptr;
+		processor.m_smp_free    = smp_storage;
+
+		s_processors_created++;
+
 		return processor;
 	}

 	Processor& Processor::initialize()
 	{
 		auto id = read_processor_id();
-		auto& processor = s_processors[id];
+		auto& processor = s_processors[id.m_id];

 		ASSERT(processor.m_gdt);
 		processor.m_gdt->load();
@@ -72,41 +102,265 @@ namespace Kernel
 		return processor;
 	}

-	void Processor::allocate_idle_thread()
+	ProcessorID Processor::id_from_index(size_t index)
 	{
-		ASSERT(idle_thread() == nullptr);
-		auto* idle_thread = MUST(Thread::create_kernel([](void*) { for (;;) asm volatile("hlt"); }, nullptr, nullptr));
-		write_gs_ptr(offsetof(Processor, m_idle_thread), idle_thread);
+		ASSERT(index < s_processor_count);
+		ASSERT(s_processor_ids[index] != PROCESSOR_NONE);
+		return s_processor_ids[index];
 	}

-	void Processor::enter_interrupt(InterruptStack* interrupt_stack, InterruptRegisters* interrupt_registers)
+	void Processor::wait_until_processors_ready()
 	{
-		ASSERT(get_interrupt_state() == InterruptState::Disabled);
-		ASSERT(read_gs_ptr(offsetof(Processor, m_interrupt_stack)) == nullptr);
-		write_gs_ptr(offsetof(Processor, m_interrupt_stack), interrupt_stack);
-		write_gs_ptr(offsetof(Processor, m_interrupt_registers), interrupt_registers);
+		if (s_processors_created == 1)
+		{
+			ASSERT(current_is_bsb());
+			s_processor_count++;
+			s_processor_ids[0] = current_id();
+		}
+
+		// wait until bsb is ready
+		if (current_is_bsb())
+		{
+			s_processor_count = 1;
+			s_processor_ids[0] = current_id();
+
+			// single processor system
+			if (s_processors_created == 1)
+				return;
+
+			// wait until first AP is ready
+			const uint64_t timeout_ms = SystemTimer::get().ms_since_boot() + 1000;
+			while (s_first_ap_ready_ms == 0)
+			{
+				if (SystemTimer::get().ms_since_boot() >= timeout_ms)
+				{
+					dprintln("Could not initialize any APs :(");
+					return;
+				}
+				__builtin_ia32_pause();
+			}
+		}
+		else
+		{
+			// wait until bsb is ready, it shall get index 0
+			while (s_processor_count == 0)
+				__builtin_ia32_pause();
+
+			auto lookup_index = s_processor_count++;
+			ASSERT(s_processor_ids[lookup_index] == PROCESSOR_NONE);
+			s_processor_ids[lookup_index] = current_id();
+
+			uint32_t expected = 0;
+			s_first_ap_ready_ms.compare_exchange(expected, SystemTimer::get().ms_since_boot());
+		}
+
+		// wait until all processors are initialized
+		{
+			const uint32_t timeout_ms = s_first_ap_ready_ms + 1000;
+			while (s_processor_count < s_processors_created)
+			{
+				if (SystemTimer::get().ms_since_boot() >= timeout_ms)
+				{
+					if (current_is_bsb())
+						dprintln("Could not initialize {} processors :(", s_processors_created - s_processor_count);
+					break;
+				}
+				__builtin_ia32_pause();
+			}
+		}
+
+		s_is_smp_enabled = true;
 	}

-	void Processor::leave_interrupt()
+	void Processor::handle_ipi()
 	{
-		ASSERT(get_interrupt_state() == InterruptState::Disabled);
-		ASSERT(read_gs_ptr(offsetof(Processor, m_interrupt_stack)) != nullptr);
-		write_gs_ptr(offsetof(Processor, m_interrupt_stack), nullptr);
-		write_gs_ptr(offsetof(Processor, m_interrupt_registers), nullptr);
+		handle_smp_messages();
 	}

-	InterruptStack& Processor::get_interrupt_stack()
+	template<typename F>
+	void with_atomic_lock(BAN::Atomic<bool>& lock, F callback)
 	{
-		ASSERT(get_interrupt_state() == InterruptState::Disabled);
-		ASSERT(read_gs_ptr(offsetof(Processor, m_interrupt_stack)));
-		return *read_gs_sized<InterruptStack*>(offsetof(Processor, m_interrupt_stack));
+		bool expected = false;
+		while (!lock.compare_exchange(expected, true, BAN::MemoryOrder::memory_order_acquire))
+		{
+			__builtin_ia32_pause();
+			expected = false;
+		}
+
+		callback();
+
+		lock.store(false, BAN::MemoryOrder::memory_order_release);
 	}

-	InterruptRegisters& Processor::get_interrupt_registers()
+	void Processor::handle_smp_messages()
 	{
-		ASSERT(get_interrupt_state() == InterruptState::Disabled);
-		ASSERT(read_gs_ptr(offsetof(Processor, m_interrupt_registers)));
-		return *read_gs_sized<InterruptRegisters*>(offsetof(Processor, m_interrupt_registers));
+		auto state = get_interrupt_state();
+		set_interrupt_state(InterruptState::Disabled);
+
+		auto processor_id = current_id();
+		auto& processor = s_processors[processor_id.m_id];
+
+		SMPMessage* pending = nullptr;
+		with_atomic_lock(processor.m_smp_pending_lock,
+			[&]()
+			{
+				pending = processor.m_smp_pending;
+				processor.m_smp_pending = nullptr;
+			}
+		);
+
+		bool should_preempt = false;
+
+		if (pending)
+		{
+			// reverse smp message queue from LIFO to FIFO
+			{
+				SMPMessage* reversed = nullptr;
+
+				for (SMPMessage* message = pending; message;)
+				{
+					SMPMessage* next = message->next;
+					message->next = reversed;
+					reversed = message;
+					message = next;
+				}
+
+				pending = reversed;
+			}
+
+			SMPMessage* last_handled = nullptr;
+
+			// handle messages
+			for (auto* message = pending; message; message = message->next)
+			{
+				switch (message->type)
+				{
+					case SMPMessage::Type::FlushTLB:
+						for (size_t i = 0; i < message->flush_tlb.page_count; i++)
+							asm volatile("invlpg (%0)" :: "r"(message->flush_tlb.vaddr + i * PAGE_SIZE) : "memory");
+						break;
+					case SMPMessage::Type::NewThread:
+						processor.m_scheduler->handle_new_thread_request(message->new_thread);
+						break;
+					case SMPMessage::Type::UnblockThread:
+						processor.m_scheduler->handle_unblock_request(message->unblock_thread);
+						break;
+					case SMPMessage::Type::SchedulerPreemption:
+						should_preempt = true;
+						break;
+				}
+
+				last_handled = message;
+			}
+
+			with_atomic_lock(processor.m_smp_free_lock,
+				[&]()
+				{
+					last_handled->next = processor.m_smp_free;
+					processor.m_smp_free = pending;
+				}
+			);
+		}
+
+		if (should_preempt)
+			processor.m_scheduler->preempt();
+
+		set_interrupt_state(state);
+	}
+
+	void Processor::send_smp_message(ProcessorID processor_id, const SMPMessage& message, bool send_ipi)
+	{
+		ASSERT(processor_id != current_id());
+
+		auto state = get_interrupt_state();
+		set_interrupt_state(InterruptState::Disabled);
+
+		auto& processor = s_processors[processor_id.m_id];
+
+		// take free message slot
+		SMPMessage* storage = nullptr;
+		with_atomic_lock(processor.m_smp_free_lock,
+			[&]()
+			{
+				storage = processor.m_smp_free;
+				ASSERT(storage && storage->next);
+
+				processor.m_smp_free = storage->next;
+			}
+		);
+
+		// write message
+		*storage = message;
+
+		// push message to pending queue
+		with_atomic_lock(processor.m_smp_pending_lock,
+			[&]()
+			{
+				storage->next = processor.m_smp_pending;
+				processor.m_smp_pending = storage;
+			}
+		);
+
+		if (send_ipi)
+			InterruptController::get().send_ipi(processor_id);
+
+		set_interrupt_state(state);
+	}
+
+	void Processor::broadcast_smp_message(const SMPMessage& message)
+	{
+		if (!is_smp_enabled())
+			return;
+
+		auto state = get_interrupt_state();
+		set_interrupt_state(InterruptState::Disabled);
+
+		for (size_t i = 0; i < Processor::count(); i++)
+		{
+			auto processor_id = s_processor_ids[i];
+			if (processor_id != current_id())
+				send_smp_message(processor_id, message, false);
+		}
+
+		InterruptController::get().broadcast_ipi();
+
+		set_interrupt_state(state);
+	}
+
+	void Processor::yield()
+	{
+		auto state = get_interrupt_state();
+		set_interrupt_state(InterruptState::Disabled);
+
+#if ARCH(x86_64)
+		asm volatile(
+			"movq %%rsp, %%rcx;"
+			"movq %[load_sp], %%rsp;"
+			"int %[yield];"
+			"movq %%rcx, %%rsp;"
+			// NOTE: This is offset by 2 pointers since interrupt without PL change
+			//       does not push SP and SS. This allows accessing "whole" interrupt stack.
+			:: [load_sp]"r"(Processor::current_stack_top() - 2 * sizeof(uintptr_t)),
+			   [yield]"i"(IRQ_VECTOR_BASE + IRQ_YIELD)
+			:  "memory", "rcx"
+		);
+#elif ARCH(i686)
+		asm volatile(
+			"movl %%esp, %%ecx;"
+			"movl %[load_sp], %%esp;"
+			"int %[yield];"
+			"movl %%ecx, %%esp;"
+			// NOTE: This is offset by 2 pointers since interrupt without PL change
+			//       does not push SP and SS. This allows accessing "whole" interrupt stack.
+			:: [load_sp]"r"(Processor::current_stack_top() - 2 * sizeof(uintptr_t)),
+			   [yield]"i"(IRQ_VECTOR_BASE + IRQ_YIELD)
+			:  "memory", "ecx"
+		);
+#else
+		#error
+#endif
+
+		Processor::set_interrupt_state(state);
 	}

 }
--- a/kernel/kernel/Scheduler.cpp
+++ b/kernel/kernel/Scheduler.cpp
@@ -1,260 +1,715 @@
-#include <kernel/Arch.h>
-#include <kernel/Attributes.h>
-#include <kernel/GDT.h>
+#include <BAN/Optional.h>
+#include <BAN/Sort.h>
 #include <kernel/InterruptController.h>
 #include <kernel/Process.h>
 #include <kernel/Scheduler.h>
+#include <kernel/Thread.h>
 #include <kernel/Timer/Timer.h>

-#define SCHEDULER_VERIFY_STACK 1
+#define DEBUG_SCHEDULER 0
+#define SCHEDULER_ASSERT 1
+
+#if SCHEDULER_ASSERT == 0
+#undef ASSERT
+#define ASSERT(...)
+#endif

 namespace Kernel
 {

-	static Scheduler* s_instance = nullptr;
+	static constexpr uint64_t s_reschedule_interval_ns   =    10'000'000;
+	static constexpr uint64_t s_load_balance_interval_ns = 1'000'000'000;
+
+	static BAN::Atomic<uint8_t> s_schedulers_initialized { 0 };
+
+	struct ProcessorInfo
+	{
+		uint64_t idle_time_ns     { s_load_balance_interval_ns };
+		uint32_t max_load_threads { 0 };
+	};
+
+	static SpinLock                        s_processor_info_time_lock;
+	static BAN::Array<ProcessorInfo, 0xFF> s_processor_infos;
+
+
+	static BAN::Atomic<size_t> s_next_processor_index { 0 };
+
+
+	void SchedulerQueue::add_thread_to_back(Node* node)
+	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+		node->next = nullptr;
+		node->prev = m_tail;
+		(m_tail ? m_tail->next : m_head) = node;
+		m_tail = node;
+	}
+
+	void SchedulerQueue::add_thread_with_wake_time(Node* node)
+	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+
+		if (m_tail == nullptr || node->wake_time_ns >= m_tail->wake_time_ns)
+			return add_thread_to_back(node);
+
+		Node* next = m_head;
+		Node* prev = nullptr;
+		while (next && node->wake_time_ns > next->wake_time_ns)
+		{
+			prev = next;
+			next = next->next;
+		}
+
+		node->next = next;
+		node->prev = prev;
+		(next ? next->prev : m_tail) = node;
+		(prev ? prev->next : m_head) = node;
+	}
+
+	template<typename F>
+	SchedulerQueue::Node* SchedulerQueue::remove_with_condition(F callback)
+	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+
+		for (Node* node = m_head; node; node = node->next)
+		{
+			if (!callback(node))
+				continue;
+			remove_node(node);
+			return node;
+		}
+
+		return nullptr;
+	}
+
+	void SchedulerQueue::remove_node(Node* node)
+	{
+		(node->prev ? node->prev->next : m_head) = node->next;
+		(node->next ? node->next->prev : m_tail) = node->prev;
+		node->prev = nullptr;
+		node->next = nullptr;
+	}
+
+	SchedulerQueue::Node* SchedulerQueue::front()
+	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+		ASSERT(!empty());
+		return m_head;
+	}
+
+	SchedulerQueue::Node* SchedulerQueue::pop_front()
+	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+		if (empty())
+			return nullptr;
+		Node* result = m_head;
+		m_head = m_head->next;
+		(m_head ? m_head->prev : m_tail) = nullptr;
+		result->next = nullptr;
+		return result;
+	}
+
+	BAN::ErrorOr<Scheduler*> Scheduler::create()
+	{
+		auto* scheduler = new Scheduler();
+		if (scheduler == nullptr)
+			return BAN::Error::from_errno(ENOMEM);
+		return scheduler;
+	}

 	BAN::ErrorOr<void> Scheduler::initialize()
 	{
-		ASSERT(s_instance == nullptr);
-		s_instance = new Scheduler();
-		ASSERT(s_instance);
-		Processor::allocate_idle_thread();
+		m_idle_thread = TRY(Thread::create_kernel([](void*) { asm volatile("1: hlt; jmp 1b"); }, nullptr, nullptr));
+		ASSERT(m_idle_thread);
+
+		size_t processor_index = 0;
+		for (; processor_index < Processor::count(); processor_index++)
+			if (Processor::id_from_index(processor_index) == Processor::current_id())
+				break;
+		ASSERT(processor_index < Processor::count());
+
+		// each CPU does load balance at different times. This calulates the offset to other CPUs
+		m_last_load_balance_ns = s_load_balance_interval_ns * processor_index / Processor::count();
+		m_idle_ns              = -m_last_load_balance_ns;
+
+		s_schedulers_initialized++;
+		while (s_schedulers_initialized < Processor::count())
+			__builtin_ia32_pause();
+
 		return {};
 	}

-	Scheduler& Scheduler::get()
-	{
-		ASSERT(s_instance);
-		return *s_instance;
-	}
-
-	void Scheduler::start()
+	void Scheduler::add_current_to_most_loaded(SchedulerQueue* target_queue)
 	{
 		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
-		ASSERT(!m_active_threads.empty());

-		// broadcast ipi (yield) for each processor
-		InterruptController::get().broadcast_ipi();
-		yield();
-
-		ASSERT_NOT_REACHED();
-	}
-
-	Thread& Scheduler::current_thread()
-	{
-		auto* current = Processor::get_current_thread();
-		return current ? *current->thread : *Processor::idle_thread();
-	}
-
-	pid_t Scheduler::current_tid()
-	{
-		if (s_instance == nullptr)
-			return 0;
-		return Scheduler::get().current_thread().tid();
-	}
-
-	void Scheduler::setup_next_thread()
-	{
-		ASSERT(m_lock.current_processor_has_lock());
-
-		if (auto* current = Processor::get_current_thread())
+		bool has_current = false;
+		for (auto& info : m_most_loaded_threads)
 		{
-			auto* thread = current->thread;
-
-			if (thread->state() == Thread::State::Terminated)
+			if (info.node == m_current)
 			{
-				PageTable::kernel().load();
-				delete thread;
-				delete current;
-			}
-			else
-			{
-				// thread->state() can be NotStarted when calling exec or cleaning up process
-				if (thread->state() != Thread::State::NotStarted)
-				{
-					thread->interrupt_stack() = Processor::get_interrupt_stack();
-					thread->interrupt_registers() = Processor::get_interrupt_registers();
-				}
-
-				if (current->should_block)
-				{
-					current->should_block = false;
-					m_blocking_threads.add_with_wake_time(current);
-				}
-				else
-				{
-					m_active_threads.push_back(current);
-				}
+				info.queue = target_queue;
+				has_current = true;
+				break;
 			}
 		}

-		SchedulerQueue::Node* node = nullptr;
-		while (!m_active_threads.empty())
+		if (!has_current)
 		{
-			node = m_active_threads.pop_front();
-			if (node->thread->state() != Thread::State::Terminated)
+			size_t index = 0;
+			for (; index < m_most_loaded_threads.size() - 1; index++)
+				if (m_most_loaded_threads[index].node == nullptr)
+					break;
+			m_most_loaded_threads[index].queue = target_queue;
+			m_most_loaded_threads[index].node  = m_current;
+		}
+
+		BAN::sort::sort(m_most_loaded_threads.begin(), m_most_loaded_threads.end(),
+			[](const ThreadInfo& a, const ThreadInfo& b) -> bool
+			{
+				if (a.node == nullptr || b.node == nullptr)
+					return a.node;
+				return a.node->time_used_ns > b.node->time_used_ns;
+			}
+		);
+	}
+
+	void Scheduler::update_most_loaded_node_queue(SchedulerQueue::Node* node, SchedulerQueue* target_queue)
+	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+
+		for (auto& info : m_most_loaded_threads)
+		{
+			if (info.node == node)
+			{
+				info.queue = target_queue;
+				break;
+			}
+		}
+	}
+
+	void Scheduler::remove_node_from_most_loaded(SchedulerQueue::Node* node)
+	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+
+		size_t i = 0;
+		for (; i < m_most_loaded_threads.size(); i++)
+			if (m_most_loaded_threads[i].node == node)
 				break;

-			PageTable::kernel().load();
-			delete node->thread;
-			delete node;
-			node = nullptr;
+		for (; i < m_most_loaded_threads.size() - 1; i++)
+			m_most_loaded_threads[i] = m_most_loaded_threads[i + 1];
+
+		m_most_loaded_threads.back().node = nullptr;
+		m_most_loaded_threads.back().queue = nullptr;
+	}
+
+	void Scheduler::reschedule(InterruptStack* interrupt_stack, InterruptRegisters* interrupt_registers)
+	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+
+		// If there are no other threads in run queue, reschedule can be no-op :)
+		if (m_run_queue.empty() && !m_current_will_block && current_thread().state() == Thread::State::Executing)
+			return;
+
+		if (m_current == nullptr)
+			m_idle_ns += SystemTimer::get().ns_since_boot() - m_idle_start_ns;
+		else
+		{
+			switch (m_current->thread->state())
+			{
+				case Thread::State::Terminated:
+					remove_node_from_most_loaded(m_current);
+					PageTable::kernel().load();
+					delete m_current->thread;
+					delete m_current;
+					m_thread_count--;
+					break;
+				case Thread::State::Executing:
+				{
+					const uint64_t current_ns = SystemTimer::get().ns_since_boot();
+					m_current->thread->interrupt_stack()     = *interrupt_stack;
+					m_current->thread->interrupt_registers() = *interrupt_registers;
+					m_current->time_used_ns += current_ns - m_current->last_start_ns;
+					add_current_to_most_loaded(m_current_will_block ? &m_block_queue : &m_run_queue);
+					if (!m_current_will_block)
+						m_run_queue.add_thread_to_back(m_current);
+					else
+					{
+						m_current_will_block = false;
+						m_block_queue.add_thread_with_wake_time(m_current);
+					}
+					break;
+				}
+				case Thread::State::NotStarted:
+					ASSERT(!m_current_will_block);
+					m_current->time_used_ns = 0;
+					remove_node_from_most_loaded(m_current);
+					m_run_queue.add_thread_to_back(m_current);
+					break;
+			}
 		}

-		Processor::set_current_thread(node);
-
-		auto* thread = node ? node->thread : Processor::idle_thread();
-
-		if (thread->has_process())
-			thread->process().page_table().load();
-		else
+		while ((m_current = m_run_queue.pop_front()))
+		{
+			if (m_current->thread->state() != Thread::State::Terminated)
+				break;
+			remove_node_from_most_loaded(m_current);
 			PageTable::kernel().load();
+			delete m_current->thread;
+			delete m_current;
+			m_thread_count--;
+		}
+
+		if (m_current == nullptr)
+		{
+			PageTable::kernel().load();
+			*interrupt_stack       = m_idle_thread->interrupt_stack();
+			*interrupt_registers   = m_idle_thread->interrupt_registers();
+			m_idle_thread->m_state = Thread::State::Executing;
+			m_idle_start_ns        = SystemTimer::get().ns_since_boot();
+			return;
+		}
+
+		update_most_loaded_node_queue(m_current, nullptr);
+
+		auto* thread = m_current->thread;
+
+		auto& page_table = thread->has_process() ? thread->process().page_table() : PageTable::kernel();
+		page_table.load();

 		if (thread->state() == Thread::State::NotStarted)
 			thread->m_state = Thread::State::Executing;

 		Processor::gdt().set_tss_stack(thread->kernel_stack_top());
-		Processor::get_interrupt_stack() = thread->interrupt_stack();
-		Processor::get_interrupt_registers() = thread->interrupt_registers();
+		*interrupt_stack     = thread->interrupt_stack();
+		*interrupt_registers = thread->interrupt_registers();
+
+		m_current->last_start_ns = SystemTimer::get().ns_since_boot();
 	}

-	void Scheduler::timer_reschedule()
+	void Scheduler::reschedule_if_idle()
 	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+		if (!m_current && !m_run_queue.empty())
+			Processor::yield();
+	}
+
+	void Scheduler::preempt()
+	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+
+		if (Processor::is_smp_enabled())
+			do_load_balancing();
+
 		{
-			SpinLockGuard _(m_lock);
-			m_blocking_threads.remove_with_wake_time(m_active_threads, SystemTimer::get().ms_since_boot());
+			const uint64_t current_ns = SystemTimer::get().ns_since_boot();
+			while (!m_block_queue.empty() && current_ns >= m_block_queue.front()->wake_time_ns)
+			{
+				auto* node = m_block_queue.pop_front();
+				update_most_loaded_node_queue(node, &m_run_queue);
+				m_run_queue.add_thread_to_back(node);
+			}
 		}

-		// Broadcast IPI to all other processors for them
-		// to perform reschedule
-		InterruptController::get().broadcast_ipi();
-		yield();
-	}
-
-	void Scheduler::yield()
-	{
-		auto state = Processor::get_interrupt_state();
-		Processor::set_interrupt_state(InterruptState::Disabled);
-
-		ASSERT(!m_lock.current_processor_has_lock());
-
-#if ARCH(x86_64)
-		asm volatile(
-			"movq %%rsp, %%rcx;"
-			"movq %[load_sp], %%rsp;"
-			"int %[yield];"
-			"movq %%rcx, %%rsp;"
-			// NOTE: This is offset by 2 pointers since interrupt without PL change
-			//       does not push SP and SS. This allows accessing "whole" interrupt stack.
-			:: [load_sp]"r"(Processor::current_stack_top() - 2 * sizeof(uintptr_t)),
-			   [yield]"i"(IRQ_VECTOR_BASE + IRQ_YIELD)
-			:  "memory", "rcx"
-		);
-#elif ARCH(i686)
-		asm volatile(
-			"movl %%esp, %%ecx;"
-			"movl %[load_sp], %%esp;"
-			"int %[yield];"
-			"movl %%ecx, %%esp;"
-			// NOTE: This is offset by 2 pointers since interrupt without PL change
-			//       does not push SP and SS. This allows accessing "whole" interrupt stack.
-			:: [load_sp]"r"(Processor::current_stack_top() - 2 * sizeof(uintptr_t)),
-			   [yield]"i"(IRQ_VECTOR_BASE + IRQ_YIELD)
-			:  "memory", "ecx"
-		);
-#else
-		#error
-#endif
-
-		Processor::set_interrupt_state(state);
-	}
-
-	void Scheduler::irq_reschedule()
-	{
-		SpinLockGuard _(m_lock);
-		setup_next_thread();
-	}
-
-	void Scheduler::reschedule_if_idling()
-	{
 		{
-			SpinLockGuard _(m_lock);
-			if (Processor::get_current_thread())
-				return;
-			if (m_active_threads.empty())
-				return;
+			const uint64_t current_ns = SystemTimer::get().ns_since_boot();
+			if (current_ns >= m_last_reschedule_ns + s_reschedule_interval_ns)
+			{
+				m_last_reschedule_ns = current_ns;
+				Processor::yield();
+			}
+		}
+	}
+
+	void Scheduler::timer_interrupt()
+	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+
+		// FIXME: all processors should LAPIC for their preemption
+		if (Processor::is_smp_enabled())
+		{
+			ASSERT(Processor::current_is_bsb());
+			Processor::broadcast_smp_message({
+				.type = Processor::SMPMessage::Type::SchedulerPreemption,
+				.scheduler_preemption = 0 // dummy value
+			});
 		}

-		yield();
+		preempt();
+	}
+
+	void Scheduler::handle_unblock_request(const UnblockRequest& request)
+	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+
+		switch (request.type)
+		{
+			case UnblockRequest::Type::ThreadBlocker:
+				do_unblock(request.blocker);
+				break;
+			case UnblockRequest::Type::ThreadID:
+				do_unblock(request.tid);
+				break;
+			default:
+				ASSERT_NOT_REACHED();
+		}
+	}
+
+	void Scheduler::handle_new_thread_request(const NewThreadRequest& reqeuest)
+	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+
+		if (reqeuest.blocked)
+			m_block_queue.add_thread_with_wake_time(reqeuest.node);
+		else
+			m_run_queue.add_thread_to_back(reqeuest.node);
+	}
+
+	bool Scheduler::do_unblock(ThreadBlocker* blocker)
+	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+
+		// FIXME: This could _easily_ be O(1)
+
+		bool did_unblock = false;
+
+		if (m_current && m_current->blocker == blocker && m_current_will_block)
+		{
+			m_current_will_block = false;
+			did_unblock = true;
+		}
+
+		SchedulerQueue::Node* match;
+		while ((match = m_block_queue.remove_with_condition([blocker](const auto* node) { return node->blocker == blocker; })))
+		{
+			dprintln_if(DEBUG_SCHEDULER, "CPU {}: unblock blocker {} (tid {})", Processor::current_id(), blocker, match->thread->tid());
+			update_most_loaded_node_queue(match, &m_run_queue);
+			m_run_queue.add_thread_to_back(match);
+			did_unblock = true;
+		}
+
+		return did_unblock;
+	}
+
+	bool Scheduler::do_unblock(pid_t tid)
+	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+
+		// FIXME: This could _easily_ be O(1)
+
+		if (m_current && m_current->thread->tid() == tid && m_current_will_block)
+		{
+			m_current_will_block = false;
+			return true;
+		}
+
+		auto* match = m_block_queue.remove_with_condition([tid](const auto* node) { return node->thread->tid() == tid; });
+		if (match == nullptr)
+			return false;
+
+		dprintln_if(DEBUG_SCHEDULER, "CPU {}: unblock tid {}", Processor::current_id(), tid);
+		update_most_loaded_node_queue(match, &m_run_queue);
+		m_run_queue.add_thread_to_back(match);
+		return true;
+	}
+
+	ProcessorID Scheduler::find_least_loaded_processor() const
+	{
+		ProcessorID least_loaded_id        = Processor::current_id();
+		uint64_t    most_idle_ns           = m_idle_ns;
+		uint32_t    least_max_load_threads = static_cast<uint32_t>(-1);
+		for (uint8_t i = 0; i < Processor::count(); i++)
+		{
+			auto processor_id = Processor::id_from_index(i);
+			if (processor_id == Processor::current_id())
+				continue;
+			const auto& info = s_processor_infos[i];
+			if (info.idle_time_ns < most_idle_ns || info.max_load_threads > least_max_load_threads)
+				continue;
+			least_loaded_id        = processor_id;
+			most_idle_ns           = info.idle_time_ns;
+			least_max_load_threads = info.max_load_threads;
+		}
+		return least_loaded_id;
+	}
+
+	void Scheduler::do_load_balancing()
+	{
+		ASSERT(Processor::get_interrupt_state() == InterruptState::Disabled);
+
+		const uint64_t current_ns = SystemTimer::get().ns_since_boot();
+		if (current_ns < m_last_load_balance_ns + s_load_balance_interval_ns)
+			return;
+
+		if (m_current == nullptr)
+		{
+			m_idle_ns += current_ns - m_idle_start_ns;
+			m_idle_start_ns = current_ns;
+		}
+		else
+		{
+			m_current->time_used_ns += current_ns - m_current->last_start_ns;
+			m_current->last_start_ns = current_ns;
+			add_current_to_most_loaded(nullptr);
+		}
+
+		if constexpr(DEBUG_SCHEDULER)
+		{
+			const uint64_t duration_ns = current_ns - m_last_load_balance_ns;
+			const uint64_t processing_ns = duration_ns - m_idle_ns;
+
+			{
+				const uint64_t load_percent_x1000 = BAN::Math::div_round_up<uint64_t>(processing_ns * 100'000, duration_ns);
+				dprintln("CPU {}: { 2}.{3}% ({} threads)", Processor::current_id(), load_percent_x1000 / 1000, load_percent_x1000 % 1000, m_thread_count);
+			}
+
+			if (m_current)
+			{
+				const char* name = "unknown";
+				if (m_current->thread->has_process() && m_current->thread->process().is_userspace() && m_current->thread->process().userspace_info().argv)
+					name = m_current->thread->process().userspace_info().argv[0];
+				const uint64_t load_percent_x1000 = BAN::Math::div_round_up<uint64_t>(m_current->time_used_ns * 100'000, processing_ns);
+				dprintln("  tid { 2}: { 3}.{3}% <{}> current", m_current->thread->tid(), load_percent_x1000 / 1000, load_percent_x1000 % 1000, name);
+			}
+			m_run_queue.remove_with_condition(
+				[&](SchedulerQueue::Node* node)
+				{
+					const uint64_t load_percent_x1000 = BAN::Math::div_round_up<uint64_t>(node->time_used_ns * 100'000, processing_ns);
+					dprintln("  tid { 2}: { 3}.{3}% active", node->thread->tid(), load_percent_x1000 / 1000, load_percent_x1000 % 1000);
+					return false;
+				}
+			);
+			m_block_queue.remove_with_condition(
+				[&](SchedulerQueue::Node* node)
+				{
+					const uint64_t load_percent_x1000 = BAN::Math::div_round_up<uint64_t>(node->time_used_ns * 100'000, processing_ns);
+					dprintln("  tid { 2}: { 3}.{3}% blocked", node->thread->tid(), load_percent_x1000 / 1000, load_percent_x1000 % 1000);
+					return false;
+				}
+			);
+		}
+
+		if (!s_processor_info_time_lock.try_lock_interrupts_disabled())
+		{
+			dprintln_if(DEBUG_SCHEDULER, "Load balancing cannot keep up");
+			return;
+		}
+
+		if (m_idle_ns == 0 && m_should_calculate_max_load_threads)
+		{
+			const auto& most_loaded_thread = m_most_loaded_threads.front();
+			if (most_loaded_thread.node == nullptr || most_loaded_thread.node->time_used_ns == 0)
+				s_processor_infos[Processor::current_id().as_u32()].max_load_threads = 0;
+			else
+			{
+				const uint64_t duration_ns = current_ns - m_last_load_balance_ns;
+				const uint64_t max_thread_load_x1000 = 1000 * m_most_loaded_threads.front().node->time_used_ns / duration_ns;
+				const uint64_t max_load_thread_count = ((2000 / max_thread_load_x1000) + 1) / 2;
+				s_processor_infos[Processor::current_id().as_u32()].max_load_threads = max_load_thread_count;
+			}
+		}
+
+		constexpr auto absolute_difference_u64 = [](uint64_t a, uint64_t b) { return (a < b) ? (b - a) : (a - b); };
+
+		for (size_t i = 1; i < m_most_loaded_threads.size(); i++)
+		{
+			auto& thread_info = m_most_loaded_threads[i];
+			if (thread_info.node == nullptr)
+				break;
+			if (thread_info.node == m_current || thread_info.queue == nullptr)
+				continue;
+
+			auto least_loaded_id = find_least_loaded_processor();
+			if (least_loaded_id == Processor::current_id())
+				break;
+
+			auto& most_idle_info = s_processor_infos[least_loaded_id.as_u32()];
+			auto& my_info = s_processor_infos[Processor::current_id().as_u32()];
+
+			if (m_idle_ns == 0)
+			{
+				if (my_info.max_load_threads == 0)
+					break;
+
+				if (most_idle_info.idle_time_ns == 0)
+				{
+					if (most_idle_info.max_load_threads + 1 > my_info.max_load_threads - 1)
+						break;
+
+					my_info.max_load_threads        -= 1;
+					most_idle_info.max_load_threads += 1;
+
+					dprintln_if(DEBUG_SCHEDULER, "CPU {}: sending tid {} to CPU {} (max load)", Processor::current_id(), thread_info.node->thread->tid(), least_loaded_id);
+				}
+				else
+				{
+					my_info.max_load_threads        -= 1;
+					most_idle_info.idle_time_ns      = 0;
+					most_idle_info.max_load_threads  = 1;
+
+					dprintln_if(DEBUG_SCHEDULER, "CPU {}: sending tid {} to CPU {}", Processor::current_id(), thread_info.node->thread->tid(), least_loaded_id);
+				}
+			}
+			else
+			{
+				const uint64_t my_current_proc_ns    = s_load_balance_interval_ns - BAN::Math::min(s_load_balance_interval_ns, m_idle_ns);
+				const uint64_t other_current_proc_ns = s_load_balance_interval_ns - BAN::Math::min(s_load_balance_interval_ns, most_idle_info.idle_time_ns);
+				const uint64_t current_proc_diff_ns  = absolute_difference_u64(my_current_proc_ns, other_current_proc_ns);
+
+				const uint64_t my_new_proc_ns    = my_current_proc_ns    - BAN::Math::min(thread_info.node->time_used_ns, my_current_proc_ns);
+				const uint64_t other_new_proc_ns = other_current_proc_ns + thread_info.node->time_used_ns;
+				const uint64_t new_proc_diff_ns  = absolute_difference_u64(my_new_proc_ns, other_new_proc_ns);
+
+				// require 10% decrease between CPU loads to do send thread to other CPU
+				if (new_proc_diff_ns >= current_proc_diff_ns || (100 * (current_proc_diff_ns - new_proc_diff_ns) / current_proc_diff_ns) < 10)
+					continue;
+
+				most_idle_info.idle_time_ns -= BAN::Math::min(thread_info.node->time_used_ns, most_idle_info.idle_time_ns);
+				m_idle_ns                   += thread_info.node->time_used_ns;
+
+				dprintln_if(DEBUG_SCHEDULER, "CPU {}: sending tid {} to CPU {}", Processor::current_id(), thread_info.node->thread->tid(), least_loaded_id);
+			}
+
+			thread_info.node->time_used_ns = 0;
+
+			{
+				auto& my_queue = (thread_info.queue == &m_run_queue) ? m_run_queue : m_block_queue;
+				my_queue.remove_node(thread_info.node);
+				m_thread_count--;
+			}
+
+			Processor::send_smp_message(least_loaded_id, {
+				.type = Processor::SMPMessage::Type::NewThread,
+				.new_thread = {
+					.node = thread_info.node,
+					.blocked = thread_info.queue == &m_block_queue
+				}
+			});
+
+			thread_info.node = nullptr;
+			thread_info.queue = nullptr;
+
+			if (m_idle_ns == 0)
+				break;
+		}
+
+		s_processor_infos[Processor::current_id().as_u32()].idle_time_ns = m_idle_ns;
+		s_processor_info_time_lock.unlock(InterruptState::Disabled);
+
+		if (m_current)
+			m_current->time_used_ns = 0;
+		for (auto& thread_info : m_most_loaded_threads)
+			thread_info = {};
+		m_run_queue  .remove_with_condition([&](SchedulerQueue::Node* node) { node->time_used_ns = 0; return false; });
+		m_block_queue.remove_with_condition([&](SchedulerQueue::Node* node) { node->time_used_ns = 0; return false; });
+		m_idle_ns = 0;
+
+		m_should_calculate_max_load_threads = true;
+
+		m_last_load_balance_ns += s_load_balance_interval_ns;
 	}

 	BAN::ErrorOr<void> Scheduler::add_thread(Thread* thread)
 	{
-		auto* node = new SchedulerQueue::Node(thread);
-		if (node == nullptr)
+		auto* new_node = new SchedulerQueue::Node(thread);
+		if (new_node == nullptr)
 			return BAN::Error::from_errno(ENOMEM);
-		SpinLockGuard _(m_lock);
-		m_active_threads.push_back(node);
+
+		const size_t processor_index = s_next_processor_index++ % Processor::count();
+		const auto processor_id = Processor::id_from_index(processor_index);
+
+		if (processor_id == Processor::current_id())
+		{
+			auto state = Processor::get_interrupt_state();
+			Processor::set_interrupt_state(InterruptState::Disabled);
+			m_run_queue.add_thread_to_back(new_node);
+			m_thread_count++;
+			Processor::set_interrupt_state(state);
+		}
+		else
+		{
+			Processor::send_smp_message(processor_id, {
+				.type = Processor::SMPMessage::Type::NewThread,
+				.new_thread = {
+					.node = new_node,
+					.blocked = false
+				}
+			});
+		}
+
 		return {};
 	}

-	void Scheduler::terminate_thread(Thread* thread)
+	void Scheduler::block_current_thread(ThreadBlocker* blocker, uint64_t wake_time_ns)
 	{
-		auto state = m_lock.lock();
+		auto state = Processor::get_interrupt_state();
+		Processor::set_interrupt_state(InterruptState::Disabled);

-		ASSERT(thread->state() == Thread::State::Executing);
-		thread->m_state = Thread::State::Terminated;
-		thread->interrupt_stack().sp = Processor::current_stack_top();
+		m_current->blocker      = blocker;
+		m_current->wake_time_ns = wake_time_ns;
+		m_current_will_block    = true;
+		Processor::yield();

-		m_lock.unlock(InterruptState::Disabled);
+		Processor::set_interrupt_state(state);
+	}

-		// actual deletion will be done while rescheduling
+	void Scheduler::unblock_threads(ThreadBlocker* blocker)
+	{
+		auto state = Processor::get_interrupt_state();
+		Processor::set_interrupt_state(InterruptState::Disabled);

-		if (&current_thread() == thread)
+		do_unblock(blocker);
+
+		Processor::broadcast_smp_message({
+			.type = Processor::SMPMessage::Type::UnblockThread,
+			.unblock_thread = {
+				.type = UnblockRequest::Type::ThreadBlocker,
+				.blocker = blocker
+			}
+		});
+
+		Processor::set_interrupt_state(state);
+	}
+
+	void Scheduler::unblock_thread(pid_t tid)
+	{
+		auto state = Processor::get_interrupt_state();
+		Processor::set_interrupt_state(InterruptState::Disabled);
+
+		if (!do_unblock(tid))
 		{
-			yield();
-			ASSERT_NOT_REACHED();
+			Processor::broadcast_smp_message({
+				.type = Processor::SMPMessage::Type::UnblockThread,
+				.unblock_thread = {
+					.type = UnblockRequest::Type::ThreadID,
+					.tid = tid
+				}
+			});
 		}

 		Processor::set_interrupt_state(state);
 	}

-	void Scheduler::set_current_thread_sleeping_impl(Semaphore* semaphore, uint64_t wake_time)
+	Thread& Scheduler::current_thread()
 	{
-		auto state = m_lock.lock();
-
-		auto* current = Processor::get_current_thread();
-		current->semaphore = semaphore;
-		current->wake_time = wake_time;
-		current->should_block = true;
-
-		m_lock.unlock(InterruptState::Disabled);
-
-		yield();
-
-		Processor::set_interrupt_state(state);
+		if (m_current)
+			return *m_current->thread;
+		return *m_idle_thread;
 	}

-	void Scheduler::set_current_thread_sleeping(uint64_t wake_time)
+	Thread& Scheduler::idle_thread()
 	{
-		set_current_thread_sleeping_impl(nullptr, wake_time);
+		return *m_idle_thread;
 	}

-	void Scheduler::block_current_thread(Semaphore* semaphore, uint64_t wake_time)
+	pid_t Scheduler::current_tid() const
 	{
-		set_current_thread_sleeping_impl(semaphore, wake_time);
+		return m_current ? m_current->thread->tid() : 0;
 	}

-	void Scheduler::unblock_threads(Semaphore* semaphore)
+	bool Scheduler::is_idle() const
 	{
-		SpinLockGuard _(m_lock);
-		m_blocking_threads.remove_with_condition(m_active_threads, [&](auto* node) { return node->semaphore == semaphore; });
-	}
-
-	void Scheduler::unblock_thread(pid_t tid)
-	{
-		SpinLockGuard _(m_lock);
-		m_blocking_threads.remove_with_condition(m_active_threads, [&](auto* node) { return node->thread->tid() == tid; });
+		return m_current == nullptr;
 	}

 }
--- a/kernel/kernel/Semaphore.cpp
+++ b/kernel/kernel/Semaphore.cpp
@@ -1,28 +0,0 @@
-#include <kernel/Scheduler.h>
-#include <kernel/Semaphore.h>
-#include <kernel/Timer/Timer.h>
-
-namespace Kernel
-{
-
-	void Semaphore::block_indefinite()
-	{
-		Scheduler::get().block_current_thread(this, ~(uint64_t)0);
-	}
-
-	void Semaphore::block_with_timeout(uint64_t timeout_ms)
-	{
-		Scheduler::get().block_current_thread(this, SystemTimer::get().ms_since_boot() + timeout_ms);
-	}
-
-	void Semaphore::block_with_wake_time(uint64_t wake_time)
-	{
-		Scheduler::get().block_current_thread(this, wake_time);
-	}
-
-	void Semaphore::unblock()
-	{
-		Scheduler::get().unblock_threads(this);
-	}
-
-}
--- a/kernel/kernel/Storage/ATA/AHCI/Device.cpp
+++ b/kernel/kernel/Storage/ATA/AHCI/Device.cpp
@@ -8,7 +8,7 @@
 namespace Kernel
 {

-	static constexpr uint64_t s_ata_timeout = 1000;
+	static constexpr uint64_t s_ata_timeout_ms = 1000;

 	static void start_cmd(volatile HBAPortMemorySpace* port)
 	{
@@ -118,9 +118,9 @@ namespace Kernel
 		command.c = 1;
 		command.command = ATA_COMMAND_IDENTIFY;

-		uint64_t timeout = SystemTimer::get().ms_since_boot() + s_ata_timeout;
+		const uint64_t timeout_ms = SystemTimer::get().ms_since_boot() + s_ata_timeout_ms;
 		while (m_port->tfd & (ATA_STATUS_BSY | ATA_STATUS_DRQ))
-			if (SystemTimer::get().ms_since_boot() >= timeout)
+			if (SystemTimer::get().ms_since_boot() >= timeout_ms)
 				return BAN::Error::from_errno(ETIMEDOUT);

 		m_port->ci = 1 << slot.value();
@@ -158,17 +158,17 @@ namespace Kernel
 	{
 		static constexpr uint64_t poll_timeout_ms = 10;

-		auto start_time = SystemTimer::get().ms_since_boot();
+		const auto start_time_ms = SystemTimer::get().ms_since_boot();

-		while (SystemTimer::get().ms_since_boot() < start_time + poll_timeout_ms)
+		while (SystemTimer::get().ms_since_boot() < start_time_ms + poll_timeout_ms)
 			if (!(m_port->ci & (1 << command_slot)))
 				return {};

-		// FIXME: This should actually block once semaphores support blocking with timeout.
+		// FIXME: This should actually block once ThreadBlocker support blocking with timeout.
 		//        This doesn't allow scheduler to go properly idle.
-		while (SystemTimer::get().ms_since_boot() < start_time + s_ata_timeout)
+		while (SystemTimer::get().ms_since_boot() < start_time_ms + s_ata_timeout_ms)
 		{
-			Scheduler::get().yield();
+			Processor::yield();
 			if (!(m_port->ci & (1 << command_slot)))
 				return {};
 		}
--- a/kernel/kernel/Storage/ATA/ATABus.cpp
+++ b/kernel/kernel/Storage/ATA/ATABus.cpp
@@ -67,9 +67,7 @@ namespace Kernel

 	static void select_delay()
 	{
-		auto time = SystemTimer::get().ns_since_boot();
-		while (SystemTimer::get().ns_since_boot() < time + 400)
-			continue;
+		SystemTimer::get().sleep_ns(400);
 	}

 	void ATABus::select_device(bool secondary)
@@ -106,7 +104,7 @@ namespace Kernel
 		io_write(ATA_PORT_CONTROL, ATA_CONTROL_nIEN);

 		io_write(ATA_PORT_COMMAND, ATA_COMMAND_IDENTIFY);
-		SystemTimer::get().sleep(1);
+		SystemTimer::get().sleep_ms(1);

 		// No device on port
 		if (io_read(ATA_PORT_STATUS) == 0)
@@ -130,7 +128,7 @@ namespace Kernel
 			}

 			io_write(ATA_PORT_COMMAND, ATA_COMMAND_IDENTIFY_PACKET);
-			SystemTimer::get().sleep(1);
+			SystemTimer::get().sleep_ms(1);

 			if (auto res = wait(true); res.is_error())
 			{
--- a/kernel/kernel/Storage/NVMe/Queue.cpp
+++ b/kernel/kernel/Storage/NVMe/Queue.cpp
@@ -1,6 +1,6 @@
 #include <kernel/Lock/LockGuard.h>
-#include <kernel/Scheduler.h>
 #include <kernel/Storage/NVMe/Queue.h>
+#include <kernel/Thread.h>
 #include <kernel/Timer/Timer.h>

 namespace Kernel
@@ -44,7 +44,7 @@ namespace Kernel

 		m_doorbell.cq_head = m_cq_head;

-		m_semaphore.unblock();
+		m_thread_blocker.unblock();
 	}

 	uint16_t NVMeQueue::submit_command(NVMe::SubmissionQueueEntry& sqe)
@@ -66,15 +66,15 @@ namespace Kernel
 			m_doorbell.sq_tail = m_sq_tail;
 		}

-		const uint64_t start_time = SystemTimer::get().ms_since_boot();
-		while (!(m_done_mask & cid_mask) && SystemTimer::get().ms_since_boot() < start_time + s_nvme_command_poll_timeout_ms)
+		const uint64_t start_time_ms = SystemTimer::get().ms_since_boot();
+		while (!(m_done_mask & cid_mask) && SystemTimer::get().ms_since_boot() < start_time_ms + s_nvme_command_poll_timeout_ms)
 			continue;

 		// FIXME: Here is a possible race condition if done mask is set before
 		//        scheduler has put the current thread blocking.
 		//        EINTR should also be handled here.
-		while (!(m_done_mask & cid_mask) && SystemTimer::get().ms_since_boot() < start_time + s_nvme_command_timeout_ms)
-			Scheduler::get().block_current_thread(&m_semaphore, start_time + s_nvme_command_timeout_ms);
+		while (!(m_done_mask & cid_mask) && SystemTimer::get().ms_since_boot() < start_time_ms + s_nvme_command_timeout_ms)
+			m_thread_blocker.block_with_wake_time_ms(start_time_ms + s_nvme_command_timeout_ms);

 		if (m_done_mask & cid_mask)
 		{
@@ -93,7 +93,7 @@ namespace Kernel
 		while (~m_used_mask == 0)
 		{
 			m_lock.unlock(state);
-			m_semaphore.block_with_timeout(s_nvme_command_timeout_ms);
+			m_thread_blocker.block_with_timeout_ms(s_nvme_command_timeout_ms);
 			state = m_lock.lock();
 		}

--- a/kernel/kernel/Terminal/TTY.cpp
+++ b/kernel/kernel/Terminal/TTY.cpp
@@ -57,7 +57,7 @@ namespace Kernel
 				if ((flags & TTY_FLAG_ENABLE_INPUT) && !m_tty_ctrl.receive_input)
 				{
 					m_tty_ctrl.receive_input = true;
-					m_tty_ctrl.semaphore.unblock();
+					m_tty_ctrl.thread_blocker.unblock();
 				}
 				if (flags & TTY_FLAG_ENABLE_OUTPUT)
 					m_tty_ctrl.draw_graphics = true;
@@ -94,7 +94,7 @@ namespace Kernel
 				while (true)
 				{
 					while (!TTY::current()->m_tty_ctrl.receive_input)
-						TTY::current()->m_tty_ctrl.semaphore.block_indefinite();
+						TTY::current()->m_tty_ctrl.thread_blocker.block_indefinite();

 					LibInput::RawKeyEvent event;
 					size_t read = MUST(inode->read(0, BAN::ByteSpan::from(event)));
@@ -237,7 +237,7 @@ namespace Kernel
 		if (ch == '\x04' && m_termios.canonical)
 		{
 			m_output.flush = true;
-			m_output.semaphore.unblock();
+			m_output.thread_blocker.unblock();
 			return;
 		}

@@ -279,7 +279,7 @@ namespace Kernel
 		if (ch == '\n' || !m_termios.canonical)
 		{
 			m_output.flush = true;
-			m_output.semaphore.unblock();
+			m_output.thread_blocker.unblock();
 		}
 	}

@@ -338,7 +338,7 @@ namespace Kernel
 		while (!m_output.flush)
 		{
 			LockFreeGuard _(m_mutex);
-			TRY(Thread::current().block_or_eintr_indefinite(m_output.semaphore));
+			TRY(Thread::current().block_or_eintr_indefinite(m_output.thread_blocker));
 		}

 		if (m_output.bytes == 0)
@@ -356,7 +356,7 @@ namespace Kernel
 		if (m_output.bytes == 0)
 			m_output.flush = false;

-		m_output.semaphore.unblock();
+		m_output.thread_blocker.unblock();

 		return to_copy;
 	}
--- a/kernel/kernel/Thread.cpp
+++ b/kernel/kernel/Thread.cpp
@@ -120,7 +120,12 @@ namespace Kernel

 	Thread& Thread::current()
 	{
-		return Scheduler::get().current_thread();
+		return Processor::scheduler().current_thread();
+	}
+
+	pid_t Thread::current_tid()
+	{
+		return Processor::scheduler().current_tid();
 	}

 	Process& Thread::process()
@@ -396,36 +401,36 @@ namespace Kernel
 		{
 			m_signal_pending_mask |= mask;
 			if (this != &Thread::current())
-				Scheduler::get().unblock_thread(tid());
+				Processor::scheduler().unblock_thread(tid());
 			return true;
 		}
 		return false;
 	}

-	BAN::ErrorOr<void> Thread::block_or_eintr_indefinite(Semaphore& semaphore)
+	BAN::ErrorOr<void> Thread::block_or_eintr_indefinite(ThreadBlocker& thread_blocker)
 	{
 		if (is_interrupted_by_signal())
 			return BAN::Error::from_errno(EINTR);
-		semaphore.block_indefinite();
+		thread_blocker.block_indefinite();
 		if (is_interrupted_by_signal())
 			return BAN::Error::from_errno(EINTR);
 		return {};
 	}

-	BAN::ErrorOr<void> Thread::block_or_eintr_or_timeout(Semaphore& semaphore, uint64_t timeout_ms, bool etimedout)
+	BAN::ErrorOr<void> Thread::block_or_eintr_or_timeout_ns(ThreadBlocker& thread_blocker, uint64_t timeout_ns, bool etimedout)
 	{
-		uint64_t wake_time_ms = SystemTimer::get().ms_since_boot() + timeout_ms;
-		return block_or_eintr_or_waketime(semaphore, wake_time_ms, etimedout);
+		const uint64_t wake_time_ns = SystemTimer::get().ns_since_boot() + timeout_ns;
+		return block_or_eintr_or_waketime_ns(thread_blocker, wake_time_ns, etimedout);
 	}

-	BAN::ErrorOr<void> Thread::block_or_eintr_or_waketime(Semaphore& semaphore, uint64_t wake_time_ms, bool etimedout)
+	BAN::ErrorOr<void> Thread::block_or_eintr_or_waketime_ns(ThreadBlocker& thread_blocker, uint64_t wake_time_ns, bool etimedout)
 	{
 		if (is_interrupted_by_signal())
 			return BAN::Error::from_errno(EINTR);
-		semaphore.block_with_wake_time(wake_time_ms);
+		thread_blocker.block_with_timeout_ns(wake_time_ns);
 		if (is_interrupted_by_signal())
 			return BAN::Error::from_errno(EINTR);
-		if (etimedout && SystemTimer::get().ms_since_boot() >= wake_time_ms)
+		if (etimedout && SystemTimer::get().ms_since_boot() >= wake_time_ns)
 			return BAN::Error::from_errno(ETIMEDOUT);
 		return {};
 	}
@@ -444,15 +449,12 @@ namespace Kernel
 			{
 				Processor::set_interrupt_state(InterruptState::Disabled);
 				setup_process_cleanup();
-				Scheduler::get().yield();
+				Processor::yield();
+				ASSERT_NOT_REACHED();
 			}
-			else
-				Scheduler::get().terminate_thread(this);
-		}
-		else
-		{
-			Scheduler::get().terminate_thread(this);
 		}
+		m_state = State::Terminated;
+		Processor::yield();
 		ASSERT_NOT_REACHED();
 	}

--- a/kernel/kernel/ThreadBlocker.cpp
+++ b/kernel/kernel/ThreadBlocker.cpp
@@ -0,0 +1,28 @@
+#include <kernel/Processor.h>
+#include <kernel/ThreadBlocker.h>
+#include <kernel/Timer/Timer.h>
+
+namespace Kernel
+{
+
+	void ThreadBlocker::block_indefinite()
+	{
+		Processor::scheduler().block_current_thread(this, ~static_cast<uint64_t>(0));
+	}
+
+	void ThreadBlocker::block_with_timeout_ns(uint64_t timeout_ns)
+	{
+		Processor::scheduler().block_current_thread(this, SystemTimer::get().ns_since_boot() + timeout_ns);
+	}
+
+	void ThreadBlocker::block_with_wake_time_ns(uint64_t wake_time_ns)
+	{
+		Processor::scheduler().block_current_thread(this, wake_time_ns);
+	}
+
+	void ThreadBlocker::unblock()
+	{
+		Processor::scheduler().unblock_threads(this);
+	}
+
+}
--- a/kernel/kernel/Timer/HPET.cpp
+++ b/kernel/kernel/Timer/HPET.cpp
@@ -4,7 +4,7 @@
 #include <kernel/InterruptController.h>
 #include <kernel/Memory/PageTable.h>
 #include <kernel/MMIO.h>
-#include <kernel/Scheduler.h>
+#include <kernel/Processor.h>
 #include <kernel/Timer/HPET.h>

 #define HPET_PERIOD_MAX 0x05F5E100
@@ -272,7 +272,7 @@ namespace Kernel
 			m_last_ticks = current_ticks;
 		}

-		Scheduler::get().timer_reschedule();
+		Processor::scheduler().timer_interrupt();
 	}

 	uint64_t HPET::ms_since_boot() const
--- a/kernel/kernel/Timer/PIT.cpp
+++ b/kernel/kernel/Timer/PIT.cpp
@@ -1,7 +1,7 @@
 #include <kernel/IDT.h>
 #include <kernel/InterruptController.h>
 #include <kernel/IO.h>
-#include <kernel/Scheduler.h>
+#include <kernel/Processor.h>
 #include <kernel/Timer/PIT.h>

 #define PIT_IRQ 0
@@ -58,7 +58,7 @@ namespace Kernel
 			SpinLockGuard _(m_lock);
 			m_system_time++;
 		}
-		Kernel::Scheduler::get().timer_reschedule();
+		Processor::scheduler().timer_interrupt();
 	}

 	uint64_t PIT::read_counter() const
--- a/kernel/kernel/Timer/Timer.cpp
+++ b/kernel/kernel/Timer/Timer.cpp
@@ -69,15 +69,17 @@ namespace Kernel
 		return m_timer->time_since_boot();
 	}

-	void SystemTimer::sleep(uint64_t ms) const
+	void SystemTimer::sleep_ns(uint64_t ns) const
 	{
-		if (ms == 0)
+		if (ns == 0)
 			return;
-		uint64_t wake_time = ms_since_boot() + ms;
-		Scheduler::get().set_current_thread_sleeping(wake_time);
-		uint64_t current_time = ms_since_boot();
-		if (current_time < wake_time)
-			dwarnln("sleep woke {} ms too soon", wake_time - current_time);
+
+		const uint64_t wake_time_ns = ns_since_boot() + ns;
+		Processor::scheduler().block_current_thread(nullptr, wake_time_ns);
+
+		//const uint64_t current_time_ns = ns_since_boot();
+		//if (current_time_ns < wake_time_ns)
+		//	dwarnln("sleep woke {} ms too soon", BAN::Math::div_round_up<uint64_t>(wake_time_ns - current_time_ns, 1'000'000));
 	}

 	timespec SystemTimer::real_time() const
--- a/kernel/kernel/USB/XHCI/Controller.cpp
+++ b/kernel/kernel/USB/XHCI/Controller.cpp
@@ -250,7 +250,7 @@ namespace Kernel
 				bool expected { true };
 				while (!m_port_changed.compare_exchange(expected, false))
 				{
-					m_port_semaphore.block_with_timeout(100);
+					m_port_thread_blocker.block_with_timeout_ms(100);
 					expected = true;
 				}
 			}
@@ -482,7 +482,7 @@ namespace Kernel
 							break;
 						}
 						m_port_changed = true;
-						m_port_semaphore.unblock();
+						m_port_thread_blocker.unblock();
 						break;
 					}
 					case XHCI::TRBType::BandwidthRequestEvent:
--- a/kernel/kernel/kernel.cpp
+++ b/kernel/kernel/kernel.cpp
@@ -108,7 +108,7 @@ extern "C" void kernel_main(uint32_t boot_magic, uint32_t boot_info)
 	parse_boot_info(boot_magic, boot_info);
 	dprintln("boot info parsed");

-	Processor::create(0);
+	Processor::create(PROCESSOR_NONE);
 	Processor::initialize();
 	dprintln("BSP initialized");

@@ -167,12 +167,11 @@ extern "C" void kernel_main(uint32_t boot_magic, uint32_t boot_info)
 	Random::initialize();
 	dprintln("RNG initialized");

-	MUST(Scheduler::initialize());
-	dprintln("Scheduler initialized");
+	Processor::wait_until_processors_ready();
+	MUST(Processor::scheduler().initialize());

-	Scheduler& scheduler = Scheduler::get();
 	Process::create_kernel(init2, nullptr);
-	scheduler.start();
+	Processor::yield();

 	ASSERT_NOT_REACHED();
 }
@@ -233,14 +232,11 @@ extern "C" void ap_main()

 	Processor::initialize();
 	PageTable::kernel().initial_load();
-	Processor::allocate_idle_thread();
 	InterruptController::get().enable();

-	dprintln("ap{} initialized", Processor::current_id());
+	Processor::wait_until_processors_ready();
+	MUST(Processor::scheduler().initialize());

-	// wait until scheduler is started and we get irq for reschedule
-	Processor::set_interrupt_state(InterruptState::Enabled);
-	while (true)
-		asm volatile("hlt");
+	asm volatile("sti; 1: hlt; jmp 1b");
 	ASSERT_NOT_REACHED();
 }