From 54a92293da7b812da201d5fe982aa8d5bdffea1b Mon Sep 17 00:00:00 2001 From: Bananymous Date: Sun, 14 Jan 2024 01:39:48 +0200 Subject: [PATCH] Kernel: Implement NVMe driver I'm actually able to boot this os fine on own laptop now! --- kernel/CMakeLists.txt | 3 + kernel/include/kernel/Memory/Types.h | 1 + .../include/kernel/Storage/NVMe/Controller.h | 52 +++ .../include/kernel/Storage/NVMe/Definitions.h | 295 +++++++++++++++++ .../include/kernel/Storage/NVMe/Namespace.h | 41 +++ kernel/include/kernel/Storage/NVMe/Queue.h | 37 +++ kernel/kernel/PCI.cpp | 9 + kernel/kernel/Storage/NVMe/Controller.cpp | 310 ++++++++++++++++++ kernel/kernel/Storage/NVMe/Namespace.cpp | 119 +++++++ kernel/kernel/Storage/NVMe/Queue.cpp | 82 +++++ script/qemu.sh | 9 +- 11 files changed, 956 insertions(+), 2 deletions(-) create mode 100644 kernel/include/kernel/Storage/NVMe/Controller.h create mode 100644 kernel/include/kernel/Storage/NVMe/Definitions.h create mode 100644 kernel/include/kernel/Storage/NVMe/Namespace.h create mode 100644 kernel/include/kernel/Storage/NVMe/Queue.h create mode 100644 kernel/kernel/Storage/NVMe/Controller.cpp create mode 100644 kernel/kernel/Storage/NVMe/Namespace.cpp create mode 100644 kernel/kernel/Storage/NVMe/Queue.cpp diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index d6b7f3aee4..c91ed7695a 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -65,6 +65,9 @@ set(KERNEL_SOURCES kernel/Storage/ATA/ATAController.cpp kernel/Storage/ATA/ATADevice.cpp kernel/Storage/DiskCache.cpp + kernel/Storage/NVMe/Controller.cpp + kernel/Storage/NVMe/Namespace.cpp + kernel/Storage/NVMe/Queue.cpp kernel/Storage/Partition.cpp kernel/Storage/StorageDevice.cpp kernel/Syscall.cpp diff --git a/kernel/include/kernel/Memory/Types.h b/kernel/include/kernel/Memory/Types.h index 279ae7caab..ad883dfcef 100644 --- a/kernel/include/kernel/Memory/Types.h +++ b/kernel/include/kernel/Memory/Types.h @@ -11,6 +11,7 @@ #endif #define PAGE_SIZE ((uintptr_t)4096) +#define PAGE_SIZE_SHIFT 12 #define PAGE_ADDR_MASK (~(uintptr_t)0xFFF) namespace Kernel diff --git a/kernel/include/kernel/Storage/NVMe/Controller.h b/kernel/include/kernel/Storage/NVMe/Controller.h new file mode 100644 index 0000000000..6152799a35 --- /dev/null +++ b/kernel/include/kernel/Storage/NVMe/Controller.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace Kernel +{ + + class NVMeController final : public StorageController, public CharacterDevice + { + BAN_NON_COPYABLE(NVMeController); + BAN_NON_MOVABLE(NVMeController); + + public: + static BAN::ErrorOr> create(PCI::Device&); + ~NVMeController() { ASSERT_NOT_REACHED(); } + + NVMeQueue& io_queue() { return *m_io_queue; } + + virtual dev_t rdev() const override { return m_rdev; } + virtual BAN::StringView name() const override { return m_name; } + + private: + NVMeController(PCI::Device& pci_device); + virtual BAN::ErrorOr initialize() override; + + BAN::ErrorOr identify_controller(); + BAN::ErrorOr identify_namespaces(); + + BAN::ErrorOr wait_until_ready(bool expected_value); + BAN::ErrorOr create_admin_queue(); + BAN::ErrorOr create_io_queue(); + + private: + PCI::Device& m_pci_device; + BAN::UniqPtr m_bar0; + volatile NVMe::ControllerRegisters* m_controller_registers; + + BAN::UniqPtr m_admin_queue; + BAN::UniqPtr m_io_queue; + + BAN::Vector> m_namespaces; + + char m_name[20]; + const dev_t m_rdev; + }; + +} diff --git a/kernel/include/kernel/Storage/NVMe/Definitions.h b/kernel/include/kernel/Storage/NVMe/Definitions.h new file mode 100644 index 0000000000..28151d6dc6 --- /dev/null +++ b/kernel/include/kernel/Storage/NVMe/Definitions.h @@ -0,0 +1,295 @@ +#pragma once + +#include + +namespace Kernel::NVMe +{ + + struct CAP + { + uint64_t mqes : 16; + uint64_t cqr : 1; + uint64_t ams : 2; + uint64_t __reserved0 : 5; + uint64_t to : 8; + uint64_t dstrd : 4; + uint64_t nssrs : 1; + uint64_t css : 8; + uint64_t bps : 1; + uint64_t cps : 2; + uint64_t mpsmin : 4; + uint64_t mpsmax : 4; + uint64_t pmrs : 1; + uint64_t cmpbs : 1; + uint64_t nsss : 1; + uint64_t crms : 2; + uint64_t __reserved1 : 3; + }; + static_assert(sizeof(CAP) == sizeof(uint64_t)); + + enum CAP_CSS + { + CAP_CSS_NVME = 1 << 0, + CAP_CSS_IO = 1 << 6, + CAP_CSS_ADMIN = 1 << 7, + }; + + struct VS + { + uint32_t tertiary : 8; + uint32_t minor : 8; + uint32_t major : 16; + }; + static_assert(sizeof(VS) == sizeof(uint32_t)); + + struct CC + { + uint32_t en : 1; + uint32_t __reserved0 : 3; + uint32_t css : 3; + uint32_t mps : 4; + uint32_t ams : 3; + uint32_t shn : 2; + uint32_t iosqes : 4; + uint32_t iocqes : 4; + uint32_t crime : 1; + uint32_t __reserved1 : 7; + }; + static_assert(sizeof(CC) == sizeof(uint32_t)); + + struct CSTS + { + uint32_t rdy : 1; + uint32_t cfs : 1; + uint32_t shts : 2; + uint32_t nssro : 1; + uint32_t pp : 1; + uint32_t st : 1; + uint32_t __reserved : 25; + }; + static_assert(sizeof(CSTS) == sizeof(uint32_t)); + + struct AQA + { + uint32_t asqs : 12; + uint32_t __reserved0 : 4; + uint32_t acqs : 12; + uint32_t __reserved1 : 4; + }; + static_assert(sizeof(AQA) == sizeof(uint32_t)); + + // BAR0 + struct ControllerRegisters + { + CAP cap; + VS vs; + uint32_t intms; + uint32_t intmc; + CC cc; + uint8_t __reserved0[4]; + CSTS csts; + uint32_t nssr; + AQA aqa; + uint64_t asq; + uint64_t acq; + + static constexpr uint32_t SQ0TDBL = 0x1000; + }; + static_assert(sizeof(ControllerRegisters) == 0x38); + + struct DoorbellRegisters + { + uint32_t sq_tail; + uint32_t cq_head; + } __attribute__((packed)); + + struct CompletionQueueEntry + { + uint32_t dontcare[3]; + uint16_t cid; + uint16_t sts; + } __attribute__((packed)); + static_assert(sizeof(CompletionQueueEntry) == 16); + + struct DataPtr + { + union + { + struct + { + uint64_t prp1; + uint64_t prp2; + }; + uint8_t sgl1[16]; + }; + }; + + struct CommandGeneric + { + uint32_t nsid; + uint32_t cdw2; + uint32_t cdw3; + uint64_t mptr; + DataPtr dptr; + uint32_t cdw10; + uint32_t cdw11; + uint32_t cdw12; + uint32_t cdw13; + uint32_t cdw14; + uint32_t cdw15; + } __attribute__((packed)); + static_assert(sizeof(CommandGeneric) == 15 * sizeof(uint32_t)); + + struct CommandIdentify + { + uint32_t nsid; + uint64_t __reserved0[2]; + DataPtr dptr; + // dword 10 + uint8_t cns; + uint8_t __reserved1; + uint16_t cntid; + // dword 11 + uint16_t cnsid; + uint8_t __reserved2; + uint8_t csi; + // dword 12-15 + uint32_t __reserved3[4]; + } __attribute__((packed)); + static_assert(sizeof(CommandIdentify) == 15 * sizeof(uint32_t)); + + struct CommandCreateCQ + { + uint32_t __reserved0; + uint64_t __reserved1[2]; + DataPtr dptr; + // dword 10 + uint16_t qid; + uint16_t qsize; + // dword 11 + uint16_t pc : 1; + uint16_t ien : 1; + uint16_t __reserved2 : 14; + uint16_t iv; + // dword 12-15 + uint32_t __reserved4[4]; + } __attribute__((packed)); + static_assert(sizeof(CommandCreateCQ) == 15 * sizeof(uint32_t)); + + struct CommandCreateSQ + { + uint32_t __reserved0; + uint64_t __reserved1[2]; + DataPtr dptr; + // dword 10 + uint16_t qid; + uint16_t qsize; + // dword 11 + uint16_t pc : 1; + uint16_t qprio : 2; + uint16_t __reserved2 : 13; + uint16_t cqid; + // dword 12 + uint16_t nvmsetid; + uint16_t __reserved4; + // dword 13-15 + uint32_t __reserved5[3]; + } __attribute__((packed)); + static_assert(sizeof(CommandCreateSQ) == 15 * sizeof(uint32_t)); + + + struct CommandRead + { + uint32_t nsid; + uint64_t __reserved0; + uint64_t mptr; + DataPtr dptr; + // dword 10-11 + uint64_t slba; + // dword 12 + uint16_t nlb; + uint16_t __reserved1; + // dword 13-15 + uint32_t __reserved2[3]; + } __attribute__((packed)); + static_assert(sizeof(CommandRead) == 15 * sizeof(uint32_t)); + + struct SubmissionQueueEntry + { + uint8_t opc; + uint8_t fuse : 2; + uint8_t __reserved : 4; + uint8_t psdt : 2; + uint16_t cid; + union + { + CommandGeneric generic; + CommandIdentify identify; + CommandCreateCQ create_cq; + CommandCreateSQ create_sq; + CommandRead read; + }; + } __attribute__((packed)); + static_assert(sizeof(SubmissionQueueEntry) == 64); + + enum OPC : uint8_t + { + OPC_ADMIN_CREATE_SQ = 0x01, + OPC_ADMIN_CREATE_CQ = 0x05, + OPC_ADMIN_IDENTIFY = 0x06, + OPC_IO_WRITE = 0x01, + OPC_IO_READ = 0x02, + }; + + enum CNS : uint8_t + { + CNS_INDENTIFY_NAMESPACE = 0x00, + CNS_INDENTIFY_CONTROLLER = 0x01, + CNS_INDENTIFY_ACTIVE_NAMESPACES = 0x02, + }; + + struct NamespaceIdentify + { + uint64_t nsze; + uint64_t ncap; + uint64_t nuse; + uint8_t nsfeat; + uint8_t nlbaf; + uint8_t flbas; + uint8_t mc; + uint8_t dpc; + uint8_t dps; + uint8_t nmic; + uint8_t rescap; + uint8_t fpi; + uint8_t dlfeat; + uint16_t nawun; + uint16_t nawupf; + uint16_t nacwu; + uint16_t nabsn; + uint16_t nabo; + uint16_t nabspf; + uint16_t noiob; + uint64_t nvmcap[2]; + uint16_t npwg; + uint16_t npwa; + uint16_t npdg; + uint16_t npda; + uint16_t nows; + uint16_t mssrl; + uint32_t mcl; + uint8_t msrc; + uint8_t __reserved0[11]; + uint32_t adagrpid; + uint8_t __reserved1[3]; + uint8_t nsattr; + uint16_t nvmsetid; + uint16_t endgid; + uint64_t nguid[2]; + uint64_t eui64; + uint32_t lbafN[64]; + uint8_t vendor_specific[3712]; + } __attribute__((packed)); + static_assert(sizeof(NamespaceIdentify) == 0x1000); + +} \ No newline at end of file diff --git a/kernel/include/kernel/Storage/NVMe/Namespace.h b/kernel/include/kernel/Storage/NVMe/Namespace.h new file mode 100644 index 0000000000..635477ed96 --- /dev/null +++ b/kernel/include/kernel/Storage/NVMe/Namespace.h @@ -0,0 +1,41 @@ +#pragma once + +#include +#include + +namespace Kernel +{ + + class NVMeController; + + class NVMeNamespace : public StorageDevice + { + public: + static BAN::ErrorOr> create(NVMeController&, uint32_t nsid, uint64_t block_count, uint32_t block_size); + + virtual uint32_t sector_size() const override { return m_block_size; } + virtual uint64_t total_size() const override { return m_block_size * m_block_count; } + + virtual dev_t rdev() const override { return m_rdev; } + virtual BAN::StringView name() const { return m_name; } + + private: + NVMeNamespace(NVMeController&, uint32_t nsid, uint64_t block_count, uint32_t block_size); + BAN::ErrorOr initialize(); + + virtual BAN::ErrorOr read_sectors_impl(uint64_t lba, uint64_t sector_count, BAN::ByteSpan) override; + virtual BAN::ErrorOr write_sectors_impl(uint64_t lba, uint64_t sector_count, BAN::ConstByteSpan) override; + + private: + NVMeController& m_controller; + BAN::UniqPtr m_dma_region; + + const uint32_t m_nsid; + const uint32_t m_block_size; + const uint64_t m_block_count; + + char m_name[10] {}; + const dev_t m_rdev; + }; + +} diff --git a/kernel/include/kernel/Storage/NVMe/Queue.h b/kernel/include/kernel/Storage/NVMe/Queue.h new file mode 100644 index 0000000000..357a3a0af8 --- /dev/null +++ b/kernel/include/kernel/Storage/NVMe/Queue.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace Kernel +{ + + class NVMeQueue : public Interruptable + { + public: + NVMeQueue(BAN::UniqPtr&& cq, BAN::UniqPtr&& sq, volatile NVMe::DoorbellRegisters& db, uint32_t qdepth, uint8_t irq); + + uint16_t submit_command(NVMe::SubmissionQueueEntry& sqe); + + virtual void handle_irq() final override; + + private: + SpinLock m_lock; + BAN::UniqPtr m_completion_queue; + BAN::UniqPtr m_submission_queue; + volatile NVMe::DoorbellRegisters& m_doorbell; + const uint32_t m_qdepth; + uint32_t m_sq_tail { 0 }; + uint32_t m_cq_head { 0 }; + uint16_t m_cq_valid_phase { 1 }; + + Semaphore m_semaphore; + volatile uint16_t m_status; + volatile bool m_done { false }; + }; + +} diff --git a/kernel/kernel/PCI.cpp b/kernel/kernel/PCI.cpp index 5b50d1108a..f93d9425aa 100644 --- a/kernel/kernel/PCI.cpp +++ b/kernel/kernel/PCI.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #define INVALID_VENDOR 0xFFFF #define MULTI_FUNCTION 0x80 @@ -170,6 +171,14 @@ namespace Kernel::PCI if (auto res = ATAController::create(pci_device); res.is_error()) dprintln("ATA: {}", res.error()); break; + case 0x08: + // FIXME: HACK if inode initialization fails before it attaches to DevFS, + // it will kernel panic. This is used to make nvme eternal + if (auto res = NVMeController::create(pci_device); res.is_error()) + dprintln("NVMe: {}", res.error()); + else + res.value()->ref(); + break; default: dprintln("unsupported storage device (pci {2H}.{2H}.{2H})", pci_device.class_code(), pci_device.subclass(), pci_device.prog_if()); break; diff --git a/kernel/kernel/Storage/NVMe/Controller.cpp b/kernel/kernel/Storage/NVMe/Controller.cpp new file mode 100644 index 0000000000..de995bed25 --- /dev/null +++ b/kernel/kernel/Storage/NVMe/Controller.cpp @@ -0,0 +1,310 @@ +#include +#include +#include +#include +#include + +#include + +#define DEBUG_NVMe 1 + +namespace Kernel +{ + + static dev_t get_ctrl_dev_major() + { + static dev_t major = DevFileSystem::get().get_next_dev(); + return major; + } + + static dev_t get_ctrl_dev_minor() + { + static dev_t minor = 0; + return minor++; + } + + BAN::ErrorOr> NVMeController::create(PCI::Device& pci_device) + { + auto* controller_ptr = new NVMeController(pci_device); + if (controller_ptr == nullptr) + return BAN::Error::from_errno(ENOMEM); + auto controller = BAN::RefPtr::adopt(controller_ptr); + TRY(controller->initialize()); + return controller; + } + + NVMeController::NVMeController(PCI::Device& pci_device) + : CharacterDevice(0600, 0, 0) + , m_pci_device(pci_device) + , m_rdev(makedev(get_ctrl_dev_major(), get_ctrl_dev_minor())) + { + ASSERT(minor(m_rdev) < 10); + strcpy(m_name, "nvmeX"); + m_name[4] = '0' + minor(m_rdev); + } + + BAN::ErrorOr NVMeController::initialize() + { + // See NVM express base specification section 3.5.1 + m_pci_device.enable_bus_mastering(); + m_pci_device.enable_memory_space(); + + m_bar0 = TRY(m_pci_device.allocate_bar_region(0)); + if (m_bar0->type() != PCI::BarType::MEM) + { + dwarnln("NVMe controller BAR0 is not MEM"); + return BAN::Error::from_errno(EINVAL); + } + if (m_bar0->size() < 0x1000) + { + dwarnln("NVMe controller BAR0 is too small {} bytes", m_bar0->size()); + return BAN::Error::from_errno(EINVAL); + } + + m_controller_registers = reinterpret_cast(m_bar0->vaddr()); + + const auto& vs = m_controller_registers->vs; + if (vs.major != 1) + { + dwarnln("NVMe controller has unsupported version {}.{}", (uint16_t)vs.major, (uint8_t)vs.minor); + return BAN::Error::from_errno(ENOTSUP); + } + + dprintln_if(DEBUG_NVMe, "NVMe controller"); + dprintln_if(DEBUG_NVMe, " version: {}.{}", (uint16_t)vs.major, (uint8_t)vs.minor); + + auto& cap = m_controller_registers->cap; + if (!(cap.css & NVMe::CAP_CSS_NVME)) + { + dwarnln("NVMe controller does not support NVMe command set"); + return BAN::Error::from_errno(ECANCELED); + } + + const uint64_t min_page_size = 1ull << (12 + cap.mpsmin); + const uint64_t max_page_size = 1ull << (12 + cap.mpsmax); + if (PAGE_SIZE < min_page_size || PAGE_SIZE > max_page_size) + { + dwarnln("NVMe controller does not support {} byte pages, only {}-{} byte pages are supported", PAGE_SIZE, min_page_size, max_page_size); + return BAN::Error::from_errno(ECANCELED); + } + + // One for aq and one for ioq + TRY(m_pci_device.reserve_irqs(2)); + + auto& cc = m_controller_registers->cc; + + if (cc.en) + TRY(wait_until_ready(true)); + cc.en = 0; + TRY(wait_until_ready(false)); + dprintln_if(DEBUG_NVMe, " controller reset"); + + TRY(create_admin_queue()); + dprintln_if(DEBUG_NVMe, " created admin queue"); + + // Configure controller + cc.ams = 0; + cc.mps = PAGE_SIZE_SHIFT - 12; + cc.css = 0b000; + + cc.en = 1; + TRY(wait_until_ready(true)); + dprintln_if(DEBUG_NVMe, " controller enabled"); + + TRY(identify_controller()); + + cc.iocqes = 4; static_assert(1 << 4 == sizeof(NVMe::CompletionQueueEntry)); + cc.iosqes = 6; static_assert(1 << 6 == sizeof(NVMe::SubmissionQueueEntry)); + TRY(create_io_queue()); + dprintln_if(DEBUG_NVMe, " created io queue"); + + TRY(identify_namespaces()); + + DevFileSystem::get().add_device(this); + + return {}; + } + + BAN::ErrorOr NVMeController::wait_until_ready(bool expected_value) + { + const auto& cap = m_controller_registers->cap; + const auto& csts = m_controller_registers->csts; + + uint64_t timeout = SystemTimer::get().ms_since_boot() + 500 * cap.to; + while (csts.rdy != expected_value) + { + if (SystemTimer::get().ms_since_boot() >= timeout) + { + dwarnln("NVMe controller reset timedout"); + return BAN::Error::from_errno(ETIMEDOUT); + } + } + + return {}; + } + + BAN::ErrorOr NVMeController::identify_controller() + { + auto dma_page = TRY(DMARegion::create(PAGE_SIZE)); + + NVMe::SubmissionQueueEntry sqe {}; + sqe.opc = NVMe::OPC_ADMIN_IDENTIFY; + sqe.identify.dptr.prp1 = dma_page->paddr(); + sqe.identify.cns = NVMe::CNS_INDENTIFY_CONTROLLER; + if (uint16_t status = m_admin_queue->submit_command(sqe)) + { + dwarnln("NVMe controller identify failed (status {4H})", status); + return BAN::Error::from_errno(EFAULT); + } + + if (*reinterpret_cast(dma_page->vaddr()) != m_pci_device.vendor_id()) + { + dwarnln("NVMe controller vendor id does not match with the one in PCI"); + return BAN::Error::from_errno(EFAULT); + } + + dprintln_if(DEBUG_NVMe, " model: '{}'", BAN::StringView { (char*)dma_page->vaddr() + 24, 20 }); + + return {}; + } + + BAN::ErrorOr NVMeController::identify_namespaces() + { + auto dma_page = TRY(DMARegion::create(PAGE_SIZE)); + + BAN::Vector namespace_ids; + TRY(namespace_ids.resize(PAGE_SIZE / sizeof(uint32_t))); + + { + NVMe::SubmissionQueueEntry sqe {}; + sqe.opc = NVMe::OPC_ADMIN_IDENTIFY; + sqe.identify.dptr.prp1 = dma_page->paddr(); + sqe.identify.cns = NVMe::CNS_INDENTIFY_ACTIVE_NAMESPACES; + if (uint16_t status = m_admin_queue->submit_command(sqe)) + { + dwarnln("NVMe active namespace identify failed (status {4H})", status); + return BAN::Error::from_errno(EFAULT); + } + memcpy(namespace_ids.data(), reinterpret_cast(dma_page->vaddr()), PAGE_SIZE); + } + + for (uint32_t nsid : namespace_ids) + { + if (nsid == 0) + break; + dprintln(" found namespace {}", nsid); + + NVMe::SubmissionQueueEntry sqe {}; + sqe.opc = NVMe::OPC_ADMIN_IDENTIFY; + sqe.identify.nsid = nsid; + sqe.identify.dptr.prp1 = dma_page->paddr(); + sqe.identify.cns = NVMe::CNS_INDENTIFY_NAMESPACE; + if (uint16_t status = m_admin_queue->submit_command(sqe)) + { + dwarnln("NVMe namespace {} identify failed (status {4H})", nsid , status); + return BAN::Error::from_errno(EFAULT); + } + + auto& namespace_info = *reinterpret_cast(dma_page->vaddr()); + + const uint64_t block_count = namespace_info.nsze; + + const uint64_t format = namespace_info.lbafN[namespace_info.flbas & 0x0F]; + const uint64_t block_size = 1u << ((format >> 16) & 0xFF); + + dprintln(" block count {}", block_count); + dprintln(" block size {} B", block_size); + dprintln(" total {} MiB", block_count * block_size / (1 << 20)); + + auto ns = TRY(NVMeNamespace::create(*this, nsid, block_count, block_size)); + TRY(m_namespaces.push_back(BAN::move(ns))); + } + + return {}; + } + + BAN::ErrorOr NVMeController::create_admin_queue() + { + const uint32_t admin_queue_depth = BAN::Math::min(PAGE_SIZE / sizeof(NVMe::CompletionQueueEntry), PAGE_SIZE / sizeof(NVMe::SubmissionQueueEntry)); + auto& aqa = m_controller_registers->aqa; + aqa.acqs = admin_queue_depth - 1; + aqa.asqs = admin_queue_depth - 1; + dprintln_if(DEBUG_NVMe, " admin queue depth is {}", admin_queue_depth); + + const uint32_t completion_queue_size = admin_queue_depth * sizeof(NVMe::CompletionQueueEntry); + auto completion_queue = TRY(DMARegion::create(completion_queue_size)); + memset((void*)completion_queue->vaddr(), 0x00, completion_queue->size()); + + const uint32_t submission_queue_size = admin_queue_depth * sizeof(NVMe::SubmissionQueueEntry); + auto submission_queue = TRY(DMARegion::create(submission_queue_size)); + memset((void*)submission_queue->vaddr(), 0x00, submission_queue->size()); + + m_controller_registers->acq = completion_queue->paddr(); + m_controller_registers->asq = submission_queue->paddr(); + + uint8_t irq = m_pci_device.get_irq(0); + dprintln_if(DEBUG_NVMe, " admin queue using irq {}", irq); + + auto& doorbell = *reinterpret_cast(m_bar0->vaddr() + NVMe::ControllerRegisters::SQ0TDBL); + + m_admin_queue = TRY(BAN::UniqPtr::create(BAN::move(completion_queue), BAN::move(submission_queue), doorbell, admin_queue_depth, irq)); + + return {}; + } + + BAN::ErrorOr NVMeController::create_io_queue() + { + constexpr uint32_t queue_size = PAGE_SIZE; + constexpr uint32_t queue_elems = queue_size / BAN::Math::max(sizeof(NVMe::CompletionQueueEntry), sizeof(NVMe::SubmissionQueueEntry)); + auto completion_queue = TRY(DMARegion::create(queue_size)); + memset((void*)completion_queue->vaddr(), 0x00, completion_queue->size()); + + auto submission_queue = TRY(DMARegion::create(queue_size)); + memset((void*)submission_queue->vaddr(), 0x00, submission_queue->size()); + + { + NVMe::SubmissionQueueEntry sqe {}; + sqe.opc = NVMe::OPC_ADMIN_CREATE_CQ; + sqe.create_cq.dptr.prp1 = completion_queue->paddr(); + sqe.create_cq.qsize = queue_elems - 1; + sqe.create_cq.qid = 1; + sqe.create_cq.iv = 1; + sqe.create_cq.ien = 1; + sqe.create_cq.pc = 1; + if (uint16_t status = m_admin_queue->submit_command(sqe)) + { + dwarnln("NVMe io completion queue creation failed (status {4H})", status); + return BAN::Error::from_errno(EFAULT); + } + } + + { + NVMe::SubmissionQueueEntry sqe {}; + sqe.opc = NVMe::OPC_ADMIN_CREATE_SQ; + sqe.create_sq.dptr.prp1 = submission_queue->paddr(); + sqe.create_sq.qsize = queue_elems - 1; + sqe.create_sq.qid = 1; + sqe.create_sq.cqid = 1; + sqe.create_sq.qprio = 0; + sqe.create_sq.pc = 1; + sqe.create_sq.nvmsetid = 0; + if (uint16_t status = m_admin_queue->submit_command(sqe)) + { + dwarnln("NVMe io submission queue creation failed (status {4H})", status); + return BAN::Error::from_errno(EFAULT); + } + } + + uint8_t irq = m_pci_device.get_irq(1); + dprintln_if(DEBUG_NVMe, " io queue using irq {}", irq); + + const uint32_t doorbell_stride = 1 << (2 + m_controller_registers->cap.dstrd); + const uint32_t doorbell_offset = 2 * doorbell_stride; + auto& doorbell = *reinterpret_cast(m_bar0->vaddr() + NVMe::ControllerRegisters::SQ0TDBL + doorbell_offset); + + m_io_queue = TRY(BAN::UniqPtr::create(BAN::move(completion_queue), BAN::move(submission_queue), doorbell, queue_elems, irq)); + + return {}; + } + +} diff --git a/kernel/kernel/Storage/NVMe/Namespace.cpp b/kernel/kernel/Storage/NVMe/Namespace.cpp new file mode 100644 index 0000000000..1c4bbe795b --- /dev/null +++ b/kernel/kernel/Storage/NVMe/Namespace.cpp @@ -0,0 +1,119 @@ +#include +#include +#include + +#include + +namespace Kernel +{ + + static dev_t get_ns_dev_major() + { + static dev_t major = DevFileSystem::get().get_next_dev(); + return major; + } + + static dev_t get_ns_dev_minor() + { + static dev_t minor = 0; + return minor++; + } + + BAN::ErrorOr> NVMeNamespace::create(NVMeController& controller, uint32_t nsid, uint64_t block_count, uint32_t block_size) + { + auto* namespace_ptr = new NVMeNamespace(controller, nsid, block_count, block_size); + if (namespace_ptr == nullptr) + return BAN::Error::from_errno(ENOMEM); + auto ns = BAN::RefPtr::adopt(namespace_ptr); + TRY(ns->initialize()); + return ns; + } + + NVMeNamespace::NVMeNamespace(NVMeController& controller, uint32_t nsid, uint64_t block_count, uint32_t block_size) + : m_controller(controller) + , m_nsid(nsid) + , m_block_size(block_size) + , m_block_count(block_count) + , m_rdev(makedev(get_ns_dev_major(), get_ns_dev_minor())) + { + ASSERT(minor(m_rdev) < 10); + ASSERT(m_controller.name().size() + 2 < sizeof(m_name)); + memcpy(m_name, m_controller.name().data(), m_controller.name().size()); + m_name[m_controller.name().size() + 0] = 'n'; + m_name[m_controller.name().size() + 1] = '1' + minor(m_rdev); + m_name[m_controller.name().size() + 2] = '\0'; + } + + BAN::ErrorOr NVMeNamespace::initialize() + { + m_dma_region = TRY(DMARegion::create(PAGE_SIZE)); + + add_disk_cache(); + + DevFileSystem::get().add_device(this); + + char name_prefix[20]; + strcpy(name_prefix, m_name); + strcat(name_prefix, "p"); + if (auto res = initialize_partitions(name_prefix); res.is_error()) + dprintln("{}", res.error()); + + return {}; + } + + BAN::ErrorOr NVMeNamespace::read_sectors_impl(uint64_t lba, uint64_t sector_count, BAN::ByteSpan buffer) + { + ASSERT(buffer.size() >= sector_count * m_block_size); + + for (uint64_t i = 0; i < sector_count;) + { + uint16_t count = BAN::Math::min(sector_count - i, m_dma_region->size() / m_block_size); + + NVMe::SubmissionQueueEntry sqe {}; + sqe.opc = NVMe::OPC_IO_READ; + sqe.read.nsid = m_nsid; + sqe.read.dptr.prp1 = m_dma_region->paddr(); + sqe.read.slba = lba + i; + sqe.read.nlb = count - 1; + if (uint16_t status = m_controller.io_queue().submit_command(sqe)) + { + dwarnln("NVMe read failed (status {4H})", status); + return BAN::Error::from_errno(EIO); + } + memcpy(buffer.data() + i * m_block_size, reinterpret_cast(m_dma_region->vaddr()), count * m_block_size); + + i += count; + } + + return {}; + } + + BAN::ErrorOr NVMeNamespace::write_sectors_impl(uint64_t lba, uint64_t sector_count, BAN::ConstByteSpan buffer) + { + ASSERT(buffer.size() >= sector_count * m_block_size); + + for (uint64_t i = 0; i < sector_count;) + { + uint16_t count = BAN::Math::min(sector_count - i, m_dma_region->size() / m_block_size); + + memcpy(reinterpret_cast(m_dma_region->vaddr()), buffer.data() + i * m_block_size, count * m_block_size); + + NVMe::SubmissionQueueEntry sqe {}; + sqe.opc = NVMe::OPC_IO_WRITE; + sqe.read.nsid = m_nsid; + sqe.read.dptr.prp1 = m_dma_region->paddr(); + sqe.read.slba = lba + i; + sqe.read.nlb = count - 1; + if (uint16_t status = m_controller.io_queue().submit_command(sqe)) + { + dwarnln("NVMe write failed (status {4H})", status); + return BAN::Error::from_errno(EIO); + } + + i += count; + } + + return {}; + } + +} diff --git a/kernel/kernel/Storage/NVMe/Queue.cpp b/kernel/kernel/Storage/NVMe/Queue.cpp new file mode 100644 index 0000000000..995f3485c0 --- /dev/null +++ b/kernel/kernel/Storage/NVMe/Queue.cpp @@ -0,0 +1,82 @@ +#include +#include +#include +#include + +namespace Kernel +{ + + static constexpr uint64_t s_nvme_command_timeout_ms = 1000; + static constexpr uint64_t s_nvme_command_poll_timeout_ms = 20; + + NVMeQueue::NVMeQueue(BAN::UniqPtr&& cq, BAN::UniqPtr&& sq, volatile NVMe::DoorbellRegisters& db, uint32_t qdepth, uint8_t irq) + : m_completion_queue(BAN::move(cq)) + , m_submission_queue(BAN::move(sq)) + , m_doorbell(db) + , m_qdepth(qdepth) + { + set_irq(irq); + enable_interrupt(); + } + + void NVMeQueue::handle_irq() + { + auto* cq_ptr = reinterpret_cast(m_completion_queue->vaddr()); + + while ((cq_ptr[m_cq_head].sts & 1) == m_cq_valid_phase) + { + uint16_t sts = cq_ptr[m_cq_head].sts >> 1; + uint16_t cid = cq_ptr[m_cq_head].cid; + ASSERT(cid == 0); + + ASSERT(!m_done); + m_status = sts; + m_done = true; + m_semaphore.unblock(); + + m_cq_head = (m_cq_head + 1) % m_qdepth; + if (m_cq_head == 0) + m_cq_valid_phase ^= 1; + } + + m_doorbell.cq_head = m_cq_head; + } + + uint16_t NVMeQueue::submit_command(NVMe::SubmissionQueueEntry& sqe) + { + LockGuard _(m_lock); + + ASSERT(m_done == false); + m_status = 0; + + sqe.cid = 0; + + auto* sqe_ptr = reinterpret_cast(m_submission_queue->vaddr()); + memcpy(&sqe_ptr[m_sq_tail], &sqe, sizeof(NVMe::SubmissionQueueEntry)); + m_sq_tail = (m_sq_tail + 1) % m_qdepth; + m_doorbell.sq_tail = m_sq_tail; + + const uint64_t start_time = SystemTimer::get().ms_since_boot(); + while (SystemTimer::get().ms_since_boot() < start_time + s_nvme_command_poll_timeout_ms) + { + if (!m_done) + continue; + m_done = false; + return m_status; + } + + while (SystemTimer::get().ms_since_boot() < start_time + s_nvme_command_timeout_ms) + { + if (!m_done) + { + m_semaphore.block(); + continue; + } + m_done = false; + return m_status; + } + + return 0xFFFF; + } + +} diff --git a/script/qemu.sh b/script/qemu.sh index 8683dbcd5c..38d34da87f 100755 --- a/script/qemu.sh +++ b/script/qemu.sh @@ -13,11 +13,16 @@ if (($BANAN_UEFI_BOOT)); then BIOS_ARGS="-bios $OVMF_PATH -net none" fi +if [[ $BANAN_DISK_TYPE == "NVME" ]]; then + DISK_ARGS="-device nvme,serial=deadbeef,drive=disk" +else + DISK_ARGS="-device ahci,id=ahci -device ide-hd,drive=disk,bus=ahci.0" +fi + qemu-system-$BANAN_ARCH \ -m 128 \ -smp 2 \ $BIOS_ARGS \ -drive format=raw,id=disk,file=${BANAN_DISK_IMAGE_PATH},if=none \ - -device ahci,id=ahci \ - -device ide-hd,drive=disk,bus=ahci.0 \ + $DISK_ARGS \ $@ \