From a4f30b4ce52dee21655b48d09f6b5e8f7801ce1c Mon Sep 17 00:00:00 2001 From: Russell Joyce Date: Wed, 11 Apr 2018 19:50:50 +0100 Subject: [PATCH] Modified UNVMe driver to use UIO device and no-IOMMU VFIO This is the main transition away from an IOMMU-based driver to something that can work with a statically-mapped buffer defined with a UIO device, alongside VFIO running in no-IOMMU mode for access to PCIe. This change allows the driver to run on embedded devices without an IOMMU, but with the flexibility to allocate a chunk of memory to UNVMe through the device tree, such as the Xilinx Zynq UltraScale+ MPSoC. The implementation is currently quite restricted, requiring a UIO driver to be loaded at /dev/uio0, with its first memory map a region of DMA-able memory of size 1GiB and address 0x40000000. --- Makefile.def | 4 -- src/unvme_core.c | 11 ++-- src/unvme_vfio.c | 128 ++++++++++++++++------------------------------- src/unvme_vfio.h | 3 ++ 4 files changed, 53 insertions(+), 93 deletions(-) diff --git a/Makefile.def b/Makefile.def index 1beb4eb..439ea62 100644 --- a/Makefile.def +++ b/Makefile.def @@ -39,10 +39,6 @@ CPPFLAGS+=-D_GNU_SOURCE # To turn on debug message logging #CPPFLAGS+=-DUNVME_DEBUG -# To use identity map for DMA address # for faster submission time -# (but may cause DMA overlapping for applications with high memory usage) -CPPFLAGS+=-DUNVME_IDENTITY_MAP_DMA - # To build the ioengine modules, specify fio directory where header files are #FIODIR:=/opt/fio diff --git a/src/unvme_core.c b/src/unvme_core.c index b11c318..913bcd1 100644 --- a/src/unvme_core.c +++ b/src/unvme_core.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "rdtsc.h" #include "unvme_core.h" @@ -199,9 +200,6 @@ static u16 unvme_get_cid(unvme_desc_t* desc) static u64 unvme_map_dma(const unvme_ns_t* ns, void* buf, u64 bufsz) { unvme_device_t* dev = ((unvme_session_t*)ns->ses)->dev; -#ifdef UNVME_IDENTITY_MAP_DMA - u64 addr = (u64)buf & dev->vfiodev.iovamask; -#else vfio_dma_t* dma = NULL; unvme_lockr(&dev->iomem.lock); int i; @@ -215,7 +213,6 @@ static u64 unvme_map_dma(const unvme_ns_t* ns, void* buf, u64 bufsz) u64 addr = dma->addr + (u64)(buf - dma->buf); if ((addr + bufsz) > (dma->addr + dma->size)) FATAL("buffer overrun"); -#endif //if ((addr & (ns->blocksize - 1)) != 0) // FATAL("unaligned buffer address"); return addr; @@ -494,7 +491,9 @@ unvme_ns_t* unvme_do_open(int pci, int nsid, int qcount, int qsize) vfio_dma_t* dma = vfio_dma_alloc(&dev->vfiodev, 4096); if (nvme_acmd_identify(&dev->nvmedev, 0, dma->addr, 0)) FATAL("nvme_acmd_identify controller failed"); - nvme_identify_ctlr_t* idc = (nvme_identify_ctlr_t*)dma->buf; + nvme_identify_ctlr_t* idc = malloc(sizeof(nvme_identify_ctlr_t)); + memcpy(idc, dma->buf, sizeof(nvme_identify_ctlr_t)); + if (nsid > idc->nn) { ERROR("invalid %06x nsid %d (max %d)", pci, nsid, idc->nn); return NULL; @@ -523,6 +522,7 @@ unvme_ns_t* unvme_do_open(int pci, int nsid, int qcount, int qsize) int mp = 2 << (idc->mdts - 1); if (ns->maxppio > mp) ns->maxppio = mp; } + free(idc); vfio_dma_free(dma); // get max number of queues supported @@ -539,6 +539,7 @@ unvme_ns_t* unvme_do_open(int pci, int nsid, int qcount, int qsize) ns->qsize = qsize; // setup IO queues + DEBUG_FN("Creating %d IO queues (of max %d), queue size %d", qcount, maxqcount, qsize); dev->ioqs = zalloc(qcount * sizeof(unvme_queue_t)); for (i = 0; i < qcount; i++) unvme_ioq_create(dev, i); } diff --git a/src/unvme_vfio.c b/src/unvme_vfio.c index efc00e4..7f317e7 100644 --- a/src/unvme_vfio.c +++ b/src/unvme_vfio.c @@ -52,7 +52,9 @@ #define FATAL(fmt, arg...) do { ERROR(fmt, ##arg); abort(); } while (0) /// Starting device DMA address -#define VFIO_IOVA 0x800000000 +#define UIO_BASE 0x40000000 +/// Size of UIO buffer/device +#define UIO_SIZE 0x40000000 /// IRQ index names const char* vfio_irq_names[] = { "INTX", "MSI", "MSIX", "ERR", "REQ" }; @@ -102,31 +104,18 @@ static vfio_mem_t* vfio_mem_alloc(vfio_device_t* dev, size_t size, void* pmb) if (pmb) { mem->dma.buf = pmb; } else { - mem->dma.buf = mmap(0, size, PROT_READ|PROT_WRITE, - MAP_PRIVATE|MAP_ANONYMOUS|MAP_LOCKED, -1, 0); - if (mem->dma.buf == MAP_FAILED) - FATAL("mmap: %s", strerror(errno)); - mem->mmap = 1; + if (dev->uiobufoff + size > UIO_SIZE) { + ERROR("Out of UIO memory space (next allocation would use %#lx of %#lx)", dev->uiobufoff + size, UIO_SIZE); + free(mem); + return NULL; + } + mem->dma.buf = dev->uiobuf + dev->uiobufoff; + dev->uiobufoff += size; } pthread_mutex_lock(&dev->lock); - struct vfio_iommu_type1_dma_map map = { - .argsz = sizeof(map), - .flags = (VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE), - .size = (__u64)size, - .vaddr = (__u64)mem->dma.buf, -#ifdef UNVME_IDENTITY_MAP_DMA - .iova = (__u64)mem->dma.buf & dev->iovamask, -#else - .iova = dev->iovanext, -#endif - }; - - if (ioctl(dev->contfd, VFIO_IOMMU_MAP_DMA, &map) < 0) { - FATAL("VFIO_IOMMU_MAP_DMA: %s", strerror(errno)); - } mem->dma.size = size; - mem->dma.addr = map.iova; + mem->dma.addr = dev->iovanext; mem->dma.mem = mem; mem->dev = dev; @@ -141,8 +130,8 @@ static vfio_mem_t* vfio_mem_alloc(vfio_device_t* dev, size_t size, void* pmb) dev->memlist->prev->next = mem; dev->memlist->prev = mem; } - dev->iovanext = map.iova + size; - DEBUG_FN("%x %#lx %#lx %#lx", dev->pci, map.iova, map.size, dev->iovanext); + dev->iovanext += size; + DEBUG_FN("%x %#lx %#lx %#lx %#lx", dev->pci, mem->dma.addr, size, dev->iovanext, dev->uiobufoff); pthread_mutex_unlock(&dev->lock); return mem; @@ -157,35 +146,26 @@ int vfio_mem_free(vfio_mem_t* mem) { vfio_device_t* dev = mem->dev; - struct vfio_iommu_type1_dma_unmap unmap = { - .argsz = sizeof(unmap), - .size = (__u64)mem->dma.size, - .iova = mem->dma.addr, - }; - - // unmap and free dma memory - if (mem->dma.buf) { - if (ioctl(dev->contfd, VFIO_IOMMU_UNMAP_DMA, &unmap) < 0) - FATAL("VFIO_IOMMU_UNMAP_DMA: %s", strerror(errno)); - } - if (mem->mmap) { - if (munmap(mem->dma.buf, mem->dma.size) < 0) - FATAL("munmap: %s", strerror(errno)); - } - // remove node from memory list pthread_mutex_lock(&dev->lock); - if (mem->next == dev->memlist) dev->iovanext -= mem->dma.size; - if (mem->next == mem) { + if (mem->next == dev->memlist) { // If removing last item in list + dev->iovanext -= mem->dma.size; + dev->uiobufoff -= mem->dma.size; + } + if (mem->next == mem) { // If removing only item in list dev->memlist = NULL; dev->iovanext = dev->iovabase; - } else { + dev->uiobufoff = 0; + } else { // If there are other items in list mem->next->prev = mem->prev; mem->prev->next = mem->next; - if (dev->memlist == mem) dev->memlist = mem->next; - dev->iovanext = dev->memlist->prev->dma.addr + dev->memlist->prev->dma.size; + if (dev->memlist == mem) { // If first item in list + dev->memlist = mem->next; + } + dev->iovanext = dev->memlist->prev->dma.addr + dev->memlist->prev->dma.size; // IOVA next is after last item in list + dev->uiobufoff = dev->iovanext - dev->iovabase; // UIO buffer offset is same as IOVA offset } - DEBUG_FN("%x %#lx %#lx %#lx", dev->pci, unmap.iova, unmap.size, dev->iovanext); + DEBUG_FN("%x %#lx %#lx %#lx %#lx", dev->pci, mem->dma.addr, mem->dma.size, dev->iovanext, dev->uiobufoff); pthread_mutex_unlock(&dev->lock); free(mem); @@ -312,10 +292,9 @@ vfio_device_t* vfio_create(vfio_device_t* dev, int pci) if ((i = readlink(path, path, sizeof(path))) < 0) FATAL("No iommu_group associated with device %s", pciname); path[i] = 0; - sprintf(path, "/dev/vfio%s", strrchr(path, '/')); + sprintf(path, "/dev/vfio/noiommu-%s", &strrchr(path, '/')[1]); struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; - struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) }; struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; // allocate and initialize device context @@ -323,7 +302,7 @@ vfio_device_t* vfio_create(vfio_device_t* dev, int pci) else dev->ext = 1; dev->pci = pci; dev->pagesize = sysconf(_SC_PAGESIZE); - dev->iovabase = VFIO_IOVA; + dev->iovabase = UIO_BASE; dev->iovanext = dev->iovabase; if (pthread_mutex_init(&dev->lock, 0)) return NULL; @@ -334,7 +313,7 @@ vfio_device_t* vfio_create(vfio_device_t* dev, int pci) if (ioctl(dev->contfd, VFIO_GET_API_VERSION) != VFIO_API_VERSION) FATAL("ioctl VFIO_GET_API_VERSION"); - if (ioctl(dev->contfd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU) == 0) + if (ioctl(dev->contfd, VFIO_CHECK_EXTENSION, VFIO_NOIOMMU_IOMMU) == 0) FATAL("ioctl VFIO_CHECK_EXTENSION"); if ((dev->groupfd = open(path, O_RDWR)) < 0) @@ -349,12 +328,9 @@ vfio_device_t* vfio_create(vfio_device_t* dev, int pci) if (ioctl(dev->groupfd, VFIO_GROUP_SET_CONTAINER, &dev->contfd) < 0) FATAL("ioctl VFIO_GROUP_SET_CONTAINER"); - if (ioctl(dev->contfd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU) < 0) + if (ioctl(dev->contfd, VFIO_SET_IOMMU, VFIO_NOIOMMU_IOMMU) < 0) FATAL("ioctl VFIO_SET_IOMMU"); - if (ioctl(dev->contfd, VFIO_IOMMU_GET_INFO, &iommu_info) < 0) - FATAL("ioctl VFIO_IOMMU_GET_INFO"); - dev->fd = ioctl(dev->groupfd, VFIO_GROUP_GET_DEVICE_FD, pciname); if (dev->fd < 0) FATAL("ioctl VFIO_GROUP_GET_DEVICE_FD"); @@ -415,37 +391,16 @@ vfio_device_t* vfio_create(vfio_device_t* dev, int pci) FATAL("VFIO_DEVICE_GET_IRQ_INFO MSIX count %d != %d", irq.count, dev->msixsize); } -#ifdef UNVME_IDENTITY_MAP_DMA - // Set up mask to support identity IOVA map option - struct vfio_iommu_type1_dma_map map = { - .argsz = sizeof(map), - .flags = (VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE), - .iova = dev->iovabase, - .size = dev->pagesize, - }; - struct vfio_iommu_type1_dma_unmap unmap = { - .argsz = sizeof(unmap), - .size = dev->pagesize, - }; + // Open and map UIO device as memory buffer + dev->uiofd = open("/dev/uio0", O_RDWR | O_SYNC); + if (dev->uiofd == -1) + FATAL("unable to open /dev/uio0, %d", errno); + dev->uiobuf = mmap(NULL, UIO_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, dev->uiofd, 0); + if (dev->uiobuf == MAP_FAILED) + FATAL("unable to mmap /dev/uio0, %d", errno); - map.vaddr = (__u64)mmap(0, map.size, PROT_READ|PROT_WRITE, - MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); - if ((void*)map.vaddr == MAP_FAILED) - FATAL("mmap: %s", strerror(errno)); - while (map.iova) { - if (ioctl(dev->contfd, VFIO_IOMMU_MAP_DMA, &map) < 0) { - if (errno == EFAULT) break; - FATAL("VFIO_IOMMU_MAP_DMA: %s", strerror(errno)); - } - unmap.iova = map.iova; - if (ioctl(dev->contfd, VFIO_IOMMU_UNMAP_DMA, &unmap) < 0) - FATAL("VFIO_IOMMU_MUNAP_DMA: %s", strerror(errno)); - map.iova <<= 1; - } - dev->iovamask = map.iova - 1; - (void) munmap((void*)map.vaddr, map.size); - DEBUG_FN("iovamask=%#llx", dev->iovamask); -#endif + if (mlock(dev->uiobuf, UIO_SIZE) == -1) + FATAL("unable to mlock, %d", errno); return (vfio_device_t*)dev; } @@ -459,6 +414,11 @@ void vfio_delete(vfio_device_t* dev) if (!dev) return; DEBUG_FN("%x", dev->pci); + // Close and unmap UIO buffer + munlock(dev->uiobuf, UIO_SIZE); + munmap(dev->uiobuf, UIO_SIZE); + close(dev->uiofd); + // free all memory associated with the device while (dev->memlist) vfio_mem_free(dev->memlist); diff --git a/src/unvme_vfio.h b/src/unvme_vfio.h index fd755e1..5e66010 100644 --- a/src/unvme_vfio.h +++ b/src/unvme_vfio.h @@ -74,6 +74,9 @@ typedef struct _vfio_device { __u64 iovamask; ///< max IO virtual address mask pthread_mutex_t lock; ///< multithreaded lock vfio_mem_t* memlist; ///< memory allocated list + int uiofd; ///< file descriptor of UIO device + void* uiobuf; ///< UIO buffer pointer + off_t uiobufoff; ///< UIO buffer offset } vfio_device_t; // Export functions