| // SPDX-License-Identifier: GPL-2.0 |
| /* |
| * Paravirtualized DMA operations that offers DMA inspection between |
| * guest & host. |
| */ |
| #include <linux/mm.h> |
| #include <linux/dma-mapping.h> |
| #include <linux/dma-direct.h> |
| #include <linux/bitmap.h> |
| #include <linux/scatterlist.h> |
| #include <linux/pci.h> |
| #include <linux/dma-map-ops.h> |
| #include <linux/coiommu_dev.h> |
| #include <linux/coiommu.h> |
| #include <linux/iommu.h> |
| #include "coiommu.h" |
| #include "direct.h" |
| |
| static struct coiommu *global_coiommu; |
| |
| static inline unsigned int dtt_level_to_offset(unsigned long pfn, |
| unsigned int level) |
| { |
| unsigned int offset; |
| |
| if (level == DTT_LAST_LEVEL) |
| return (pfn) & COIOMMU_PT_LEVEL_MASK; |
| |
| offset = COIOMMU_PT_LEVEL_STRIDE + (level - 2) * COIOMMU_UPPER_LEVEL_STRIDE; |
| |
| return (pfn >> offset) & COIOMMU_UPPER_LEVEL_MASK; |
| } |
| |
| static void *dtt_alloc_page(struct coiommu_dtt *dtt) |
| { |
| struct dtt_page_cache *c; |
| unsigned long flags; |
| void *obj = NULL; |
| |
| spin_lock_irqsave(&dtt->alloc_lock, flags); |
| c = &dtt->cache[dtt->cur_cache]; |
| if (!c->nobjs) { |
| /* |
| * get the page directly |
| */ |
| obj = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_ACCOUNT); |
| if (!obj) |
| pr_err("%s: coiommu failed to alloc dtt page\n", |
| __func__); |
| } else { |
| obj = c->objects[--c->nobjs]; |
| if (!c->nobjs) |
| /* |
| * Prepare the next cache by waking up the alloc work |
| */ |
| kthread_queue_work(dtt->worker, &dtt->alloc_work); |
| } |
| |
| spin_unlock_irqrestore(&dtt->alloc_lock, flags); |
| return obj; |
| } |
| |
| static struct dtt_leaf_entry *pfn_to_dtt_pte(struct coiommu_dtt *dtt, |
| unsigned long pfn, bool alloc) |
| { |
| struct dtt_parent_entry *parent_pte; |
| unsigned int index; |
| struct dtt_leaf_entry *leaf_pte; |
| unsigned int level = dtt->level; |
| void *pt = (void *)dtt->root; |
| u64 pteval; |
| |
| while (level != DTT_LAST_LEVEL) { |
| index = dtt_level_to_offset(pfn, level); |
| parent_pte = (struct dtt_parent_entry *)pt + index; |
| |
| if (!parent_pte_present(parent_pte)) { |
| if (!alloc) |
| break; |
| pt = dtt_alloc_page(dtt); |
| if (!pt) |
| break; |
| pteval = parent_pte_value(pt); |
| if (cmpxchg64(&parent_pte->val, 0ULL, pteval)) |
| /* Someone else set it, free this one */ |
| free_page((unsigned long)pt); |
| else |
| atomic_inc(&dtt->pages); |
| } |
| |
| pt = phys_to_virt(parent_pte_addr(parent_pte)); |
| level--; |
| } |
| |
| if (level > DTT_LAST_LEVEL) { |
| pr_err("coiommu: DTT %s failed at level %d for pfn 0x%lx\n", |
| alloc ? "alloc" : "absent", level, pfn); |
| return NULL; |
| } |
| |
| index = dtt_level_to_offset(pfn, DTT_LAST_LEVEL); |
| leaf_pte = (struct dtt_leaf_entry *)pt + index; |
| |
| return leaf_pte; |
| } |
| |
| static bool is_page_pinned(struct coiommu_dtt *dtt, unsigned long pfn) |
| { |
| struct dtt_leaf_entry *leaf_pte = pfn_to_dtt_pte(dtt, pfn, false); |
| |
| if (leaf_pte == NULL) |
| return false; |
| |
| return coiommu_test_flag((1 << DTTE_PINNED_FLAG), &leaf_pte->dtte); |
| } |
| |
| static void unmark_pfn(struct dtt_leaf_entry *leaf_pte, bool clear_accessed) |
| { |
| if (!(atomic_read(&leaf_pte->dtte) & DTTE_MAP_CNT_MASK)) { |
| pr_err("%s: coiomu: map count already zero, leaf_pte 0x%llx\n", |
| __func__, (u64)leaf_pte); |
| return; |
| } |
| |
| if (!(atomic_dec_return(&leaf_pte->dtte) & DTTE_MAP_CNT_MASK)) { |
| if (unlikely(clear_accessed)) |
| /* |
| * The clear_accessed is only true in the error handling code |
| * path, like pin a page failed and need reverse some operations. |
| * So this happens in rare. |
| * If this page is pinned successfully by another thread right |
| * before decreasing the map count here, then the access flag |
| * won't be cleared which is expected. |
| * If this page is pinned successfully by another thread right |
| * after decreasing the map count here, then the access flag |
| * will still be cleared. This won't cause any issue but just |
| * messes up access tracking a little bit. |
| */ |
| coiommu_clear_flag((1 << DTTE_ACCESSED_FLAG), |
| &leaf_pte->dtte); |
| } |
| } |
| |
| static void unmark_pfns(struct coiommu_dtt *dtt, unsigned long pfn, |
| unsigned long nr_pages, bool clear_accessed) |
| { |
| struct dtt_leaf_entry *leaf_pte = NULL; |
| unsigned long count = 0; |
| unsigned int index = 0; |
| |
| for (; count < nr_pages; count++) { |
| if (!leaf_pte || index > COIOMMU_PT_LEVEL_MASK) { |
| leaf_pte = pfn_to_dtt_pte(dtt, pfn + count, false); |
| if (leaf_pte == NULL) { |
| pr_err("%s: coiommu: pfn 0x%lx pte is NULL\n", |
| __func__, pfn + count); |
| /* For the entries in the same page table |
| * page, they should all be NULL, so we |
| * can just skip all of them. |
| */ |
| index = dtt_level_to_offset(pfn + count, |
| DTT_LAST_LEVEL); |
| count += COIOMMU_PT_LEVEL_MASK - index; |
| continue; |
| } |
| index = dtt_level_to_offset(pfn + count, |
| DTT_LAST_LEVEL); |
| } else |
| leaf_pte += 1; |
| |
| unmark_pfn(leaf_pte, clear_accessed); |
| index++; |
| } |
| } |
| |
| static int mark_pfn(struct coiommu_dtt *dtt, |
| struct dtt_leaf_entry *leaf_pte, |
| bool *pinned) |
| { |
| unsigned long flags; |
| unsigned int dtte; |
| |
| local_irq_save(flags); |
| dtte = atomic_inc_return(&leaf_pte->dtte); |
| if ((dtte & DTTE_MAP_CNT_MASK) > dtt->max_map_count) { |
| pr_err("%s: coiommu: %d maps already done, leaf_pte 0x%llx\n", |
| __func__, (dtte & DTTE_MAP_CNT_MASK), (u64)leaf_pte); |
| atomic_dec(&leaf_pte->dtte); |
| local_irq_restore(flags); |
| return -EINVAL; |
| } |
| local_irq_restore(flags); |
| |
| coiommu_set_flag((1 << DTTE_ACCESSED_FLAG), &leaf_pte->dtte); |
| |
| if (pinned) |
| *pinned = !!coiommu_test_flag((1 << DTTE_PINNED_FLAG), |
| &leaf_pte->dtte); |
| return 0; |
| } |
| |
| static int mark_pfns(struct coiommu_dtt *dtt, unsigned long pfn, |
| unsigned long nr_pages, struct pin_pages_info *pin_info) |
| { |
| struct dtt_leaf_entry *leaf_pte = NULL; |
| unsigned long count = 0; |
| unsigned int index = 0; |
| bool pinned; |
| int ret = 0; |
| |
| for (count = 0; count < nr_pages; count++) { |
| if (!leaf_pte || index > COIOMMU_PT_LEVEL_MASK) { |
| leaf_pte = pfn_to_dtt_pte(dtt, pfn + count, true); |
| if (leaf_pte == NULL) { |
| pr_err("%s: coiommu: pfn 0x%lx pte is NULL\n", |
| __func__, pfn); |
| ret = -EINVAL; |
| goto out; |
| } |
| index = dtt_level_to_offset(pfn + count, DTT_LAST_LEVEL); |
| } else |
| leaf_pte += 1; |
| |
| ret = mark_pfn(dtt, leaf_pte, &pinned); |
| if (ret) |
| goto out; |
| |
| if (!pinned) { |
| pin_info->pfn[pin_info->nr_pages] = pfn + count; |
| pin_info->nr_pages++; |
| } |
| |
| index++; |
| } |
| |
| return 0; |
| out: |
| unmark_pfns(dtt, pfn, count, true); |
| return ret; |
| } |
| |
| static inline unsigned long get_aligned_nrpages(phys_addr_t phys_addr, |
| size_t size) |
| { |
| return PAGE_ALIGN((phys_addr & (PAGE_SIZE - 1)) + size) >> PAGE_SHIFT; |
| } |
| |
| static inline unsigned short get_pci_device_id(struct device *dev) |
| { |
| struct pci_dev *pdev = to_pci_dev(dev); |
| |
| return PCI_DEVID(pdev->bus->number, pdev->devfn); |
| } |
| |
| static void unmark_dma_addr(struct device *dev, size_t size, |
| dma_addr_t dma_addr) |
| { |
| struct coiommu_dtt *dtt = &global_coiommu->dtt; |
| phys_addr_t phys_addr = dma_to_phys(dev, dma_addr); |
| unsigned long pfn = phys_addr >> PAGE_SHIFT; |
| unsigned long nr_pages = get_aligned_nrpages(phys_addr, size); |
| |
| if (unlikely(!dtt)) |
| return; |
| |
| read_lock(&dtt->lock); |
| if (likely(dtt->root)) |
| unmark_pfns(dtt, pfn, nr_pages, false); |
| read_unlock(&dtt->lock); |
| } |
| |
| static void unmark_sg_pfns(struct coiommu_dtt *dtt, |
| struct scatterlist *sgl, |
| int nents, bool clear_accessed) |
| { |
| struct scatterlist *sg; |
| phys_addr_t phys_addr; |
| unsigned long pfn; |
| unsigned long nr_pages; |
| int i; |
| |
| for_each_sg(sgl, sg, nents, i) { |
| phys_addr = sg->dma_address; |
| pfn = phys_addr >> PAGE_SHIFT; |
| nr_pages = get_aligned_nrpages(phys_addr, sg->length); |
| read_lock(&dtt->lock); |
| if (unlikely(!dtt->root)) { |
| read_unlock(&dtt->lock); |
| return; |
| } |
| unmark_pfns(dtt, pfn, nr_pages, clear_accessed); |
| read_unlock(&dtt->lock); |
| } |
| } |
| |
| static void unmark_sg(struct scatterlist *sgl, |
| int nents, bool clear_accessed) |
| { |
| struct coiommu_dtt *dtt = &global_coiommu->dtt; |
| |
| if (likely(dtt)) |
| unmark_sg_pfns(dtt, sgl, nents, clear_accessed); |
| } |
| |
| static int pin_page(struct coiommu_dtt *dtt, unsigned long pfn, |
| unsigned short bdf) |
| { |
| struct coiommu *coiommu = dtt_to_coiommu(dtt); |
| int ret; |
| |
| ret = coiommu->dev_ops->execute_request(coiommu->dev, pfn, bdf); |
| if (ret) |
| return ret; |
| |
| if (!is_page_pinned(dtt, pfn)) { |
| pr_err("%s: coiommu pin pfn 0x%lx failed\n", __func__, pfn); |
| return -EFAULT; |
| } |
| |
| return 0; |
| } |
| |
| static int pin_page_list(struct coiommu_dtt *dtt, struct pin_pages_info *pin_info) |
| { |
| struct coiommu *coiommu = dtt_to_coiommu(dtt); |
| int ret, count; |
| |
| ret = coiommu->dev_ops->execute_requests(coiommu->dev, pin_info); |
| if (ret) |
| return ret; |
| |
| for (count = 0; count < pin_info->nr_pages; count++) { |
| if (!is_page_pinned(dtt, pin_info->pfn[count])) { |
| pr_err("%s: coiommu pin pfn 0x%llx failed\n", |
| __func__, pin_info->pfn[count]); |
| return -EFAULT; |
| } |
| } |
| |
| return 0; |
| } |
| |
| static int pin_and_mark_pfn(struct device *dev, unsigned long pfn) |
| { |
| struct dtt_leaf_entry *leaf_pte; |
| unsigned short bdf = get_pci_device_id(dev); |
| struct coiommu_dtt *dtt = &global_coiommu->dtt; |
| int ret = 0; |
| bool pinned; |
| |
| if (!dtt) |
| return -ENODEV; |
| |
| read_lock(&dtt->lock); |
| |
| if (unlikely(!dtt->root)) { |
| ret = -ENODEV; |
| goto out; |
| } |
| |
| leaf_pte = pfn_to_dtt_pte(dtt, pfn, true); |
| if (leaf_pte == NULL) { |
| pr_err("%s: coiommu: pfn 0x%lx pte is NULL\n", __func__, pfn); |
| ret = -EINVAL; |
| goto out; |
| } |
| |
| ret = mark_pfn(dtt, leaf_pte, &pinned); |
| if (ret) |
| goto out; |
| |
| if (!pinned) { |
| ret = pin_page(dtt, pfn, bdf); |
| if (unlikely(ret)) |
| unmark_pfn(leaf_pte, true); |
| } |
| |
| out: |
| read_unlock(&dtt->lock); |
| return ret; |
| } |
| |
| static int pin_and_mark_pfns(struct device *dev, unsigned long start_pfn, |
| unsigned long nr_pages) |
| { |
| unsigned short bdf = get_pci_device_id(dev); |
| struct coiommu_dtt *dtt = &global_coiommu->dtt; |
| struct pin_pages_info *pin_info; |
| int ret; |
| |
| if (nr_pages == 1) |
| return pin_and_mark_pfn(dev, start_pfn); |
| |
| if (!dtt) |
| return -ENODEV; |
| |
| pin_info = kzalloc(sizeof(struct pin_pages_info) + |
| nr_pages * sizeof(unsigned long), |
| GFP_ATOMIC); |
| if (!pin_info) |
| return -ENOMEM; |
| |
| read_lock(&dtt->lock); |
| |
| if (unlikely(!dtt->root)) { |
| ret = -ENODEV; |
| goto out; |
| } |
| |
| ret = mark_pfns(dtt, start_pfn, nr_pages, pin_info); |
| if (ret) |
| goto out; |
| |
| if (pin_info->nr_pages > 0) { |
| pin_info->bdf = bdf; |
| ret = pin_page_list(dtt, pin_info); |
| if (unlikely(ret)) |
| /* |
| * Note - In case pin failures, all pfns required for |
| * this dma mapping shall fail, which means none of |
| * them will participate in the dma operations. |
| * Hence their map count shall be decremented. |
| */ |
| unmark_pfns(dtt, start_pfn, nr_pages, true); |
| } |
| |
| out: |
| read_unlock(&dtt->lock); |
| kfree(pin_info); |
| return ret; |
| } |
| |
| static int pin_and_mark_dma_addr(struct device *dev, size_t size, |
| dma_addr_t dma_addr) |
| { |
| phys_addr_t phys_addr = dma_to_phys(dev, dma_addr); |
| unsigned long nr_pages = get_aligned_nrpages(phys_addr, size); |
| unsigned long pfn = phys_addr >> PAGE_SHIFT; |
| int ret; |
| |
| ret = pin_and_mark_pfns(dev, pfn, nr_pages); |
| if (unlikely(ret)) |
| dev_err(dev, "%s: coiommu failed to pin DMA buffer: %d\n", |
| __func__, ret); |
| |
| return ret; |
| } |
| |
| static int pin_and_mark_sg_list(struct device *dev, |
| struct scatterlist *sgl, |
| int nents) |
| { |
| unsigned short bdf = get_pci_device_id(dev); |
| struct coiommu_dtt *dtt = &global_coiommu->dtt; |
| struct scatterlist *sg; |
| unsigned long nr_pages = 0; |
| phys_addr_t phys_addr; |
| unsigned long pfn; |
| struct pin_pages_info *pin_info = NULL; |
| int i, ret = 0; |
| |
| if (!dtt) |
| return -ENODEV; |
| |
| for_each_sg(sgl, sg, nents, i) { |
| phys_addr = sg->dma_address; |
| nr_pages += get_aligned_nrpages(phys_addr, sg->length); |
| } |
| |
| pin_info = kzalloc(sizeof(struct pin_pages_info) + |
| nr_pages * sizeof(unsigned long), GFP_ATOMIC); |
| if (!pin_info) |
| return -ENOMEM; |
| |
| read_lock(&dtt->lock); |
| |
| if (unlikely(!dtt->root)) { |
| ret = -ENODEV; |
| goto out; |
| } |
| |
| for_each_sg(sgl, sg, nents, i) { |
| phys_addr = sg->dma_address; |
| pfn = phys_addr >> PAGE_SHIFT; |
| nr_pages = get_aligned_nrpages(phys_addr, sg->length); |
| |
| ret = mark_pfns(dtt, pfn, nr_pages, pin_info); |
| if (ret) { |
| unmark_sg_pfns(dtt, sgl, i, true); |
| goto out; |
| } |
| } |
| |
| if (pin_info->nr_pages > 0) { |
| pin_info->bdf = bdf; |
| ret = pin_page_list(dtt, pin_info); |
| if (unlikely(ret)) |
| /* |
| * Note - In case pin failures, all pfns required for this |
| * dma mapping shall fail, which means none of them will |
| * participate in the dma operations. Hence their map count |
| * shall be decremented. |
| */ |
| unmark_sg_pfns(dtt, sgl, nents, true); |
| } |
| |
| out: |
| read_unlock(&dtt->lock); |
| kfree(pin_info); |
| return ret; |
| } |
| |
| static void *coiommu_alloc(struct device *dev, size_t size, |
| dma_addr_t *dma_addr, gfp_t gfp, |
| unsigned long attrs) |
| { |
| void *cpu_addr = dma_direct_alloc(dev, size, dma_addr, gfp, attrs); |
| |
| if (!cpu_addr) { |
| dev_err(dev, "%s: failed\n", __func__); |
| return NULL; |
| } |
| |
| if (pin_and_mark_dma_addr(dev, size, *dma_addr)) |
| goto out_free; |
| |
| return cpu_addr; |
| |
| out_free: |
| dma_direct_free(dev, size, cpu_addr, *dma_addr, attrs); |
| return NULL; |
| } |
| |
| static void coiommu_free(struct device *dev, size_t size, void *cpu_addr, |
| dma_addr_t dma_addr, unsigned long attrs) |
| { |
| dma_direct_free(dev, size, cpu_addr, dma_addr, attrs); |
| |
| unmark_dma_addr(dev, size, dma_addr); |
| } |
| |
| static struct page *coiommu_alloc_pages(struct device *dev, size_t size, |
| dma_addr_t *dma_handle, |
| enum dma_data_direction dir, |
| gfp_t gfp) |
| { |
| struct page *page = dma_direct_alloc_pages(dev, size, dma_handle, |
| dir, gfp); |
| if (!page) { |
| dev_err(dev, "%s: failed\n", __func__); |
| return NULL; |
| } |
| |
| if (pin_and_mark_dma_addr(dev, size, *dma_handle)) |
| goto out_free; |
| |
| return page; |
| |
| out_free: |
| dma_direct_free_pages(dev, size, page, *dma_handle, dir); |
| return NULL; |
| } |
| |
| static void coiommu_free_pages(struct device *dev, size_t size, |
| struct page *page, dma_addr_t dma_handle, |
| enum dma_data_direction dir) |
| { |
| dma_direct_free_pages(dev, size, page, dma_handle, dir); |
| |
| unmark_dma_addr(dev, size, dma_handle); |
| } |
| |
| static dma_addr_t coiommu_map_page(struct device *dev, struct page *page, |
| unsigned long offset, size_t size, enum dma_data_direction dir, |
| unsigned long attrs) |
| { |
| dma_addr_t dma_addr = dma_direct_map_page(dev, page, offset, |
| size, dir, attrs); |
| if (dma_addr == DMA_MAPPING_ERROR) { |
| dev_err(dev, "%s: failed\n", __func__); |
| return dma_addr; |
| } |
| |
| if (pin_and_mark_dma_addr(dev, size, dma_addr)) |
| goto out_unmap; |
| |
| return dma_addr; |
| |
| out_unmap: |
| dma_direct_unmap_page(dev, dma_addr, size, dir, |
| attrs | DMA_ATTR_SKIP_CPU_SYNC); |
| return DMA_MAPPING_ERROR; |
| } |
| |
| static void coiommu_unmap_page(struct device *dev, dma_addr_t addr, size_t size, |
| enum dma_data_direction dir, unsigned long attrs) |
| { |
| dma_direct_unmap_page(dev, addr, size, dir, attrs); |
| |
| unmark_dma_addr(dev, size, addr); |
| } |
| |
| static int coiommu_map_sg(struct device *dev, struct scatterlist *sgl, |
| int nents, enum dma_data_direction dir, |
| unsigned long attrs) |
| { |
| nents = dma_direct_map_sg(dev, sgl, nents, dir, attrs); |
| if (!nents) { |
| dev_err(dev, "%s: failed\n", __func__); |
| return 0; |
| } |
| |
| if (pin_and_mark_sg_list(dev, sgl, nents)) |
| goto out_unmap; |
| |
| return nents; |
| |
| out_unmap: |
| dma_direct_unmap_sg(dev, sgl, nents, dir, |
| attrs | DMA_ATTR_SKIP_CPU_SYNC); |
| return 0; |
| } |
| |
| static void coiommu_unmap_sg(struct device *dev, struct scatterlist *sgl, |
| int nents, enum dma_data_direction dir, |
| unsigned long attrs) |
| { |
| dma_direct_unmap_sg(dev, sgl, nents, dir, attrs); |
| |
| unmark_sg(sgl, nents, false); |
| } |
| |
| static const struct dma_map_ops coiommu_ops = { |
| .alloc = coiommu_alloc, |
| .free = coiommu_free, |
| .alloc_pages = coiommu_alloc_pages, |
| .free_pages = coiommu_free_pages, |
| .mmap = dma_direct_mmap, |
| .get_sgtable = dma_direct_get_sgtable, |
| .map_page = coiommu_map_page, |
| .unmap_page = coiommu_unmap_page, |
| .map_sg = coiommu_map_sg, |
| .unmap_sg = coiommu_unmap_sg, |
| .map_resource = dma_direct_map_resource, |
| .sync_single_for_cpu = dma_direct_sync_single_for_cpu, |
| .sync_single_for_device = dma_direct_sync_single_for_device, |
| .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu, |
| .sync_sg_for_device = dma_direct_sync_sg_for_device, |
| .dma_supported = dma_direct_supported, |
| .get_required_mask = dma_direct_get_required_mask, |
| .max_mapping_size = dma_direct_max_mapping_size, |
| }; |
| |
| static inline unsigned int get_dtt_level(void) |
| { |
| unsigned int pfn_width; |
| |
| pfn_width = MAX_PHYSMEM_BITS - PAGE_SHIFT; |
| |
| if (pfn_width <= COIOMMU_PT_LEVEL_STRIDE) |
| return 1; |
| |
| return DIV_ROUND_UP((pfn_width - COIOMMU_PT_LEVEL_STRIDE), |
| COIOMMU_UPPER_LEVEL_STRIDE) + 1; |
| } |
| |
| static void dtt_free(void *pt, unsigned int level) |
| { |
| struct dtt_parent_entry *pte; |
| u64 phys; |
| int i; |
| |
| /* |
| * The last level contains the DMA tracking which doesn't |
| * point to any physical memory, so don't need to free any |
| * entry but the page itself. |
| */ |
| if (level == DTT_LAST_LEVEL) |
| goto free; |
| |
| for (i = 0; i < 1 << COIOMMU_UPPER_LEVEL_STRIDE; i++) { |
| pte = (struct dtt_parent_entry *)pt + i; |
| if (!parent_pte_present(pte)) |
| continue; |
| phys = parent_pte_addr(pte); |
| dtt_free(phys_to_virt(phys), level - 1); |
| } |
| free: |
| |
| free_page((unsigned long)pt); |
| } |
| |
| static void dtt_root_free(struct coiommu_dtt *dtt) |
| { |
| dtt_free((void *)dtt->root, dtt->level); |
| dtt->root = NULL; |
| dtt->level = 0; |
| } |
| |
| static int populate_dtt_page_cache(struct dtt_page_cache *c, |
| int count, gfp_t gfp_mask) |
| { |
| void *obj; |
| |
| while (c->nobjs < count) { |
| obj = (void *)get_zeroed_page(gfp_mask); |
| if (!obj) |
| break; |
| c->objects[c->nobjs++] = obj; |
| } |
| |
| return c->nobjs; |
| } |
| |
| static void dtt_page_cache_free(struct coiommu_dtt *dtt) |
| { |
| struct dtt_page_cache *c; |
| int i; |
| |
| for (i = 0; i < ARRAY_SIZE(dtt->cache); i++) { |
| c = &dtt->cache[i]; |
| while (c->nobjs) |
| free_page((unsigned long)c->objects[--c->nobjs]); |
| } |
| } |
| |
| static int dtt_page_cache_alloc(struct coiommu_dtt *dtt) |
| { |
| int i; |
| |
| for (i = 0; i < ARRAY_SIZE(dtt->cache); i++) { |
| if (!populate_dtt_page_cache(&dtt->cache[i], |
| COIOMMU_INFO_NR_OBJS, GFP_KERNEL_ACCOUNT)) { |
| goto free; |
| } |
| } |
| |
| return 0; |
| free: |
| dtt_page_cache_free(dtt); |
| return -ENOMEM; |
| } |
| |
| static void alloc_dtt_pages(struct kthread_work *work) |
| { |
| struct coiommu_dtt *dtt = |
| container_of(work, struct coiommu_dtt, alloc_work); |
| int prev_cache = dtt->cur_cache; |
| unsigned long flags; |
| int nobjs; |
| |
| spin_lock_irqsave(&dtt->alloc_lock, flags); |
| dtt->cur_cache = !dtt->cur_cache; |
| spin_unlock_irqrestore(&dtt->alloc_lock, flags); |
| |
| nobjs = populate_dtt_page_cache(&dtt->cache[prev_cache], |
| COIOMMU_INFO_NR_OBJS, GFP_KERNEL_ACCOUNT); |
| if (nobjs != COIOMMU_INFO_NR_OBJS) |
| pr_warn("%s: coiommu: cache%d supposed to get %d pages but got %d\n", |
| __func__, prev_cache, COIOMMU_INFO_NR_OBJS, nobjs); |
| } |
| |
| static int coiommu_setup_endpoint(struct device *dev) |
| { |
| struct coiommu *coiommu = NULL; |
| int i; |
| |
| if (!global_coiommu || !global_coiommu->endpoints) |
| return 0; |
| |
| for (i = 0; i < global_coiommu->ep_count; i++) { |
| if (get_pci_device_id(dev) == global_coiommu->endpoints[i]) { |
| coiommu = global_coiommu; |
| break; |
| } |
| } |
| |
| /* |
| * Device is not on top of coIOMMU, so no need to setup |
| */ |
| if (!coiommu) |
| return 0; |
| |
| if (!coiommu->dev_ops) { |
| dev_info(dev, "%s: probe earlier than coiommu, deferred\n", __func__); |
| return -EPROBE_DEFER; |
| } |
| |
| set_dma_ops(dev, &coiommu_ops); |
| return 0; |
| } |
| |
| static unsigned long |
| dtt_shrink_count(struct shrinker *shrink, struct shrink_control *sc) |
| { |
| struct coiommu_dtt *dtt = container_of(shrink, struct coiommu_dtt, |
| dtt_shrinker); |
| |
| return atomic_read(&dtt->pages); |
| } |
| |
| static unsigned int dtt_shrink(struct coiommu_dtt *dtt, |
| struct dtt_parent_entry *parentpt, |
| void *pt, unsigned int level, |
| bool *pt_freed) |
| { |
| unsigned int free_count = 0; |
| struct dtt_parent_entry *pte; |
| bool has_child = false; |
| u64 phys; |
| int i; |
| |
| if (level != DTT_LAST_LEVEL) { |
| for (i = 0; i < 1 << COIOMMU_UPPER_LEVEL_STRIDE; i++) { |
| bool child_freed = false; |
| |
| pte = (struct dtt_parent_entry *)pt + i; |
| if (!parent_pte_present(pte)) |
| continue; |
| phys = parent_pte_addr(pte); |
| free_count += dtt_shrink(dtt, pte, phys_to_virt(phys), |
| level - 1, &child_freed); |
| has_child |= !child_freed; |
| } |
| } |
| |
| if (!has_child && parentpt) { |
| unsigned long flags; |
| |
| write_lock_irqsave(&dtt->lock, flags); |
| if (!memcmp(pt, dtt->zero_page, PAGE_SIZE)) { |
| free_page((unsigned long)pt); |
| atomic_dec(&dtt->pages); |
| parentpt->val = 0; |
| if (pt_freed) |
| *pt_freed = true; |
| free_count += 1; |
| } |
| write_unlock_irqrestore(&dtt->lock, flags); |
| } |
| |
| return free_count; |
| } |
| |
| static unsigned long |
| dtt_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) |
| { |
| struct coiommu *coiommu = container_of(shrink, struct coiommu, dtt.dtt_shrinker); |
| struct coiommu_dtt *dtt = &coiommu->dtt; |
| unsigned int total = atomic_read(&dtt->pages); |
| unsigned int free; |
| |
| coiommu->dev_ops->park_unpin(coiommu->dev, true); |
| free = dtt_shrink(dtt, NULL, (void *)dtt->root, dtt->level, NULL); |
| coiommu->dev_ops->park_unpin(coiommu->dev, false); |
| |
| if (free) |
| pr_info("coiommu: DTT pages total %u free %u\n", total, free); |
| |
| return free ? free : SHRINK_STOP; |
| } |
| |
| int coiommu_enable_dtt(u64 *dtt_addr, u64 *dtt_level) |
| { |
| struct coiommu_dtt *dtt; |
| int ret; |
| |
| if (!global_coiommu) { |
| pr_err("%s: coiommu not exists\n", __func__); |
| return -EINVAL; |
| } |
| |
| dtt = &global_coiommu->dtt; |
| dtt->root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); |
| if (!dtt->root) |
| return -ENOMEM; |
| dtt->level = get_dtt_level(); |
| |
| ret = dtt_page_cache_alloc(dtt); |
| if (ret) |
| goto free_root; |
| dtt->cur_cache = 0; |
| |
| dtt->worker = kthread_create_worker(0, "coiommu_pagecache_alloc"); |
| if (IS_ERR(dtt->worker)) { |
| ret = PTR_ERR(dtt->worker); |
| goto free_page_cache; |
| } |
| kthread_init_work(&dtt->alloc_work, alloc_dtt_pages); |
| |
| atomic_set(&dtt->pages, 0); |
| |
| if (dtt_addr) |
| *dtt_addr = (u64)__pa(dtt->root); |
| if (dtt_level) |
| *dtt_level = (u64)dtt->level; |
| /* |
| * It is possible that the same guest physical page will be mapped |
| * at the same time by different CPUs. So it is possible to increase |
| * the map_count at the same time by multiple CPU threads(see mark_pfn). |
| * To prevent map_count from exceeding the MAP_CNT_MASK, set the |
| * max map_count to be MAP_CNT_MASK - num_possible_cpus(). |
| */ |
| dtt->max_map_count = DTTE_MAP_CNT_MASK - num_possible_cpus(); |
| pr_info("%s: coiommu max map_count: 0x%x\n", |
| __func__, dtt->max_map_count); |
| |
| return 0; |
| |
| free_page_cache: |
| dtt_page_cache_free(dtt); |
| free_root: |
| free_page((unsigned long)dtt->root); |
| dtt->root = NULL; |
| pr_err("%s: failed with error %d\n", __func__, ret); |
| return ret; |
| } |
| |
| void coiommu_disable_dtt(void) |
| { |
| struct coiommu_dtt *dtt; |
| unsigned long flags; |
| |
| if (!global_coiommu) |
| return; |
| |
| dtt = &global_coiommu->dtt; |
| if (!dtt->root) |
| return; |
| |
| write_lock_irqsave(&dtt->lock, flags); |
| kthread_destroy_worker(dtt->worker); |
| dtt_page_cache_free(dtt); |
| dtt_root_free(dtt); |
| write_unlock_irqrestore(&dtt->lock, flags); |
| } |
| |
| int coiommu_setup_dev_ops(const struct coiommu_dev_ops *ops, void *dev) |
| { |
| struct coiommu_dtt *dtt; |
| |
| if (!ops) |
| return -EINVAL; |
| |
| if (!global_coiommu) |
| return -ENODEV; |
| |
| /* |
| * If this is not the first time to set up the |
| * dev ops, means coiommu is already occupied |
| * by the driver, and be here because the coiommu |
| * is removed and re-probed again. Doing so cannot |
| * bring the coiommu back because removing already |
| * cleared the DTT which contains the previous mapping |
| * and pinning status. |
| */ |
| if (global_coiommu->dev_ops) |
| return -EBUSY; |
| |
| global_coiommu->dev_ops = ops; |
| global_coiommu->dev = dev; |
| |
| if (!ops->park_unpin) |
| return 0; |
| |
| dtt = &global_coiommu->dtt; |
| |
| dtt->zero_page = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); |
| if (!dtt->zero_page) |
| return -ENOMEM; |
| dtt->dtt_shrinker.count_objects = dtt_shrink_count; |
| dtt->dtt_shrinker.scan_objects = dtt_shrink_scan; |
| dtt->dtt_shrinker.seeks = DEFAULT_SEEKS; |
| |
| return register_shrinker(&dtt->dtt_shrinker); |
| } |
| |
| int coiommu_configure(struct device *dev) |
| { |
| struct pci_dev *pdev = to_pci_dev(dev); |
| |
| if (pdev->vendor == PCI_VENDOR_ID_COIOMMU && |
| pdev->device == PCI_DEVICE_ID_COIOMMU) |
| return 0; |
| |
| return coiommu_setup_endpoint(dev); |
| } |
| |
| static void coiommu_set_endpoints(struct coiommu *coiommu, |
| unsigned short ep_count, |
| unsigned short *endpoints) |
| { |
| if (!endpoints) |
| return; |
| |
| coiommu->endpoints = kcalloc(ep_count, |
| sizeof(unsigned short), GFP_KERNEL); |
| if (!coiommu->endpoints) |
| return; |
| |
| memcpy(coiommu->endpoints, endpoints, |
| ep_count * sizeof(unsigned short)); |
| coiommu->ep_count = ep_count; |
| } |
| |
| void coiommu_init(unsigned short ep_count, unsigned short *endpoints) |
| { |
| /* |
| * If already created means it is not the first time |
| * to init. Just re-use it. |
| */ |
| if (global_coiommu) { |
| pr_warn("%s: coiommu is already initialized\n", __func__); |
| return; |
| } |
| |
| global_coiommu = kzalloc(sizeof(struct coiommu), GFP_KERNEL); |
| if (!global_coiommu) |
| return; |
| |
| rwlock_init(&global_coiommu->dtt.lock); |
| spin_lock_init(&global_coiommu->dtt.alloc_lock); |
| coiommu_set_endpoints(global_coiommu, ep_count, endpoints); |
| } |