mirror of https://github.com/Qortal/Brooklyn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
490 lines
13 KiB
490 lines
13 KiB
// SPDX-License-Identifier: GPL-2.0-only |
|
/* |
|
* VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2. |
|
* |
|
* Copyright (C) 2018 IBM Corp. All rights reserved. |
|
* Author: Alexey Kardashevskiy <[email protected]> |
|
* |
|
* Register an on-GPU RAM region for cacheable access. |
|
* |
|
* Derived from original vfio_pci_igd.c: |
|
* Copyright (C) 2016 Red Hat, Inc. All rights reserved. |
|
* Author: Alex Williamson <[email protected]> |
|
*/ |
|
|
|
#include <linux/io.h> |
|
#include <linux/pci.h> |
|
#include <linux/uaccess.h> |
|
#include <linux/vfio.h> |
|
#include <linux/sched/mm.h> |
|
#include <linux/mmu_context.h> |
|
#include <asm/kvm_ppc.h> |
|
#include "vfio_pci_private.h" |
|
|
|
#define CREATE_TRACE_POINTS |
|
#include "trace.h" |
|
|
|
EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault); |
|
EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap); |
|
EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap); |
|
|
|
struct vfio_pci_nvgpu_data { |
|
unsigned long gpu_hpa; /* GPU RAM physical address */ |
|
unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */ |
|
unsigned long useraddr; /* GPU RAM userspace address */ |
|
unsigned long size; /* Size of the GPU RAM window (usually 128GB) */ |
|
struct mm_struct *mm; |
|
struct mm_iommu_table_group_mem_t *mem; /* Pre-registered RAM descr. */ |
|
struct pci_dev *gpdev; |
|
struct notifier_block group_notifier; |
|
}; |
|
|
|
static size_t vfio_pci_nvgpu_rw(struct vfio_pci_device *vdev, |
|
char __user *buf, size_t count, loff_t *ppos, bool iswrite) |
|
{ |
|
unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; |
|
struct vfio_pci_nvgpu_data *data = vdev->region[i].data; |
|
loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; |
|
loff_t posaligned = pos & PAGE_MASK, posoff = pos & ~PAGE_MASK; |
|
size_t sizealigned; |
|
void __iomem *ptr; |
|
|
|
if (pos >= vdev->region[i].size) |
|
return -EINVAL; |
|
|
|
count = min(count, (size_t)(vdev->region[i].size - pos)); |
|
|
|
/* |
|
* We map only a bit of GPU RAM for a short time instead of mapping it |
|
* for the guest lifetime as: |
|
* |
|
* 1) we do not know GPU RAM size, only aperture which is 4-8 times |
|
* bigger than actual RAM size (16/32GB RAM vs. 128GB aperture); |
|
* 2) mapping GPU RAM allows CPU to prefetch and if this happens |
|
* before NVLink bridge is reset (which fences GPU RAM), |
|
* hardware management interrupts (HMI) might happen, this |
|
* will freeze NVLink bridge. |
|
* |
|
* This is not fast path anyway. |
|
*/ |
|
sizealigned = ALIGN(posoff + count, PAGE_SIZE); |
|
ptr = ioremap_cache(data->gpu_hpa + posaligned, sizealigned); |
|
if (!ptr) |
|
return -EFAULT; |
|
|
|
if (iswrite) { |
|
if (copy_from_user(ptr + posoff, buf, count)) |
|
count = -EFAULT; |
|
else |
|
*ppos += count; |
|
} else { |
|
if (copy_to_user(buf, ptr + posoff, count)) |
|
count = -EFAULT; |
|
else |
|
*ppos += count; |
|
} |
|
|
|
iounmap(ptr); |
|
|
|
return count; |
|
} |
|
|
|
static void vfio_pci_nvgpu_release(struct vfio_pci_device *vdev, |
|
struct vfio_pci_region *region) |
|
{ |
|
struct vfio_pci_nvgpu_data *data = region->data; |
|
long ret; |
|
|
|
/* If there were any mappings at all... */ |
|
if (data->mm) { |
|
if (data->mem) { |
|
ret = mm_iommu_put(data->mm, data->mem); |
|
WARN_ON(ret); |
|
} |
|
|
|
mmdrop(data->mm); |
|
} |
|
|
|
vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, |
|
&data->group_notifier); |
|
|
|
pnv_npu2_unmap_lpar_dev(data->gpdev); |
|
|
|
kfree(data); |
|
} |
|
|
|
static vm_fault_t vfio_pci_nvgpu_mmap_fault(struct vm_fault *vmf) |
|
{ |
|
vm_fault_t ret; |
|
struct vm_area_struct *vma = vmf->vma; |
|
struct vfio_pci_region *region = vma->vm_private_data; |
|
struct vfio_pci_nvgpu_data *data = region->data; |
|
unsigned long vmf_off = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
|
unsigned long nv2pg = data->gpu_hpa >> PAGE_SHIFT; |
|
unsigned long vm_pgoff = vma->vm_pgoff & |
|
((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); |
|
unsigned long pfn = nv2pg + vm_pgoff + vmf_off; |
|
|
|
ret = vmf_insert_pfn(vma, vmf->address, pfn); |
|
trace_vfio_pci_nvgpu_mmap_fault(data->gpdev, pfn << PAGE_SHIFT, |
|
vmf->address, ret); |
|
|
|
return ret; |
|
} |
|
|
|
static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops = { |
|
.fault = vfio_pci_nvgpu_mmap_fault, |
|
}; |
|
|
|
static int vfio_pci_nvgpu_mmap(struct vfio_pci_device *vdev, |
|
struct vfio_pci_region *region, struct vm_area_struct *vma) |
|
{ |
|
int ret; |
|
struct vfio_pci_nvgpu_data *data = region->data; |
|
|
|
if (data->useraddr) |
|
return -EPERM; |
|
|
|
if (vma->vm_end - vma->vm_start > data->size) |
|
return -EINVAL; |
|
|
|
vma->vm_private_data = region; |
|
vma->vm_flags |= VM_PFNMAP; |
|
vma->vm_ops = &vfio_pci_nvgpu_mmap_vmops; |
|
|
|
/* |
|
* Calling mm_iommu_newdev() here once as the region is not |
|
* registered yet and therefore right initialization will happen now. |
|
* Other places will use mm_iommu_find() which returns |
|
* registered @mem and does not go gup(). |
|
*/ |
|
data->useraddr = vma->vm_start; |
|
data->mm = current->mm; |
|
|
|
mmgrab(data->mm); |
|
ret = (int) mm_iommu_newdev(data->mm, data->useraddr, |
|
vma_pages(vma), data->gpu_hpa, &data->mem); |
|
|
|
trace_vfio_pci_nvgpu_mmap(vdev->pdev, data->gpu_hpa, data->useraddr, |
|
vma->vm_end - vma->vm_start, ret); |
|
|
|
return ret; |
|
} |
|
|
|
static int vfio_pci_nvgpu_add_capability(struct vfio_pci_device *vdev, |
|
struct vfio_pci_region *region, struct vfio_info_cap *caps) |
|
{ |
|
struct vfio_pci_nvgpu_data *data = region->data; |
|
struct vfio_region_info_cap_nvlink2_ssatgt cap = { |
|
.header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, |
|
.header.version = 1, |
|
.tgt = data->gpu_tgt |
|
}; |
|
|
|
return vfio_info_add_capability(caps, &cap.header, sizeof(cap)); |
|
} |
|
|
|
static const struct vfio_pci_regops vfio_pci_nvgpu_regops = { |
|
.rw = vfio_pci_nvgpu_rw, |
|
.release = vfio_pci_nvgpu_release, |
|
.mmap = vfio_pci_nvgpu_mmap, |
|
.add_capability = vfio_pci_nvgpu_add_capability, |
|
}; |
|
|
|
static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb, |
|
unsigned long action, void *opaque) |
|
{ |
|
struct kvm *kvm = opaque; |
|
struct vfio_pci_nvgpu_data *data = container_of(nb, |
|
struct vfio_pci_nvgpu_data, |
|
group_notifier); |
|
|
|
if (action == VFIO_GROUP_NOTIFY_SET_KVM && kvm && |
|
pnv_npu2_map_lpar_dev(data->gpdev, |
|
kvm->arch.lpid, MSR_DR | MSR_PR)) |
|
return NOTIFY_BAD; |
|
|
|
return NOTIFY_OK; |
|
} |
|
|
|
int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev) |
|
{ |
|
int ret; |
|
u64 reg[2]; |
|
u64 tgt = 0; |
|
struct device_node *npu_node, *mem_node; |
|
struct pci_dev *npu_dev; |
|
struct vfio_pci_nvgpu_data *data; |
|
uint32_t mem_phandle = 0; |
|
unsigned long events = VFIO_GROUP_NOTIFY_SET_KVM; |
|
|
|
/* |
|
* PCI config space does not tell us about NVLink presense but |
|
* platform does, use this. |
|
*/ |
|
npu_dev = pnv_pci_get_npu_dev(vdev->pdev, 0); |
|
if (!npu_dev) |
|
return -ENODEV; |
|
|
|
npu_node = pci_device_to_OF_node(npu_dev); |
|
if (!npu_node) |
|
return -EINVAL; |
|
|
|
if (of_property_read_u32(npu_node, "memory-region", &mem_phandle)) |
|
return -ENODEV; |
|
|
|
mem_node = of_find_node_by_phandle(mem_phandle); |
|
if (!mem_node) |
|
return -EINVAL; |
|
|
|
if (of_property_read_variable_u64_array(mem_node, "reg", reg, |
|
ARRAY_SIZE(reg), ARRAY_SIZE(reg)) != |
|
ARRAY_SIZE(reg)) |
|
return -EINVAL; |
|
|
|
if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) { |
|
dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n"); |
|
return -EFAULT; |
|
} |
|
|
|
data = kzalloc(sizeof(*data), GFP_KERNEL); |
|
if (!data) |
|
return -ENOMEM; |
|
|
|
data->gpu_hpa = reg[0]; |
|
data->gpu_tgt = tgt; |
|
data->size = reg[1]; |
|
|
|
dev_dbg(&vdev->pdev->dev, "%lx..%lx\n", data->gpu_hpa, |
|
data->gpu_hpa + data->size - 1); |
|
|
|
data->gpdev = vdev->pdev; |
|
data->group_notifier.notifier_call = vfio_pci_nvgpu_group_notifier; |
|
|
|
ret = vfio_register_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, |
|
&events, &data->group_notifier); |
|
if (ret) |
|
goto free_exit; |
|
|
|
/* |
|
* We have just set KVM, we do not need the listener anymore. |
|
* Also, keeping it registered means that if more than one GPU is |
|
* assigned, we will get several similar notifiers notifying about |
|
* the same device again which does not help with anything. |
|
*/ |
|
vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, |
|
&data->group_notifier); |
|
|
|
ret = vfio_pci_register_dev_region(vdev, |
|
PCI_VENDOR_ID_NVIDIA | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, |
|
VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, |
|
&vfio_pci_nvgpu_regops, |
|
data->size, |
|
VFIO_REGION_INFO_FLAG_READ | |
|
VFIO_REGION_INFO_FLAG_WRITE | |
|
VFIO_REGION_INFO_FLAG_MMAP, |
|
data); |
|
if (ret) |
|
goto free_exit; |
|
|
|
return 0; |
|
free_exit: |
|
kfree(data); |
|
|
|
return ret; |
|
} |
|
|
|
/* |
|
* IBM NPU2 bridge |
|
*/ |
|
struct vfio_pci_npu2_data { |
|
void *base; /* ATSD register virtual address, for emulated access */ |
|
unsigned long mmio_atsd; /* ATSD physical address */ |
|
unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */ |
|
unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */ |
|
}; |
|
|
|
static size_t vfio_pci_npu2_rw(struct vfio_pci_device *vdev, |
|
char __user *buf, size_t count, loff_t *ppos, bool iswrite) |
|
{ |
|
unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; |
|
struct vfio_pci_npu2_data *data = vdev->region[i].data; |
|
loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; |
|
|
|
if (pos >= vdev->region[i].size) |
|
return -EINVAL; |
|
|
|
count = min(count, (size_t)(vdev->region[i].size - pos)); |
|
|
|
if (iswrite) { |
|
if (copy_from_user(data->base + pos, buf, count)) |
|
return -EFAULT; |
|
} else { |
|
if (copy_to_user(buf, data->base + pos, count)) |
|
return -EFAULT; |
|
} |
|
*ppos += count; |
|
|
|
return count; |
|
} |
|
|
|
static int vfio_pci_npu2_mmap(struct vfio_pci_device *vdev, |
|
struct vfio_pci_region *region, struct vm_area_struct *vma) |
|
{ |
|
int ret; |
|
struct vfio_pci_npu2_data *data = region->data; |
|
unsigned long req_len = vma->vm_end - vma->vm_start; |
|
|
|
if (req_len != PAGE_SIZE) |
|
return -EINVAL; |
|
|
|
vma->vm_flags |= VM_PFNMAP; |
|
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); |
|
|
|
ret = remap_pfn_range(vma, vma->vm_start, data->mmio_atsd >> PAGE_SHIFT, |
|
req_len, vma->vm_page_prot); |
|
trace_vfio_pci_npu2_mmap(vdev->pdev, data->mmio_atsd, vma->vm_start, |
|
vma->vm_end - vma->vm_start, ret); |
|
|
|
return ret; |
|
} |
|
|
|
static void vfio_pci_npu2_release(struct vfio_pci_device *vdev, |
|
struct vfio_pci_region *region) |
|
{ |
|
struct vfio_pci_npu2_data *data = region->data; |
|
|
|
memunmap(data->base); |
|
kfree(data); |
|
} |
|
|
|
static int vfio_pci_npu2_add_capability(struct vfio_pci_device *vdev, |
|
struct vfio_pci_region *region, struct vfio_info_cap *caps) |
|
{ |
|
struct vfio_pci_npu2_data *data = region->data; |
|
struct vfio_region_info_cap_nvlink2_ssatgt captgt = { |
|
.header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, |
|
.header.version = 1, |
|
.tgt = data->gpu_tgt |
|
}; |
|
struct vfio_region_info_cap_nvlink2_lnkspd capspd = { |
|
.header.id = VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD, |
|
.header.version = 1, |
|
.link_speed = data->link_speed |
|
}; |
|
int ret; |
|
|
|
ret = vfio_info_add_capability(caps, &captgt.header, sizeof(captgt)); |
|
if (ret) |
|
return ret; |
|
|
|
return vfio_info_add_capability(caps, &capspd.header, sizeof(capspd)); |
|
} |
|
|
|
static const struct vfio_pci_regops vfio_pci_npu2_regops = { |
|
.rw = vfio_pci_npu2_rw, |
|
.mmap = vfio_pci_npu2_mmap, |
|
.release = vfio_pci_npu2_release, |
|
.add_capability = vfio_pci_npu2_add_capability, |
|
}; |
|
|
|
int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev) |
|
{ |
|
int ret; |
|
struct vfio_pci_npu2_data *data; |
|
struct device_node *nvlink_dn; |
|
u32 nvlink_index = 0, mem_phandle = 0; |
|
struct pci_dev *npdev = vdev->pdev; |
|
struct device_node *npu_node = pci_device_to_OF_node(npdev); |
|
struct pci_controller *hose = pci_bus_to_host(npdev->bus); |
|
u64 mmio_atsd = 0; |
|
u64 tgt = 0; |
|
u32 link_speed = 0xff; |
|
|
|
/* |
|
* PCI config space does not tell us about NVLink presense but |
|
* platform does, use this. |
|
*/ |
|
if (!pnv_pci_get_gpu_dev(vdev->pdev)) |
|
return -ENODEV; |
|
|
|
if (of_property_read_u32(npu_node, "memory-region", &mem_phandle)) |
|
return -ENODEV; |
|
|
|
/* |
|
* NPU2 normally has 8 ATSD registers (for concurrency) and 6 links |
|
* so we can allocate one register per link, using nvlink index as |
|
* a key. |
|
* There is always at least one ATSD register so as long as at least |
|
* NVLink bridge #0 is passed to the guest, ATSD will be available. |
|
*/ |
|
nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); |
|
if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", |
|
&nvlink_index))) |
|
return -ENODEV; |
|
|
|
if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index, |
|
&mmio_atsd)) { |
|
if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", 0, |
|
&mmio_atsd)) { |
|
dev_warn(&vdev->pdev->dev, "No available ATSD found\n"); |
|
mmio_atsd = 0; |
|
} else { |
|
dev_warn(&vdev->pdev->dev, |
|
"Using fallback ibm,mmio-atsd[0] for ATSD.\n"); |
|
} |
|
} |
|
|
|
if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) { |
|
dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n"); |
|
return -EFAULT; |
|
} |
|
|
|
if (of_property_read_u32(npu_node, "ibm,nvlink-speed", &link_speed)) { |
|
dev_warn(&vdev->pdev->dev, "No ibm,nvlink-speed found\n"); |
|
return -EFAULT; |
|
} |
|
|
|
data = kzalloc(sizeof(*data), GFP_KERNEL); |
|
if (!data) |
|
return -ENOMEM; |
|
|
|
data->mmio_atsd = mmio_atsd; |
|
data->gpu_tgt = tgt; |
|
data->link_speed = link_speed; |
|
if (data->mmio_atsd) { |
|
data->base = memremap(data->mmio_atsd, SZ_64K, MEMREMAP_WT); |
|
if (!data->base) { |
|
ret = -ENOMEM; |
|
goto free_exit; |
|
} |
|
} |
|
|
|
/* |
|
* We want to expose the capability even if this specific NVLink |
|
* did not get its own ATSD register because capabilities |
|
* belong to VFIO regions and normally there will be ATSD register |
|
* assigned to the NVLink bridge. |
|
*/ |
|
ret = vfio_pci_register_dev_region(vdev, |
|
PCI_VENDOR_ID_IBM | |
|
VFIO_REGION_TYPE_PCI_VENDOR_TYPE, |
|
VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, |
|
&vfio_pci_npu2_regops, |
|
data->mmio_atsd ? PAGE_SIZE : 0, |
|
VFIO_REGION_INFO_FLAG_READ | |
|
VFIO_REGION_INFO_FLAG_WRITE | |
|
VFIO_REGION_INFO_FLAG_MMAP, |
|
data); |
|
if (ret) |
|
goto free_exit; |
|
|
|
return 0; |
|
|
|
free_exit: |
|
if (data->base) |
|
memunmap(data->base); |
|
kfree(data); |
|
|
|
return ret; |
|
}
|
|
|