forked from Qortal/Brooklyn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1229 lines
27 KiB
1229 lines
27 KiB
// SPDX-License-Identifier: GPL-2.0 |
|
/* |
|
* Copyright (C) 2018-2020 Intel Corporation. |
|
* Copyright (C) 2020 Red Hat, Inc. |
|
* |
|
* Author: Tiwei Bie <[email protected]> |
|
* Jason Wang <[email protected]> |
|
* |
|
* Thanks Michael S. Tsirkin for the valuable comments and |
|
* suggestions. And thanks to Cunming Liang and Zhihong Wang for all |
|
* their supports. |
|
*/ |
|
|
|
#include <linux/kernel.h> |
|
#include <linux/module.h> |
|
#include <linux/cdev.h> |
|
#include <linux/device.h> |
|
#include <linux/mm.h> |
|
#include <linux/slab.h> |
|
#include <linux/iommu.h> |
|
#include <linux/uuid.h> |
|
#include <linux/vdpa.h> |
|
#include <linux/nospec.h> |
|
#include <linux/vhost.h> |
|
|
|
#include "vhost.h" |
|
|
|
enum { |
|
VHOST_VDPA_BACKEND_FEATURES = |
|
(1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) | |
|
(1ULL << VHOST_BACKEND_F_IOTLB_BATCH), |
|
}; |
|
|
|
#define VHOST_VDPA_DEV_MAX (1U << MINORBITS) |
|
|
|
struct vhost_vdpa { |
|
struct vhost_dev vdev; |
|
struct iommu_domain *domain; |
|
struct vhost_virtqueue *vqs; |
|
struct completion completion; |
|
struct vdpa_device *vdpa; |
|
struct device dev; |
|
struct cdev cdev; |
|
atomic_t opened; |
|
int nvqs; |
|
int virtio_id; |
|
int minor; |
|
struct eventfd_ctx *config_ctx; |
|
int in_batch; |
|
struct vdpa_iova_range range; |
|
}; |
|
|
|
static DEFINE_IDA(vhost_vdpa_ida); |
|
|
|
static dev_t vhost_vdpa_major; |
|
|
|
static void handle_vq_kick(struct vhost_work *work) |
|
{ |
|
struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, |
|
poll.work); |
|
struct vhost_vdpa *v = container_of(vq->dev, struct vhost_vdpa, vdev); |
|
const struct vdpa_config_ops *ops = v->vdpa->config; |
|
|
|
ops->kick_vq(v->vdpa, vq - v->vqs); |
|
} |
|
|
|
static irqreturn_t vhost_vdpa_virtqueue_cb(void *private) |
|
{ |
|
struct vhost_virtqueue *vq = private; |
|
struct eventfd_ctx *call_ctx = vq->call_ctx.ctx; |
|
|
|
if (call_ctx) |
|
eventfd_signal(call_ctx, 1); |
|
|
|
return IRQ_HANDLED; |
|
} |
|
|
|
static irqreturn_t vhost_vdpa_config_cb(void *private) |
|
{ |
|
struct vhost_vdpa *v = private; |
|
struct eventfd_ctx *config_ctx = v->config_ctx; |
|
|
|
if (config_ctx) |
|
eventfd_signal(config_ctx, 1); |
|
|
|
return IRQ_HANDLED; |
|
} |
|
|
|
static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid) |
|
{ |
|
struct vhost_virtqueue *vq = &v->vqs[qid]; |
|
const struct vdpa_config_ops *ops = v->vdpa->config; |
|
struct vdpa_device *vdpa = v->vdpa; |
|
int ret, irq; |
|
|
|
if (!ops->get_vq_irq) |
|
return; |
|
|
|
irq = ops->get_vq_irq(vdpa, qid); |
|
irq_bypass_unregister_producer(&vq->call_ctx.producer); |
|
if (!vq->call_ctx.ctx || irq < 0) |
|
return; |
|
|
|
vq->call_ctx.producer.token = vq->call_ctx.ctx; |
|
vq->call_ctx.producer.irq = irq; |
|
ret = irq_bypass_register_producer(&vq->call_ctx.producer); |
|
if (unlikely(ret)) |
|
dev_info(&v->dev, "vq %u, irq bypass producer (token %p) registration fails, ret = %d\n", |
|
qid, vq->call_ctx.producer.token, ret); |
|
} |
|
|
|
static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, u16 qid) |
|
{ |
|
struct vhost_virtqueue *vq = &v->vqs[qid]; |
|
|
|
irq_bypass_unregister_producer(&vq->call_ctx.producer); |
|
} |
|
|
|
static int vhost_vdpa_reset(struct vhost_vdpa *v) |
|
{ |
|
struct vdpa_device *vdpa = v->vdpa; |
|
|
|
v->in_batch = 0; |
|
|
|
return vdpa_reset(vdpa); |
|
} |
|
|
|
static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp) |
|
{ |
|
struct vdpa_device *vdpa = v->vdpa; |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
u32 device_id; |
|
|
|
device_id = ops->get_device_id(vdpa); |
|
|
|
if (copy_to_user(argp, &device_id, sizeof(device_id))) |
|
return -EFAULT; |
|
|
|
return 0; |
|
} |
|
|
|
static long vhost_vdpa_get_status(struct vhost_vdpa *v, u8 __user *statusp) |
|
{ |
|
struct vdpa_device *vdpa = v->vdpa; |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
u8 status; |
|
|
|
status = ops->get_status(vdpa); |
|
|
|
if (copy_to_user(statusp, &status, sizeof(status))) |
|
return -EFAULT; |
|
|
|
return 0; |
|
} |
|
|
|
static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp) |
|
{ |
|
struct vdpa_device *vdpa = v->vdpa; |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
u8 status, status_old; |
|
int ret, nvqs = v->nvqs; |
|
u16 i; |
|
|
|
if (copy_from_user(&status, statusp, sizeof(status))) |
|
return -EFAULT; |
|
|
|
status_old = ops->get_status(vdpa); |
|
|
|
/* |
|
* Userspace shouldn't remove status bits unless reset the |
|
* status to 0. |
|
*/ |
|
if (status != 0 && (ops->get_status(vdpa) & ~status) != 0) |
|
return -EINVAL; |
|
|
|
if ((status_old & VIRTIO_CONFIG_S_DRIVER_OK) && !(status & VIRTIO_CONFIG_S_DRIVER_OK)) |
|
for (i = 0; i < nvqs; i++) |
|
vhost_vdpa_unsetup_vq_irq(v, i); |
|
|
|
if (status == 0) { |
|
ret = ops->reset(vdpa); |
|
if (ret) |
|
return ret; |
|
} else |
|
ops->set_status(vdpa, status); |
|
|
|
if ((status & VIRTIO_CONFIG_S_DRIVER_OK) && !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) |
|
for (i = 0; i < nvqs; i++) |
|
vhost_vdpa_setup_vq_irq(v, i); |
|
|
|
return 0; |
|
} |
|
|
|
static int vhost_vdpa_config_validate(struct vhost_vdpa *v, |
|
struct vhost_vdpa_config *c) |
|
{ |
|
struct vdpa_device *vdpa = v->vdpa; |
|
long size = vdpa->config->get_config_size(vdpa); |
|
|
|
if (c->len == 0) |
|
return -EINVAL; |
|
|
|
if (c->len > size - c->off) |
|
return -E2BIG; |
|
|
|
return 0; |
|
} |
|
|
|
static long vhost_vdpa_get_config(struct vhost_vdpa *v, |
|
struct vhost_vdpa_config __user *c) |
|
{ |
|
struct vdpa_device *vdpa = v->vdpa; |
|
struct vhost_vdpa_config config; |
|
unsigned long size = offsetof(struct vhost_vdpa_config, buf); |
|
u8 *buf; |
|
|
|
if (copy_from_user(&config, c, size)) |
|
return -EFAULT; |
|
if (vhost_vdpa_config_validate(v, &config)) |
|
return -EINVAL; |
|
buf = kvzalloc(config.len, GFP_KERNEL); |
|
if (!buf) |
|
return -ENOMEM; |
|
|
|
vdpa_get_config(vdpa, config.off, buf, config.len); |
|
|
|
if (copy_to_user(c->buf, buf, config.len)) { |
|
kvfree(buf); |
|
return -EFAULT; |
|
} |
|
|
|
kvfree(buf); |
|
return 0; |
|
} |
|
|
|
static long vhost_vdpa_set_config(struct vhost_vdpa *v, |
|
struct vhost_vdpa_config __user *c) |
|
{ |
|
struct vdpa_device *vdpa = v->vdpa; |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
struct vhost_vdpa_config config; |
|
unsigned long size = offsetof(struct vhost_vdpa_config, buf); |
|
u8 *buf; |
|
|
|
if (copy_from_user(&config, c, size)) |
|
return -EFAULT; |
|
if (vhost_vdpa_config_validate(v, &config)) |
|
return -EINVAL; |
|
|
|
buf = vmemdup_user(c->buf, config.len); |
|
if (IS_ERR(buf)) |
|
return PTR_ERR(buf); |
|
|
|
ops->set_config(vdpa, config.off, buf, config.len); |
|
|
|
kvfree(buf); |
|
return 0; |
|
} |
|
|
|
static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep) |
|
{ |
|
struct vdpa_device *vdpa = v->vdpa; |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
u64 features; |
|
|
|
features = ops->get_features(vdpa); |
|
|
|
if (copy_to_user(featurep, &features, sizeof(features))) |
|
return -EFAULT; |
|
|
|
return 0; |
|
} |
|
|
|
static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep) |
|
{ |
|
struct vdpa_device *vdpa = v->vdpa; |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
u64 features; |
|
|
|
/* |
|
* It's not allowed to change the features after they have |
|
* been negotiated. |
|
*/ |
|
if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_FEATURES_OK) |
|
return -EBUSY; |
|
|
|
if (copy_from_user(&features, featurep, sizeof(features))) |
|
return -EFAULT; |
|
|
|
if (vdpa_set_features(vdpa, features)) |
|
return -EINVAL; |
|
|
|
return 0; |
|
} |
|
|
|
static long vhost_vdpa_get_vring_num(struct vhost_vdpa *v, u16 __user *argp) |
|
{ |
|
struct vdpa_device *vdpa = v->vdpa; |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
u16 num; |
|
|
|
num = ops->get_vq_num_max(vdpa); |
|
|
|
if (copy_to_user(argp, &num, sizeof(num))) |
|
return -EFAULT; |
|
|
|
return 0; |
|
} |
|
|
|
static void vhost_vdpa_config_put(struct vhost_vdpa *v) |
|
{ |
|
if (v->config_ctx) { |
|
eventfd_ctx_put(v->config_ctx); |
|
v->config_ctx = NULL; |
|
} |
|
} |
|
|
|
static long vhost_vdpa_set_config_call(struct vhost_vdpa *v, u32 __user *argp) |
|
{ |
|
struct vdpa_callback cb; |
|
int fd; |
|
struct eventfd_ctx *ctx; |
|
|
|
cb.callback = vhost_vdpa_config_cb; |
|
cb.private = v; |
|
if (copy_from_user(&fd, argp, sizeof(fd))) |
|
return -EFAULT; |
|
|
|
ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd); |
|
swap(ctx, v->config_ctx); |
|
|
|
if (!IS_ERR_OR_NULL(ctx)) |
|
eventfd_ctx_put(ctx); |
|
|
|
if (IS_ERR(v->config_ctx)) { |
|
long ret = PTR_ERR(v->config_ctx); |
|
|
|
v->config_ctx = NULL; |
|
return ret; |
|
} |
|
|
|
v->vdpa->config->set_config_cb(v->vdpa, &cb); |
|
|
|
return 0; |
|
} |
|
|
|
static long vhost_vdpa_get_iova_range(struct vhost_vdpa *v, u32 __user *argp) |
|
{ |
|
struct vhost_vdpa_iova_range range = { |
|
.first = v->range.first, |
|
.last = v->range.last, |
|
}; |
|
|
|
if (copy_to_user(argp, &range, sizeof(range))) |
|
return -EFAULT; |
|
return 0; |
|
} |
|
|
|
static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, |
|
void __user *argp) |
|
{ |
|
struct vdpa_device *vdpa = v->vdpa; |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
struct vdpa_vq_state vq_state; |
|
struct vdpa_callback cb; |
|
struct vhost_virtqueue *vq; |
|
struct vhost_vring_state s; |
|
u32 idx; |
|
long r; |
|
|
|
r = get_user(idx, (u32 __user *)argp); |
|
if (r < 0) |
|
return r; |
|
|
|
if (idx >= v->nvqs) |
|
return -ENOBUFS; |
|
|
|
idx = array_index_nospec(idx, v->nvqs); |
|
vq = &v->vqs[idx]; |
|
|
|
switch (cmd) { |
|
case VHOST_VDPA_SET_VRING_ENABLE: |
|
if (copy_from_user(&s, argp, sizeof(s))) |
|
return -EFAULT; |
|
ops->set_vq_ready(vdpa, idx, s.num); |
|
return 0; |
|
case VHOST_GET_VRING_BASE: |
|
r = ops->get_vq_state(v->vdpa, idx, &vq_state); |
|
if (r) |
|
return r; |
|
|
|
vq->last_avail_idx = vq_state.split.avail_index; |
|
break; |
|
} |
|
|
|
r = vhost_vring_ioctl(&v->vdev, cmd, argp); |
|
if (r) |
|
return r; |
|
|
|
switch (cmd) { |
|
case VHOST_SET_VRING_ADDR: |
|
if (ops->set_vq_address(vdpa, idx, |
|
(u64)(uintptr_t)vq->desc, |
|
(u64)(uintptr_t)vq->avail, |
|
(u64)(uintptr_t)vq->used)) |
|
r = -EINVAL; |
|
break; |
|
|
|
case VHOST_SET_VRING_BASE: |
|
vq_state.split.avail_index = vq->last_avail_idx; |
|
if (ops->set_vq_state(vdpa, idx, &vq_state)) |
|
r = -EINVAL; |
|
break; |
|
|
|
case VHOST_SET_VRING_CALL: |
|
if (vq->call_ctx.ctx) { |
|
cb.callback = vhost_vdpa_virtqueue_cb; |
|
cb.private = vq; |
|
} else { |
|
cb.callback = NULL; |
|
cb.private = NULL; |
|
} |
|
ops->set_vq_cb(vdpa, idx, &cb); |
|
vhost_vdpa_setup_vq_irq(v, idx); |
|
break; |
|
|
|
case VHOST_SET_VRING_NUM: |
|
ops->set_vq_num(vdpa, idx, vq->num); |
|
break; |
|
} |
|
|
|
return r; |
|
} |
|
|
|
static long vhost_vdpa_unlocked_ioctl(struct file *filep, |
|
unsigned int cmd, unsigned long arg) |
|
{ |
|
struct vhost_vdpa *v = filep->private_data; |
|
struct vhost_dev *d = &v->vdev; |
|
void __user *argp = (void __user *)arg; |
|
u64 __user *featurep = argp; |
|
u64 features; |
|
long r = 0; |
|
|
|
if (cmd == VHOST_SET_BACKEND_FEATURES) { |
|
if (copy_from_user(&features, featurep, sizeof(features))) |
|
return -EFAULT; |
|
if (features & ~VHOST_VDPA_BACKEND_FEATURES) |
|
return -EOPNOTSUPP; |
|
vhost_set_backend_features(&v->vdev, features); |
|
return 0; |
|
} |
|
|
|
mutex_lock(&d->mutex); |
|
|
|
switch (cmd) { |
|
case VHOST_VDPA_GET_DEVICE_ID: |
|
r = vhost_vdpa_get_device_id(v, argp); |
|
break; |
|
case VHOST_VDPA_GET_STATUS: |
|
r = vhost_vdpa_get_status(v, argp); |
|
break; |
|
case VHOST_VDPA_SET_STATUS: |
|
r = vhost_vdpa_set_status(v, argp); |
|
break; |
|
case VHOST_VDPA_GET_CONFIG: |
|
r = vhost_vdpa_get_config(v, argp); |
|
break; |
|
case VHOST_VDPA_SET_CONFIG: |
|
r = vhost_vdpa_set_config(v, argp); |
|
break; |
|
case VHOST_GET_FEATURES: |
|
r = vhost_vdpa_get_features(v, argp); |
|
break; |
|
case VHOST_SET_FEATURES: |
|
r = vhost_vdpa_set_features(v, argp); |
|
break; |
|
case VHOST_VDPA_GET_VRING_NUM: |
|
r = vhost_vdpa_get_vring_num(v, argp); |
|
break; |
|
case VHOST_SET_LOG_BASE: |
|
case VHOST_SET_LOG_FD: |
|
r = -ENOIOCTLCMD; |
|
break; |
|
case VHOST_VDPA_SET_CONFIG_CALL: |
|
r = vhost_vdpa_set_config_call(v, argp); |
|
break; |
|
case VHOST_GET_BACKEND_FEATURES: |
|
features = VHOST_VDPA_BACKEND_FEATURES; |
|
if (copy_to_user(featurep, &features, sizeof(features))) |
|
r = -EFAULT; |
|
break; |
|
case VHOST_VDPA_GET_IOVA_RANGE: |
|
r = vhost_vdpa_get_iova_range(v, argp); |
|
break; |
|
default: |
|
r = vhost_dev_ioctl(&v->vdev, cmd, argp); |
|
if (r == -ENOIOCTLCMD) |
|
r = vhost_vdpa_vring_ioctl(v, cmd, argp); |
|
break; |
|
} |
|
|
|
mutex_unlock(&d->mutex); |
|
return r; |
|
} |
|
|
|
static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, u64 start, u64 last) |
|
{ |
|
struct vhost_dev *dev = &v->vdev; |
|
struct vhost_iotlb *iotlb = dev->iotlb; |
|
struct vhost_iotlb_map *map; |
|
struct page *page; |
|
unsigned long pfn, pinned; |
|
|
|
while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) { |
|
pinned = PFN_DOWN(map->size); |
|
for (pfn = PFN_DOWN(map->addr); |
|
pinned > 0; pfn++, pinned--) { |
|
page = pfn_to_page(pfn); |
|
if (map->perm & VHOST_ACCESS_WO) |
|
set_page_dirty_lock(page); |
|
unpin_user_page(page); |
|
} |
|
atomic64_sub(PFN_DOWN(map->size), &dev->mm->pinned_vm); |
|
vhost_iotlb_map_free(iotlb, map); |
|
} |
|
} |
|
|
|
static void vhost_vdpa_va_unmap(struct vhost_vdpa *v, u64 start, u64 last) |
|
{ |
|
struct vhost_dev *dev = &v->vdev; |
|
struct vhost_iotlb *iotlb = dev->iotlb; |
|
struct vhost_iotlb_map *map; |
|
struct vdpa_map_file *map_file; |
|
|
|
while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) { |
|
map_file = (struct vdpa_map_file *)map->opaque; |
|
fput(map_file->file); |
|
kfree(map_file); |
|
vhost_iotlb_map_free(iotlb, map); |
|
} |
|
} |
|
|
|
static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last) |
|
{ |
|
struct vdpa_device *vdpa = v->vdpa; |
|
|
|
if (vdpa->use_va) |
|
return vhost_vdpa_va_unmap(v, start, last); |
|
|
|
return vhost_vdpa_pa_unmap(v, start, last); |
|
} |
|
|
|
static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v) |
|
{ |
|
struct vhost_dev *dev = &v->vdev; |
|
|
|
vhost_vdpa_iotlb_unmap(v, 0ULL, 0ULL - 1); |
|
kfree(dev->iotlb); |
|
dev->iotlb = NULL; |
|
} |
|
|
|
static int perm_to_iommu_flags(u32 perm) |
|
{ |
|
int flags = 0; |
|
|
|
switch (perm) { |
|
case VHOST_ACCESS_WO: |
|
flags |= IOMMU_WRITE; |
|
break; |
|
case VHOST_ACCESS_RO: |
|
flags |= IOMMU_READ; |
|
break; |
|
case VHOST_ACCESS_RW: |
|
flags |= (IOMMU_WRITE | IOMMU_READ); |
|
break; |
|
default: |
|
WARN(1, "invalidate vhost IOTLB permission\n"); |
|
break; |
|
} |
|
|
|
return flags | IOMMU_CACHE; |
|
} |
|
|
|
static int vhost_vdpa_map(struct vhost_vdpa *v, u64 iova, |
|
u64 size, u64 pa, u32 perm, void *opaque) |
|
{ |
|
struct vhost_dev *dev = &v->vdev; |
|
struct vdpa_device *vdpa = v->vdpa; |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
int r = 0; |
|
|
|
r = vhost_iotlb_add_range_ctx(dev->iotlb, iova, iova + size - 1, |
|
pa, perm, opaque); |
|
if (r) |
|
return r; |
|
|
|
if (ops->dma_map) { |
|
r = ops->dma_map(vdpa, iova, size, pa, perm, opaque); |
|
} else if (ops->set_map) { |
|
if (!v->in_batch) |
|
r = ops->set_map(vdpa, dev->iotlb); |
|
} else { |
|
r = iommu_map(v->domain, iova, pa, size, |
|
perm_to_iommu_flags(perm)); |
|
} |
|
if (r) { |
|
vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1); |
|
return r; |
|
} |
|
|
|
if (!vdpa->use_va) |
|
atomic64_add(PFN_DOWN(size), &dev->mm->pinned_vm); |
|
|
|
return 0; |
|
} |
|
|
|
static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size) |
|
{ |
|
struct vhost_dev *dev = &v->vdev; |
|
struct vdpa_device *vdpa = v->vdpa; |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
|
|
vhost_vdpa_iotlb_unmap(v, iova, iova + size - 1); |
|
|
|
if (ops->dma_map) { |
|
ops->dma_unmap(vdpa, iova, size); |
|
} else if (ops->set_map) { |
|
if (!v->in_batch) |
|
ops->set_map(vdpa, dev->iotlb); |
|
} else { |
|
iommu_unmap(v->domain, iova, size); |
|
} |
|
} |
|
|
|
static int vhost_vdpa_va_map(struct vhost_vdpa *v, |
|
u64 iova, u64 size, u64 uaddr, u32 perm) |
|
{ |
|
struct vhost_dev *dev = &v->vdev; |
|
u64 offset, map_size, map_iova = iova; |
|
struct vdpa_map_file *map_file; |
|
struct vm_area_struct *vma; |
|
int ret = 0; |
|
|
|
mmap_read_lock(dev->mm); |
|
|
|
while (size) { |
|
vma = find_vma(dev->mm, uaddr); |
|
if (!vma) { |
|
ret = -EINVAL; |
|
break; |
|
} |
|
map_size = min(size, vma->vm_end - uaddr); |
|
if (!(vma->vm_file && (vma->vm_flags & VM_SHARED) && |
|
!(vma->vm_flags & (VM_IO | VM_PFNMAP)))) |
|
goto next; |
|
|
|
map_file = kzalloc(sizeof(*map_file), GFP_KERNEL); |
|
if (!map_file) { |
|
ret = -ENOMEM; |
|
break; |
|
} |
|
offset = (vma->vm_pgoff << PAGE_SHIFT) + uaddr - vma->vm_start; |
|
map_file->offset = offset; |
|
map_file->file = get_file(vma->vm_file); |
|
ret = vhost_vdpa_map(v, map_iova, map_size, uaddr, |
|
perm, map_file); |
|
if (ret) { |
|
fput(map_file->file); |
|
kfree(map_file); |
|
break; |
|
} |
|
next: |
|
size -= map_size; |
|
uaddr += map_size; |
|
map_iova += map_size; |
|
} |
|
if (ret) |
|
vhost_vdpa_unmap(v, iova, map_iova - iova); |
|
|
|
mmap_read_unlock(dev->mm); |
|
|
|
return ret; |
|
} |
|
|
|
static int vhost_vdpa_pa_map(struct vhost_vdpa *v, |
|
u64 iova, u64 size, u64 uaddr, u32 perm) |
|
{ |
|
struct vhost_dev *dev = &v->vdev; |
|
struct page **page_list; |
|
unsigned long list_size = PAGE_SIZE / sizeof(struct page *); |
|
unsigned int gup_flags = FOLL_LONGTERM; |
|
unsigned long npages, cur_base, map_pfn, last_pfn = 0; |
|
unsigned long lock_limit, sz2pin, nchunks, i; |
|
u64 start = iova; |
|
long pinned; |
|
int ret = 0; |
|
|
|
/* Limit the use of memory for bookkeeping */ |
|
page_list = (struct page **) __get_free_page(GFP_KERNEL); |
|
if (!page_list) |
|
return -ENOMEM; |
|
|
|
if (perm & VHOST_ACCESS_WO) |
|
gup_flags |= FOLL_WRITE; |
|
|
|
npages = PFN_UP(size + (iova & ~PAGE_MASK)); |
|
if (!npages) { |
|
ret = -EINVAL; |
|
goto free; |
|
} |
|
|
|
mmap_read_lock(dev->mm); |
|
|
|
lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK)); |
|
if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) { |
|
ret = -ENOMEM; |
|
goto unlock; |
|
} |
|
|
|
cur_base = uaddr & PAGE_MASK; |
|
iova &= PAGE_MASK; |
|
nchunks = 0; |
|
|
|
while (npages) { |
|
sz2pin = min_t(unsigned long, npages, list_size); |
|
pinned = pin_user_pages(cur_base, sz2pin, |
|
gup_flags, page_list, NULL); |
|
if (sz2pin != pinned) { |
|
if (pinned < 0) { |
|
ret = pinned; |
|
} else { |
|
unpin_user_pages(page_list, pinned); |
|
ret = -ENOMEM; |
|
} |
|
goto out; |
|
} |
|
nchunks++; |
|
|
|
if (!last_pfn) |
|
map_pfn = page_to_pfn(page_list[0]); |
|
|
|
for (i = 0; i < pinned; i++) { |
|
unsigned long this_pfn = page_to_pfn(page_list[i]); |
|
u64 csize; |
|
|
|
if (last_pfn && (this_pfn != last_pfn + 1)) { |
|
/* Pin a contiguous chunk of memory */ |
|
csize = PFN_PHYS(last_pfn - map_pfn + 1); |
|
ret = vhost_vdpa_map(v, iova, csize, |
|
PFN_PHYS(map_pfn), |
|
perm, NULL); |
|
if (ret) { |
|
/* |
|
* Unpin the pages that are left unmapped |
|
* from this point on in the current |
|
* page_list. The remaining outstanding |
|
* ones which may stride across several |
|
* chunks will be covered in the common |
|
* error path subsequently. |
|
*/ |
|
unpin_user_pages(&page_list[i], |
|
pinned - i); |
|
goto out; |
|
} |
|
|
|
map_pfn = this_pfn; |
|
iova += csize; |
|
nchunks = 0; |
|
} |
|
|
|
last_pfn = this_pfn; |
|
} |
|
|
|
cur_base += PFN_PHYS(pinned); |
|
npages -= pinned; |
|
} |
|
|
|
/* Pin the rest chunk */ |
|
ret = vhost_vdpa_map(v, iova, PFN_PHYS(last_pfn - map_pfn + 1), |
|
PFN_PHYS(map_pfn), perm, NULL); |
|
out: |
|
if (ret) { |
|
if (nchunks) { |
|
unsigned long pfn; |
|
|
|
/* |
|
* Unpin the outstanding pages which are yet to be |
|
* mapped but haven't due to vdpa_map() or |
|
* pin_user_pages() failure. |
|
* |
|
* Mapped pages are accounted in vdpa_map(), hence |
|
* the corresponding unpinning will be handled by |
|
* vdpa_unmap(). |
|
*/ |
|
WARN_ON(!last_pfn); |
|
for (pfn = map_pfn; pfn <= last_pfn; pfn++) |
|
unpin_user_page(pfn_to_page(pfn)); |
|
} |
|
vhost_vdpa_unmap(v, start, size); |
|
} |
|
unlock: |
|
mmap_read_unlock(dev->mm); |
|
free: |
|
free_page((unsigned long)page_list); |
|
return ret; |
|
|
|
} |
|
|
|
static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, |
|
struct vhost_iotlb_msg *msg) |
|
{ |
|
struct vhost_dev *dev = &v->vdev; |
|
struct vdpa_device *vdpa = v->vdpa; |
|
struct vhost_iotlb *iotlb = dev->iotlb; |
|
|
|
if (msg->iova < v->range.first || !msg->size || |
|
msg->iova > U64_MAX - msg->size + 1 || |
|
msg->iova + msg->size - 1 > v->range.last) |
|
return -EINVAL; |
|
|
|
if (vhost_iotlb_itree_first(iotlb, msg->iova, |
|
msg->iova + msg->size - 1)) |
|
return -EEXIST; |
|
|
|
if (vdpa->use_va) |
|
return vhost_vdpa_va_map(v, msg->iova, msg->size, |
|
msg->uaddr, msg->perm); |
|
|
|
return vhost_vdpa_pa_map(v, msg->iova, msg->size, msg->uaddr, |
|
msg->perm); |
|
} |
|
|
|
static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, |
|
struct vhost_iotlb_msg *msg) |
|
{ |
|
struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev); |
|
struct vdpa_device *vdpa = v->vdpa; |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
int r = 0; |
|
|
|
mutex_lock(&dev->mutex); |
|
|
|
r = vhost_dev_check_owner(dev); |
|
if (r) |
|
goto unlock; |
|
|
|
switch (msg->type) { |
|
case VHOST_IOTLB_UPDATE: |
|
r = vhost_vdpa_process_iotlb_update(v, msg); |
|
break; |
|
case VHOST_IOTLB_INVALIDATE: |
|
vhost_vdpa_unmap(v, msg->iova, msg->size); |
|
break; |
|
case VHOST_IOTLB_BATCH_BEGIN: |
|
v->in_batch = true; |
|
break; |
|
case VHOST_IOTLB_BATCH_END: |
|
if (v->in_batch && ops->set_map) |
|
ops->set_map(vdpa, dev->iotlb); |
|
v->in_batch = false; |
|
break; |
|
default: |
|
r = -EINVAL; |
|
break; |
|
} |
|
unlock: |
|
mutex_unlock(&dev->mutex); |
|
|
|
return r; |
|
} |
|
|
|
static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb, |
|
struct iov_iter *from) |
|
{ |
|
struct file *file = iocb->ki_filp; |
|
struct vhost_vdpa *v = file->private_data; |
|
struct vhost_dev *dev = &v->vdev; |
|
|
|
return vhost_chr_write_iter(dev, from); |
|
} |
|
|
|
static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v) |
|
{ |
|
struct vdpa_device *vdpa = v->vdpa; |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
struct device *dma_dev = vdpa_get_dma_dev(vdpa); |
|
struct bus_type *bus; |
|
int ret; |
|
|
|
/* Device want to do DMA by itself */ |
|
if (ops->set_map || ops->dma_map) |
|
return 0; |
|
|
|
bus = dma_dev->bus; |
|
if (!bus) |
|
return -EFAULT; |
|
|
|
if (!iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY)) |
|
return -ENOTSUPP; |
|
|
|
v->domain = iommu_domain_alloc(bus); |
|
if (!v->domain) |
|
return -EIO; |
|
|
|
ret = iommu_attach_device(v->domain, dma_dev); |
|
if (ret) |
|
goto err_attach; |
|
|
|
return 0; |
|
|
|
err_attach: |
|
iommu_domain_free(v->domain); |
|
return ret; |
|
} |
|
|
|
static void vhost_vdpa_free_domain(struct vhost_vdpa *v) |
|
{ |
|
struct vdpa_device *vdpa = v->vdpa; |
|
struct device *dma_dev = vdpa_get_dma_dev(vdpa); |
|
|
|
if (v->domain) { |
|
iommu_detach_device(v->domain, dma_dev); |
|
iommu_domain_free(v->domain); |
|
} |
|
|
|
v->domain = NULL; |
|
} |
|
|
|
static void vhost_vdpa_set_iova_range(struct vhost_vdpa *v) |
|
{ |
|
struct vdpa_iova_range *range = &v->range; |
|
struct vdpa_device *vdpa = v->vdpa; |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
|
|
if (ops->get_iova_range) { |
|
*range = ops->get_iova_range(vdpa); |
|
} else if (v->domain && v->domain->geometry.force_aperture) { |
|
range->first = v->domain->geometry.aperture_start; |
|
range->last = v->domain->geometry.aperture_end; |
|
} else { |
|
range->first = 0; |
|
range->last = ULLONG_MAX; |
|
} |
|
} |
|
|
|
static int vhost_vdpa_open(struct inode *inode, struct file *filep) |
|
{ |
|
struct vhost_vdpa *v; |
|
struct vhost_dev *dev; |
|
struct vhost_virtqueue **vqs; |
|
int nvqs, i, r, opened; |
|
|
|
v = container_of(inode->i_cdev, struct vhost_vdpa, cdev); |
|
|
|
opened = atomic_cmpxchg(&v->opened, 0, 1); |
|
if (opened) |
|
return -EBUSY; |
|
|
|
nvqs = v->nvqs; |
|
r = vhost_vdpa_reset(v); |
|
if (r) |
|
goto err; |
|
|
|
vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL); |
|
if (!vqs) { |
|
r = -ENOMEM; |
|
goto err; |
|
} |
|
|
|
dev = &v->vdev; |
|
for (i = 0; i < nvqs; i++) { |
|
vqs[i] = &v->vqs[i]; |
|
vqs[i]->handle_kick = handle_vq_kick; |
|
} |
|
vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false, |
|
vhost_vdpa_process_iotlb_msg); |
|
|
|
dev->iotlb = vhost_iotlb_alloc(0, 0); |
|
if (!dev->iotlb) { |
|
r = -ENOMEM; |
|
goto err_init_iotlb; |
|
} |
|
|
|
r = vhost_vdpa_alloc_domain(v); |
|
if (r) |
|
goto err_init_iotlb; |
|
|
|
vhost_vdpa_set_iova_range(v); |
|
|
|
filep->private_data = v; |
|
|
|
return 0; |
|
|
|
err_init_iotlb: |
|
vhost_dev_cleanup(&v->vdev); |
|
kfree(vqs); |
|
err: |
|
atomic_dec(&v->opened); |
|
return r; |
|
} |
|
|
|
static void vhost_vdpa_clean_irq(struct vhost_vdpa *v) |
|
{ |
|
int i; |
|
|
|
for (i = 0; i < v->nvqs; i++) |
|
vhost_vdpa_unsetup_vq_irq(v, i); |
|
} |
|
|
|
static int vhost_vdpa_release(struct inode *inode, struct file *filep) |
|
{ |
|
struct vhost_vdpa *v = filep->private_data; |
|
struct vhost_dev *d = &v->vdev; |
|
|
|
mutex_lock(&d->mutex); |
|
filep->private_data = NULL; |
|
vhost_vdpa_reset(v); |
|
vhost_dev_stop(&v->vdev); |
|
vhost_vdpa_iotlb_free(v); |
|
vhost_vdpa_free_domain(v); |
|
vhost_vdpa_config_put(v); |
|
vhost_vdpa_clean_irq(v); |
|
vhost_dev_cleanup(&v->vdev); |
|
kfree(v->vdev.vqs); |
|
mutex_unlock(&d->mutex); |
|
|
|
atomic_dec(&v->opened); |
|
complete(&v->completion); |
|
|
|
return 0; |
|
} |
|
|
|
#ifdef CONFIG_MMU |
|
static vm_fault_t vhost_vdpa_fault(struct vm_fault *vmf) |
|
{ |
|
struct vhost_vdpa *v = vmf->vma->vm_file->private_data; |
|
struct vdpa_device *vdpa = v->vdpa; |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
struct vdpa_notification_area notify; |
|
struct vm_area_struct *vma = vmf->vma; |
|
u16 index = vma->vm_pgoff; |
|
|
|
notify = ops->get_vq_notification(vdpa, index); |
|
|
|
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); |
|
if (remap_pfn_range(vma, vmf->address & PAGE_MASK, |
|
PFN_DOWN(notify.addr), PAGE_SIZE, |
|
vma->vm_page_prot)) |
|
return VM_FAULT_SIGBUS; |
|
|
|
return VM_FAULT_NOPAGE; |
|
} |
|
|
|
static const struct vm_operations_struct vhost_vdpa_vm_ops = { |
|
.fault = vhost_vdpa_fault, |
|
}; |
|
|
|
static int vhost_vdpa_mmap(struct file *file, struct vm_area_struct *vma) |
|
{ |
|
struct vhost_vdpa *v = vma->vm_file->private_data; |
|
struct vdpa_device *vdpa = v->vdpa; |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
struct vdpa_notification_area notify; |
|
unsigned long index = vma->vm_pgoff; |
|
|
|
if (vma->vm_end - vma->vm_start != PAGE_SIZE) |
|
return -EINVAL; |
|
if ((vma->vm_flags & VM_SHARED) == 0) |
|
return -EINVAL; |
|
if (vma->vm_flags & VM_READ) |
|
return -EINVAL; |
|
if (index > 65535) |
|
return -EINVAL; |
|
if (!ops->get_vq_notification) |
|
return -ENOTSUPP; |
|
|
|
/* To be safe and easily modelled by userspace, We only |
|
* support the doorbell which sits on the page boundary and |
|
* does not share the page with other registers. |
|
*/ |
|
notify = ops->get_vq_notification(vdpa, index); |
|
if (notify.addr & (PAGE_SIZE - 1)) |
|
return -EINVAL; |
|
if (vma->vm_end - vma->vm_start != notify.size) |
|
return -ENOTSUPP; |
|
|
|
vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; |
|
vma->vm_ops = &vhost_vdpa_vm_ops; |
|
return 0; |
|
} |
|
#endif /* CONFIG_MMU */ |
|
|
|
static const struct file_operations vhost_vdpa_fops = { |
|
.owner = THIS_MODULE, |
|
.open = vhost_vdpa_open, |
|
.release = vhost_vdpa_release, |
|
.write_iter = vhost_vdpa_chr_write_iter, |
|
.unlocked_ioctl = vhost_vdpa_unlocked_ioctl, |
|
#ifdef CONFIG_MMU |
|
.mmap = vhost_vdpa_mmap, |
|
#endif /* CONFIG_MMU */ |
|
.compat_ioctl = compat_ptr_ioctl, |
|
}; |
|
|
|
static void vhost_vdpa_release_dev(struct device *device) |
|
{ |
|
struct vhost_vdpa *v = |
|
container_of(device, struct vhost_vdpa, dev); |
|
|
|
ida_simple_remove(&vhost_vdpa_ida, v->minor); |
|
kfree(v->vqs); |
|
kfree(v); |
|
} |
|
|
|
static int vhost_vdpa_probe(struct vdpa_device *vdpa) |
|
{ |
|
const struct vdpa_config_ops *ops = vdpa->config; |
|
struct vhost_vdpa *v; |
|
int minor; |
|
int r; |
|
|
|
v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL); |
|
if (!v) |
|
return -ENOMEM; |
|
|
|
minor = ida_simple_get(&vhost_vdpa_ida, 0, |
|
VHOST_VDPA_DEV_MAX, GFP_KERNEL); |
|
if (minor < 0) { |
|
kfree(v); |
|
return minor; |
|
} |
|
|
|
atomic_set(&v->opened, 0); |
|
v->minor = minor; |
|
v->vdpa = vdpa; |
|
v->nvqs = vdpa->nvqs; |
|
v->virtio_id = ops->get_device_id(vdpa); |
|
|
|
device_initialize(&v->dev); |
|
v->dev.release = vhost_vdpa_release_dev; |
|
v->dev.parent = &vdpa->dev; |
|
v->dev.devt = MKDEV(MAJOR(vhost_vdpa_major), minor); |
|
v->vqs = kmalloc_array(v->nvqs, sizeof(struct vhost_virtqueue), |
|
GFP_KERNEL); |
|
if (!v->vqs) { |
|
r = -ENOMEM; |
|
goto err; |
|
} |
|
|
|
r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor); |
|
if (r) |
|
goto err; |
|
|
|
cdev_init(&v->cdev, &vhost_vdpa_fops); |
|
v->cdev.owner = THIS_MODULE; |
|
|
|
r = cdev_device_add(&v->cdev, &v->dev); |
|
if (r) |
|
goto err; |
|
|
|
init_completion(&v->completion); |
|
vdpa_set_drvdata(vdpa, v); |
|
|
|
return 0; |
|
|
|
err: |
|
put_device(&v->dev); |
|
return r; |
|
} |
|
|
|
static void vhost_vdpa_remove(struct vdpa_device *vdpa) |
|
{ |
|
struct vhost_vdpa *v = vdpa_get_drvdata(vdpa); |
|
int opened; |
|
|
|
cdev_device_del(&v->cdev, &v->dev); |
|
|
|
do { |
|
opened = atomic_cmpxchg(&v->opened, 0, 1); |
|
if (!opened) |
|
break; |
|
wait_for_completion(&v->completion); |
|
} while (1); |
|
|
|
put_device(&v->dev); |
|
} |
|
|
|
static struct vdpa_driver vhost_vdpa_driver = { |
|
.driver = { |
|
.name = "vhost_vdpa", |
|
}, |
|
.probe = vhost_vdpa_probe, |
|
.remove = vhost_vdpa_remove, |
|
}; |
|
|
|
static int __init vhost_vdpa_init(void) |
|
{ |
|
int r; |
|
|
|
r = alloc_chrdev_region(&vhost_vdpa_major, 0, VHOST_VDPA_DEV_MAX, |
|
"vhost-vdpa"); |
|
if (r) |
|
goto err_alloc_chrdev; |
|
|
|
r = vdpa_register_driver(&vhost_vdpa_driver); |
|
if (r) |
|
goto err_vdpa_register_driver; |
|
|
|
return 0; |
|
|
|
err_vdpa_register_driver: |
|
unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX); |
|
err_alloc_chrdev: |
|
return r; |
|
} |
|
module_init(vhost_vdpa_init); |
|
|
|
static void __exit vhost_vdpa_exit(void) |
|
{ |
|
vdpa_unregister_driver(&vhost_vdpa_driver); |
|
unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX); |
|
} |
|
module_exit(vhost_vdpa_exit); |
|
|
|
MODULE_VERSION("0.0.1"); |
|
MODULE_LICENSE("GPL v2"); |
|
MODULE_AUTHOR("Intel Corporation"); |
|
MODULE_DESCRIPTION("vDPA-based vhost backend for virtio");
|
|
|