forked from Qortal/Brooklyn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1151 lines
27 KiB
1151 lines
27 KiB
// SPDX-License-Identifier: GPL-2.0 |
|
/* |
|
* This is a module to test the HMM (Heterogeneous Memory Management) |
|
* mirror and zone device private memory migration APIs of the kernel. |
|
* Userspace programs can register with the driver to mirror their own address |
|
* space and can use the device to read/write any valid virtual address. |
|
*/ |
|
#include <linux/init.h> |
|
#include <linux/fs.h> |
|
#include <linux/mm.h> |
|
#include <linux/module.h> |
|
#include <linux/kernel.h> |
|
#include <linux/cdev.h> |
|
#include <linux/device.h> |
|
#include <linux/mutex.h> |
|
#include <linux/rwsem.h> |
|
#include <linux/sched.h> |
|
#include <linux/slab.h> |
|
#include <linux/highmem.h> |
|
#include <linux/delay.h> |
|
#include <linux/pagemap.h> |
|
#include <linux/hmm.h> |
|
#include <linux/vmalloc.h> |
|
#include <linux/swap.h> |
|
#include <linux/swapops.h> |
|
#include <linux/sched/mm.h> |
|
#include <linux/platform_device.h> |
|
|
|
#include "test_hmm_uapi.h" |
|
|
|
#define DMIRROR_NDEVICES 2 |
|
#define DMIRROR_RANGE_FAULT_TIMEOUT 1000 |
|
#define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) |
|
#define DEVMEM_CHUNKS_RESERVE 16 |
|
|
|
static const struct dev_pagemap_ops dmirror_devmem_ops; |
|
static const struct mmu_interval_notifier_ops dmirror_min_ops; |
|
static dev_t dmirror_dev; |
|
|
|
struct dmirror_device; |
|
|
|
struct dmirror_bounce { |
|
void *ptr; |
|
unsigned long size; |
|
unsigned long addr; |
|
unsigned long cpages; |
|
}; |
|
|
|
#define DPT_XA_TAG_WRITE 3UL |
|
|
|
/* |
|
* Data structure to track address ranges and register for mmu interval |
|
* notifier updates. |
|
*/ |
|
struct dmirror_interval { |
|
struct mmu_interval_notifier notifier; |
|
struct dmirror *dmirror; |
|
}; |
|
|
|
/* |
|
* Data attached to the open device file. |
|
* Note that it might be shared after a fork(). |
|
*/ |
|
struct dmirror { |
|
struct dmirror_device *mdevice; |
|
struct xarray pt; |
|
struct mmu_interval_notifier notifier; |
|
struct mutex mutex; |
|
}; |
|
|
|
/* |
|
* ZONE_DEVICE pages for migration and simulating device memory. |
|
*/ |
|
struct dmirror_chunk { |
|
struct dev_pagemap pagemap; |
|
struct dmirror_device *mdevice; |
|
}; |
|
|
|
/* |
|
* Per device data. |
|
*/ |
|
struct dmirror_device { |
|
struct cdev cdevice; |
|
struct hmm_devmem *devmem; |
|
|
|
unsigned int devmem_capacity; |
|
unsigned int devmem_count; |
|
struct dmirror_chunk **devmem_chunks; |
|
struct mutex devmem_lock; /* protects the above */ |
|
|
|
unsigned long calloc; |
|
unsigned long cfree; |
|
struct page *free_pages; |
|
spinlock_t lock; /* protects the above */ |
|
}; |
|
|
|
static struct dmirror_device dmirror_devices[DMIRROR_NDEVICES]; |
|
|
|
static int dmirror_bounce_init(struct dmirror_bounce *bounce, |
|
unsigned long addr, |
|
unsigned long size) |
|
{ |
|
bounce->addr = addr; |
|
bounce->size = size; |
|
bounce->cpages = 0; |
|
bounce->ptr = vmalloc(size); |
|
if (!bounce->ptr) |
|
return -ENOMEM; |
|
return 0; |
|
} |
|
|
|
static void dmirror_bounce_fini(struct dmirror_bounce *bounce) |
|
{ |
|
vfree(bounce->ptr); |
|
} |
|
|
|
static int dmirror_fops_open(struct inode *inode, struct file *filp) |
|
{ |
|
struct cdev *cdev = inode->i_cdev; |
|
struct dmirror *dmirror; |
|
int ret; |
|
|
|
/* Mirror this process address space */ |
|
dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL); |
|
if (dmirror == NULL) |
|
return -ENOMEM; |
|
|
|
dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice); |
|
mutex_init(&dmirror->mutex); |
|
xa_init(&dmirror->pt); |
|
|
|
ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm, |
|
0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops); |
|
if (ret) { |
|
kfree(dmirror); |
|
return ret; |
|
} |
|
|
|
filp->private_data = dmirror; |
|
return 0; |
|
} |
|
|
|
static int dmirror_fops_release(struct inode *inode, struct file *filp) |
|
{ |
|
struct dmirror *dmirror = filp->private_data; |
|
|
|
mmu_interval_notifier_remove(&dmirror->notifier); |
|
xa_destroy(&dmirror->pt); |
|
kfree(dmirror); |
|
return 0; |
|
} |
|
|
|
static struct dmirror_device *dmirror_page_to_device(struct page *page) |
|
|
|
{ |
|
return container_of(page->pgmap, struct dmirror_chunk, |
|
pagemap)->mdevice; |
|
} |
|
|
|
static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range) |
|
{ |
|
unsigned long *pfns = range->hmm_pfns; |
|
unsigned long pfn; |
|
|
|
for (pfn = (range->start >> PAGE_SHIFT); |
|
pfn < (range->end >> PAGE_SHIFT); |
|
pfn++, pfns++) { |
|
struct page *page; |
|
void *entry; |
|
|
|
/* |
|
* Since we asked for hmm_range_fault() to populate pages, |
|
* it shouldn't return an error entry on success. |
|
*/ |
|
WARN_ON(*pfns & HMM_PFN_ERROR); |
|
WARN_ON(!(*pfns & HMM_PFN_VALID)); |
|
|
|
page = hmm_pfn_to_page(*pfns); |
|
WARN_ON(!page); |
|
|
|
entry = page; |
|
if (*pfns & HMM_PFN_WRITE) |
|
entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); |
|
else if (WARN_ON(range->default_flags & HMM_PFN_WRITE)) |
|
return -EFAULT; |
|
entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); |
|
if (xa_is_err(entry)) |
|
return xa_err(entry); |
|
} |
|
|
|
return 0; |
|
} |
|
|
|
static void dmirror_do_update(struct dmirror *dmirror, unsigned long start, |
|
unsigned long end) |
|
{ |
|
unsigned long pfn; |
|
void *entry; |
|
|
|
/* |
|
* The XArray doesn't hold references to pages since it relies on |
|
* the mmu notifier to clear page pointers when they become stale. |
|
* Therefore, it is OK to just clear the entry. |
|
*/ |
|
xa_for_each_range(&dmirror->pt, pfn, entry, start >> PAGE_SHIFT, |
|
end >> PAGE_SHIFT) |
|
xa_erase(&dmirror->pt, pfn); |
|
} |
|
|
|
static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni, |
|
const struct mmu_notifier_range *range, |
|
unsigned long cur_seq) |
|
{ |
|
struct dmirror *dmirror = container_of(mni, struct dmirror, notifier); |
|
|
|
/* |
|
* Ignore invalidation callbacks for device private pages since |
|
* the invalidation is handled as part of the migration process. |
|
*/ |
|
if (range->event == MMU_NOTIFY_MIGRATE && |
|
range->migrate_pgmap_owner == dmirror->mdevice) |
|
return true; |
|
|
|
if (mmu_notifier_range_blockable(range)) |
|
mutex_lock(&dmirror->mutex); |
|
else if (!mutex_trylock(&dmirror->mutex)) |
|
return false; |
|
|
|
mmu_interval_set_seq(mni, cur_seq); |
|
dmirror_do_update(dmirror, range->start, range->end); |
|
|
|
mutex_unlock(&dmirror->mutex); |
|
return true; |
|
} |
|
|
|
static const struct mmu_interval_notifier_ops dmirror_min_ops = { |
|
.invalidate = dmirror_interval_invalidate, |
|
}; |
|
|
|
static int dmirror_range_fault(struct dmirror *dmirror, |
|
struct hmm_range *range) |
|
{ |
|
struct mm_struct *mm = dmirror->notifier.mm; |
|
unsigned long timeout = |
|
jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); |
|
int ret; |
|
|
|
while (true) { |
|
if (time_after(jiffies, timeout)) { |
|
ret = -EBUSY; |
|
goto out; |
|
} |
|
|
|
range->notifier_seq = mmu_interval_read_begin(range->notifier); |
|
mmap_read_lock(mm); |
|
ret = hmm_range_fault(range); |
|
mmap_read_unlock(mm); |
|
if (ret) { |
|
if (ret == -EBUSY) |
|
continue; |
|
goto out; |
|
} |
|
|
|
mutex_lock(&dmirror->mutex); |
|
if (mmu_interval_read_retry(range->notifier, |
|
range->notifier_seq)) { |
|
mutex_unlock(&dmirror->mutex); |
|
continue; |
|
} |
|
break; |
|
} |
|
|
|
ret = dmirror_do_fault(dmirror, range); |
|
|
|
mutex_unlock(&dmirror->mutex); |
|
out: |
|
return ret; |
|
} |
|
|
|
static int dmirror_fault(struct dmirror *dmirror, unsigned long start, |
|
unsigned long end, bool write) |
|
{ |
|
struct mm_struct *mm = dmirror->notifier.mm; |
|
unsigned long addr; |
|
unsigned long pfns[64]; |
|
struct hmm_range range = { |
|
.notifier = &dmirror->notifier, |
|
.hmm_pfns = pfns, |
|
.pfn_flags_mask = 0, |
|
.default_flags = |
|
HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0), |
|
.dev_private_owner = dmirror->mdevice, |
|
}; |
|
int ret = 0; |
|
|
|
/* Since the mm is for the mirrored process, get a reference first. */ |
|
if (!mmget_not_zero(mm)) |
|
return 0; |
|
|
|
for (addr = start; addr < end; addr = range.end) { |
|
range.start = addr; |
|
range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); |
|
|
|
ret = dmirror_range_fault(dmirror, &range); |
|
if (ret) |
|
break; |
|
} |
|
|
|
mmput(mm); |
|
return ret; |
|
} |
|
|
|
static int dmirror_do_read(struct dmirror *dmirror, unsigned long start, |
|
unsigned long end, struct dmirror_bounce *bounce) |
|
{ |
|
unsigned long pfn; |
|
void *ptr; |
|
|
|
ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); |
|
|
|
for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { |
|
void *entry; |
|
struct page *page; |
|
void *tmp; |
|
|
|
entry = xa_load(&dmirror->pt, pfn); |
|
page = xa_untag_pointer(entry); |
|
if (!page) |
|
return -ENOENT; |
|
|
|
tmp = kmap(page); |
|
memcpy(ptr, tmp, PAGE_SIZE); |
|
kunmap(page); |
|
|
|
ptr += PAGE_SIZE; |
|
bounce->cpages++; |
|
} |
|
|
|
return 0; |
|
} |
|
|
|
static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) |
|
{ |
|
struct dmirror_bounce bounce; |
|
unsigned long start, end; |
|
unsigned long size = cmd->npages << PAGE_SHIFT; |
|
int ret; |
|
|
|
start = cmd->addr; |
|
end = start + size; |
|
if (end < start) |
|
return -EINVAL; |
|
|
|
ret = dmirror_bounce_init(&bounce, start, size); |
|
if (ret) |
|
return ret; |
|
|
|
while (1) { |
|
mutex_lock(&dmirror->mutex); |
|
ret = dmirror_do_read(dmirror, start, end, &bounce); |
|
mutex_unlock(&dmirror->mutex); |
|
if (ret != -ENOENT) |
|
break; |
|
|
|
start = cmd->addr + (bounce.cpages << PAGE_SHIFT); |
|
ret = dmirror_fault(dmirror, start, end, false); |
|
if (ret) |
|
break; |
|
cmd->faults++; |
|
} |
|
|
|
if (ret == 0) { |
|
if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, |
|
bounce.size)) |
|
ret = -EFAULT; |
|
} |
|
cmd->cpages = bounce.cpages; |
|
dmirror_bounce_fini(&bounce); |
|
return ret; |
|
} |
|
|
|
static int dmirror_do_write(struct dmirror *dmirror, unsigned long start, |
|
unsigned long end, struct dmirror_bounce *bounce) |
|
{ |
|
unsigned long pfn; |
|
void *ptr; |
|
|
|
ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); |
|
|
|
for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { |
|
void *entry; |
|
struct page *page; |
|
void *tmp; |
|
|
|
entry = xa_load(&dmirror->pt, pfn); |
|
page = xa_untag_pointer(entry); |
|
if (!page || xa_pointer_tag(entry) != DPT_XA_TAG_WRITE) |
|
return -ENOENT; |
|
|
|
tmp = kmap(page); |
|
memcpy(tmp, ptr, PAGE_SIZE); |
|
kunmap(page); |
|
|
|
ptr += PAGE_SIZE; |
|
bounce->cpages++; |
|
} |
|
|
|
return 0; |
|
} |
|
|
|
static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) |
|
{ |
|
struct dmirror_bounce bounce; |
|
unsigned long start, end; |
|
unsigned long size = cmd->npages << PAGE_SHIFT; |
|
int ret; |
|
|
|
start = cmd->addr; |
|
end = start + size; |
|
if (end < start) |
|
return -EINVAL; |
|
|
|
ret = dmirror_bounce_init(&bounce, start, size); |
|
if (ret) |
|
return ret; |
|
if (copy_from_user(bounce.ptr, u64_to_user_ptr(cmd->ptr), |
|
bounce.size)) { |
|
ret = -EFAULT; |
|
goto fini; |
|
} |
|
|
|
while (1) { |
|
mutex_lock(&dmirror->mutex); |
|
ret = dmirror_do_write(dmirror, start, end, &bounce); |
|
mutex_unlock(&dmirror->mutex); |
|
if (ret != -ENOENT) |
|
break; |
|
|
|
start = cmd->addr + (bounce.cpages << PAGE_SHIFT); |
|
ret = dmirror_fault(dmirror, start, end, true); |
|
if (ret) |
|
break; |
|
cmd->faults++; |
|
} |
|
|
|
fini: |
|
cmd->cpages = bounce.cpages; |
|
dmirror_bounce_fini(&bounce); |
|
return ret; |
|
} |
|
|
|
static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, |
|
struct page **ppage) |
|
{ |
|
struct dmirror_chunk *devmem; |
|
struct resource *res; |
|
unsigned long pfn; |
|
unsigned long pfn_first; |
|
unsigned long pfn_last; |
|
void *ptr; |
|
|
|
devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); |
|
if (!devmem) |
|
return false; |
|
|
|
res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, |
|
"hmm_dmirror"); |
|
if (IS_ERR(res)) |
|
goto err_devmem; |
|
|
|
devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; |
|
devmem->pagemap.range.start = res->start; |
|
devmem->pagemap.range.end = res->end; |
|
devmem->pagemap.nr_range = 1; |
|
devmem->pagemap.ops = &dmirror_devmem_ops; |
|
devmem->pagemap.owner = mdevice; |
|
|
|
mutex_lock(&mdevice->devmem_lock); |
|
|
|
if (mdevice->devmem_count == mdevice->devmem_capacity) { |
|
struct dmirror_chunk **new_chunks; |
|
unsigned int new_capacity; |
|
|
|
new_capacity = mdevice->devmem_capacity + |
|
DEVMEM_CHUNKS_RESERVE; |
|
new_chunks = krealloc(mdevice->devmem_chunks, |
|
sizeof(new_chunks[0]) * new_capacity, |
|
GFP_KERNEL); |
|
if (!new_chunks) |
|
goto err_release; |
|
mdevice->devmem_capacity = new_capacity; |
|
mdevice->devmem_chunks = new_chunks; |
|
} |
|
|
|
ptr = memremap_pages(&devmem->pagemap, numa_node_id()); |
|
if (IS_ERR(ptr)) |
|
goto err_release; |
|
|
|
devmem->mdevice = mdevice; |
|
pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT; |
|
pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT); |
|
mdevice->devmem_chunks[mdevice->devmem_count++] = devmem; |
|
|
|
mutex_unlock(&mdevice->devmem_lock); |
|
|
|
pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n", |
|
DEVMEM_CHUNK_SIZE / (1024 * 1024), |
|
mdevice->devmem_count, |
|
mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)), |
|
pfn_first, pfn_last); |
|
|
|
spin_lock(&mdevice->lock); |
|
for (pfn = pfn_first; pfn < pfn_last; pfn++) { |
|
struct page *page = pfn_to_page(pfn); |
|
|
|
page->zone_device_data = mdevice->free_pages; |
|
mdevice->free_pages = page; |
|
} |
|
if (ppage) { |
|
*ppage = mdevice->free_pages; |
|
mdevice->free_pages = (*ppage)->zone_device_data; |
|
mdevice->calloc++; |
|
} |
|
spin_unlock(&mdevice->lock); |
|
|
|
return true; |
|
|
|
err_release: |
|
mutex_unlock(&mdevice->devmem_lock); |
|
release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range)); |
|
err_devmem: |
|
kfree(devmem); |
|
|
|
return false; |
|
} |
|
|
|
static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) |
|
{ |
|
struct page *dpage = NULL; |
|
struct page *rpage; |
|
|
|
/* |
|
* This is a fake device so we alloc real system memory to store |
|
* our device memory. |
|
*/ |
|
rpage = alloc_page(GFP_HIGHUSER); |
|
if (!rpage) |
|
return NULL; |
|
|
|
spin_lock(&mdevice->lock); |
|
|
|
if (mdevice->free_pages) { |
|
dpage = mdevice->free_pages; |
|
mdevice->free_pages = dpage->zone_device_data; |
|
mdevice->calloc++; |
|
spin_unlock(&mdevice->lock); |
|
} else { |
|
spin_unlock(&mdevice->lock); |
|
if (!dmirror_allocate_chunk(mdevice, &dpage)) |
|
goto error; |
|
} |
|
|
|
dpage->zone_device_data = rpage; |
|
get_page(dpage); |
|
lock_page(dpage); |
|
return dpage; |
|
|
|
error: |
|
__free_page(rpage); |
|
return NULL; |
|
} |
|
|
|
static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, |
|
struct dmirror *dmirror) |
|
{ |
|
struct dmirror_device *mdevice = dmirror->mdevice; |
|
const unsigned long *src = args->src; |
|
unsigned long *dst = args->dst; |
|
unsigned long addr; |
|
|
|
for (addr = args->start; addr < args->end; addr += PAGE_SIZE, |
|
src++, dst++) { |
|
struct page *spage; |
|
struct page *dpage; |
|
struct page *rpage; |
|
|
|
if (!(*src & MIGRATE_PFN_MIGRATE)) |
|
continue; |
|
|
|
/* |
|
* Note that spage might be NULL which is OK since it is an |
|
* unallocated pte_none() or read-only zero page. |
|
*/ |
|
spage = migrate_pfn_to_page(*src); |
|
|
|
dpage = dmirror_devmem_alloc_page(mdevice); |
|
if (!dpage) |
|
continue; |
|
|
|
rpage = dpage->zone_device_data; |
|
if (spage) |
|
copy_highpage(rpage, spage); |
|
else |
|
clear_highpage(rpage); |
|
|
|
/* |
|
* Normally, a device would use the page->zone_device_data to |
|
* point to the mirror but here we use it to hold the page for |
|
* the simulated device memory and that page holds the pointer |
|
* to the mirror. |
|
*/ |
|
rpage->zone_device_data = dmirror; |
|
|
|
*dst = migrate_pfn(page_to_pfn(dpage)) | |
|
MIGRATE_PFN_LOCKED; |
|
if ((*src & MIGRATE_PFN_WRITE) || |
|
(!spage && args->vma->vm_flags & VM_WRITE)) |
|
*dst |= MIGRATE_PFN_WRITE; |
|
} |
|
} |
|
|
|
static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, |
|
struct dmirror *dmirror) |
|
{ |
|
unsigned long start = args->start; |
|
unsigned long end = args->end; |
|
const unsigned long *src = args->src; |
|
const unsigned long *dst = args->dst; |
|
unsigned long pfn; |
|
|
|
/* Map the migrated pages into the device's page tables. */ |
|
mutex_lock(&dmirror->mutex); |
|
|
|
for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, |
|
src++, dst++) { |
|
struct page *dpage; |
|
void *entry; |
|
|
|
if (!(*src & MIGRATE_PFN_MIGRATE)) |
|
continue; |
|
|
|
dpage = migrate_pfn_to_page(*dst); |
|
if (!dpage) |
|
continue; |
|
|
|
/* |
|
* Store the page that holds the data so the page table |
|
* doesn't have to deal with ZONE_DEVICE private pages. |
|
*/ |
|
entry = dpage->zone_device_data; |
|
if (*dst & MIGRATE_PFN_WRITE) |
|
entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); |
|
entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); |
|
if (xa_is_err(entry)) { |
|
mutex_unlock(&dmirror->mutex); |
|
return xa_err(entry); |
|
} |
|
} |
|
|
|
mutex_unlock(&dmirror->mutex); |
|
return 0; |
|
} |
|
|
|
static int dmirror_migrate(struct dmirror *dmirror, |
|
struct hmm_dmirror_cmd *cmd) |
|
{ |
|
unsigned long start, end, addr; |
|
unsigned long size = cmd->npages << PAGE_SHIFT; |
|
struct mm_struct *mm = dmirror->notifier.mm; |
|
struct vm_area_struct *vma; |
|
unsigned long src_pfns[64]; |
|
unsigned long dst_pfns[64]; |
|
struct dmirror_bounce bounce; |
|
struct migrate_vma args; |
|
unsigned long next; |
|
int ret; |
|
|
|
start = cmd->addr; |
|
end = start + size; |
|
if (end < start) |
|
return -EINVAL; |
|
|
|
/* Since the mm is for the mirrored process, get a reference first. */ |
|
if (!mmget_not_zero(mm)) |
|
return -EINVAL; |
|
|
|
mmap_read_lock(mm); |
|
for (addr = start; addr < end; addr = next) { |
|
vma = find_vma(mm, addr); |
|
if (!vma || addr < vma->vm_start || |
|
!(vma->vm_flags & VM_READ)) { |
|
ret = -EINVAL; |
|
goto out; |
|
} |
|
next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); |
|
if (next > vma->vm_end) |
|
next = vma->vm_end; |
|
|
|
args.vma = vma; |
|
args.src = src_pfns; |
|
args.dst = dst_pfns; |
|
args.start = addr; |
|
args.end = next; |
|
args.pgmap_owner = dmirror->mdevice; |
|
args.flags = MIGRATE_VMA_SELECT_SYSTEM; |
|
ret = migrate_vma_setup(&args); |
|
if (ret) |
|
goto out; |
|
|
|
dmirror_migrate_alloc_and_copy(&args, dmirror); |
|
migrate_vma_pages(&args); |
|
dmirror_migrate_finalize_and_map(&args, dmirror); |
|
migrate_vma_finalize(&args); |
|
} |
|
mmap_read_unlock(mm); |
|
mmput(mm); |
|
|
|
/* Return the migrated data for verification. */ |
|
ret = dmirror_bounce_init(&bounce, start, size); |
|
if (ret) |
|
return ret; |
|
mutex_lock(&dmirror->mutex); |
|
ret = dmirror_do_read(dmirror, start, end, &bounce); |
|
mutex_unlock(&dmirror->mutex); |
|
if (ret == 0) { |
|
if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, |
|
bounce.size)) |
|
ret = -EFAULT; |
|
} |
|
cmd->cpages = bounce.cpages; |
|
dmirror_bounce_fini(&bounce); |
|
return ret; |
|
|
|
out: |
|
mmap_read_unlock(mm); |
|
mmput(mm); |
|
return ret; |
|
} |
|
|
|
static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, |
|
unsigned char *perm, unsigned long entry) |
|
{ |
|
struct page *page; |
|
|
|
if (entry & HMM_PFN_ERROR) { |
|
*perm = HMM_DMIRROR_PROT_ERROR; |
|
return; |
|
} |
|
if (!(entry & HMM_PFN_VALID)) { |
|
*perm = HMM_DMIRROR_PROT_NONE; |
|
return; |
|
} |
|
|
|
page = hmm_pfn_to_page(entry); |
|
if (is_device_private_page(page)) { |
|
/* Is the page migrated to this device or some other? */ |
|
if (dmirror->mdevice == dmirror_page_to_device(page)) |
|
*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; |
|
else |
|
*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; |
|
} else if (is_zero_pfn(page_to_pfn(page))) |
|
*perm = HMM_DMIRROR_PROT_ZERO; |
|
else |
|
*perm = HMM_DMIRROR_PROT_NONE; |
|
if (entry & HMM_PFN_WRITE) |
|
*perm |= HMM_DMIRROR_PROT_WRITE; |
|
else |
|
*perm |= HMM_DMIRROR_PROT_READ; |
|
if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PMD_SHIFT) |
|
*perm |= HMM_DMIRROR_PROT_PMD; |
|
else if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PUD_SHIFT) |
|
*perm |= HMM_DMIRROR_PROT_PUD; |
|
} |
|
|
|
static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni, |
|
const struct mmu_notifier_range *range, |
|
unsigned long cur_seq) |
|
{ |
|
struct dmirror_interval *dmi = |
|
container_of(mni, struct dmirror_interval, notifier); |
|
struct dmirror *dmirror = dmi->dmirror; |
|
|
|
if (mmu_notifier_range_blockable(range)) |
|
mutex_lock(&dmirror->mutex); |
|
else if (!mutex_trylock(&dmirror->mutex)) |
|
return false; |
|
|
|
/* |
|
* Snapshots only need to set the sequence number since any |
|
* invalidation in the interval invalidates the whole snapshot. |
|
*/ |
|
mmu_interval_set_seq(mni, cur_seq); |
|
|
|
mutex_unlock(&dmirror->mutex); |
|
return true; |
|
} |
|
|
|
static const struct mmu_interval_notifier_ops dmirror_mrn_ops = { |
|
.invalidate = dmirror_snapshot_invalidate, |
|
}; |
|
|
|
static int dmirror_range_snapshot(struct dmirror *dmirror, |
|
struct hmm_range *range, |
|
unsigned char *perm) |
|
{ |
|
struct mm_struct *mm = dmirror->notifier.mm; |
|
struct dmirror_interval notifier; |
|
unsigned long timeout = |
|
jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); |
|
unsigned long i; |
|
unsigned long n; |
|
int ret = 0; |
|
|
|
notifier.dmirror = dmirror; |
|
range->notifier = ¬ifier.notifier; |
|
|
|
ret = mmu_interval_notifier_insert(range->notifier, mm, |
|
range->start, range->end - range->start, |
|
&dmirror_mrn_ops); |
|
if (ret) |
|
return ret; |
|
|
|
while (true) { |
|
if (time_after(jiffies, timeout)) { |
|
ret = -EBUSY; |
|
goto out; |
|
} |
|
|
|
range->notifier_seq = mmu_interval_read_begin(range->notifier); |
|
|
|
mmap_read_lock(mm); |
|
ret = hmm_range_fault(range); |
|
mmap_read_unlock(mm); |
|
if (ret) { |
|
if (ret == -EBUSY) |
|
continue; |
|
goto out; |
|
} |
|
|
|
mutex_lock(&dmirror->mutex); |
|
if (mmu_interval_read_retry(range->notifier, |
|
range->notifier_seq)) { |
|
mutex_unlock(&dmirror->mutex); |
|
continue; |
|
} |
|
break; |
|
} |
|
|
|
n = (range->end - range->start) >> PAGE_SHIFT; |
|
for (i = 0; i < n; i++) |
|
dmirror_mkentry(dmirror, range, perm + i, range->hmm_pfns[i]); |
|
|
|
mutex_unlock(&dmirror->mutex); |
|
out: |
|
mmu_interval_notifier_remove(range->notifier); |
|
return ret; |
|
} |
|
|
|
static int dmirror_snapshot(struct dmirror *dmirror, |
|
struct hmm_dmirror_cmd *cmd) |
|
{ |
|
struct mm_struct *mm = dmirror->notifier.mm; |
|
unsigned long start, end; |
|
unsigned long size = cmd->npages << PAGE_SHIFT; |
|
unsigned long addr; |
|
unsigned long next; |
|
unsigned long pfns[64]; |
|
unsigned char perm[64]; |
|
char __user *uptr; |
|
struct hmm_range range = { |
|
.hmm_pfns = pfns, |
|
.dev_private_owner = dmirror->mdevice, |
|
}; |
|
int ret = 0; |
|
|
|
start = cmd->addr; |
|
end = start + size; |
|
if (end < start) |
|
return -EINVAL; |
|
|
|
/* Since the mm is for the mirrored process, get a reference first. */ |
|
if (!mmget_not_zero(mm)) |
|
return -EINVAL; |
|
|
|
/* |
|
* Register a temporary notifier to detect invalidations even if it |
|
* overlaps with other mmu_interval_notifiers. |
|
*/ |
|
uptr = u64_to_user_ptr(cmd->ptr); |
|
for (addr = start; addr < end; addr = next) { |
|
unsigned long n; |
|
|
|
next = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); |
|
range.start = addr; |
|
range.end = next; |
|
|
|
ret = dmirror_range_snapshot(dmirror, &range, perm); |
|
if (ret) |
|
break; |
|
|
|
n = (range.end - range.start) >> PAGE_SHIFT; |
|
if (copy_to_user(uptr, perm, n)) { |
|
ret = -EFAULT; |
|
break; |
|
} |
|
|
|
cmd->cpages += n; |
|
uptr += n; |
|
} |
|
mmput(mm); |
|
|
|
return ret; |
|
} |
|
|
|
static long dmirror_fops_unlocked_ioctl(struct file *filp, |
|
unsigned int command, |
|
unsigned long arg) |
|
{ |
|
void __user *uarg = (void __user *)arg; |
|
struct hmm_dmirror_cmd cmd; |
|
struct dmirror *dmirror; |
|
int ret; |
|
|
|
dmirror = filp->private_data; |
|
if (!dmirror) |
|
return -EINVAL; |
|
|
|
if (copy_from_user(&cmd, uarg, sizeof(cmd))) |
|
return -EFAULT; |
|
|
|
if (cmd.addr & ~PAGE_MASK) |
|
return -EINVAL; |
|
if (cmd.addr >= (cmd.addr + (cmd.npages << PAGE_SHIFT))) |
|
return -EINVAL; |
|
|
|
cmd.cpages = 0; |
|
cmd.faults = 0; |
|
|
|
switch (command) { |
|
case HMM_DMIRROR_READ: |
|
ret = dmirror_read(dmirror, &cmd); |
|
break; |
|
|
|
case HMM_DMIRROR_WRITE: |
|
ret = dmirror_write(dmirror, &cmd); |
|
break; |
|
|
|
case HMM_DMIRROR_MIGRATE: |
|
ret = dmirror_migrate(dmirror, &cmd); |
|
break; |
|
|
|
case HMM_DMIRROR_SNAPSHOT: |
|
ret = dmirror_snapshot(dmirror, &cmd); |
|
break; |
|
|
|
default: |
|
return -EINVAL; |
|
} |
|
if (ret) |
|
return ret; |
|
|
|
if (copy_to_user(uarg, &cmd, sizeof(cmd))) |
|
return -EFAULT; |
|
|
|
return 0; |
|
} |
|
|
|
static const struct file_operations dmirror_fops = { |
|
.open = dmirror_fops_open, |
|
.release = dmirror_fops_release, |
|
.unlocked_ioctl = dmirror_fops_unlocked_ioctl, |
|
.llseek = default_llseek, |
|
.owner = THIS_MODULE, |
|
}; |
|
|
|
static void dmirror_devmem_free(struct page *page) |
|
{ |
|
struct page *rpage = page->zone_device_data; |
|
struct dmirror_device *mdevice; |
|
|
|
if (rpage) |
|
__free_page(rpage); |
|
|
|
mdevice = dmirror_page_to_device(page); |
|
|
|
spin_lock(&mdevice->lock); |
|
mdevice->cfree++; |
|
page->zone_device_data = mdevice->free_pages; |
|
mdevice->free_pages = page; |
|
spin_unlock(&mdevice->lock); |
|
} |
|
|
|
static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, |
|
struct dmirror *dmirror) |
|
{ |
|
const unsigned long *src = args->src; |
|
unsigned long *dst = args->dst; |
|
unsigned long start = args->start; |
|
unsigned long end = args->end; |
|
unsigned long addr; |
|
|
|
for (addr = start; addr < end; addr += PAGE_SIZE, |
|
src++, dst++) { |
|
struct page *dpage, *spage; |
|
|
|
spage = migrate_pfn_to_page(*src); |
|
if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) |
|
continue; |
|
spage = spage->zone_device_data; |
|
|
|
dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); |
|
if (!dpage) |
|
continue; |
|
|
|
lock_page(dpage); |
|
xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); |
|
copy_highpage(dpage, spage); |
|
*dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; |
|
if (*src & MIGRATE_PFN_WRITE) |
|
*dst |= MIGRATE_PFN_WRITE; |
|
} |
|
return 0; |
|
} |
|
|
|
static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) |
|
{ |
|
struct migrate_vma args; |
|
unsigned long src_pfns; |
|
unsigned long dst_pfns; |
|
struct page *rpage; |
|
struct dmirror *dmirror; |
|
vm_fault_t ret; |
|
|
|
/* |
|
* Normally, a device would use the page->zone_device_data to point to |
|
* the mirror but here we use it to hold the page for the simulated |
|
* device memory and that page holds the pointer to the mirror. |
|
*/ |
|
rpage = vmf->page->zone_device_data; |
|
dmirror = rpage->zone_device_data; |
|
|
|
/* FIXME demonstrate how we can adjust migrate range */ |
|
args.vma = vmf->vma; |
|
args.start = vmf->address; |
|
args.end = args.start + PAGE_SIZE; |
|
args.src = &src_pfns; |
|
args.dst = &dst_pfns; |
|
args.pgmap_owner = dmirror->mdevice; |
|
args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; |
|
|
|
if (migrate_vma_setup(&args)) |
|
return VM_FAULT_SIGBUS; |
|
|
|
ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror); |
|
if (ret) |
|
return ret; |
|
migrate_vma_pages(&args); |
|
/* |
|
* No device finalize step is needed since |
|
* dmirror_devmem_fault_alloc_and_copy() will have already |
|
* invalidated the device page table. |
|
*/ |
|
migrate_vma_finalize(&args); |
|
return 0; |
|
} |
|
|
|
static const struct dev_pagemap_ops dmirror_devmem_ops = { |
|
.page_free = dmirror_devmem_free, |
|
.migrate_to_ram = dmirror_devmem_fault, |
|
}; |
|
|
|
static int dmirror_device_init(struct dmirror_device *mdevice, int id) |
|
{ |
|
dev_t dev; |
|
int ret; |
|
|
|
dev = MKDEV(MAJOR(dmirror_dev), id); |
|
mutex_init(&mdevice->devmem_lock); |
|
spin_lock_init(&mdevice->lock); |
|
|
|
cdev_init(&mdevice->cdevice, &dmirror_fops); |
|
mdevice->cdevice.owner = THIS_MODULE; |
|
ret = cdev_add(&mdevice->cdevice, dev, 1); |
|
if (ret) |
|
return ret; |
|
|
|
/* Build a list of free ZONE_DEVICE private struct pages */ |
|
dmirror_allocate_chunk(mdevice, NULL); |
|
|
|
return 0; |
|
} |
|
|
|
static void dmirror_device_remove(struct dmirror_device *mdevice) |
|
{ |
|
unsigned int i; |
|
|
|
if (mdevice->devmem_chunks) { |
|
for (i = 0; i < mdevice->devmem_count; i++) { |
|
struct dmirror_chunk *devmem = |
|
mdevice->devmem_chunks[i]; |
|
|
|
memunmap_pages(&devmem->pagemap); |
|
release_mem_region(devmem->pagemap.range.start, |
|
range_len(&devmem->pagemap.range)); |
|
kfree(devmem); |
|
} |
|
kfree(mdevice->devmem_chunks); |
|
} |
|
|
|
cdev_del(&mdevice->cdevice); |
|
} |
|
|
|
static int __init hmm_dmirror_init(void) |
|
{ |
|
int ret; |
|
int id; |
|
|
|
ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES, |
|
"HMM_DMIRROR"); |
|
if (ret) |
|
goto err_unreg; |
|
|
|
for (id = 0; id < DMIRROR_NDEVICES; id++) { |
|
ret = dmirror_device_init(dmirror_devices + id, id); |
|
if (ret) |
|
goto err_chrdev; |
|
} |
|
|
|
pr_info("HMM test module loaded. This is only for testing HMM.\n"); |
|
return 0; |
|
|
|
err_chrdev: |
|
while (--id >= 0) |
|
dmirror_device_remove(dmirror_devices + id); |
|
unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); |
|
err_unreg: |
|
return ret; |
|
} |
|
|
|
static void __exit hmm_dmirror_exit(void) |
|
{ |
|
int id; |
|
|
|
for (id = 0; id < DMIRROR_NDEVICES; id++) |
|
dmirror_device_remove(dmirror_devices + id); |
|
unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); |
|
} |
|
|
|
module_init(hmm_dmirror_init); |
|
module_exit(hmm_dmirror_exit); |
|
MODULE_LICENSE("GPL");
|
|
|