mirror of https://github.com/Qortal/Brooklyn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
449 lines
10 KiB
449 lines
10 KiB
// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause |
|
|
|
/* Authors: Bernard Metzler <[email protected]> */ |
|
/* Copyright (c) 2008-2019, IBM Corporation */ |
|
|
|
#include <linux/gfp.h> |
|
#include <rdma/ib_verbs.h> |
|
#include <linux/dma-mapping.h> |
|
#include <linux/slab.h> |
|
#include <linux/sched/mm.h> |
|
#include <linux/resource.h> |
|
|
|
#include "siw.h" |
|
#include "siw_mem.h" |
|
|
|
/* |
|
* Stag lookup is based on its index part only (24 bits). |
|
* The code avoids special Stag of zero and tries to randomize |
|
* STag values between 1 and SIW_STAG_MAX_INDEX. |
|
*/ |
|
int siw_mem_add(struct siw_device *sdev, struct siw_mem *m) |
|
{ |
|
struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); |
|
u32 id, next; |
|
|
|
get_random_bytes(&next, 4); |
|
next &= 0x00ffffff; |
|
|
|
if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next, |
|
GFP_KERNEL) < 0) |
|
return -ENOMEM; |
|
|
|
/* Set the STag index part */ |
|
m->stag = id << 8; |
|
|
|
siw_dbg_mem(m, "new MEM object\n"); |
|
|
|
return 0; |
|
} |
|
|
|
/* |
|
* siw_mem_id2obj() |
|
* |
|
* resolves memory from stag given by id. might be called from: |
|
* o process context before sending out of sgl, or |
|
* o in softirq when resolving target memory |
|
*/ |
|
struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index) |
|
{ |
|
struct siw_mem *mem; |
|
|
|
rcu_read_lock(); |
|
mem = xa_load(&sdev->mem_xa, stag_index); |
|
if (likely(mem && kref_get_unless_zero(&mem->ref))) { |
|
rcu_read_unlock(); |
|
return mem; |
|
} |
|
rcu_read_unlock(); |
|
|
|
return NULL; |
|
} |
|
|
|
static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, |
|
bool dirty) |
|
{ |
|
unpin_user_pages_dirty_lock(chunk->plist, num_pages, dirty); |
|
} |
|
|
|
void siw_umem_release(struct siw_umem *umem, bool dirty) |
|
{ |
|
struct mm_struct *mm_s = umem->owning_mm; |
|
int i, num_pages = umem->num_pages; |
|
|
|
for (i = 0; num_pages; i++) { |
|
int to_free = min_t(int, PAGES_PER_CHUNK, num_pages); |
|
|
|
siw_free_plist(&umem->page_chunk[i], to_free, |
|
umem->writable && dirty); |
|
kfree(umem->page_chunk[i].plist); |
|
num_pages -= to_free; |
|
} |
|
atomic64_sub(umem->num_pages, &mm_s->pinned_vm); |
|
|
|
mmdrop(mm_s); |
|
kfree(umem->page_chunk); |
|
kfree(umem); |
|
} |
|
|
|
int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj, |
|
u64 start, u64 len, int rights) |
|
{ |
|
struct siw_device *sdev = to_siw_dev(pd->device); |
|
struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL); |
|
struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); |
|
u32 id, next; |
|
|
|
if (!mem) |
|
return -ENOMEM; |
|
|
|
mem->mem_obj = mem_obj; |
|
mem->stag_valid = 0; |
|
mem->sdev = sdev; |
|
mem->va = start; |
|
mem->len = len; |
|
mem->pd = pd; |
|
mem->perms = rights & IWARP_ACCESS_MASK; |
|
kref_init(&mem->ref); |
|
|
|
get_random_bytes(&next, 4); |
|
next &= 0x00ffffff; |
|
|
|
if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next, |
|
GFP_KERNEL) < 0) { |
|
kfree(mem); |
|
return -ENOMEM; |
|
} |
|
|
|
mr->mem = mem; |
|
/* Set the STag index part */ |
|
mem->stag = id << 8; |
|
mr->base_mr.lkey = mr->base_mr.rkey = mem->stag; |
|
|
|
return 0; |
|
} |
|
|
|
void siw_mr_drop_mem(struct siw_mr *mr) |
|
{ |
|
struct siw_mem *mem = mr->mem, *found; |
|
|
|
mem->stag_valid = 0; |
|
|
|
/* make STag invalid visible asap */ |
|
smp_mb(); |
|
|
|
found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8); |
|
WARN_ON(found != mem); |
|
siw_mem_put(mem); |
|
} |
|
|
|
void siw_free_mem(struct kref *ref) |
|
{ |
|
struct siw_mem *mem = container_of(ref, struct siw_mem, ref); |
|
|
|
siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n"); |
|
|
|
if (!mem->is_mw && mem->mem_obj) { |
|
if (mem->is_pbl == 0) |
|
siw_umem_release(mem->umem, true); |
|
else |
|
kfree(mem->pbl); |
|
} |
|
kfree(mem); |
|
} |
|
|
|
/* |
|
* siw_check_mem() |
|
* |
|
* Check protection domain, STAG state, access permissions and |
|
* address range for memory object. |
|
* |
|
* @pd: Protection Domain memory should belong to |
|
* @mem: memory to be checked |
|
* @addr: starting addr of mem |
|
* @perms: requested access permissions |
|
* @len: len of memory interval to be checked |
|
* |
|
*/ |
|
int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, |
|
enum ib_access_flags perms, int len) |
|
{ |
|
if (!mem->stag_valid) { |
|
siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag); |
|
return -E_STAG_INVALID; |
|
} |
|
if (mem->pd != pd) { |
|
siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag); |
|
return -E_PD_MISMATCH; |
|
} |
|
/* |
|
* check access permissions |
|
*/ |
|
if ((mem->perms & perms) < perms) { |
|
siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n", |
|
mem->perms, perms); |
|
return -E_ACCESS_PERM; |
|
} |
|
/* |
|
* Check if access falls into valid memory interval. |
|
*/ |
|
if (addr < mem->va || addr + len > mem->va + mem->len) { |
|
siw_dbg_pd(pd, "MEM interval len %d\n", len); |
|
siw_dbg_pd(pd, "[0x%pK, 0x%pK] out of bounds\n", |
|
(void *)(uintptr_t)addr, |
|
(void *)(uintptr_t)(addr + len)); |
|
siw_dbg_pd(pd, "[0x%pK, 0x%pK] STag=0x%08x\n", |
|
(void *)(uintptr_t)mem->va, |
|
(void *)(uintptr_t)(mem->va + mem->len), |
|
mem->stag); |
|
|
|
return -E_BASE_BOUNDS; |
|
} |
|
return E_ACCESS_OK; |
|
} |
|
|
|
/* |
|
* siw_check_sge() |
|
* |
|
* Check SGE for access rights in given interval |
|
* |
|
* @pd: Protection Domain memory should belong to |
|
* @sge: SGE to be checked |
|
* @mem: location of memory reference within array |
|
* @perms: requested access permissions |
|
* @off: starting offset in SGE |
|
* @len: len of memory interval to be checked |
|
* |
|
* NOTE: Function references SGE's memory object (mem->obj) |
|
* if not yet done. New reference is kept if check went ok and |
|
* released if check failed. If mem->obj is already valid, no new |
|
* lookup is being done and mem is not released it check fails. |
|
*/ |
|
int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[], |
|
enum ib_access_flags perms, u32 off, int len) |
|
{ |
|
struct siw_device *sdev = to_siw_dev(pd->device); |
|
struct siw_mem *new = NULL; |
|
int rv = E_ACCESS_OK; |
|
|
|
if (len + off > sge->length) { |
|
rv = -E_BASE_BOUNDS; |
|
goto fail; |
|
} |
|
if (*mem == NULL) { |
|
new = siw_mem_id2obj(sdev, sge->lkey >> 8); |
|
if (unlikely(!new)) { |
|
siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey); |
|
rv = -E_STAG_INVALID; |
|
goto fail; |
|
} |
|
*mem = new; |
|
} |
|
/* Check if user re-registered with different STag key */ |
|
if (unlikely((*mem)->stag != sge->lkey)) { |
|
siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey); |
|
rv = -E_STAG_INVALID; |
|
goto fail; |
|
} |
|
rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len); |
|
if (unlikely(rv)) |
|
goto fail; |
|
|
|
return 0; |
|
|
|
fail: |
|
if (new) { |
|
*mem = NULL; |
|
siw_mem_put(new); |
|
} |
|
return rv; |
|
} |
|
|
|
void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op) |
|
{ |
|
switch (op) { |
|
case SIW_OP_SEND: |
|
case SIW_OP_WRITE: |
|
case SIW_OP_SEND_WITH_IMM: |
|
case SIW_OP_SEND_REMOTE_INV: |
|
case SIW_OP_READ: |
|
case SIW_OP_READ_LOCAL_INV: |
|
if (!(wqe->sqe.flags & SIW_WQE_INLINE)) |
|
siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge); |
|
break; |
|
|
|
case SIW_OP_RECEIVE: |
|
siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge); |
|
break; |
|
|
|
case SIW_OP_READ_RESPONSE: |
|
siw_unref_mem_sgl(wqe->mem, 1); |
|
break; |
|
|
|
default: |
|
/* |
|
* SIW_OP_INVAL_STAG and SIW_OP_REG_MR |
|
* do not hold memory references |
|
*/ |
|
break; |
|
} |
|
} |
|
|
|
int siw_invalidate_stag(struct ib_pd *pd, u32 stag) |
|
{ |
|
struct siw_device *sdev = to_siw_dev(pd->device); |
|
struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8); |
|
int rv = 0; |
|
|
|
if (unlikely(!mem)) { |
|
siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag); |
|
return -EINVAL; |
|
} |
|
if (unlikely(mem->pd != pd)) { |
|
siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag); |
|
rv = -EACCES; |
|
goto out; |
|
} |
|
/* |
|
* Per RDMA verbs definition, an STag may already be in invalid |
|
* state if invalidation is requested. So no state check here. |
|
*/ |
|
mem->stag_valid = 0; |
|
|
|
siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag); |
|
out: |
|
siw_mem_put(mem); |
|
return rv; |
|
} |
|
|
|
/* |
|
* Gets physical address backed by PBL element. Address is referenced |
|
* by linear byte offset into list of variably sized PB elements. |
|
* Optionally, provides remaining len within current element, and |
|
* current PBL index for later resume at same element. |
|
*/ |
|
dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx) |
|
{ |
|
int i = idx ? *idx : 0; |
|
|
|
while (i < pbl->num_buf) { |
|
struct siw_pble *pble = &pbl->pbe[i]; |
|
|
|
if (pble->pbl_off + pble->size > off) { |
|
u64 pble_off = off - pble->pbl_off; |
|
|
|
if (len) |
|
*len = pble->size - pble_off; |
|
if (idx) |
|
*idx = i; |
|
|
|
return pble->addr + pble_off; |
|
} |
|
i++; |
|
} |
|
if (len) |
|
*len = 0; |
|
return 0; |
|
} |
|
|
|
struct siw_pbl *siw_pbl_alloc(u32 num_buf) |
|
{ |
|
struct siw_pbl *pbl; |
|
|
|
if (num_buf == 0) |
|
return ERR_PTR(-EINVAL); |
|
|
|
pbl = kzalloc(struct_size(pbl, pbe, num_buf), GFP_KERNEL); |
|
if (!pbl) |
|
return ERR_PTR(-ENOMEM); |
|
|
|
pbl->max_buf = num_buf; |
|
|
|
return pbl; |
|
} |
|
|
|
struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) |
|
{ |
|
struct siw_umem *umem; |
|
struct mm_struct *mm_s; |
|
u64 first_page_va; |
|
unsigned long mlock_limit; |
|
unsigned int foll_flags = FOLL_WRITE; |
|
int num_pages, num_chunks, i, rv = 0; |
|
|
|
if (!can_do_mlock()) |
|
return ERR_PTR(-EPERM); |
|
|
|
if (!len) |
|
return ERR_PTR(-EINVAL); |
|
|
|
first_page_va = start & PAGE_MASK; |
|
num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT; |
|
num_chunks = (num_pages >> CHUNK_SHIFT) + 1; |
|
|
|
umem = kzalloc(sizeof(*umem), GFP_KERNEL); |
|
if (!umem) |
|
return ERR_PTR(-ENOMEM); |
|
|
|
mm_s = current->mm; |
|
umem->owning_mm = mm_s; |
|
umem->writable = writable; |
|
|
|
mmgrab(mm_s); |
|
|
|
if (!writable) |
|
foll_flags |= FOLL_FORCE; |
|
|
|
mmap_read_lock(mm_s); |
|
|
|
mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
|
|
|
if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) { |
|
rv = -ENOMEM; |
|
goto out_sem_up; |
|
} |
|
umem->fp_addr = first_page_va; |
|
|
|
umem->page_chunk = |
|
kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL); |
|
if (!umem->page_chunk) { |
|
rv = -ENOMEM; |
|
goto out_sem_up; |
|
} |
|
for (i = 0; num_pages; i++) { |
|
int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK); |
|
|
|
umem->page_chunk[i].plist = |
|
kcalloc(nents, sizeof(struct page *), GFP_KERNEL); |
|
if (!umem->page_chunk[i].plist) { |
|
rv = -ENOMEM; |
|
goto out_sem_up; |
|
} |
|
got = 0; |
|
while (nents) { |
|
struct page **plist = &umem->page_chunk[i].plist[got]; |
|
|
|
rv = pin_user_pages(first_page_va, nents, |
|
foll_flags | FOLL_LONGTERM, |
|
plist, NULL); |
|
if (rv < 0) |
|
goto out_sem_up; |
|
|
|
umem->num_pages += rv; |
|
atomic64_add(rv, &mm_s->pinned_vm); |
|
first_page_va += rv * PAGE_SIZE; |
|
nents -= rv; |
|
got += rv; |
|
} |
|
num_pages -= got; |
|
} |
|
out_sem_up: |
|
mmap_read_unlock(mm_s); |
|
|
|
if (rv > 0) |
|
return umem; |
|
|
|
siw_umem_release(umem, false); |
|
|
|
return ERR_PTR(rv); |
|
}
|
|
|