forked from Qortal/Brooklyn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
532 lines
11 KiB
532 lines
11 KiB
/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ |
|
/* |
|
* aoedev.c |
|
* AoE device utility functions; maintains device list. |
|
*/ |
|
|
|
#include <linux/hdreg.h> |
|
#include <linux/blk-mq.h> |
|
#include <linux/netdevice.h> |
|
#include <linux/delay.h> |
|
#include <linux/slab.h> |
|
#include <linux/bitmap.h> |
|
#include <linux/kdev_t.h> |
|
#include <linux/moduleparam.h> |
|
#include <linux/string.h> |
|
#include "aoe.h" |
|
|
|
static void freetgt(struct aoedev *d, struct aoetgt *t); |
|
static void skbpoolfree(struct aoedev *d); |
|
|
|
static int aoe_dyndevs = 1; |
|
module_param(aoe_dyndevs, int, 0644); |
|
MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices."); |
|
|
|
static struct aoedev *devlist; |
|
static DEFINE_SPINLOCK(devlist_lock); |
|
|
|
/* Because some systems will have one, many, or no |
|
* - partitions, |
|
* - slots per shelf, |
|
* - or shelves, |
|
* we need some flexibility in the way the minor numbers |
|
* are allocated. So they are dynamic. |
|
*/ |
|
#define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS) |
|
|
|
static DEFINE_SPINLOCK(used_minors_lock); |
|
static DECLARE_BITMAP(used_minors, N_DEVS); |
|
|
|
static int |
|
minor_get_dyn(ulong *sysminor) |
|
{ |
|
ulong flags; |
|
ulong n; |
|
int error = 0; |
|
|
|
spin_lock_irqsave(&used_minors_lock, flags); |
|
n = find_first_zero_bit(used_minors, N_DEVS); |
|
if (n < N_DEVS) |
|
set_bit(n, used_minors); |
|
else |
|
error = -1; |
|
spin_unlock_irqrestore(&used_minors_lock, flags); |
|
|
|
*sysminor = n * AOE_PARTITIONS; |
|
return error; |
|
} |
|
|
|
static int |
|
minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin) |
|
{ |
|
ulong flags; |
|
ulong n; |
|
int error = 0; |
|
enum { |
|
/* for backwards compatibility when !aoe_dyndevs, |
|
* a static number of supported slots per shelf */ |
|
NPERSHELF = 16, |
|
}; |
|
|
|
if (aoemin >= NPERSHELF) { |
|
pr_err("aoe: %s %d slots per shelf\n", |
|
"static minor device numbers support only", |
|
NPERSHELF); |
|
error = -1; |
|
goto out; |
|
} |
|
|
|
n = aoemaj * NPERSHELF + aoemin; |
|
if (n >= N_DEVS) { |
|
pr_err("aoe: %s with e%ld.%d\n", |
|
"cannot use static minor device numbers", |
|
aoemaj, aoemin); |
|
error = -1; |
|
goto out; |
|
} |
|
|
|
spin_lock_irqsave(&used_minors_lock, flags); |
|
if (test_bit(n, used_minors)) { |
|
pr_err("aoe: %s %lu\n", |
|
"existing device already has static minor number", |
|
n); |
|
error = -1; |
|
} else |
|
set_bit(n, used_minors); |
|
spin_unlock_irqrestore(&used_minors_lock, flags); |
|
*sysminor = n * AOE_PARTITIONS; |
|
out: |
|
return error; |
|
} |
|
|
|
static int |
|
minor_get(ulong *sysminor, ulong aoemaj, int aoemin) |
|
{ |
|
if (aoe_dyndevs) |
|
return minor_get_dyn(sysminor); |
|
else |
|
return minor_get_static(sysminor, aoemaj, aoemin); |
|
} |
|
|
|
static void |
|
minor_free(ulong minor) |
|
{ |
|
ulong flags; |
|
|
|
minor /= AOE_PARTITIONS; |
|
BUG_ON(minor >= N_DEVS); |
|
|
|
spin_lock_irqsave(&used_minors_lock, flags); |
|
BUG_ON(!test_bit(minor, used_minors)); |
|
clear_bit(minor, used_minors); |
|
spin_unlock_irqrestore(&used_minors_lock, flags); |
|
} |
|
|
|
/* |
|
* Users who grab a pointer to the device with aoedev_by_aoeaddr |
|
* automatically get a reference count and must be responsible |
|
* for performing a aoedev_put. With the addition of async |
|
* kthread processing I'm no longer confident that we can |
|
* guarantee consistency in the face of device flushes. |
|
* |
|
* For the time being, we only bother to add extra references for |
|
* frames sitting on the iocq. When the kthreads finish processing |
|
* these frames, they will aoedev_put the device. |
|
*/ |
|
|
|
void |
|
aoedev_put(struct aoedev *d) |
|
{ |
|
ulong flags; |
|
|
|
spin_lock_irqsave(&devlist_lock, flags); |
|
d->ref--; |
|
spin_unlock_irqrestore(&devlist_lock, flags); |
|
} |
|
|
|
static void |
|
dummy_timer(struct timer_list *t) |
|
{ |
|
struct aoedev *d; |
|
|
|
d = from_timer(d, t, timer); |
|
if (d->flags & DEVFL_TKILL) |
|
return; |
|
d->timer.expires = jiffies + HZ; |
|
add_timer(&d->timer); |
|
} |
|
|
|
static void |
|
aoe_failip(struct aoedev *d) |
|
{ |
|
struct request *rq; |
|
struct aoe_req *req; |
|
struct bio *bio; |
|
|
|
aoe_failbuf(d, d->ip.buf); |
|
rq = d->ip.rq; |
|
if (rq == NULL) |
|
return; |
|
|
|
req = blk_mq_rq_to_pdu(rq); |
|
while ((bio = d->ip.nxbio)) { |
|
bio->bi_status = BLK_STS_IOERR; |
|
d->ip.nxbio = bio->bi_next; |
|
req->nr_bios--; |
|
} |
|
|
|
if (!req->nr_bios) |
|
aoe_end_request(d, rq, 0); |
|
} |
|
|
|
static void |
|
downdev_frame(struct list_head *pos) |
|
{ |
|
struct frame *f; |
|
|
|
f = list_entry(pos, struct frame, head); |
|
list_del(pos); |
|
if (f->buf) { |
|
f->buf->nframesout--; |
|
aoe_failbuf(f->t->d, f->buf); |
|
} |
|
aoe_freetframe(f); |
|
} |
|
|
|
void |
|
aoedev_downdev(struct aoedev *d) |
|
{ |
|
struct aoetgt *t, **tt, **te; |
|
struct list_head *head, *pos, *nx; |
|
int i; |
|
|
|
d->flags &= ~DEVFL_UP; |
|
|
|
/* clean out active and to-be-retransmitted buffers */ |
|
for (i = 0; i < NFACTIVE; i++) { |
|
head = &d->factive[i]; |
|
list_for_each_safe(pos, nx, head) |
|
downdev_frame(pos); |
|
} |
|
head = &d->rexmitq; |
|
list_for_each_safe(pos, nx, head) |
|
downdev_frame(pos); |
|
|
|
/* reset window dressings */ |
|
tt = d->targets; |
|
te = tt + d->ntargets; |
|
for (; tt < te && (t = *tt); tt++) { |
|
aoecmd_wreset(t); |
|
t->nout = 0; |
|
} |
|
|
|
/* clean out the in-process request (if any) */ |
|
aoe_failip(d); |
|
|
|
/* fast fail all pending I/O */ |
|
if (d->blkq) { |
|
/* UP is cleared, freeze+quiesce to insure all are errored */ |
|
blk_mq_freeze_queue(d->blkq); |
|
blk_mq_quiesce_queue(d->blkq); |
|
blk_mq_unquiesce_queue(d->blkq); |
|
blk_mq_unfreeze_queue(d->blkq); |
|
} |
|
|
|
if (d->gd) |
|
set_capacity(d->gd, 0); |
|
} |
|
|
|
/* return whether the user asked for this particular |
|
* device to be flushed |
|
*/ |
|
static int |
|
user_req(char *s, size_t slen, struct aoedev *d) |
|
{ |
|
const char *p; |
|
size_t lim; |
|
|
|
if (!d->gd) |
|
return 0; |
|
p = kbasename(d->gd->disk_name); |
|
lim = sizeof(d->gd->disk_name); |
|
lim -= p - d->gd->disk_name; |
|
if (slen < lim) |
|
lim = slen; |
|
|
|
return !strncmp(s, p, lim); |
|
} |
|
|
|
static void |
|
freedev(struct aoedev *d) |
|
{ |
|
struct aoetgt **t, **e; |
|
int freeing = 0; |
|
unsigned long flags; |
|
|
|
spin_lock_irqsave(&d->lock, flags); |
|
if (d->flags & DEVFL_TKILL |
|
&& !(d->flags & DEVFL_FREEING)) { |
|
d->flags |= DEVFL_FREEING; |
|
freeing = 1; |
|
} |
|
spin_unlock_irqrestore(&d->lock, flags); |
|
if (!freeing) |
|
return; |
|
|
|
del_timer_sync(&d->timer); |
|
if (d->gd) { |
|
aoedisk_rm_debugfs(d); |
|
del_gendisk(d->gd); |
|
put_disk(d->gd); |
|
blk_mq_free_tag_set(&d->tag_set); |
|
blk_cleanup_queue(d->blkq); |
|
} |
|
t = d->targets; |
|
e = t + d->ntargets; |
|
for (; t < e && *t; t++) |
|
freetgt(d, *t); |
|
|
|
mempool_destroy(d->bufpool); |
|
skbpoolfree(d); |
|
minor_free(d->sysminor); |
|
|
|
spin_lock_irqsave(&d->lock, flags); |
|
d->flags |= DEVFL_FREED; |
|
spin_unlock_irqrestore(&d->lock, flags); |
|
} |
|
|
|
enum flush_parms { |
|
NOT_EXITING = 0, |
|
EXITING = 1, |
|
}; |
|
|
|
static int |
|
flush(const char __user *str, size_t cnt, int exiting) |
|
{ |
|
ulong flags; |
|
struct aoedev *d, **dd; |
|
char buf[16]; |
|
int all = 0; |
|
int specified = 0; /* flush a specific device */ |
|
unsigned int skipflags; |
|
|
|
skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL; |
|
|
|
if (!exiting && cnt >= 3) { |
|
if (cnt > sizeof buf) |
|
cnt = sizeof buf; |
|
if (copy_from_user(buf, str, cnt)) |
|
return -EFAULT; |
|
all = !strncmp(buf, "all", 3); |
|
if (!all) |
|
specified = 1; |
|
} |
|
|
|
flush_scheduled_work(); |
|
/* pass one: do aoedev_downdev, which might sleep */ |
|
restart1: |
|
spin_lock_irqsave(&devlist_lock, flags); |
|
for (d = devlist; d; d = d->next) { |
|
spin_lock(&d->lock); |
|
if (d->flags & DEVFL_TKILL) |
|
goto cont; |
|
|
|
if (exiting) { |
|
/* unconditionally take each device down */ |
|
} else if (specified) { |
|
if (!user_req(buf, cnt, d)) |
|
goto cont; |
|
} else if ((!all && (d->flags & DEVFL_UP)) |
|
|| d->flags & skipflags |
|
|| d->nopen |
|
|| d->ref) |
|
goto cont; |
|
|
|
spin_unlock(&d->lock); |
|
spin_unlock_irqrestore(&devlist_lock, flags); |
|
aoedev_downdev(d); |
|
d->flags |= DEVFL_TKILL; |
|
goto restart1; |
|
cont: |
|
spin_unlock(&d->lock); |
|
} |
|
spin_unlock_irqrestore(&devlist_lock, flags); |
|
|
|
/* pass two: call freedev, which might sleep, |
|
* for aoedevs marked with DEVFL_TKILL |
|
*/ |
|
restart2: |
|
spin_lock_irqsave(&devlist_lock, flags); |
|
for (d = devlist; d; d = d->next) { |
|
spin_lock(&d->lock); |
|
if (d->flags & DEVFL_TKILL |
|
&& !(d->flags & DEVFL_FREEING)) { |
|
spin_unlock(&d->lock); |
|
spin_unlock_irqrestore(&devlist_lock, flags); |
|
freedev(d); |
|
goto restart2; |
|
} |
|
spin_unlock(&d->lock); |
|
} |
|
|
|
/* pass three: remove aoedevs marked with DEVFL_FREED */ |
|
for (dd = &devlist, d = *dd; d; d = *dd) { |
|
struct aoedev *doomed = NULL; |
|
|
|
spin_lock(&d->lock); |
|
if (d->flags & DEVFL_FREED) { |
|
*dd = d->next; |
|
doomed = d; |
|
} else { |
|
dd = &d->next; |
|
} |
|
spin_unlock(&d->lock); |
|
if (doomed) |
|
kfree(doomed->targets); |
|
kfree(doomed); |
|
} |
|
spin_unlock_irqrestore(&devlist_lock, flags); |
|
|
|
return 0; |
|
} |
|
|
|
int |
|
aoedev_flush(const char __user *str, size_t cnt) |
|
{ |
|
return flush(str, cnt, NOT_EXITING); |
|
} |
|
|
|
/* This has been confirmed to occur once with Tms=3*1000 due to the |
|
* driver changing link and not processing its transmit ring. The |
|
* problem is hard enough to solve by returning an error that I'm |
|
* still punting on "solving" this. |
|
*/ |
|
static void |
|
skbfree(struct sk_buff *skb) |
|
{ |
|
enum { Sms = 250, Tms = 30 * 1000}; |
|
int i = Tms / Sms; |
|
|
|
if (skb == NULL) |
|
return; |
|
while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0) |
|
msleep(Sms); |
|
if (i < 0) { |
|
printk(KERN_ERR |
|
"aoe: %s holds ref: %s\n", |
|
skb->dev ? skb->dev->name : "netif", |
|
"cannot free skb -- memory leaked."); |
|
return; |
|
} |
|
skb->truesize -= skb->data_len; |
|
skb_shinfo(skb)->nr_frags = skb->data_len = 0; |
|
skb_trim(skb, 0); |
|
dev_kfree_skb(skb); |
|
} |
|
|
|
static void |
|
skbpoolfree(struct aoedev *d) |
|
{ |
|
struct sk_buff *skb, *tmp; |
|
|
|
skb_queue_walk_safe(&d->skbpool, skb, tmp) |
|
skbfree(skb); |
|
|
|
__skb_queue_head_init(&d->skbpool); |
|
} |
|
|
|
/* find it or allocate it */ |
|
struct aoedev * |
|
aoedev_by_aoeaddr(ulong maj, int min, int do_alloc) |
|
{ |
|
struct aoedev *d; |
|
int i; |
|
ulong flags; |
|
ulong sysminor = 0; |
|
|
|
spin_lock_irqsave(&devlist_lock, flags); |
|
|
|
for (d=devlist; d; d=d->next) |
|
if (d->aoemajor == maj && d->aoeminor == min) { |
|
spin_lock(&d->lock); |
|
if (d->flags & DEVFL_TKILL) { |
|
spin_unlock(&d->lock); |
|
d = NULL; |
|
goto out; |
|
} |
|
d->ref++; |
|
spin_unlock(&d->lock); |
|
break; |
|
} |
|
if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0) |
|
goto out; |
|
d = kcalloc(1, sizeof *d, GFP_ATOMIC); |
|
if (!d) |
|
goto out; |
|
d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC); |
|
if (!d->targets) { |
|
kfree(d); |
|
d = NULL; |
|
goto out; |
|
} |
|
d->ntargets = NTARGETS; |
|
INIT_WORK(&d->work, aoecmd_sleepwork); |
|
spin_lock_init(&d->lock); |
|
INIT_LIST_HEAD(&d->rq_list); |
|
skb_queue_head_init(&d->skbpool); |
|
timer_setup(&d->timer, dummy_timer, 0); |
|
d->timer.expires = jiffies + HZ; |
|
add_timer(&d->timer); |
|
d->bufpool = NULL; /* defer to aoeblk_gdalloc */ |
|
d->tgt = d->targets; |
|
d->ref = 1; |
|
for (i = 0; i < NFACTIVE; i++) |
|
INIT_LIST_HEAD(&d->factive[i]); |
|
INIT_LIST_HEAD(&d->rexmitq); |
|
d->sysminor = sysminor; |
|
d->aoemajor = maj; |
|
d->aoeminor = min; |
|
d->rttavg = RTTAVG_INIT; |
|
d->rttdev = RTTDEV_INIT; |
|
d->next = devlist; |
|
devlist = d; |
|
out: |
|
spin_unlock_irqrestore(&devlist_lock, flags); |
|
return d; |
|
} |
|
|
|
static void |
|
freetgt(struct aoedev *d, struct aoetgt *t) |
|
{ |
|
struct frame *f; |
|
struct list_head *pos, *nx, *head; |
|
struct aoeif *ifp; |
|
|
|
for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) { |
|
if (!ifp->nd) |
|
break; |
|
dev_put(ifp->nd); |
|
} |
|
|
|
head = &t->ffree; |
|
list_for_each_safe(pos, nx, head) { |
|
list_del(pos); |
|
f = list_entry(pos, struct frame, head); |
|
skbfree(f->skb); |
|
kfree(f); |
|
} |
|
kfree(t); |
|
} |
|
|
|
void |
|
aoedev_exit(void) |
|
{ |
|
flush_scheduled_work(); |
|
flush(NULL, 0, EXITING); |
|
} |
|
|
|
int __init |
|
aoedev_init(void) |
|
{ |
|
return 0; |
|
}
|
|
|