|
@@ -1,4 +1,4 @@
|
|
|
-/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
|
|
|
+/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */
|
|
|
/*
|
|
|
* aoedev.c
|
|
|
* AoE device utility functions; maintains device list.
|
|
@@ -9,6 +9,9 @@
|
|
|
#include <linux/netdevice.h>
|
|
|
#include <linux/delay.h>
|
|
|
#include <linux/slab.h>
|
|
|
+#include <linux/bitmap.h>
|
|
|
+#include <linux/kdev_t.h>
|
|
|
+#include <linux/moduleparam.h>
|
|
|
#include "aoe.h"
|
|
|
|
|
|
static void dummy_timer(ulong);
|
|
@@ -16,23 +19,121 @@ static void aoedev_freedev(struct aoedev *);
|
|
|
static void freetgt(struct aoedev *d, struct aoetgt *t);
|
|
|
static void skbpoolfree(struct aoedev *d);
|
|
|
|
|
|
+static int aoe_dyndevs = 1;
|
|
|
+module_param(aoe_dyndevs, int, 0644);
|
|
|
+MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices.");
|
|
|
+
|
|
|
static struct aoedev *devlist;
|
|
|
static DEFINE_SPINLOCK(devlist_lock);
|
|
|
|
|
|
-struct aoedev *
|
|
|
-aoedev_by_aoeaddr(int maj, int min)
|
|
|
+/* Because some systems will have one, many, or no
|
|
|
+ * - partitions,
|
|
|
+ * - slots per shelf,
|
|
|
+ * - or shelves,
|
|
|
+ * we need some flexibility in the way the minor numbers
|
|
|
+ * are allocated. So they are dynamic.
|
|
|
+ */
|
|
|
+#define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS)
|
|
|
+
|
|
|
+static DEFINE_SPINLOCK(used_minors_lock);
|
|
|
+static DECLARE_BITMAP(used_minors, N_DEVS);
|
|
|
+
|
|
|
+static int
|
|
|
+minor_get_dyn(ulong *sysminor)
|
|
|
{
|
|
|
- struct aoedev *d;
|
|
|
ulong flags;
|
|
|
+ ulong n;
|
|
|
+ int error = 0;
|
|
|
+
|
|
|
+ spin_lock_irqsave(&used_minors_lock, flags);
|
|
|
+ n = find_first_zero_bit(used_minors, N_DEVS);
|
|
|
+ if (n < N_DEVS)
|
|
|
+ set_bit(n, used_minors);
|
|
|
+ else
|
|
|
+ error = -1;
|
|
|
+ spin_unlock_irqrestore(&used_minors_lock, flags);
|
|
|
+
|
|
|
+ *sysminor = n * AOE_PARTITIONS;
|
|
|
+ return error;
|
|
|
+}
|
|
|
|
|
|
- spin_lock_irqsave(&devlist_lock, flags);
|
|
|
+static int
|
|
|
+minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin)
|
|
|
+{
|
|
|
+ ulong flags;
|
|
|
+ ulong n;
|
|
|
+ int error = 0;
|
|
|
+ enum {
|
|
|
+ /* for backwards compatibility when !aoe_dyndevs,
|
|
|
+ * a static number of supported slots per shelf */
|
|
|
+ NPERSHELF = 16,
|
|
|
+ };
|
|
|
+
|
|
|
+ n = aoemaj * NPERSHELF + aoemin;
|
|
|
+ if (aoemin >= NPERSHELF || n >= N_DEVS) {
|
|
|
+ pr_err("aoe: %s with e%ld.%d\n",
|
|
|
+ "cannot use static minor device numbers",
|
|
|
+ aoemaj, aoemin);
|
|
|
+ error = -1;
|
|
|
+ } else {
|
|
|
+ spin_lock_irqsave(&used_minors_lock, flags);
|
|
|
+ if (test_bit(n, used_minors)) {
|
|
|
+ pr_err("aoe: %s %lu\n",
|
|
|
+ "existing device already has static minor number",
|
|
|
+ n);
|
|
|
+ error = -1;
|
|
|
+ } else
|
|
|
+ set_bit(n, used_minors);
|
|
|
+ spin_unlock_irqrestore(&used_minors_lock, flags);
|
|
|
+ }
|
|
|
|
|
|
- for (d=devlist; d; d=d->next)
|
|
|
- if (d->aoemajor == maj && d->aoeminor == min)
|
|
|
- break;
|
|
|
+ *sysminor = n;
|
|
|
+ return error;
|
|
|
+}
|
|
|
+
|
|
|
+static int
|
|
|
+minor_get(ulong *sysminor, ulong aoemaj, int aoemin)
|
|
|
+{
|
|
|
+ if (aoe_dyndevs)
|
|
|
+ return minor_get_dyn(sysminor);
|
|
|
+ else
|
|
|
+ return minor_get_static(sysminor, aoemaj, aoemin);
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+minor_free(ulong minor)
|
|
|
+{
|
|
|
+ ulong flags;
|
|
|
+
|
|
|
+ minor /= AOE_PARTITIONS;
|
|
|
+ BUG_ON(minor >= N_DEVS);
|
|
|
+
|
|
|
+ spin_lock_irqsave(&used_minors_lock, flags);
|
|
|
+ BUG_ON(!test_bit(minor, used_minors));
|
|
|
+ clear_bit(minor, used_minors);
|
|
|
+ spin_unlock_irqrestore(&used_minors_lock, flags);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Users who grab a pointer to the device with aoedev_by_aoeaddr
|
|
|
+ * automatically get a reference count and must be responsible
|
|
|
+ * for performing a aoedev_put. With the addition of async
|
|
|
+ * kthread processing I'm no longer confident that we can
|
|
|
+ * guarantee consistency in the face of device flushes.
|
|
|
+ *
|
|
|
+ * For the time being, we only bother to add extra references for
|
|
|
+ * frames sitting on the iocq. When the kthreads finish processing
|
|
|
+ * these frames, they will aoedev_put the device.
|
|
|
+ */
|
|
|
+
|
|
|
+void
|
|
|
+aoedev_put(struct aoedev *d)
|
|
|
+{
|
|
|
+ ulong flags;
|
|
|
|
|
|
+ spin_lock_irqsave(&devlist_lock, flags);
|
|
|
+ d->ref--;
|
|
|
spin_unlock_irqrestore(&devlist_lock, flags);
|
|
|
- return d;
|
|
|
}
|
|
|
|
|
|
static void
|
|
@@ -47,54 +148,74 @@ dummy_timer(ulong vp)
|
|
|
add_timer(&d->timer);
|
|
|
}
|
|
|
|
|
|
+static void
|
|
|
+aoe_failip(struct aoedev *d)
|
|
|
+{
|
|
|
+ struct request *rq;
|
|
|
+ struct bio *bio;
|
|
|
+ unsigned long n;
|
|
|
+
|
|
|
+ aoe_failbuf(d, d->ip.buf);
|
|
|
+
|
|
|
+ rq = d->ip.rq;
|
|
|
+ if (rq == NULL)
|
|
|
+ return;
|
|
|
+ while ((bio = d->ip.nxbio)) {
|
|
|
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
|
|
|
+ d->ip.nxbio = bio->bi_next;
|
|
|
+ n = (unsigned long) rq->special;
|
|
|
+ rq->special = (void *) --n;
|
|
|
+ }
|
|
|
+ if ((unsigned long) rq->special == 0)
|
|
|
+ aoe_end_request(d, rq, 0);
|
|
|
+}
|
|
|
+
|
|
|
void
|
|
|
aoedev_downdev(struct aoedev *d)
|
|
|
{
|
|
|
- struct aoetgt **t, **te;
|
|
|
- struct frame *f, *e;
|
|
|
- struct buf *buf;
|
|
|
- struct bio *bio;
|
|
|
+ struct aoetgt *t, **tt, **te;
|
|
|
+ struct frame *f;
|
|
|
+ struct list_head *head, *pos, *nx;
|
|
|
+ struct request *rq;
|
|
|
+ int i;
|
|
|
|
|
|
- t = d->targets;
|
|
|
- te = t + NTARGETS;
|
|
|
- for (; t < te && *t; t++) {
|
|
|
- f = (*t)->frames;
|
|
|
- e = f + (*t)->nframes;
|
|
|
- for (; f < e; f->tag = FREETAG, f->buf = NULL, f++) {
|
|
|
- if (f->tag == FREETAG || f->buf == NULL)
|
|
|
- continue;
|
|
|
- buf = f->buf;
|
|
|
- bio = buf->bio;
|
|
|
- if (--buf->nframesout == 0
|
|
|
- && buf != d->inprocess) {
|
|
|
- mempool_free(buf, d->bufpool);
|
|
|
- bio_endio(bio, -EIO);
|
|
|
+ d->flags &= ~DEVFL_UP;
|
|
|
+
|
|
|
+ /* clean out active buffers */
|
|
|
+ for (i = 0; i < NFACTIVE; i++) {
|
|
|
+ head = &d->factive[i];
|
|
|
+ list_for_each_safe(pos, nx, head) {
|
|
|
+ f = list_entry(pos, struct frame, head);
|
|
|
+ list_del(pos);
|
|
|
+ if (f->buf) {
|
|
|
+ f->buf->nframesout--;
|
|
|
+ aoe_failbuf(d, f->buf);
|
|
|
}
|
|
|
+ aoe_freetframe(f);
|
|
|
}
|
|
|
- (*t)->maxout = (*t)->nframes;
|
|
|
- (*t)->nout = 0;
|
|
|
}
|
|
|
- buf = d->inprocess;
|
|
|
- if (buf) {
|
|
|
- bio = buf->bio;
|
|
|
- mempool_free(buf, d->bufpool);
|
|
|
- bio_endio(bio, -EIO);
|
|
|
+ /* reset window dressings */
|
|
|
+ tt = d->targets;
|
|
|
+ te = tt + NTARGETS;
|
|
|
+ for (; tt < te && (t = *tt); tt++) {
|
|
|
+ t->maxout = t->nframes;
|
|
|
+ t->nout = 0;
|
|
|
}
|
|
|
- d->inprocess = NULL;
|
|
|
+
|
|
|
+ /* clean out the in-process request (if any) */
|
|
|
+ aoe_failip(d);
|
|
|
d->htgt = NULL;
|
|
|
|
|
|
- while (!list_empty(&d->bufq)) {
|
|
|
- buf = container_of(d->bufq.next, struct buf, bufs);
|
|
|
- list_del(d->bufq.next);
|
|
|
- bio = buf->bio;
|
|
|
- mempool_free(buf, d->bufpool);
|
|
|
- bio_endio(bio, -EIO);
|
|
|
+ /* fast fail all pending I/O */
|
|
|
+ if (d->blkq) {
|
|
|
+ while ((rq = blk_peek_request(d->blkq))) {
|
|
|
+ blk_start_request(rq);
|
|
|
+ aoe_end_request(d, rq, 1);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
if (d->gd)
|
|
|
set_capacity(d->gd, 0);
|
|
|
-
|
|
|
- d->flags &= ~DEVFL_UP;
|
|
|
}
|
|
|
|
|
|
static void
|
|
@@ -107,6 +228,7 @@ aoedev_freedev(struct aoedev *d)
|
|
|
aoedisk_rm_sysfs(d);
|
|
|
del_gendisk(d->gd);
|
|
|
put_disk(d->gd);
|
|
|
+ blk_cleanup_queue(d->blkq);
|
|
|
}
|
|
|
t = d->targets;
|
|
|
e = t + NTARGETS;
|
|
@@ -115,7 +237,7 @@ aoedev_freedev(struct aoedev *d)
|
|
|
if (d->bufpool)
|
|
|
mempool_destroy(d->bufpool);
|
|
|
skbpoolfree(d);
|
|
|
- blk_cleanup_queue(d->blkq);
|
|
|
+ minor_free(d->sysminor);
|
|
|
kfree(d);
|
|
|
}
|
|
|
|
|
@@ -142,7 +264,8 @@ aoedev_flush(const char __user *str, size_t cnt)
|
|
|
spin_lock(&d->lock);
|
|
|
if ((!all && (d->flags & DEVFL_UP))
|
|
|
|| (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
|
|
|
- || d->nopen) {
|
|
|
+ || d->nopen
|
|
|
+ || d->ref) {
|
|
|
spin_unlock(&d->lock);
|
|
|
dd = &d->next;
|
|
|
continue;
|
|
@@ -163,12 +286,15 @@ aoedev_flush(const char __user *str, size_t cnt)
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-/* I'm not really sure that this is a realistic problem, but if the
|
|
|
-network driver goes gonzo let's just leak memory after complaining. */
|
|
|
+/* This has been confirmed to occur once with Tms=3*1000 due to the
|
|
|
+ * driver changing link and not processing its transmit ring. The
|
|
|
+ * problem is hard enough to solve by returning an error that I'm
|
|
|
+ * still punting on "solving" this.
|
|
|
+ */
|
|
|
static void
|
|
|
skbfree(struct sk_buff *skb)
|
|
|
{
|
|
|
- enum { Sms = 100, Tms = 3*1000};
|
|
|
+ enum { Sms = 250, Tms = 30 * 1000};
|
|
|
int i = Tms / Sms;
|
|
|
|
|
|
if (skb == NULL)
|
|
@@ -182,6 +308,7 @@ skbfree(struct sk_buff *skb)
|
|
|
"cannot free skb -- memory leaked.");
|
|
|
return;
|
|
|
}
|
|
|
+ skb->truesize -= skb->data_len;
|
|
|
skb_shinfo(skb)->nr_frags = skb->data_len = 0;
|
|
|
skb_trim(skb, 0);
|
|
|
dev_kfree_skb(skb);
|
|
@@ -198,26 +325,29 @@ skbpoolfree(struct aoedev *d)
|
|
|
__skb_queue_head_init(&d->skbpool);
|
|
|
}
|
|
|
|
|
|
-/* find it or malloc it */
|
|
|
+/* find it or allocate it */
|
|
|
struct aoedev *
|
|
|
-aoedev_by_sysminor_m(ulong sysminor)
|
|
|
+aoedev_by_aoeaddr(ulong maj, int min, int do_alloc)
|
|
|
{
|
|
|
struct aoedev *d;
|
|
|
+ int i;
|
|
|
ulong flags;
|
|
|
+ ulong sysminor;
|
|
|
|
|
|
spin_lock_irqsave(&devlist_lock, flags);
|
|
|
|
|
|
for (d=devlist; d; d=d->next)
|
|
|
- if (d->sysminor == sysminor)
|
|
|
+ if (d->aoemajor == maj && d->aoeminor == min) {
|
|
|
+ d->ref++;
|
|
|
break;
|
|
|
- if (d)
|
|
|
+ }
|
|
|
+ if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0)
|
|
|
goto out;
|
|
|
d = kcalloc(1, sizeof *d, GFP_ATOMIC);
|
|
|
if (!d)
|
|
|
goto out;
|
|
|
INIT_WORK(&d->work, aoecmd_sleepwork);
|
|
|
spin_lock_init(&d->lock);
|
|
|
- skb_queue_head_init(&d->sendq);
|
|
|
skb_queue_head_init(&d->skbpool);
|
|
|
init_timer(&d->timer);
|
|
|
d->timer.data = (ulong) d;
|
|
@@ -226,10 +356,12 @@ aoedev_by_sysminor_m(ulong sysminor)
|
|
|
add_timer(&d->timer);
|
|
|
d->bufpool = NULL; /* defer to aoeblk_gdalloc */
|
|
|
d->tgt = d->targets;
|
|
|
- INIT_LIST_HEAD(&d->bufq);
|
|
|
+ d->ref = 1;
|
|
|
+ for (i = 0; i < NFACTIVE; i++)
|
|
|
+ INIT_LIST_HEAD(&d->factive[i]);
|
|
|
d->sysminor = sysminor;
|
|
|
- d->aoemajor = AOEMAJOR(sysminor);
|
|
|
- d->aoeminor = AOEMINOR(sysminor);
|
|
|
+ d->aoemajor = maj;
|
|
|
+ d->aoeminor = min;
|
|
|
d->mintimer = MINTIMER;
|
|
|
d->next = devlist;
|
|
|
devlist = d;
|
|
@@ -241,13 +373,23 @@ aoedev_by_sysminor_m(ulong sysminor)
|
|
|
static void
|
|
|
freetgt(struct aoedev *d, struct aoetgt *t)
|
|
|
{
|
|
|
- struct frame *f, *e;
|
|
|
+ struct frame *f;
|
|
|
+ struct list_head *pos, *nx, *head;
|
|
|
+ struct aoeif *ifp;
|
|
|
|
|
|
- f = t->frames;
|
|
|
- e = f + t->nframes;
|
|
|
- for (; f < e; f++)
|
|
|
+ for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) {
|
|
|
+ if (!ifp->nd)
|
|
|
+ break;
|
|
|
+ dev_put(ifp->nd);
|
|
|
+ }
|
|
|
+
|
|
|
+ head = &t->ffree;
|
|
|
+ list_for_each_safe(pos, nx, head) {
|
|
|
+ list_del(pos);
|
|
|
+ f = list_entry(pos, struct frame, head);
|
|
|
skbfree(f->skb);
|
|
|
- kfree(t->frames);
|
|
|
+ kfree(f);
|
|
|
+ }
|
|
|
kfree(t);
|
|
|
}
|
|
|
|
|
@@ -257,6 +399,7 @@ aoedev_exit(void)
|
|
|
struct aoedev *d;
|
|
|
ulong flags;
|
|
|
|
|
|
+ aoe_flush_iocq();
|
|
|
while ((d = devlist)) {
|
|
|
devlist = d->next;
|
|
|
|