nvme.c 25 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043
  1. /*
  2. * NVM Express device driver
  3. * Copyright (c) 2011, Intel Corporation.
  4. *
  5. * This program is free software; you can redistribute it and/or modify it
  6. * under the terms and conditions of the GNU General Public License,
  7. * version 2, as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope it will be useful, but WITHOUT
  10. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  12. * more details.
  13. *
  14. * You should have received a copy of the GNU General Public License along with
  15. * this program; if not, write to the Free Software Foundation, Inc.,
  16. * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  17. */
  18. #include <linux/nvme.h>
  19. #include <linux/bio.h>
  20. #include <linux/blkdev.h>
  21. #include <linux/errno.h>
  22. #include <linux/fs.h>
  23. #include <linux/genhd.h>
  24. #include <linux/init.h>
  25. #include <linux/interrupt.h>
  26. #include <linux/io.h>
  27. #include <linux/kdev_t.h>
  28. #include <linux/kernel.h>
  29. #include <linux/mm.h>
  30. #include <linux/module.h>
  31. #include <linux/moduleparam.h>
  32. #include <linux/pci.h>
  33. #include <linux/sched.h>
  34. #include <linux/slab.h>
  35. #include <linux/types.h>
  36. #include <linux/version.h>
  37. #define NVME_Q_DEPTH 1024
  38. #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
  39. #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
  40. #define NVME_MINORS 64
  41. static int nvme_major;
  42. module_param(nvme_major, int, 0);
  43. /*
  44. * Represents an NVM Express device. Each nvme_dev is a PCI function.
  45. */
  46. struct nvme_dev {
  47. struct list_head node;
  48. struct nvme_queue **queues;
  49. u32 __iomem *dbs;
  50. struct pci_dev *pci_dev;
  51. int instance;
  52. int queue_count;
  53. u32 ctrl_config;
  54. struct msix_entry *entry;
  55. struct nvme_bar __iomem *bar;
  56. struct list_head namespaces;
  57. };
  58. /*
  59. * An NVM Express namespace is equivalent to a SCSI LUN
  60. */
  61. struct nvme_ns {
  62. struct list_head list;
  63. struct nvme_dev *dev;
  64. struct request_queue *queue;
  65. struct gendisk *disk;
  66. int ns_id;
  67. int lba_shift;
  68. };
  69. /*
  70. * An NVM Express queue. Each device has at least two (one for admin
  71. * commands and one for I/O commands).
  72. */
  73. struct nvme_queue {
  74. struct device *q_dmadev;
  75. spinlock_t q_lock;
  76. struct nvme_command *sq_cmds;
  77. volatile struct nvme_completion *cqes;
  78. dma_addr_t sq_dma_addr;
  79. dma_addr_t cq_dma_addr;
  80. wait_queue_head_t sq_full;
  81. struct bio_list sq_cong;
  82. u32 __iomem *q_db;
  83. u16 q_depth;
  84. u16 cq_vector;
  85. u16 sq_head;
  86. u16 sq_tail;
  87. u16 cq_head;
  88. u16 cq_cycle;
  89. unsigned long cmdid_data[];
  90. };
  91. /*
  92. * Check we didin't inadvertently grow the command struct
  93. */
  94. static inline void _nvme_check_size(void)
  95. {
  96. BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
  97. BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
  98. BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
  99. BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
  100. BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
  101. BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
  102. BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
  103. BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
  104. BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
  105. }
  106. /**
  107. * alloc_cmdid - Allocate a Command ID
  108. * @param nvmeq The queue that will be used for this command
  109. * @param ctx A pointer that will be passed to the handler
  110. * @param handler The ID of the handler to call
  111. *
  112. * Allocate a Command ID for a queue. The data passed in will
  113. * be passed to the completion handler. This is implemented by using
  114. * the bottom two bits of the ctx pointer to store the handler ID.
  115. * Passing in a pointer that's not 4-byte aligned will cause a BUG.
  116. * We can change this if it becomes a problem.
  117. */
  118. static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, int handler)
  119. {
  120. int depth = nvmeq->q_depth;
  121. unsigned long data = (unsigned long)ctx | handler;
  122. int cmdid;
  123. BUG_ON((unsigned long)ctx & 3);
  124. do {
  125. cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
  126. if (cmdid >= depth)
  127. return -EBUSY;
  128. } while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
  129. nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(depth)] = data;
  130. return cmdid;
  131. }
  132. static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
  133. int handler)
  134. {
  135. int cmdid;
  136. wait_event_killable(nvmeq->sq_full,
  137. (cmdid = alloc_cmdid(nvmeq, ctx, handler)) >= 0);
  138. return (cmdid < 0) ? -EINTR : cmdid;
  139. }
  140. /* If you need more than four handlers, you'll need to change how
  141. * alloc_cmdid and nvme_process_cq work
  142. */
  143. enum {
  144. sync_completion_id = 0,
  145. bio_completion_id,
  146. };
  147. static unsigned long free_cmdid(struct nvme_queue *nvmeq, int cmdid)
  148. {
  149. unsigned long data;
  150. data = nvmeq->cmdid_data[cmdid + BITS_TO_LONGS(nvmeq->q_depth)];
  151. clear_bit(cmdid, nvmeq->cmdid_data);
  152. wake_up(&nvmeq->sq_full);
  153. return data;
  154. }
  155. static struct nvme_queue *get_nvmeq(struct nvme_ns *ns)
  156. {
  157. return ns->dev->queues[1];
  158. }
  159. static void put_nvmeq(struct nvme_queue *nvmeq)
  160. {
  161. }
  162. /**
  163. * nvme_submit_cmd: Copy a command into a queue and ring the doorbell
  164. * @nvmeq: The queue to use
  165. * @cmd: The command to send
  166. *
  167. * Safe to use from interrupt context
  168. */
  169. static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
  170. {
  171. unsigned long flags;
  172. u16 tail;
  173. /* XXX: Need to check tail isn't going to overrun head */
  174. spin_lock_irqsave(&nvmeq->q_lock, flags);
  175. tail = nvmeq->sq_tail;
  176. memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
  177. writel(tail, nvmeq->q_db);
  178. if (++tail == nvmeq->q_depth)
  179. tail = 0;
  180. nvmeq->sq_tail = tail;
  181. spin_unlock_irqrestore(&nvmeq->q_lock, flags);
  182. return 0;
  183. }
  184. struct nvme_req_info {
  185. struct bio *bio;
  186. int nents;
  187. struct scatterlist sg[0];
  188. };
  189. /* XXX: use a mempool */
  190. static struct nvme_req_info *alloc_info(unsigned nseg, gfp_t gfp)
  191. {
  192. return kmalloc(sizeof(struct nvme_req_info) +
  193. sizeof(struct scatterlist) * nseg, gfp);
  194. }
  195. static void free_info(struct nvme_req_info *info)
  196. {
  197. kfree(info);
  198. }
  199. static void bio_completion(struct nvme_queue *nvmeq, void *ctx,
  200. struct nvme_completion *cqe)
  201. {
  202. struct nvme_req_info *info = ctx;
  203. struct bio *bio = info->bio;
  204. u16 status = le16_to_cpup(&cqe->status) >> 1;
  205. dma_unmap_sg(nvmeq->q_dmadev, info->sg, info->nents,
  206. bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
  207. free_info(info);
  208. bio_endio(bio, status ? -EIO : 0);
  209. }
  210. static int nvme_map_bio(struct device *dev, struct nvme_req_info *info,
  211. struct bio *bio, enum dma_data_direction dma_dir, int psegs)
  212. {
  213. struct bio_vec *bvec;
  214. struct scatterlist *sg = info->sg;
  215. int i, nsegs;
  216. sg_init_table(sg, psegs);
  217. bio_for_each_segment(bvec, bio, i) {
  218. sg_set_page(sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
  219. /* XXX: handle non-mergable here */
  220. nsegs++;
  221. }
  222. info->nents = nsegs;
  223. return dma_map_sg(dev, info->sg, info->nents, dma_dir);
  224. }
  225. static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
  226. struct bio *bio)
  227. {
  228. struct nvme_rw_command *cmnd;
  229. struct nvme_req_info *info;
  230. enum dma_data_direction dma_dir;
  231. int cmdid;
  232. u16 control;
  233. u32 dsmgmt;
  234. unsigned long flags;
  235. int psegs = bio_phys_segments(ns->queue, bio);
  236. info = alloc_info(psegs, GFP_NOIO);
  237. if (!info)
  238. goto congestion;
  239. info->bio = bio;
  240. cmdid = alloc_cmdid(nvmeq, info, bio_completion_id);
  241. if (unlikely(cmdid < 0))
  242. goto free_info;
  243. control = 0;
  244. if (bio->bi_rw & REQ_FUA)
  245. control |= NVME_RW_FUA;
  246. if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
  247. control |= NVME_RW_LR;
  248. dsmgmt = 0;
  249. if (bio->bi_rw & REQ_RAHEAD)
  250. dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
  251. spin_lock_irqsave(&nvmeq->q_lock, flags);
  252. cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail].rw;
  253. if (bio_data_dir(bio)) {
  254. cmnd->opcode = nvme_cmd_write;
  255. dma_dir = DMA_TO_DEVICE;
  256. } else {
  257. cmnd->opcode = nvme_cmd_read;
  258. dma_dir = DMA_FROM_DEVICE;
  259. }
  260. nvme_map_bio(nvmeq->q_dmadev, info, bio, dma_dir, psegs);
  261. cmnd->flags = 1;
  262. cmnd->command_id = cmdid;
  263. cmnd->nsid = cpu_to_le32(ns->ns_id);
  264. cmnd->prp1 = cpu_to_le64(sg_phys(info->sg));
  265. /* XXX: Support more than one PRP */
  266. cmnd->slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
  267. cmnd->length = cpu_to_le16((bio->bi_size >> ns->lba_shift) - 1);
  268. cmnd->control = cpu_to_le16(control);
  269. cmnd->dsmgmt = cpu_to_le32(dsmgmt);
  270. writel(nvmeq->sq_tail, nvmeq->q_db);
  271. if (++nvmeq->sq_tail == nvmeq->q_depth)
  272. nvmeq->sq_tail = 0;
  273. spin_unlock_irqrestore(&nvmeq->q_lock, flags);
  274. return 0;
  275. free_info:
  276. free_info(info);
  277. congestion:
  278. return -EBUSY;
  279. }
  280. /*
  281. * NB: return value of non-zero would mean that we were a stacking driver.
  282. * make_request must always succeed.
  283. */
  284. static int nvme_make_request(struct request_queue *q, struct bio *bio)
  285. {
  286. struct nvme_ns *ns = q->queuedata;
  287. struct nvme_queue *nvmeq = get_nvmeq(ns);
  288. if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
  289. blk_set_queue_congested(q, rw_is_sync(bio->bi_rw));
  290. bio_list_add(&nvmeq->sq_cong, bio);
  291. }
  292. put_nvmeq(nvmeq);
  293. return 0;
  294. }
  295. struct sync_cmd_info {
  296. struct task_struct *task;
  297. u32 result;
  298. int status;
  299. };
  300. static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
  301. struct nvme_completion *cqe)
  302. {
  303. struct sync_cmd_info *cmdinfo = ctx;
  304. cmdinfo->result = le32_to_cpup(&cqe->result);
  305. cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
  306. wake_up_process(cmdinfo->task);
  307. }
  308. typedef void (*completion_fn)(struct nvme_queue *, void *,
  309. struct nvme_completion *);
  310. static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq)
  311. {
  312. u16 head, cycle;
  313. static const completion_fn completions[4] = {
  314. [sync_completion_id] = sync_completion,
  315. [bio_completion_id] = bio_completion,
  316. };
  317. head = nvmeq->cq_head;
  318. cycle = nvmeq->cq_cycle;
  319. for (;;) {
  320. unsigned long data;
  321. void *ptr;
  322. unsigned char handler;
  323. struct nvme_completion cqe = nvmeq->cqes[head];
  324. if ((le16_to_cpu(cqe.status) & 1) != cycle)
  325. break;
  326. nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
  327. if (++head == nvmeq->q_depth) {
  328. head = 0;
  329. cycle = !cycle;
  330. }
  331. data = free_cmdid(nvmeq, cqe.command_id);
  332. handler = data & 3;
  333. ptr = (void *)(data & ~3UL);
  334. completions[handler](nvmeq, ptr, &cqe);
  335. }
  336. /* If the controller ignores the cq head doorbell and continuously
  337. * writes to the queue, it is theoretically possible to wrap around
  338. * the queue twice and mistakenly return IRQ_NONE. Linux only
  339. * requires that 0.1% of your interrupts are handled, so this isn't
  340. * a big problem.
  341. */
  342. if (head == nvmeq->cq_head && cycle == nvmeq->cq_cycle)
  343. return IRQ_NONE;
  344. writel(head, nvmeq->q_db + 1);
  345. nvmeq->cq_head = head;
  346. nvmeq->cq_cycle = cycle;
  347. return IRQ_HANDLED;
  348. }
  349. static irqreturn_t nvme_irq(int irq, void *data)
  350. {
  351. return nvme_process_cq(data);
  352. }
  353. /*
  354. * Returns 0 on success. If the result is negative, it's a Linux error code;
  355. * if the result is positive, it's an NVM Express status code
  356. */
  357. static int nvme_submit_sync_cmd(struct nvme_queue *q, struct nvme_command *cmd,
  358. u32 *result)
  359. {
  360. int cmdid;
  361. struct sync_cmd_info cmdinfo;
  362. cmdinfo.task = current;
  363. cmdinfo.status = -EINTR;
  364. cmdid = alloc_cmdid_killable(q, &cmdinfo, sync_completion_id);
  365. if (cmdid < 0)
  366. return cmdid;
  367. cmd->common.command_id = cmdid;
  368. set_current_state(TASK_UNINTERRUPTIBLE);
  369. nvme_submit_cmd(q, cmd);
  370. schedule();
  371. if (result)
  372. *result = cmdinfo.result;
  373. return cmdinfo.status;
  374. }
  375. static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
  376. u32 *result)
  377. {
  378. return nvme_submit_sync_cmd(dev->queues[0], cmd, result);
  379. }
  380. static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
  381. {
  382. int status;
  383. struct nvme_command c;
  384. memset(&c, 0, sizeof(c));
  385. c.delete_queue.opcode = opcode;
  386. c.delete_queue.qid = cpu_to_le16(id);
  387. status = nvme_submit_admin_cmd(dev, &c, NULL);
  388. if (status)
  389. return -EIO;
  390. return 0;
  391. }
  392. static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
  393. struct nvme_queue *nvmeq)
  394. {
  395. int status;
  396. struct nvme_command c;
  397. int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
  398. memset(&c, 0, sizeof(c));
  399. c.create_cq.opcode = nvme_admin_create_cq;
  400. c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
  401. c.create_cq.cqid = cpu_to_le16(qid);
  402. c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
  403. c.create_cq.cq_flags = cpu_to_le16(flags);
  404. c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
  405. status = nvme_submit_admin_cmd(dev, &c, NULL);
  406. if (status)
  407. return -EIO;
  408. return 0;
  409. }
  410. static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
  411. struct nvme_queue *nvmeq)
  412. {
  413. int status;
  414. struct nvme_command c;
  415. int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
  416. memset(&c, 0, sizeof(c));
  417. c.create_sq.opcode = nvme_admin_create_sq;
  418. c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
  419. c.create_sq.sqid = cpu_to_le16(qid);
  420. c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
  421. c.create_sq.sq_flags = cpu_to_le16(flags);
  422. c.create_sq.cqid = cpu_to_le16(qid);
  423. status = nvme_submit_admin_cmd(dev, &c, NULL);
  424. if (status)
  425. return -EIO;
  426. return 0;
  427. }
  428. static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
  429. {
  430. return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
  431. }
  432. static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
  433. {
  434. return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
  435. }
  436. static void nvme_free_queue(struct nvme_dev *dev, int qid)
  437. {
  438. struct nvme_queue *nvmeq = dev->queues[qid];
  439. free_irq(dev->entry[nvmeq->cq_vector].vector, nvmeq);
  440. /* Don't tell the adapter to delete the admin queue */
  441. if (qid) {
  442. adapter_delete_sq(dev, qid);
  443. adapter_delete_cq(dev, qid);
  444. }
  445. dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
  446. (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
  447. dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
  448. nvmeq->sq_cmds, nvmeq->sq_dma_addr);
  449. kfree(nvmeq);
  450. }
  451. static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
  452. int depth, int vector)
  453. {
  454. struct device *dmadev = &dev->pci_dev->dev;
  455. unsigned extra = (depth + BITS_TO_LONGS(depth)) * sizeof(long);
  456. struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
  457. if (!nvmeq)
  458. return NULL;
  459. nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
  460. &nvmeq->cq_dma_addr, GFP_KERNEL);
  461. if (!nvmeq->cqes)
  462. goto free_nvmeq;
  463. memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
  464. nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
  465. &nvmeq->sq_dma_addr, GFP_KERNEL);
  466. if (!nvmeq->sq_cmds)
  467. goto free_cqdma;
  468. nvmeq->q_dmadev = dmadev;
  469. spin_lock_init(&nvmeq->q_lock);
  470. nvmeq->cq_head = 0;
  471. nvmeq->cq_cycle = 1;
  472. init_waitqueue_head(&nvmeq->sq_full);
  473. bio_list_init(&nvmeq->sq_cong);
  474. nvmeq->q_db = &dev->dbs[qid * 2];
  475. nvmeq->q_depth = depth;
  476. nvmeq->cq_vector = vector;
  477. return nvmeq;
  478. free_cqdma:
  479. dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes,
  480. nvmeq->cq_dma_addr);
  481. free_nvmeq:
  482. kfree(nvmeq);
  483. return NULL;
  484. }
  485. static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
  486. int qid, int cq_size, int vector)
  487. {
  488. int result;
  489. struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
  490. result = adapter_alloc_cq(dev, qid, nvmeq);
  491. if (result < 0)
  492. goto free_nvmeq;
  493. result = adapter_alloc_sq(dev, qid, nvmeq);
  494. if (result < 0)
  495. goto release_cq;
  496. result = request_irq(dev->entry[vector].vector, nvme_irq,
  497. IRQF_DISABLED | IRQF_SHARED, "nvme", nvmeq);
  498. if (result < 0)
  499. goto release_sq;
  500. return nvmeq;
  501. release_sq:
  502. adapter_delete_sq(dev, qid);
  503. release_cq:
  504. adapter_delete_cq(dev, qid);
  505. free_nvmeq:
  506. dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
  507. (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
  508. dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
  509. nvmeq->sq_cmds, nvmeq->sq_dma_addr);
  510. kfree(nvmeq);
  511. return NULL;
  512. }
  513. static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
  514. {
  515. int result;
  516. u32 aqa;
  517. struct nvme_queue *nvmeq;
  518. dev->dbs = ((void __iomem *)dev->bar) + 4096;
  519. nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
  520. aqa = nvmeq->q_depth - 1;
  521. aqa |= aqa << 16;
  522. dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
  523. dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
  524. dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
  525. writel(aqa, &dev->bar->aqa);
  526. writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
  527. writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
  528. writel(dev->ctrl_config, &dev->bar->cc);
  529. while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
  530. msleep(100);
  531. if (fatal_signal_pending(current))
  532. return -EINTR;
  533. }
  534. result = request_irq(dev->entry[0].vector, nvme_irq,
  535. IRQF_DISABLED | IRQF_SHARED, "nvme admin", nvmeq);
  536. dev->queues[0] = nvmeq;
  537. return result;
  538. }
  539. static int nvme_identify(struct nvme_ns *ns, void __user *addr, int cns)
  540. {
  541. struct nvme_dev *dev = ns->dev;
  542. int status;
  543. struct nvme_command c;
  544. void *page;
  545. dma_addr_t dma_addr;
  546. page = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
  547. GFP_KERNEL);
  548. memset(&c, 0, sizeof(c));
  549. c.identify.opcode = nvme_admin_identify;
  550. c.identify.nsid = cns ? 0 : cpu_to_le32(ns->ns_id);
  551. c.identify.prp1 = cpu_to_le64(dma_addr);
  552. c.identify.cns = cpu_to_le32(cns);
  553. status = nvme_submit_admin_cmd(dev, &c, NULL);
  554. if (status)
  555. status = -EIO;
  556. else if (copy_to_user(addr, page, 4096))
  557. status = -EFAULT;
  558. dma_free_coherent(&dev->pci_dev->dev, 4096, page, dma_addr);
  559. return status;
  560. }
  561. static int nvme_get_range_type(struct nvme_ns *ns, void __user *addr)
  562. {
  563. struct nvme_dev *dev = ns->dev;
  564. int status;
  565. struct nvme_command c;
  566. void *page;
  567. dma_addr_t dma_addr;
  568. page = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
  569. GFP_KERNEL);
  570. memset(&c, 0, sizeof(c));
  571. c.features.opcode = nvme_admin_get_features;
  572. c.features.nsid = cpu_to_le32(ns->ns_id);
  573. c.features.prp1 = cpu_to_le64(dma_addr);
  574. c.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
  575. status = nvme_submit_admin_cmd(dev, &c, NULL);
  576. /* XXX: Assuming first range for now */
  577. if (status)
  578. status = -EIO;
  579. else if (copy_to_user(addr, page, 64))
  580. status = -EFAULT;
  581. dma_free_coherent(&dev->pci_dev->dev, 4096, page, dma_addr);
  582. return status;
  583. }
  584. static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
  585. unsigned long arg)
  586. {
  587. struct nvme_ns *ns = bdev->bd_disk->private_data;
  588. switch (cmd) {
  589. case NVME_IOCTL_IDENTIFY_NS:
  590. return nvme_identify(ns, (void __user *)arg, 0);
  591. case NVME_IOCTL_IDENTIFY_CTRL:
  592. return nvme_identify(ns, (void __user *)arg, 1);
  593. case NVME_IOCTL_GET_RANGE_TYPE:
  594. return nvme_get_range_type(ns, (void __user *)arg);
  595. default:
  596. return -ENOTTY;
  597. }
  598. }
  599. static const struct block_device_operations nvme_fops = {
  600. .owner = THIS_MODULE,
  601. .ioctl = nvme_ioctl,
  602. };
  603. static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
  604. struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
  605. {
  606. struct nvme_ns *ns;
  607. struct gendisk *disk;
  608. int lbaf;
  609. if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
  610. return NULL;
  611. ns = kzalloc(sizeof(*ns), GFP_KERNEL);
  612. if (!ns)
  613. return NULL;
  614. ns->queue = blk_alloc_queue(GFP_KERNEL);
  615. if (!ns->queue)
  616. goto out_free_ns;
  617. ns->queue->queue_flags = QUEUE_FLAG_DEFAULT | QUEUE_FLAG_NOMERGES |
  618. QUEUE_FLAG_NONROT | QUEUE_FLAG_DISCARD;
  619. blk_queue_make_request(ns->queue, nvme_make_request);
  620. ns->dev = dev;
  621. ns->queue->queuedata = ns;
  622. disk = alloc_disk(NVME_MINORS);
  623. if (!disk)
  624. goto out_free_queue;
  625. ns->ns_id = index;
  626. ns->disk = disk;
  627. lbaf = id->flbas & 0xf;
  628. ns->lba_shift = id->lbaf[lbaf].ds;
  629. disk->major = nvme_major;
  630. disk->minors = NVME_MINORS;
  631. disk->first_minor = NVME_MINORS * index;
  632. disk->fops = &nvme_fops;
  633. disk->private_data = ns;
  634. disk->queue = ns->queue;
  635. sprintf(disk->disk_name, "nvme%dn%d", dev->instance, index);
  636. set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
  637. return ns;
  638. out_free_queue:
  639. blk_cleanup_queue(ns->queue);
  640. out_free_ns:
  641. kfree(ns);
  642. return NULL;
  643. }
  644. static void nvme_ns_free(struct nvme_ns *ns)
  645. {
  646. put_disk(ns->disk);
  647. blk_cleanup_queue(ns->queue);
  648. kfree(ns);
  649. }
  650. static int set_queue_count(struct nvme_dev *dev, int sq_count, int cq_count)
  651. {
  652. int status;
  653. u32 result;
  654. struct nvme_command c;
  655. u32 q_count = (sq_count - 1) | ((cq_count - 1) << 16);
  656. memset(&c, 0, sizeof(c));
  657. c.features.opcode = nvme_admin_get_features;
  658. c.features.fid = cpu_to_le32(NVME_FEAT_NUM_QUEUES);
  659. c.features.dword11 = cpu_to_le32(q_count);
  660. status = nvme_submit_admin_cmd(dev, &c, &result);
  661. if (status)
  662. return -EIO;
  663. return min(result & 0xffff, result >> 16) + 1;
  664. }
  665. /* XXX: Create per-CPU queues */
  666. static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
  667. {
  668. int this_cpu;
  669. set_queue_count(dev, 1, 1);
  670. this_cpu = get_cpu();
  671. dev->queues[1] = nvme_create_queue(dev, 1, NVME_Q_DEPTH, this_cpu);
  672. put_cpu();
  673. if (!dev->queues[1])
  674. return -ENOMEM;
  675. dev->queue_count++;
  676. return 0;
  677. }
  678. static void nvme_free_queues(struct nvme_dev *dev)
  679. {
  680. int i;
  681. for (i = dev->queue_count - 1; i >= 0; i--)
  682. nvme_free_queue(dev, i);
  683. }
  684. static int __devinit nvme_dev_add(struct nvme_dev *dev)
  685. {
  686. int res, nn, i;
  687. struct nvme_ns *ns, *next;
  688. void *id;
  689. dma_addr_t dma_addr;
  690. struct nvme_command cid, crt;
  691. res = nvme_setup_io_queues(dev);
  692. if (res)
  693. return res;
  694. /* XXX: Switch to a SG list once prp2 works */
  695. id = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
  696. GFP_KERNEL);
  697. memset(&cid, 0, sizeof(cid));
  698. cid.identify.opcode = nvme_admin_identify;
  699. cid.identify.nsid = 0;
  700. cid.identify.prp1 = cpu_to_le64(dma_addr);
  701. cid.identify.cns = cpu_to_le32(1);
  702. res = nvme_submit_admin_cmd(dev, &cid, NULL);
  703. if (res) {
  704. res = -EIO;
  705. goto out_free;
  706. }
  707. nn = le32_to_cpup(&((struct nvme_id_ctrl *)id)->nn);
  708. cid.identify.cns = 0;
  709. memset(&crt, 0, sizeof(crt));
  710. crt.features.opcode = nvme_admin_get_features;
  711. crt.features.prp1 = cpu_to_le64(dma_addr + 4096);
  712. crt.features.fid = cpu_to_le32(NVME_FEAT_LBA_RANGE);
  713. for (i = 0; i < nn; i++) {
  714. cid.identify.nsid = cpu_to_le32(i);
  715. res = nvme_submit_admin_cmd(dev, &cid, NULL);
  716. if (res)
  717. continue;
  718. if (((struct nvme_id_ns *)id)->ncap == 0)
  719. continue;
  720. crt.features.nsid = cpu_to_le32(i);
  721. res = nvme_submit_admin_cmd(dev, &crt, NULL);
  722. if (res)
  723. continue;
  724. ns = nvme_alloc_ns(dev, i, id, id + 4096);
  725. if (ns)
  726. list_add_tail(&ns->list, &dev->namespaces);
  727. }
  728. list_for_each_entry(ns, &dev->namespaces, list)
  729. add_disk(ns->disk);
  730. dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
  731. return 0;
  732. out_free:
  733. list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
  734. list_del(&ns->list);
  735. nvme_ns_free(ns);
  736. }
  737. dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
  738. return res;
  739. }
  740. static int nvme_dev_remove(struct nvme_dev *dev)
  741. {
  742. struct nvme_ns *ns, *next;
  743. /* TODO: wait all I/O finished or cancel them */
  744. list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
  745. list_del(&ns->list);
  746. del_gendisk(ns->disk);
  747. nvme_ns_free(ns);
  748. }
  749. nvme_free_queues(dev);
  750. return 0;
  751. }
  752. /* XXX: Use an ida or something to let remove / add work correctly */
  753. static void nvme_set_instance(struct nvme_dev *dev)
  754. {
  755. static int instance;
  756. dev->instance = instance++;
  757. }
  758. static void nvme_release_instance(struct nvme_dev *dev)
  759. {
  760. }
  761. static int __devinit nvme_probe(struct pci_dev *pdev,
  762. const struct pci_device_id *id)
  763. {
  764. int result = -ENOMEM;
  765. struct nvme_dev *dev;
  766. dev = kzalloc(sizeof(*dev), GFP_KERNEL);
  767. if (!dev)
  768. return -ENOMEM;
  769. dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
  770. GFP_KERNEL);
  771. if (!dev->entry)
  772. goto free;
  773. dev->queues = kcalloc(2, sizeof(void *), GFP_KERNEL);
  774. if (!dev->queues)
  775. goto free;
  776. INIT_LIST_HEAD(&dev->namespaces);
  777. dev->pci_dev = pdev;
  778. pci_set_drvdata(pdev, dev);
  779. dma_set_mask(&dev->pci_dev->dev, DMA_BIT_MASK(64));
  780. nvme_set_instance(dev);
  781. dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
  782. if (!dev->bar) {
  783. result = -ENOMEM;
  784. goto disable;
  785. }
  786. result = nvme_configure_admin_queue(dev);
  787. if (result)
  788. goto unmap;
  789. dev->queue_count++;
  790. result = nvme_dev_add(dev);
  791. if (result)
  792. goto delete;
  793. return 0;
  794. delete:
  795. nvme_free_queues(dev);
  796. unmap:
  797. iounmap(dev->bar);
  798. disable:
  799. pci_disable_msix(pdev);
  800. nvme_release_instance(dev);
  801. free:
  802. kfree(dev->queues);
  803. kfree(dev->entry);
  804. kfree(dev);
  805. return result;
  806. }
  807. static void __devexit nvme_remove(struct pci_dev *pdev)
  808. {
  809. struct nvme_dev *dev = pci_get_drvdata(pdev);
  810. nvme_dev_remove(dev);
  811. pci_disable_msix(pdev);
  812. iounmap(dev->bar);
  813. nvme_release_instance(dev);
  814. kfree(dev->queues);
  815. kfree(dev->entry);
  816. kfree(dev);
  817. }
  818. /* These functions are yet to be implemented */
  819. #define nvme_error_detected NULL
  820. #define nvme_dump_registers NULL
  821. #define nvme_link_reset NULL
  822. #define nvme_slot_reset NULL
  823. #define nvme_error_resume NULL
  824. #define nvme_suspend NULL
  825. #define nvme_resume NULL
  826. static struct pci_error_handlers nvme_err_handler = {
  827. .error_detected = nvme_error_detected,
  828. .mmio_enabled = nvme_dump_registers,
  829. .link_reset = nvme_link_reset,
  830. .slot_reset = nvme_slot_reset,
  831. .resume = nvme_error_resume,
  832. };
  833. /* Move to pci_ids.h later */
  834. #define PCI_CLASS_STORAGE_EXPRESS 0x010802
  835. static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = {
  836. { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
  837. { 0, }
  838. };
  839. MODULE_DEVICE_TABLE(pci, nvme_id_table);
  840. static struct pci_driver nvme_driver = {
  841. .name = "nvme",
  842. .id_table = nvme_id_table,
  843. .probe = nvme_probe,
  844. .remove = __devexit_p(nvme_remove),
  845. .suspend = nvme_suspend,
  846. .resume = nvme_resume,
  847. .err_handler = &nvme_err_handler,
  848. };
  849. static int __init nvme_init(void)
  850. {
  851. int result;
  852. nvme_major = register_blkdev(nvme_major, "nvme");
  853. if (nvme_major <= 0)
  854. return -EBUSY;
  855. result = pci_register_driver(&nvme_driver);
  856. if (!result)
  857. return 0;
  858. unregister_blkdev(nvme_major, "nvme");
  859. return result;
  860. }
  861. static void __exit nvme_exit(void)
  862. {
  863. pci_unregister_driver(&nvme_driver);
  864. unregister_blkdev(nvme_major, "nvme");
  865. }
  866. MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
  867. MODULE_LICENSE("GPL");
  868. MODULE_VERSION("0.1");
  869. module_init(nvme_init);
  870. module_exit(nvme_exit);