objio_osd.c 25 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040
  1. /*
  2. * pNFS Objects layout implementation over open-osd initiator library
  3. *
  4. * Copyright (C) 2009 Panasas Inc. [year of first publication]
  5. * All rights reserved.
  6. *
  7. * Benny Halevy <bhalevy@panasas.com>
  8. * Boaz Harrosh <bharrosh@panasas.com>
  9. *
  10. * This program is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License version 2
  12. * See the file COPYING included with this distribution for more details.
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. *
  18. * 1. Redistributions of source code must retain the above copyright
  19. * notice, this list of conditions and the following disclaimer.
  20. * 2. Redistributions in binary form must reproduce the above copyright
  21. * notice, this list of conditions and the following disclaimer in the
  22. * documentation and/or other materials provided with the distribution.
  23. * 3. Neither the name of the Panasas company nor the names of its
  24. * contributors may be used to endorse or promote products derived
  25. * from this software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
  28. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  29. * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  30. * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  31. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
  34. * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  35. * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  36. * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  37. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. */
  39. #include <linux/module.h>
  40. #include <scsi/osd_initiator.h>
  41. #include "objlayout.h"
  42. #define NFSDBG_FACILITY NFSDBG_PNFS_LD
  43. #define _LLU(x) ((unsigned long long)x)
  44. enum { BIO_MAX_PAGES_KMALLOC =
  45. (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
  46. };
  47. struct objio_dev_ent {
  48. struct nfs4_deviceid_node id_node;
  49. struct osd_dev *od;
  50. };
  51. static void
  52. objio_free_deviceid_node(struct nfs4_deviceid_node *d)
  53. {
  54. struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
  55. dprintk("%s: free od=%p\n", __func__, de->od);
  56. osduld_put_device(de->od);
  57. kfree(de);
  58. }
  59. static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
  60. const struct nfs4_deviceid *d_id)
  61. {
  62. struct nfs4_deviceid_node *d;
  63. struct objio_dev_ent *de;
  64. d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
  65. if (!d)
  66. return NULL;
  67. de = container_of(d, struct objio_dev_ent, id_node);
  68. return de;
  69. }
  70. static struct objio_dev_ent *
  71. _dev_list_add(const struct nfs_server *nfss,
  72. const struct nfs4_deviceid *d_id, struct osd_dev *od,
  73. gfp_t gfp_flags)
  74. {
  75. struct nfs4_deviceid_node *d;
  76. struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
  77. struct objio_dev_ent *n;
  78. if (!de) {
  79. dprintk("%s: -ENOMEM od=%p\n", __func__, od);
  80. return NULL;
  81. }
  82. dprintk("%s: Adding od=%p\n", __func__, od);
  83. nfs4_init_deviceid_node(&de->id_node,
  84. nfss->pnfs_curr_ld,
  85. nfss->nfs_client,
  86. d_id);
  87. de->od = od;
  88. d = nfs4_insert_deviceid_node(&de->id_node);
  89. n = container_of(d, struct objio_dev_ent, id_node);
  90. if (n != de) {
  91. dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
  92. objio_free_deviceid_node(&de->id_node);
  93. de = n;
  94. }
  95. atomic_inc(&de->id_node.ref);
  96. return de;
  97. }
  98. struct caps_buffers {
  99. u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
  100. u8 creds[OSD_CAP_LEN];
  101. };
  102. struct objio_segment {
  103. struct pnfs_layout_segment lseg;
  104. struct pnfs_osd_object_cred *comps;
  105. unsigned mirrors_p1;
  106. unsigned stripe_unit;
  107. unsigned group_width; /* Data stripe_units without integrity comps */
  108. u64 group_depth;
  109. unsigned group_count;
  110. unsigned comps_index;
  111. unsigned num_comps;
  112. /* variable length */
  113. struct objio_dev_ent *ods[];
  114. };
  115. static inline struct objio_segment *
  116. OBJIO_LSEG(struct pnfs_layout_segment *lseg)
  117. {
  118. return container_of(lseg, struct objio_segment, lseg);
  119. }
  120. struct objio_state;
  121. typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
  122. struct objio_state {
  123. /* Generic layer */
  124. struct objlayout_io_state ol_state;
  125. struct objio_segment *layout;
  126. struct kref kref;
  127. objio_done_fn done;
  128. void *private;
  129. unsigned long length;
  130. unsigned numdevs; /* Actually used devs in this IO */
  131. /* A per-device variable array of size numdevs */
  132. struct _objio_per_comp {
  133. struct bio *bio;
  134. struct osd_request *or;
  135. unsigned long length;
  136. u64 offset;
  137. unsigned dev;
  138. } per_dev[];
  139. };
  140. /* Send and wait for a get_device_info of devices in the layout,
  141. then look them up with the osd_initiator library */
  142. static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
  143. struct objio_segment *objio_seg, unsigned comp,
  144. gfp_t gfp_flags)
  145. {
  146. struct pnfs_osd_deviceaddr *deviceaddr;
  147. struct nfs4_deviceid *d_id;
  148. struct objio_dev_ent *ode;
  149. struct osd_dev *od;
  150. struct osd_dev_info odi;
  151. int err;
  152. d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
  153. ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
  154. if (ode)
  155. return ode;
  156. err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
  157. if (unlikely(err)) {
  158. dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
  159. __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
  160. return ERR_PTR(err);
  161. }
  162. odi.systemid_len = deviceaddr->oda_systemid.len;
  163. if (odi.systemid_len > sizeof(odi.systemid)) {
  164. err = -EINVAL;
  165. goto out;
  166. } else if (odi.systemid_len)
  167. memcpy(odi.systemid, deviceaddr->oda_systemid.data,
  168. odi.systemid_len);
  169. odi.osdname_len = deviceaddr->oda_osdname.len;
  170. odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
  171. if (!odi.osdname_len && !odi.systemid_len) {
  172. dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
  173. __func__);
  174. err = -ENODEV;
  175. goto out;
  176. }
  177. od = osduld_info_lookup(&odi);
  178. if (unlikely(IS_ERR(od))) {
  179. err = PTR_ERR(od);
  180. dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
  181. goto out;
  182. }
  183. ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
  184. gfp_flags);
  185. out:
  186. dprintk("%s: return=%d\n", __func__, err);
  187. objlayout_put_deviceinfo(deviceaddr);
  188. return err ? ERR_PTR(err) : ode;
  189. }
  190. static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
  191. struct objio_segment *objio_seg,
  192. gfp_t gfp_flags)
  193. {
  194. unsigned i;
  195. int err;
  196. /* lookup all devices */
  197. for (i = 0; i < objio_seg->num_comps; i++) {
  198. struct objio_dev_ent *ode;
  199. ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
  200. if (unlikely(IS_ERR(ode))) {
  201. err = PTR_ERR(ode);
  202. goto out;
  203. }
  204. objio_seg->ods[i] = ode;
  205. }
  206. err = 0;
  207. out:
  208. dprintk("%s: return=%d\n", __func__, err);
  209. return err;
  210. }
  211. static int _verify_data_map(struct pnfs_osd_layout *layout)
  212. {
  213. struct pnfs_osd_data_map *data_map = &layout->olo_map;
  214. u64 stripe_length;
  215. u32 group_width;
  216. /* FIXME: Only raid0 for now. if not go through MDS */
  217. if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
  218. printk(KERN_ERR "Only RAID_0 for now\n");
  219. return -ENOTSUPP;
  220. }
  221. if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
  222. printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
  223. data_map->odm_num_comps, data_map->odm_mirror_cnt);
  224. return -EINVAL;
  225. }
  226. if (data_map->odm_group_width)
  227. group_width = data_map->odm_group_width;
  228. else
  229. group_width = data_map->odm_num_comps /
  230. (data_map->odm_mirror_cnt + 1);
  231. stripe_length = (u64)data_map->odm_stripe_unit * group_width;
  232. if (stripe_length >= (1ULL << 32)) {
  233. printk(KERN_ERR "Total Stripe length(0x%llx)"
  234. " >= 32bit is not supported\n", _LLU(stripe_length));
  235. return -ENOTSUPP;
  236. }
  237. if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
  238. printk(KERN_ERR "Stripe Unit(0x%llx)"
  239. " must be Multples of PAGE_SIZE(0x%lx)\n",
  240. _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
  241. return -ENOTSUPP;
  242. }
  243. return 0;
  244. }
  245. static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
  246. struct pnfs_osd_object_cred *src_comp,
  247. struct caps_buffers *caps_p)
  248. {
  249. WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
  250. WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
  251. *cur_comp = *src_comp;
  252. memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
  253. sizeof(caps_p->caps_key));
  254. cur_comp->oc_cap_key.cred = caps_p->caps_key;
  255. memcpy(caps_p->creds, src_comp->oc_cap.cred,
  256. sizeof(caps_p->creds));
  257. cur_comp->oc_cap.cred = caps_p->creds;
  258. }
  259. int objio_alloc_lseg(struct pnfs_layout_segment **outp,
  260. struct pnfs_layout_hdr *pnfslay,
  261. struct pnfs_layout_range *range,
  262. struct xdr_stream *xdr,
  263. gfp_t gfp_flags)
  264. {
  265. struct objio_segment *objio_seg;
  266. struct pnfs_osd_xdr_decode_layout_iter iter;
  267. struct pnfs_osd_layout layout;
  268. struct pnfs_osd_object_cred *cur_comp, src_comp;
  269. struct caps_buffers *caps_p;
  270. int err;
  271. err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
  272. if (unlikely(err))
  273. return err;
  274. err = _verify_data_map(&layout);
  275. if (unlikely(err))
  276. return err;
  277. objio_seg = kzalloc(sizeof(*objio_seg) +
  278. sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
  279. sizeof(*objio_seg->comps) * layout.olo_num_comps +
  280. sizeof(struct caps_buffers) * layout.olo_num_comps,
  281. gfp_flags);
  282. if (!objio_seg)
  283. return -ENOMEM;
  284. objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
  285. cur_comp = objio_seg->comps;
  286. caps_p = (void *)(cur_comp + layout.olo_num_comps);
  287. while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
  288. copy_single_comp(cur_comp++, &src_comp, caps_p++);
  289. if (unlikely(err))
  290. goto err;
  291. objio_seg->num_comps = layout.olo_num_comps;
  292. objio_seg->comps_index = layout.olo_comps_index;
  293. err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
  294. if (err)
  295. goto err;
  296. objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
  297. objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
  298. if (layout.olo_map.odm_group_width) {
  299. objio_seg->group_width = layout.olo_map.odm_group_width;
  300. objio_seg->group_depth = layout.olo_map.odm_group_depth;
  301. objio_seg->group_count = layout.olo_map.odm_num_comps /
  302. objio_seg->mirrors_p1 /
  303. objio_seg->group_width;
  304. } else {
  305. objio_seg->group_width = layout.olo_map.odm_num_comps /
  306. objio_seg->mirrors_p1;
  307. objio_seg->group_depth = -1;
  308. objio_seg->group_count = 1;
  309. }
  310. *outp = &objio_seg->lseg;
  311. return 0;
  312. err:
  313. kfree(objio_seg);
  314. dprintk("%s: Error: return %d\n", __func__, err);
  315. *outp = NULL;
  316. return err;
  317. }
  318. void objio_free_lseg(struct pnfs_layout_segment *lseg)
  319. {
  320. int i;
  321. struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
  322. for (i = 0; i < objio_seg->num_comps; i++) {
  323. if (!objio_seg->ods[i])
  324. break;
  325. nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
  326. }
  327. kfree(objio_seg);
  328. }
  329. int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
  330. struct objlayout_io_state **outp,
  331. gfp_t gfp_flags)
  332. {
  333. struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
  334. struct objio_state *ios;
  335. const unsigned first_size = sizeof(*ios) +
  336. objio_seg->num_comps * sizeof(ios->per_dev[0]);
  337. const unsigned sec_size = objio_seg->num_comps *
  338. sizeof(ios->ol_state.ioerrs[0]);
  339. ios = kzalloc(first_size + sec_size, gfp_flags);
  340. if (unlikely(!ios))
  341. return -ENOMEM;
  342. ios->layout = objio_seg;
  343. ios->ol_state.ioerrs = ((void *)ios) + first_size;
  344. ios->ol_state.num_comps = objio_seg->num_comps;
  345. *outp = &ios->ol_state;
  346. return 0;
  347. }
  348. void objio_free_io_state(struct objlayout_io_state *ol_state)
  349. {
  350. struct objio_state *ios = container_of(ol_state, struct objio_state,
  351. ol_state);
  352. kfree(ios);
  353. }
  354. enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
  355. {
  356. switch (oep) {
  357. case OSD_ERR_PRI_NO_ERROR:
  358. return (enum pnfs_osd_errno)0;
  359. case OSD_ERR_PRI_CLEAR_PAGES:
  360. BUG_ON(1);
  361. return 0;
  362. case OSD_ERR_PRI_RESOURCE:
  363. return PNFS_OSD_ERR_RESOURCE;
  364. case OSD_ERR_PRI_BAD_CRED:
  365. return PNFS_OSD_ERR_BAD_CRED;
  366. case OSD_ERR_PRI_NO_ACCESS:
  367. return PNFS_OSD_ERR_NO_ACCESS;
  368. case OSD_ERR_PRI_UNREACHABLE:
  369. return PNFS_OSD_ERR_UNREACHABLE;
  370. case OSD_ERR_PRI_NOT_FOUND:
  371. return PNFS_OSD_ERR_NOT_FOUND;
  372. case OSD_ERR_PRI_NO_SPACE:
  373. return PNFS_OSD_ERR_NO_SPACE;
  374. default:
  375. WARN_ON(1);
  376. /* fallthrough */
  377. case OSD_ERR_PRI_EIO:
  378. return PNFS_OSD_ERR_EIO;
  379. }
  380. }
  381. static void _clear_bio(struct bio *bio)
  382. {
  383. struct bio_vec *bv;
  384. unsigned i;
  385. __bio_for_each_segment(bv, bio, i, 0) {
  386. unsigned this_count = bv->bv_len;
  387. if (likely(PAGE_SIZE == this_count))
  388. clear_highpage(bv->bv_page);
  389. else
  390. zero_user(bv->bv_page, bv->bv_offset, this_count);
  391. }
  392. }
  393. static int _io_check(struct objio_state *ios, bool is_write)
  394. {
  395. enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
  396. int lin_ret = 0;
  397. int i;
  398. for (i = 0; i < ios->numdevs; i++) {
  399. struct osd_sense_info osi;
  400. struct osd_request *or = ios->per_dev[i].or;
  401. unsigned dev;
  402. int ret;
  403. if (!or)
  404. continue;
  405. ret = osd_req_decode_sense(or, &osi);
  406. if (likely(!ret))
  407. continue;
  408. if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
  409. /* start read offset passed endof file */
  410. BUG_ON(is_write);
  411. _clear_bio(ios->per_dev[i].bio);
  412. dprintk("%s: start read offset passed end of file "
  413. "offset=0x%llx, length=0x%lx\n", __func__,
  414. _LLU(ios->per_dev[i].offset),
  415. ios->per_dev[i].length);
  416. continue; /* we recovered */
  417. }
  418. dev = ios->per_dev[i].dev;
  419. objlayout_io_set_result(&ios->ol_state, dev,
  420. &ios->layout->comps[dev].oc_object_id,
  421. osd_pri_2_pnfs_err(osi.osd_err_pri),
  422. ios->per_dev[i].offset,
  423. ios->per_dev[i].length,
  424. is_write);
  425. if (osi.osd_err_pri >= oep) {
  426. oep = osi.osd_err_pri;
  427. lin_ret = ret;
  428. }
  429. }
  430. return lin_ret;
  431. }
  432. /*
  433. * Common IO state helpers.
  434. */
  435. static void _io_free(struct objio_state *ios)
  436. {
  437. unsigned i;
  438. for (i = 0; i < ios->numdevs; i++) {
  439. struct _objio_per_comp *per_dev = &ios->per_dev[i];
  440. if (per_dev->or) {
  441. osd_end_request(per_dev->or);
  442. per_dev->or = NULL;
  443. }
  444. if (per_dev->bio) {
  445. bio_put(per_dev->bio);
  446. per_dev->bio = NULL;
  447. }
  448. }
  449. }
  450. struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
  451. {
  452. unsigned min_dev = ios->layout->comps_index;
  453. unsigned max_dev = min_dev + ios->layout->num_comps;
  454. BUG_ON(dev < min_dev || max_dev <= dev);
  455. return ios->layout->ods[dev - min_dev]->od;
  456. }
  457. struct _striping_info {
  458. u64 obj_offset;
  459. u64 group_length;
  460. unsigned dev;
  461. unsigned unit_off;
  462. };
  463. static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
  464. struct _striping_info *si)
  465. {
  466. u32 stripe_unit = ios->layout->stripe_unit;
  467. u32 group_width = ios->layout->group_width;
  468. u64 group_depth = ios->layout->group_depth;
  469. u32 U = stripe_unit * group_width;
  470. u64 T = U * group_depth;
  471. u64 S = T * ios->layout->group_count;
  472. u64 M = div64_u64(file_offset, S);
  473. /*
  474. G = (L - (M * S)) / T
  475. H = (L - (M * S)) % T
  476. */
  477. u64 LmodU = file_offset - M * S;
  478. u32 G = div64_u64(LmodU, T);
  479. u64 H = LmodU - G * T;
  480. u32 N = div_u64(H, U);
  481. div_u64_rem(file_offset, stripe_unit, &si->unit_off);
  482. si->obj_offset = si->unit_off + (N * stripe_unit) +
  483. (M * group_depth * stripe_unit);
  484. /* "H - (N * U)" is just "H % U" so it's bound to u32 */
  485. si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
  486. si->dev *= ios->layout->mirrors_p1;
  487. si->group_length = T - H;
  488. }
  489. static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
  490. unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
  491. gfp_t gfp_flags)
  492. {
  493. unsigned pg = *cur_pg;
  494. struct request_queue *q =
  495. osd_request_queue(_io_od(ios, per_dev->dev));
  496. per_dev->length += cur_len;
  497. if (per_dev->bio == NULL) {
  498. unsigned stripes = ios->layout->num_comps /
  499. ios->layout->mirrors_p1;
  500. unsigned pages_in_stripe = stripes *
  501. (ios->layout->stripe_unit / PAGE_SIZE);
  502. unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
  503. stripes;
  504. if (BIO_MAX_PAGES_KMALLOC < bio_size)
  505. bio_size = BIO_MAX_PAGES_KMALLOC;
  506. per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
  507. if (unlikely(!per_dev->bio)) {
  508. dprintk("Faild to allocate BIO size=%u\n", bio_size);
  509. return -ENOMEM;
  510. }
  511. }
  512. while (cur_len > 0) {
  513. unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
  514. unsigned added_len;
  515. BUG_ON(ios->ol_state.nr_pages <= pg);
  516. cur_len -= pglen;
  517. added_len = bio_add_pc_page(q, per_dev->bio,
  518. ios->ol_state.pages[pg], pglen, pgbase);
  519. if (unlikely(pglen != added_len))
  520. return -ENOMEM;
  521. pgbase = 0;
  522. ++pg;
  523. }
  524. BUG_ON(cur_len);
  525. *cur_pg = pg;
  526. return 0;
  527. }
  528. static int _prepare_one_group(struct objio_state *ios, u64 length,
  529. struct _striping_info *si, unsigned *last_pg,
  530. gfp_t gfp_flags)
  531. {
  532. unsigned stripe_unit = ios->layout->stripe_unit;
  533. unsigned mirrors_p1 = ios->layout->mirrors_p1;
  534. unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
  535. unsigned dev = si->dev;
  536. unsigned first_dev = dev - (dev % devs_in_group);
  537. unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
  538. unsigned cur_pg = *last_pg;
  539. int ret = 0;
  540. while (length) {
  541. struct _objio_per_comp *per_dev = &ios->per_dev[dev];
  542. unsigned cur_len, page_off = 0;
  543. if (!per_dev->length) {
  544. per_dev->dev = dev;
  545. if (dev < si->dev) {
  546. per_dev->offset = si->obj_offset + stripe_unit -
  547. si->unit_off;
  548. cur_len = stripe_unit;
  549. } else if (dev == si->dev) {
  550. per_dev->offset = si->obj_offset;
  551. cur_len = stripe_unit - si->unit_off;
  552. page_off = si->unit_off & ~PAGE_MASK;
  553. BUG_ON(page_off &&
  554. (page_off != ios->ol_state.pgbase));
  555. } else { /* dev > si->dev */
  556. per_dev->offset = si->obj_offset - si->unit_off;
  557. cur_len = stripe_unit;
  558. }
  559. if (max_comp < dev)
  560. max_comp = dev;
  561. } else {
  562. cur_len = stripe_unit;
  563. }
  564. if (cur_len >= length)
  565. cur_len = length;
  566. ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
  567. cur_len, gfp_flags);
  568. if (unlikely(ret))
  569. goto out;
  570. dev += mirrors_p1;
  571. dev = (dev % devs_in_group) + first_dev;
  572. length -= cur_len;
  573. ios->length += cur_len;
  574. }
  575. out:
  576. ios->numdevs = max_comp + mirrors_p1;
  577. *last_pg = cur_pg;
  578. return ret;
  579. }
  580. static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
  581. {
  582. u64 length = ios->ol_state.count;
  583. u64 offset = ios->ol_state.offset;
  584. struct _striping_info si;
  585. unsigned last_pg = 0;
  586. int ret = 0;
  587. while (length) {
  588. _calc_stripe_info(ios, offset, &si);
  589. if (length < si.group_length)
  590. si.group_length = length;
  591. ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
  592. if (unlikely(ret))
  593. goto out;
  594. offset += si.group_length;
  595. length -= si.group_length;
  596. }
  597. out:
  598. if (!ios->length)
  599. return ret;
  600. return 0;
  601. }
  602. static ssize_t _sync_done(struct objio_state *ios)
  603. {
  604. struct completion *waiting = ios->private;
  605. complete(waiting);
  606. return 0;
  607. }
  608. static void _last_io(struct kref *kref)
  609. {
  610. struct objio_state *ios = container_of(kref, struct objio_state, kref);
  611. ios->done(ios);
  612. }
  613. static void _done_io(struct osd_request *or, void *p)
  614. {
  615. struct objio_state *ios = p;
  616. kref_put(&ios->kref, _last_io);
  617. }
  618. static ssize_t _io_exec(struct objio_state *ios)
  619. {
  620. DECLARE_COMPLETION_ONSTACK(wait);
  621. ssize_t status = 0; /* sync status */
  622. unsigned i;
  623. objio_done_fn saved_done_fn = ios->done;
  624. bool sync = ios->ol_state.sync;
  625. if (sync) {
  626. ios->done = _sync_done;
  627. ios->private = &wait;
  628. }
  629. kref_init(&ios->kref);
  630. for (i = 0; i < ios->numdevs; i++) {
  631. struct osd_request *or = ios->per_dev[i].or;
  632. if (!or)
  633. continue;
  634. kref_get(&ios->kref);
  635. osd_execute_request_async(or, _done_io, ios);
  636. }
  637. kref_put(&ios->kref, _last_io);
  638. if (sync) {
  639. wait_for_completion(&wait);
  640. status = saved_done_fn(ios);
  641. }
  642. return status;
  643. }
  644. /*
  645. * read
  646. */
  647. static ssize_t _read_done(struct objio_state *ios)
  648. {
  649. ssize_t status;
  650. int ret = _io_check(ios, false);
  651. _io_free(ios);
  652. if (likely(!ret))
  653. status = ios->length;
  654. else
  655. status = ret;
  656. objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
  657. return status;
  658. }
  659. static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
  660. {
  661. struct osd_request *or = NULL;
  662. struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
  663. unsigned dev = per_dev->dev;
  664. struct pnfs_osd_object_cred *cred =
  665. &ios->layout->comps[dev];
  666. struct osd_obj_id obj = {
  667. .partition = cred->oc_object_id.oid_partition_id,
  668. .id = cred->oc_object_id.oid_object_id,
  669. };
  670. int ret;
  671. or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
  672. if (unlikely(!or)) {
  673. ret = -ENOMEM;
  674. goto err;
  675. }
  676. per_dev->or = or;
  677. osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
  678. ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
  679. if (ret) {
  680. dprintk("%s: Faild to osd_finalize_request() => %d\n",
  681. __func__, ret);
  682. goto err;
  683. }
  684. dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
  685. __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
  686. per_dev->length);
  687. err:
  688. return ret;
  689. }
  690. static ssize_t _read_exec(struct objio_state *ios)
  691. {
  692. unsigned i;
  693. int ret;
  694. for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
  695. if (!ios->per_dev[i].length)
  696. continue;
  697. ret = _read_mirrors(ios, i);
  698. if (unlikely(ret))
  699. goto err;
  700. }
  701. ios->done = _read_done;
  702. return _io_exec(ios); /* In sync mode exec returns the io status */
  703. err:
  704. _io_free(ios);
  705. return ret;
  706. }
  707. ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
  708. {
  709. struct objio_state *ios = container_of(ol_state, struct objio_state,
  710. ol_state);
  711. int ret;
  712. ret = _io_rw_pagelist(ios, GFP_KERNEL);
  713. if (unlikely(ret))
  714. return ret;
  715. return _read_exec(ios);
  716. }
  717. /*
  718. * write
  719. */
  720. static ssize_t _write_done(struct objio_state *ios)
  721. {
  722. ssize_t status;
  723. int ret = _io_check(ios, true);
  724. _io_free(ios);
  725. if (likely(!ret)) {
  726. /* FIXME: should be based on the OSD's persistence model
  727. * See OSD2r05 Section 4.13 Data persistence model */
  728. ios->ol_state.committed = NFS_FILE_SYNC;
  729. status = ios->length;
  730. } else {
  731. status = ret;
  732. }
  733. objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
  734. return status;
  735. }
  736. static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
  737. {
  738. struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
  739. unsigned dev = ios->per_dev[cur_comp].dev;
  740. unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
  741. int ret;
  742. for (; cur_comp < last_comp; ++cur_comp, ++dev) {
  743. struct osd_request *or = NULL;
  744. struct pnfs_osd_object_cred *cred =
  745. &ios->layout->comps[dev];
  746. struct osd_obj_id obj = {
  747. .partition = cred->oc_object_id.oid_partition_id,
  748. .id = cred->oc_object_id.oid_object_id,
  749. };
  750. struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
  751. struct bio *bio;
  752. or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
  753. if (unlikely(!or)) {
  754. ret = -ENOMEM;
  755. goto err;
  756. }
  757. per_dev->or = or;
  758. if (per_dev != master_dev) {
  759. bio = bio_kmalloc(GFP_NOFS,
  760. master_dev->bio->bi_max_vecs);
  761. if (unlikely(!bio)) {
  762. dprintk("Faild to allocate BIO size=%u\n",
  763. master_dev->bio->bi_max_vecs);
  764. ret = -ENOMEM;
  765. goto err;
  766. }
  767. __bio_clone(bio, master_dev->bio);
  768. bio->bi_bdev = NULL;
  769. bio->bi_next = NULL;
  770. per_dev->bio = bio;
  771. per_dev->dev = dev;
  772. per_dev->length = master_dev->length;
  773. per_dev->offset = master_dev->offset;
  774. } else {
  775. bio = master_dev->bio;
  776. bio->bi_rw |= REQ_WRITE;
  777. }
  778. osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
  779. ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
  780. if (ret) {
  781. dprintk("%s: Faild to osd_finalize_request() => %d\n",
  782. __func__, ret);
  783. goto err;
  784. }
  785. dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
  786. __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
  787. per_dev->length);
  788. }
  789. err:
  790. return ret;
  791. }
  792. static ssize_t _write_exec(struct objio_state *ios)
  793. {
  794. unsigned i;
  795. int ret;
  796. for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
  797. if (!ios->per_dev[i].length)
  798. continue;
  799. ret = _write_mirrors(ios, i);
  800. if (unlikely(ret))
  801. goto err;
  802. }
  803. ios->done = _write_done;
  804. return _io_exec(ios); /* In sync mode exec returns the io->status */
  805. err:
  806. _io_free(ios);
  807. return ret;
  808. }
  809. ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
  810. {
  811. struct objio_state *ios = container_of(ol_state, struct objio_state,
  812. ol_state);
  813. int ret;
  814. /* TODO: ios->stable = stable; */
  815. ret = _io_rw_pagelist(ios, GFP_NOFS);
  816. if (unlikely(ret))
  817. return ret;
  818. return _write_exec(ios);
  819. }
  820. static struct pnfs_layoutdriver_type objlayout_type = {
  821. .id = LAYOUT_OSD2_OBJECTS,
  822. .name = "LAYOUT_OSD2_OBJECTS",
  823. .flags = PNFS_LAYOUTRET_ON_SETATTR,
  824. .alloc_layout_hdr = objlayout_alloc_layout_hdr,
  825. .free_layout_hdr = objlayout_free_layout_hdr,
  826. .alloc_lseg = objlayout_alloc_lseg,
  827. .free_lseg = objlayout_free_lseg,
  828. .read_pagelist = objlayout_read_pagelist,
  829. .write_pagelist = objlayout_write_pagelist,
  830. .pg_test = pnfs_generic_pg_test,
  831. .free_deviceid_node = objio_free_deviceid_node,
  832. .encode_layoutcommit = objlayout_encode_layoutcommit,
  833. .encode_layoutreturn = objlayout_encode_layoutreturn,
  834. };
  835. MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
  836. MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
  837. MODULE_LICENSE("GPL");
  838. static int __init
  839. objlayout_init(void)
  840. {
  841. int ret = pnfs_register_layoutdriver(&objlayout_type);
  842. if (ret)
  843. printk(KERN_INFO
  844. "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
  845. __func__, ret);
  846. else
  847. printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
  848. __func__);
  849. return ret;
  850. }
  851. static void __exit
  852. objlayout_exit(void)
  853. {
  854. pnfs_unregister_layoutdriver(&objlayout_type);
  855. printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
  856. __func__);
  857. }
  858. module_init(objlayout_init);
  859. module_exit(objlayout_exit);