objio_osd.c 24 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009
  1. /*
  2. * pNFS Objects layout implementation over open-osd initiator library
  3. *
  4. * Copyright (C) 2009 Panasas Inc. [year of first publication]
  5. * All rights reserved.
  6. *
  7. * Benny Halevy <bhalevy@panasas.com>
  8. * Boaz Harrosh <bharrosh@panasas.com>
  9. *
  10. * This program is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License version 2
  12. * See the file COPYING included with this distribution for more details.
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. *
  18. * 1. Redistributions of source code must retain the above copyright
  19. * notice, this list of conditions and the following disclaimer.
  20. * 2. Redistributions in binary form must reproduce the above copyright
  21. * notice, this list of conditions and the following disclaimer in the
  22. * documentation and/or other materials provided with the distribution.
  23. * 3. Neither the name of the Panasas company nor the names of its
  24. * contributors may be used to endorse or promote products derived
  25. * from this software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
  28. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  29. * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  30. * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  31. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
  34. * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  35. * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  36. * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  37. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. */
  39. #include <linux/module.h>
  40. #include <scsi/osd_initiator.h>
  41. #include "objlayout.h"
  42. #define NFSDBG_FACILITY NFSDBG_PNFS_LD
  43. #define _LLU(x) ((unsigned long long)x)
  44. enum { BIO_MAX_PAGES_KMALLOC =
  45. (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
  46. };
  47. struct objio_dev_ent {
  48. struct nfs4_deviceid_node id_node;
  49. struct osd_dev *od;
  50. };
  51. static void
  52. objio_free_deviceid_node(struct nfs4_deviceid_node *d)
  53. {
  54. struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
  55. dprintk("%s: free od=%p\n", __func__, de->od);
  56. osduld_put_device(de->od);
  57. kfree(de);
  58. }
  59. static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
  60. const struct nfs4_deviceid *d_id)
  61. {
  62. struct nfs4_deviceid_node *d;
  63. struct objio_dev_ent *de;
  64. d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
  65. if (!d)
  66. return NULL;
  67. de = container_of(d, struct objio_dev_ent, id_node);
  68. return de;
  69. }
  70. static struct objio_dev_ent *
  71. _dev_list_add(const struct nfs_server *nfss,
  72. const struct nfs4_deviceid *d_id, struct osd_dev *od,
  73. gfp_t gfp_flags)
  74. {
  75. struct nfs4_deviceid_node *d;
  76. struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
  77. struct objio_dev_ent *n;
  78. if (!de) {
  79. dprintk("%s: -ENOMEM od=%p\n", __func__, od);
  80. return NULL;
  81. }
  82. dprintk("%s: Adding od=%p\n", __func__, od);
  83. nfs4_init_deviceid_node(&de->id_node,
  84. nfss->pnfs_curr_ld,
  85. nfss->nfs_client,
  86. d_id);
  87. de->od = od;
  88. d = nfs4_insert_deviceid_node(&de->id_node);
  89. n = container_of(d, struct objio_dev_ent, id_node);
  90. if (n != de) {
  91. dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
  92. objio_free_deviceid_node(&de->id_node);
  93. de = n;
  94. }
  95. atomic_inc(&de->id_node.ref);
  96. return de;
  97. }
  98. struct caps_buffers {
  99. u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
  100. u8 creds[OSD_CAP_LEN];
  101. };
  102. struct objio_segment {
  103. struct pnfs_layout_segment lseg;
  104. struct pnfs_osd_object_cred *comps;
  105. unsigned mirrors_p1;
  106. unsigned stripe_unit;
  107. unsigned group_width; /* Data stripe_units without integrity comps */
  108. u64 group_depth;
  109. unsigned group_count;
  110. unsigned comps_index;
  111. unsigned num_comps;
  112. /* variable length */
  113. struct objio_dev_ent *ods[];
  114. };
  115. static inline struct objio_segment *
  116. OBJIO_LSEG(struct pnfs_layout_segment *lseg)
  117. {
  118. return container_of(lseg, struct objio_segment, lseg);
  119. }
  120. struct objio_state;
  121. typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
  122. struct objio_state {
  123. /* Generic layer */
  124. struct objlayout_io_state ol_state;
  125. struct objio_segment *layout;
  126. struct kref kref;
  127. objio_done_fn done;
  128. void *private;
  129. unsigned long length;
  130. unsigned numdevs; /* Actually used devs in this IO */
  131. /* A per-device variable array of size numdevs */
  132. struct _objio_per_comp {
  133. struct bio *bio;
  134. struct osd_request *or;
  135. unsigned long length;
  136. u64 offset;
  137. unsigned dev;
  138. } per_dev[];
  139. };
  140. /* Send and wait for a get_device_info of devices in the layout,
  141. then look them up with the osd_initiator library */
  142. static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
  143. struct objio_segment *objio_seg, unsigned comp,
  144. gfp_t gfp_flags)
  145. {
  146. struct pnfs_osd_deviceaddr *deviceaddr;
  147. struct nfs4_deviceid *d_id;
  148. struct objio_dev_ent *ode;
  149. struct osd_dev *od;
  150. struct osd_dev_info odi;
  151. int err;
  152. d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
  153. ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
  154. if (ode)
  155. return ode;
  156. err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
  157. if (unlikely(err)) {
  158. dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
  159. __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
  160. return ERR_PTR(err);
  161. }
  162. odi.systemid_len = deviceaddr->oda_systemid.len;
  163. if (odi.systemid_len > sizeof(odi.systemid)) {
  164. err = -EINVAL;
  165. goto out;
  166. } else if (odi.systemid_len)
  167. memcpy(odi.systemid, deviceaddr->oda_systemid.data,
  168. odi.systemid_len);
  169. odi.osdname_len = deviceaddr->oda_osdname.len;
  170. odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
  171. if (!odi.osdname_len && !odi.systemid_len) {
  172. dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
  173. __func__);
  174. err = -ENODEV;
  175. goto out;
  176. }
  177. od = osduld_info_lookup(&odi);
  178. if (unlikely(IS_ERR(od))) {
  179. err = PTR_ERR(od);
  180. dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
  181. goto out;
  182. }
  183. ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
  184. gfp_flags);
  185. out:
  186. dprintk("%s: return=%d\n", __func__, err);
  187. objlayout_put_deviceinfo(deviceaddr);
  188. return err ? ERR_PTR(err) : ode;
  189. }
  190. static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
  191. struct objio_segment *objio_seg,
  192. gfp_t gfp_flags)
  193. {
  194. unsigned i;
  195. int err;
  196. /* lookup all devices */
  197. for (i = 0; i < objio_seg->num_comps; i++) {
  198. struct objio_dev_ent *ode;
  199. ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
  200. if (unlikely(IS_ERR(ode))) {
  201. err = PTR_ERR(ode);
  202. goto out;
  203. }
  204. objio_seg->ods[i] = ode;
  205. }
  206. err = 0;
  207. out:
  208. dprintk("%s: return=%d\n", __func__, err);
  209. return err;
  210. }
  211. static int _verify_data_map(struct pnfs_osd_layout *layout)
  212. {
  213. struct pnfs_osd_data_map *data_map = &layout->olo_map;
  214. u64 stripe_length;
  215. u32 group_width;
  216. /* FIXME: Only raid0 for now. if not go through MDS */
  217. if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
  218. printk(KERN_ERR "Only RAID_0 for now\n");
  219. return -ENOTSUPP;
  220. }
  221. if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
  222. printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
  223. data_map->odm_num_comps, data_map->odm_mirror_cnt);
  224. return -EINVAL;
  225. }
  226. if (data_map->odm_group_width)
  227. group_width = data_map->odm_group_width;
  228. else
  229. group_width = data_map->odm_num_comps /
  230. (data_map->odm_mirror_cnt + 1);
  231. stripe_length = (u64)data_map->odm_stripe_unit * group_width;
  232. if (stripe_length >= (1ULL << 32)) {
  233. printk(KERN_ERR "Total Stripe length(0x%llx)"
  234. " >= 32bit is not supported\n", _LLU(stripe_length));
  235. return -ENOTSUPP;
  236. }
  237. if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
  238. printk(KERN_ERR "Stripe Unit(0x%llx)"
  239. " must be Multples of PAGE_SIZE(0x%lx)\n",
  240. _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
  241. return -ENOTSUPP;
  242. }
  243. return 0;
  244. }
  245. static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
  246. struct pnfs_osd_object_cred *src_comp,
  247. struct caps_buffers *caps_p)
  248. {
  249. WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
  250. WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
  251. *cur_comp = *src_comp;
  252. memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
  253. sizeof(caps_p->caps_key));
  254. cur_comp->oc_cap_key.cred = caps_p->caps_key;
  255. memcpy(caps_p->creds, src_comp->oc_cap.cred,
  256. sizeof(caps_p->creds));
  257. cur_comp->oc_cap.cred = caps_p->creds;
  258. }
  259. int objio_alloc_lseg(struct pnfs_layout_segment **outp,
  260. struct pnfs_layout_hdr *pnfslay,
  261. struct pnfs_layout_range *range,
  262. struct xdr_stream *xdr,
  263. gfp_t gfp_flags)
  264. {
  265. struct objio_segment *objio_seg;
  266. struct pnfs_osd_xdr_decode_layout_iter iter;
  267. struct pnfs_osd_layout layout;
  268. struct pnfs_osd_object_cred *cur_comp, src_comp;
  269. struct caps_buffers *caps_p;
  270. int err;
  271. err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
  272. if (unlikely(err))
  273. return err;
  274. err = _verify_data_map(&layout);
  275. if (unlikely(err))
  276. return err;
  277. objio_seg = kzalloc(sizeof(*objio_seg) +
  278. sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
  279. sizeof(*objio_seg->comps) * layout.olo_num_comps +
  280. sizeof(struct caps_buffers) * layout.olo_num_comps,
  281. gfp_flags);
  282. if (!objio_seg)
  283. return -ENOMEM;
  284. objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
  285. cur_comp = objio_seg->comps;
  286. caps_p = (void *)(cur_comp + layout.olo_num_comps);
  287. while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
  288. copy_single_comp(cur_comp++, &src_comp, caps_p++);
  289. if (unlikely(err))
  290. goto err;
  291. objio_seg->num_comps = layout.olo_num_comps;
  292. objio_seg->comps_index = layout.olo_comps_index;
  293. err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
  294. if (err)
  295. goto err;
  296. objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
  297. objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
  298. if (layout.olo_map.odm_group_width) {
  299. objio_seg->group_width = layout.olo_map.odm_group_width;
  300. objio_seg->group_depth = layout.olo_map.odm_group_depth;
  301. objio_seg->group_count = layout.olo_map.odm_num_comps /
  302. objio_seg->mirrors_p1 /
  303. objio_seg->group_width;
  304. } else {
  305. objio_seg->group_width = layout.olo_map.odm_num_comps /
  306. objio_seg->mirrors_p1;
  307. objio_seg->group_depth = -1;
  308. objio_seg->group_count = 1;
  309. }
  310. *outp = &objio_seg->lseg;
  311. return 0;
  312. err:
  313. kfree(objio_seg);
  314. dprintk("%s: Error: return %d\n", __func__, err);
  315. *outp = NULL;
  316. return err;
  317. }
  318. void objio_free_lseg(struct pnfs_layout_segment *lseg)
  319. {
  320. int i;
  321. struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
  322. for (i = 0; i < objio_seg->num_comps; i++) {
  323. if (!objio_seg->ods[i])
  324. break;
  325. nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
  326. }
  327. kfree(objio_seg);
  328. }
  329. int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
  330. struct objlayout_io_state **outp,
  331. gfp_t gfp_flags)
  332. {
  333. struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
  334. struct objio_state *ios;
  335. const unsigned first_size = sizeof(*ios) +
  336. objio_seg->num_comps * sizeof(ios->per_dev[0]);
  337. ios = kzalloc(first_size, gfp_flags);
  338. if (unlikely(!ios))
  339. return -ENOMEM;
  340. ios->layout = objio_seg;
  341. *outp = &ios->ol_state;
  342. return 0;
  343. }
  344. void objio_free_io_state(struct objlayout_io_state *ol_state)
  345. {
  346. struct objio_state *ios = container_of(ol_state, struct objio_state,
  347. ol_state);
  348. kfree(ios);
  349. }
  350. static void _clear_bio(struct bio *bio)
  351. {
  352. struct bio_vec *bv;
  353. unsigned i;
  354. __bio_for_each_segment(bv, bio, i, 0) {
  355. unsigned this_count = bv->bv_len;
  356. if (likely(PAGE_SIZE == this_count))
  357. clear_highpage(bv->bv_page);
  358. else
  359. zero_user(bv->bv_page, bv->bv_offset, this_count);
  360. }
  361. }
  362. static int _io_check(struct objio_state *ios, bool is_write)
  363. {
  364. enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
  365. int lin_ret = 0;
  366. int i;
  367. for (i = 0; i < ios->numdevs; i++) {
  368. struct osd_sense_info osi;
  369. struct osd_request *or = ios->per_dev[i].or;
  370. unsigned dev;
  371. int ret;
  372. if (!or)
  373. continue;
  374. ret = osd_req_decode_sense(or, &osi);
  375. if (likely(!ret))
  376. continue;
  377. if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
  378. /* start read offset passed endof file */
  379. BUG_ON(is_write);
  380. _clear_bio(ios->per_dev[i].bio);
  381. dprintk("%s: start read offset passed end of file "
  382. "offset=0x%llx, length=0x%lx\n", __func__,
  383. _LLU(ios->per_dev[i].offset),
  384. ios->per_dev[i].length);
  385. continue; /* we recovered */
  386. }
  387. dev = ios->per_dev[i].dev;
  388. if (osi.osd_err_pri >= oep) {
  389. oep = osi.osd_err_pri;
  390. lin_ret = ret;
  391. }
  392. }
  393. return lin_ret;
  394. }
  395. /*
  396. * Common IO state helpers.
  397. */
  398. static void _io_free(struct objio_state *ios)
  399. {
  400. unsigned i;
  401. for (i = 0; i < ios->numdevs; i++) {
  402. struct _objio_per_comp *per_dev = &ios->per_dev[i];
  403. if (per_dev->or) {
  404. osd_end_request(per_dev->or);
  405. per_dev->or = NULL;
  406. }
  407. if (per_dev->bio) {
  408. bio_put(per_dev->bio);
  409. per_dev->bio = NULL;
  410. }
  411. }
  412. }
  413. struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
  414. {
  415. unsigned min_dev = ios->layout->comps_index;
  416. unsigned max_dev = min_dev + ios->layout->num_comps;
  417. BUG_ON(dev < min_dev || max_dev <= dev);
  418. return ios->layout->ods[dev - min_dev]->od;
  419. }
  420. struct _striping_info {
  421. u64 obj_offset;
  422. u64 group_length;
  423. unsigned dev;
  424. unsigned unit_off;
  425. };
  426. static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
  427. struct _striping_info *si)
  428. {
  429. u32 stripe_unit = ios->layout->stripe_unit;
  430. u32 group_width = ios->layout->group_width;
  431. u64 group_depth = ios->layout->group_depth;
  432. u32 U = stripe_unit * group_width;
  433. u64 T = U * group_depth;
  434. u64 S = T * ios->layout->group_count;
  435. u64 M = div64_u64(file_offset, S);
  436. /*
  437. G = (L - (M * S)) / T
  438. H = (L - (M * S)) % T
  439. */
  440. u64 LmodU = file_offset - M * S;
  441. u32 G = div64_u64(LmodU, T);
  442. u64 H = LmodU - G * T;
  443. u32 N = div_u64(H, U);
  444. div_u64_rem(file_offset, stripe_unit, &si->unit_off);
  445. si->obj_offset = si->unit_off + (N * stripe_unit) +
  446. (M * group_depth * stripe_unit);
  447. /* "H - (N * U)" is just "H % U" so it's bound to u32 */
  448. si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
  449. si->dev *= ios->layout->mirrors_p1;
  450. si->group_length = T - H;
  451. }
  452. static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
  453. unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
  454. gfp_t gfp_flags)
  455. {
  456. unsigned pg = *cur_pg;
  457. struct request_queue *q =
  458. osd_request_queue(_io_od(ios, per_dev->dev));
  459. per_dev->length += cur_len;
  460. if (per_dev->bio == NULL) {
  461. unsigned stripes = ios->layout->num_comps /
  462. ios->layout->mirrors_p1;
  463. unsigned pages_in_stripe = stripes *
  464. (ios->layout->stripe_unit / PAGE_SIZE);
  465. unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
  466. stripes;
  467. if (BIO_MAX_PAGES_KMALLOC < bio_size)
  468. bio_size = BIO_MAX_PAGES_KMALLOC;
  469. per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
  470. if (unlikely(!per_dev->bio)) {
  471. dprintk("Faild to allocate BIO size=%u\n", bio_size);
  472. return -ENOMEM;
  473. }
  474. }
  475. while (cur_len > 0) {
  476. unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
  477. unsigned added_len;
  478. BUG_ON(ios->ol_state.nr_pages <= pg);
  479. cur_len -= pglen;
  480. added_len = bio_add_pc_page(q, per_dev->bio,
  481. ios->ol_state.pages[pg], pglen, pgbase);
  482. if (unlikely(pglen != added_len))
  483. return -ENOMEM;
  484. pgbase = 0;
  485. ++pg;
  486. }
  487. BUG_ON(cur_len);
  488. *cur_pg = pg;
  489. return 0;
  490. }
  491. static int _prepare_one_group(struct objio_state *ios, u64 length,
  492. struct _striping_info *si, unsigned *last_pg,
  493. gfp_t gfp_flags)
  494. {
  495. unsigned stripe_unit = ios->layout->stripe_unit;
  496. unsigned mirrors_p1 = ios->layout->mirrors_p1;
  497. unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
  498. unsigned dev = si->dev;
  499. unsigned first_dev = dev - (dev % devs_in_group);
  500. unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
  501. unsigned cur_pg = *last_pg;
  502. int ret = 0;
  503. while (length) {
  504. struct _objio_per_comp *per_dev = &ios->per_dev[dev];
  505. unsigned cur_len, page_off = 0;
  506. if (!per_dev->length) {
  507. per_dev->dev = dev;
  508. if (dev < si->dev) {
  509. per_dev->offset = si->obj_offset + stripe_unit -
  510. si->unit_off;
  511. cur_len = stripe_unit;
  512. } else if (dev == si->dev) {
  513. per_dev->offset = si->obj_offset;
  514. cur_len = stripe_unit - si->unit_off;
  515. page_off = si->unit_off & ~PAGE_MASK;
  516. BUG_ON(page_off &&
  517. (page_off != ios->ol_state.pgbase));
  518. } else { /* dev > si->dev */
  519. per_dev->offset = si->obj_offset - si->unit_off;
  520. cur_len = stripe_unit;
  521. }
  522. if (max_comp < dev)
  523. max_comp = dev;
  524. } else {
  525. cur_len = stripe_unit;
  526. }
  527. if (cur_len >= length)
  528. cur_len = length;
  529. ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
  530. cur_len, gfp_flags);
  531. if (unlikely(ret))
  532. goto out;
  533. dev += mirrors_p1;
  534. dev = (dev % devs_in_group) + first_dev;
  535. length -= cur_len;
  536. ios->length += cur_len;
  537. }
  538. out:
  539. ios->numdevs = max_comp + mirrors_p1;
  540. *last_pg = cur_pg;
  541. return ret;
  542. }
  543. static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
  544. {
  545. u64 length = ios->ol_state.count;
  546. u64 offset = ios->ol_state.offset;
  547. struct _striping_info si;
  548. unsigned last_pg = 0;
  549. int ret = 0;
  550. while (length) {
  551. _calc_stripe_info(ios, offset, &si);
  552. if (length < si.group_length)
  553. si.group_length = length;
  554. ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
  555. if (unlikely(ret))
  556. goto out;
  557. offset += si.group_length;
  558. length -= si.group_length;
  559. }
  560. out:
  561. if (!ios->length)
  562. return ret;
  563. return 0;
  564. }
  565. static ssize_t _sync_done(struct objio_state *ios)
  566. {
  567. struct completion *waiting = ios->private;
  568. complete(waiting);
  569. return 0;
  570. }
  571. static void _last_io(struct kref *kref)
  572. {
  573. struct objio_state *ios = container_of(kref, struct objio_state, kref);
  574. ios->done(ios);
  575. }
  576. static void _done_io(struct osd_request *or, void *p)
  577. {
  578. struct objio_state *ios = p;
  579. kref_put(&ios->kref, _last_io);
  580. }
  581. static ssize_t _io_exec(struct objio_state *ios)
  582. {
  583. DECLARE_COMPLETION_ONSTACK(wait);
  584. ssize_t status = 0; /* sync status */
  585. unsigned i;
  586. objio_done_fn saved_done_fn = ios->done;
  587. bool sync = ios->ol_state.sync;
  588. if (sync) {
  589. ios->done = _sync_done;
  590. ios->private = &wait;
  591. }
  592. kref_init(&ios->kref);
  593. for (i = 0; i < ios->numdevs; i++) {
  594. struct osd_request *or = ios->per_dev[i].or;
  595. if (!or)
  596. continue;
  597. kref_get(&ios->kref);
  598. osd_execute_request_async(or, _done_io, ios);
  599. }
  600. kref_put(&ios->kref, _last_io);
  601. if (sync) {
  602. wait_for_completion(&wait);
  603. status = saved_done_fn(ios);
  604. }
  605. return status;
  606. }
  607. /*
  608. * read
  609. */
  610. static ssize_t _read_done(struct objio_state *ios)
  611. {
  612. ssize_t status;
  613. int ret = _io_check(ios, false);
  614. _io_free(ios);
  615. if (likely(!ret))
  616. status = ios->length;
  617. else
  618. status = ret;
  619. objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
  620. return status;
  621. }
  622. static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
  623. {
  624. struct osd_request *or = NULL;
  625. struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
  626. unsigned dev = per_dev->dev;
  627. struct pnfs_osd_object_cred *cred =
  628. &ios->layout->comps[dev];
  629. struct osd_obj_id obj = {
  630. .partition = cred->oc_object_id.oid_partition_id,
  631. .id = cred->oc_object_id.oid_object_id,
  632. };
  633. int ret;
  634. or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
  635. if (unlikely(!or)) {
  636. ret = -ENOMEM;
  637. goto err;
  638. }
  639. per_dev->or = or;
  640. osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
  641. ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
  642. if (ret) {
  643. dprintk("%s: Faild to osd_finalize_request() => %d\n",
  644. __func__, ret);
  645. goto err;
  646. }
  647. dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
  648. __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
  649. per_dev->length);
  650. err:
  651. return ret;
  652. }
  653. static ssize_t _read_exec(struct objio_state *ios)
  654. {
  655. unsigned i;
  656. int ret;
  657. for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
  658. if (!ios->per_dev[i].length)
  659. continue;
  660. ret = _read_mirrors(ios, i);
  661. if (unlikely(ret))
  662. goto err;
  663. }
  664. ios->done = _read_done;
  665. return _io_exec(ios); /* In sync mode exec returns the io status */
  666. err:
  667. _io_free(ios);
  668. return ret;
  669. }
  670. ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
  671. {
  672. struct objio_state *ios = container_of(ol_state, struct objio_state,
  673. ol_state);
  674. int ret;
  675. ret = _io_rw_pagelist(ios, GFP_KERNEL);
  676. if (unlikely(ret))
  677. return ret;
  678. return _read_exec(ios);
  679. }
  680. /*
  681. * write
  682. */
  683. static ssize_t _write_done(struct objio_state *ios)
  684. {
  685. ssize_t status;
  686. int ret = _io_check(ios, true);
  687. _io_free(ios);
  688. if (likely(!ret)) {
  689. /* FIXME: should be based on the OSD's persistence model
  690. * See OSD2r05 Section 4.13 Data persistence model */
  691. ios->ol_state.committed = NFS_FILE_SYNC;
  692. status = ios->length;
  693. } else {
  694. status = ret;
  695. }
  696. objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
  697. return status;
  698. }
  699. static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
  700. {
  701. struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
  702. unsigned dev = ios->per_dev[cur_comp].dev;
  703. unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
  704. int ret;
  705. for (; cur_comp < last_comp; ++cur_comp, ++dev) {
  706. struct osd_request *or = NULL;
  707. struct pnfs_osd_object_cred *cred =
  708. &ios->layout->comps[dev];
  709. struct osd_obj_id obj = {
  710. .partition = cred->oc_object_id.oid_partition_id,
  711. .id = cred->oc_object_id.oid_object_id,
  712. };
  713. struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
  714. struct bio *bio;
  715. or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
  716. if (unlikely(!or)) {
  717. ret = -ENOMEM;
  718. goto err;
  719. }
  720. per_dev->or = or;
  721. if (per_dev != master_dev) {
  722. bio = bio_kmalloc(GFP_NOFS,
  723. master_dev->bio->bi_max_vecs);
  724. if (unlikely(!bio)) {
  725. dprintk("Faild to allocate BIO size=%u\n",
  726. master_dev->bio->bi_max_vecs);
  727. ret = -ENOMEM;
  728. goto err;
  729. }
  730. __bio_clone(bio, master_dev->bio);
  731. bio->bi_bdev = NULL;
  732. bio->bi_next = NULL;
  733. per_dev->bio = bio;
  734. per_dev->dev = dev;
  735. per_dev->length = master_dev->length;
  736. per_dev->offset = master_dev->offset;
  737. } else {
  738. bio = master_dev->bio;
  739. bio->bi_rw |= REQ_WRITE;
  740. }
  741. osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
  742. ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
  743. if (ret) {
  744. dprintk("%s: Faild to osd_finalize_request() => %d\n",
  745. __func__, ret);
  746. goto err;
  747. }
  748. dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
  749. __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
  750. per_dev->length);
  751. }
  752. err:
  753. return ret;
  754. }
  755. static ssize_t _write_exec(struct objio_state *ios)
  756. {
  757. unsigned i;
  758. int ret;
  759. for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
  760. if (!ios->per_dev[i].length)
  761. continue;
  762. ret = _write_mirrors(ios, i);
  763. if (unlikely(ret))
  764. goto err;
  765. }
  766. ios->done = _write_done;
  767. return _io_exec(ios); /* In sync mode exec returns the io->status */
  768. err:
  769. _io_free(ios);
  770. return ret;
  771. }
  772. ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
  773. {
  774. struct objio_state *ios = container_of(ol_state, struct objio_state,
  775. ol_state);
  776. int ret;
  777. /* TODO: ios->stable = stable; */
  778. ret = _io_rw_pagelist(ios, GFP_NOFS);
  779. if (unlikely(ret))
  780. return ret;
  781. return _write_exec(ios);
  782. }
  783. /*
  784. * objlayout_pg_test(). Called by nfs_can_coalesce_requests()
  785. *
  786. * return 1 : coalesce page
  787. * return 0 : don't coalesce page
  788. */
  789. int
  790. objlayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
  791. struct nfs_page *req)
  792. {
  793. return 1;
  794. }
  795. static struct pnfs_layoutdriver_type objlayout_type = {
  796. .id = LAYOUT_OSD2_OBJECTS,
  797. .name = "LAYOUT_OSD2_OBJECTS",
  798. .alloc_layout_hdr = objlayout_alloc_layout_hdr,
  799. .free_layout_hdr = objlayout_free_layout_hdr,
  800. .alloc_lseg = objlayout_alloc_lseg,
  801. .free_lseg = objlayout_free_lseg,
  802. .read_pagelist = objlayout_read_pagelist,
  803. .write_pagelist = objlayout_write_pagelist,
  804. .pg_test = objlayout_pg_test,
  805. .free_deviceid_node = objio_free_deviceid_node,
  806. };
  807. MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
  808. MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
  809. MODULE_LICENSE("GPL");
  810. static int __init
  811. objlayout_init(void)
  812. {
  813. int ret = pnfs_register_layoutdriver(&objlayout_type);
  814. if (ret)
  815. printk(KERN_INFO
  816. "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
  817. __func__, ret);
  818. else
  819. printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
  820. __func__);
  821. return ret;
  822. }
  823. static void __exit
  824. objlayout_exit(void)
  825. {
  826. pnfs_unregister_layoutdriver(&objlayout_type);
  827. printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
  828. __func__);
  829. }
  830. module_init(objlayout_init);
  831. module_exit(objlayout_exit);