objio_osd.c 26 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057
  1. /*
  2. * pNFS Objects layout implementation over open-osd initiator library
  3. *
  4. * Copyright (C) 2009 Panasas Inc. [year of first publication]
  5. * All rights reserved.
  6. *
  7. * Benny Halevy <bhalevy@panasas.com>
  8. * Boaz Harrosh <bharrosh@panasas.com>
  9. *
  10. * This program is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License version 2
  12. * See the file COPYING included with this distribution for more details.
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. *
  18. * 1. Redistributions of source code must retain the above copyright
  19. * notice, this list of conditions and the following disclaimer.
  20. * 2. Redistributions in binary form must reproduce the above copyright
  21. * notice, this list of conditions and the following disclaimer in the
  22. * documentation and/or other materials provided with the distribution.
  23. * 3. Neither the name of the Panasas company nor the names of its
  24. * contributors may be used to endorse or promote products derived
  25. * from this software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
  28. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  29. * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  30. * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  31. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
  34. * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  35. * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  36. * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  37. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. */
  39. #include <linux/module.h>
  40. #include <scsi/osd_initiator.h>
  41. #include "objlayout.h"
  42. #define NFSDBG_FACILITY NFSDBG_PNFS_LD
  43. #define _LLU(x) ((unsigned long long)x)
  44. enum { BIO_MAX_PAGES_KMALLOC =
  45. (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
  46. };
  47. struct objio_dev_ent {
  48. struct nfs4_deviceid_node id_node;
  49. struct osd_dev *od;
  50. };
  51. static void
  52. objio_free_deviceid_node(struct nfs4_deviceid_node *d)
  53. {
  54. struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
  55. dprintk("%s: free od=%p\n", __func__, de->od);
  56. osduld_put_device(de->od);
  57. kfree(de);
  58. }
  59. static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
  60. const struct nfs4_deviceid *d_id)
  61. {
  62. struct nfs4_deviceid_node *d;
  63. struct objio_dev_ent *de;
  64. d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
  65. if (!d)
  66. return NULL;
  67. de = container_of(d, struct objio_dev_ent, id_node);
  68. return de;
  69. }
  70. static struct objio_dev_ent *
  71. _dev_list_add(const struct nfs_server *nfss,
  72. const struct nfs4_deviceid *d_id, struct osd_dev *od,
  73. gfp_t gfp_flags)
  74. {
  75. struct nfs4_deviceid_node *d;
  76. struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
  77. struct objio_dev_ent *n;
  78. if (!de) {
  79. dprintk("%s: -ENOMEM od=%p\n", __func__, od);
  80. return NULL;
  81. }
  82. dprintk("%s: Adding od=%p\n", __func__, od);
  83. nfs4_init_deviceid_node(&de->id_node,
  84. nfss->pnfs_curr_ld,
  85. nfss->nfs_client,
  86. d_id);
  87. de->od = od;
  88. d = nfs4_insert_deviceid_node(&de->id_node);
  89. n = container_of(d, struct objio_dev_ent, id_node);
  90. if (n != de) {
  91. dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
  92. objio_free_deviceid_node(&de->id_node);
  93. de = n;
  94. }
  95. atomic_inc(&de->id_node.ref);
  96. return de;
  97. }
  98. struct caps_buffers {
  99. u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
  100. u8 creds[OSD_CAP_LEN];
  101. };
  102. struct objio_segment {
  103. struct pnfs_layout_segment lseg;
  104. struct pnfs_osd_object_cred *comps;
  105. unsigned mirrors_p1;
  106. unsigned stripe_unit;
  107. unsigned group_width; /* Data stripe_units without integrity comps */
  108. u64 group_depth;
  109. unsigned group_count;
  110. unsigned max_io_size;
  111. unsigned comps_index;
  112. unsigned num_comps;
  113. /* variable length */
  114. struct objio_dev_ent *ods[];
  115. };
  116. static inline struct objio_segment *
  117. OBJIO_LSEG(struct pnfs_layout_segment *lseg)
  118. {
  119. return container_of(lseg, struct objio_segment, lseg);
  120. }
  121. struct objio_state;
  122. typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
  123. struct objio_state {
  124. /* Generic layer */
  125. struct objlayout_io_state ol_state;
  126. struct objio_segment *layout;
  127. struct kref kref;
  128. objio_done_fn done;
  129. void *private;
  130. unsigned long length;
  131. unsigned numdevs; /* Actually used devs in this IO */
  132. /* A per-device variable array of size numdevs */
  133. struct _objio_per_comp {
  134. struct bio *bio;
  135. struct osd_request *or;
  136. unsigned long length;
  137. u64 offset;
  138. unsigned dev;
  139. } per_dev[];
  140. };
  141. /* Send and wait for a get_device_info of devices in the layout,
  142. then look them up with the osd_initiator library */
  143. static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
  144. struct objio_segment *objio_seg, unsigned comp,
  145. gfp_t gfp_flags)
  146. {
  147. struct pnfs_osd_deviceaddr *deviceaddr;
  148. struct nfs4_deviceid *d_id;
  149. struct objio_dev_ent *ode;
  150. struct osd_dev *od;
  151. struct osd_dev_info odi;
  152. int err;
  153. d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
  154. ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
  155. if (ode)
  156. return ode;
  157. err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
  158. if (unlikely(err)) {
  159. dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
  160. __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
  161. return ERR_PTR(err);
  162. }
  163. odi.systemid_len = deviceaddr->oda_systemid.len;
  164. if (odi.systemid_len > sizeof(odi.systemid)) {
  165. err = -EINVAL;
  166. goto out;
  167. } else if (odi.systemid_len)
  168. memcpy(odi.systemid, deviceaddr->oda_systemid.data,
  169. odi.systemid_len);
  170. odi.osdname_len = deviceaddr->oda_osdname.len;
  171. odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
  172. if (!odi.osdname_len && !odi.systemid_len) {
  173. dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
  174. __func__);
  175. err = -ENODEV;
  176. goto out;
  177. }
  178. od = osduld_info_lookup(&odi);
  179. if (unlikely(IS_ERR(od))) {
  180. err = PTR_ERR(od);
  181. dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
  182. goto out;
  183. }
  184. ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
  185. gfp_flags);
  186. out:
  187. dprintk("%s: return=%d\n", __func__, err);
  188. objlayout_put_deviceinfo(deviceaddr);
  189. return err ? ERR_PTR(err) : ode;
  190. }
  191. static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
  192. struct objio_segment *objio_seg,
  193. gfp_t gfp_flags)
  194. {
  195. unsigned i;
  196. int err;
  197. /* lookup all devices */
  198. for (i = 0; i < objio_seg->num_comps; i++) {
  199. struct objio_dev_ent *ode;
  200. ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
  201. if (unlikely(IS_ERR(ode))) {
  202. err = PTR_ERR(ode);
  203. goto out;
  204. }
  205. objio_seg->ods[i] = ode;
  206. }
  207. err = 0;
  208. out:
  209. dprintk("%s: return=%d\n", __func__, err);
  210. return err;
  211. }
  212. static int _verify_data_map(struct pnfs_osd_layout *layout)
  213. {
  214. struct pnfs_osd_data_map *data_map = &layout->olo_map;
  215. u64 stripe_length;
  216. u32 group_width;
  217. /* FIXME: Only raid0 for now. if not go through MDS */
  218. if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
  219. printk(KERN_ERR "Only RAID_0 for now\n");
  220. return -ENOTSUPP;
  221. }
  222. if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
  223. printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
  224. data_map->odm_num_comps, data_map->odm_mirror_cnt);
  225. return -EINVAL;
  226. }
  227. if (data_map->odm_group_width)
  228. group_width = data_map->odm_group_width;
  229. else
  230. group_width = data_map->odm_num_comps /
  231. (data_map->odm_mirror_cnt + 1);
  232. stripe_length = (u64)data_map->odm_stripe_unit * group_width;
  233. if (stripe_length >= (1ULL << 32)) {
  234. printk(KERN_ERR "Total Stripe length(0x%llx)"
  235. " >= 32bit is not supported\n", _LLU(stripe_length));
  236. return -ENOTSUPP;
  237. }
  238. if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
  239. printk(KERN_ERR "Stripe Unit(0x%llx)"
  240. " must be Multples of PAGE_SIZE(0x%lx)\n",
  241. _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
  242. return -ENOTSUPP;
  243. }
  244. return 0;
  245. }
  246. static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
  247. struct pnfs_osd_object_cred *src_comp,
  248. struct caps_buffers *caps_p)
  249. {
  250. WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
  251. WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
  252. *cur_comp = *src_comp;
  253. memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
  254. sizeof(caps_p->caps_key));
  255. cur_comp->oc_cap_key.cred = caps_p->caps_key;
  256. memcpy(caps_p->creds, src_comp->oc_cap.cred,
  257. sizeof(caps_p->creds));
  258. cur_comp->oc_cap.cred = caps_p->creds;
  259. }
  260. int objio_alloc_lseg(struct pnfs_layout_segment **outp,
  261. struct pnfs_layout_hdr *pnfslay,
  262. struct pnfs_layout_range *range,
  263. struct xdr_stream *xdr,
  264. gfp_t gfp_flags)
  265. {
  266. struct objio_segment *objio_seg;
  267. struct pnfs_osd_xdr_decode_layout_iter iter;
  268. struct pnfs_osd_layout layout;
  269. struct pnfs_osd_object_cred *cur_comp, src_comp;
  270. struct caps_buffers *caps_p;
  271. int err;
  272. err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
  273. if (unlikely(err))
  274. return err;
  275. err = _verify_data_map(&layout);
  276. if (unlikely(err))
  277. return err;
  278. objio_seg = kzalloc(sizeof(*objio_seg) +
  279. sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
  280. sizeof(*objio_seg->comps) * layout.olo_num_comps +
  281. sizeof(struct caps_buffers) * layout.olo_num_comps,
  282. gfp_flags);
  283. if (!objio_seg)
  284. return -ENOMEM;
  285. objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
  286. cur_comp = objio_seg->comps;
  287. caps_p = (void *)(cur_comp + layout.olo_num_comps);
  288. while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
  289. copy_single_comp(cur_comp++, &src_comp, caps_p++);
  290. if (unlikely(err))
  291. goto err;
  292. objio_seg->num_comps = layout.olo_num_comps;
  293. objio_seg->comps_index = layout.olo_comps_index;
  294. err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
  295. if (err)
  296. goto err;
  297. objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
  298. objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
  299. if (layout.olo_map.odm_group_width) {
  300. objio_seg->group_width = layout.olo_map.odm_group_width;
  301. objio_seg->group_depth = layout.olo_map.odm_group_depth;
  302. objio_seg->group_count = layout.olo_map.odm_num_comps /
  303. objio_seg->mirrors_p1 /
  304. objio_seg->group_width;
  305. } else {
  306. objio_seg->group_width = layout.olo_map.odm_num_comps /
  307. objio_seg->mirrors_p1;
  308. objio_seg->group_depth = -1;
  309. objio_seg->group_count = 1;
  310. }
  311. /* Cache this calculation it will hit for every page */
  312. objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
  313. objio_seg->stripe_unit) *
  314. objio_seg->group_width;
  315. *outp = &objio_seg->lseg;
  316. return 0;
  317. err:
  318. kfree(objio_seg);
  319. dprintk("%s: Error: return %d\n", __func__, err);
  320. *outp = NULL;
  321. return err;
  322. }
  323. void objio_free_lseg(struct pnfs_layout_segment *lseg)
  324. {
  325. int i;
  326. struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
  327. for (i = 0; i < objio_seg->num_comps; i++) {
  328. if (!objio_seg->ods[i])
  329. break;
  330. nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
  331. }
  332. kfree(objio_seg);
  333. }
  334. int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
  335. struct objlayout_io_state **outp,
  336. gfp_t gfp_flags)
  337. {
  338. struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
  339. struct objio_state *ios;
  340. const unsigned first_size = sizeof(*ios) +
  341. objio_seg->num_comps * sizeof(ios->per_dev[0]);
  342. const unsigned sec_size = objio_seg->num_comps *
  343. sizeof(ios->ol_state.ioerrs[0]);
  344. ios = kzalloc(first_size + sec_size, gfp_flags);
  345. if (unlikely(!ios))
  346. return -ENOMEM;
  347. ios->layout = objio_seg;
  348. ios->ol_state.ioerrs = ((void *)ios) + first_size;
  349. ios->ol_state.num_comps = objio_seg->num_comps;
  350. *outp = &ios->ol_state;
  351. return 0;
  352. }
  353. void objio_free_io_state(struct objlayout_io_state *ol_state)
  354. {
  355. struct objio_state *ios = container_of(ol_state, struct objio_state,
  356. ol_state);
  357. kfree(ios);
  358. }
  359. enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
  360. {
  361. switch (oep) {
  362. case OSD_ERR_PRI_NO_ERROR:
  363. return (enum pnfs_osd_errno)0;
  364. case OSD_ERR_PRI_CLEAR_PAGES:
  365. BUG_ON(1);
  366. return 0;
  367. case OSD_ERR_PRI_RESOURCE:
  368. return PNFS_OSD_ERR_RESOURCE;
  369. case OSD_ERR_PRI_BAD_CRED:
  370. return PNFS_OSD_ERR_BAD_CRED;
  371. case OSD_ERR_PRI_NO_ACCESS:
  372. return PNFS_OSD_ERR_NO_ACCESS;
  373. case OSD_ERR_PRI_UNREACHABLE:
  374. return PNFS_OSD_ERR_UNREACHABLE;
  375. case OSD_ERR_PRI_NOT_FOUND:
  376. return PNFS_OSD_ERR_NOT_FOUND;
  377. case OSD_ERR_PRI_NO_SPACE:
  378. return PNFS_OSD_ERR_NO_SPACE;
  379. default:
  380. WARN_ON(1);
  381. /* fallthrough */
  382. case OSD_ERR_PRI_EIO:
  383. return PNFS_OSD_ERR_EIO;
  384. }
  385. }
  386. static void _clear_bio(struct bio *bio)
  387. {
  388. struct bio_vec *bv;
  389. unsigned i;
  390. __bio_for_each_segment(bv, bio, i, 0) {
  391. unsigned this_count = bv->bv_len;
  392. if (likely(PAGE_SIZE == this_count))
  393. clear_highpage(bv->bv_page);
  394. else
  395. zero_user(bv->bv_page, bv->bv_offset, this_count);
  396. }
  397. }
  398. static int _io_check(struct objio_state *ios, bool is_write)
  399. {
  400. enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
  401. int lin_ret = 0;
  402. int i;
  403. for (i = 0; i < ios->numdevs; i++) {
  404. struct osd_sense_info osi;
  405. struct osd_request *or = ios->per_dev[i].or;
  406. unsigned dev;
  407. int ret;
  408. if (!or)
  409. continue;
  410. ret = osd_req_decode_sense(or, &osi);
  411. if (likely(!ret))
  412. continue;
  413. if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
  414. /* start read offset passed endof file */
  415. BUG_ON(is_write);
  416. _clear_bio(ios->per_dev[i].bio);
  417. dprintk("%s: start read offset passed end of file "
  418. "offset=0x%llx, length=0x%lx\n", __func__,
  419. _LLU(ios->per_dev[i].offset),
  420. ios->per_dev[i].length);
  421. continue; /* we recovered */
  422. }
  423. dev = ios->per_dev[i].dev;
  424. objlayout_io_set_result(&ios->ol_state, dev,
  425. &ios->layout->comps[dev].oc_object_id,
  426. osd_pri_2_pnfs_err(osi.osd_err_pri),
  427. ios->per_dev[i].offset,
  428. ios->per_dev[i].length,
  429. is_write);
  430. if (osi.osd_err_pri >= oep) {
  431. oep = osi.osd_err_pri;
  432. lin_ret = ret;
  433. }
  434. }
  435. return lin_ret;
  436. }
  437. /*
  438. * Common IO state helpers.
  439. */
  440. static void _io_free(struct objio_state *ios)
  441. {
  442. unsigned i;
  443. for (i = 0; i < ios->numdevs; i++) {
  444. struct _objio_per_comp *per_dev = &ios->per_dev[i];
  445. if (per_dev->or) {
  446. osd_end_request(per_dev->or);
  447. per_dev->or = NULL;
  448. }
  449. if (per_dev->bio) {
  450. bio_put(per_dev->bio);
  451. per_dev->bio = NULL;
  452. }
  453. }
  454. }
  455. struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
  456. {
  457. unsigned min_dev = ios->layout->comps_index;
  458. unsigned max_dev = min_dev + ios->layout->num_comps;
  459. BUG_ON(dev < min_dev || max_dev <= dev);
  460. return ios->layout->ods[dev - min_dev]->od;
  461. }
  462. struct _striping_info {
  463. u64 obj_offset;
  464. u64 group_length;
  465. unsigned dev;
  466. unsigned unit_off;
  467. };
  468. static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
  469. struct _striping_info *si)
  470. {
  471. u32 stripe_unit = ios->layout->stripe_unit;
  472. u32 group_width = ios->layout->group_width;
  473. u64 group_depth = ios->layout->group_depth;
  474. u32 U = stripe_unit * group_width;
  475. u64 T = U * group_depth;
  476. u64 S = T * ios->layout->group_count;
  477. u64 M = div64_u64(file_offset, S);
  478. /*
  479. G = (L - (M * S)) / T
  480. H = (L - (M * S)) % T
  481. */
  482. u64 LmodU = file_offset - M * S;
  483. u32 G = div64_u64(LmodU, T);
  484. u64 H = LmodU - G * T;
  485. u32 N = div_u64(H, U);
  486. div_u64_rem(file_offset, stripe_unit, &si->unit_off);
  487. si->obj_offset = si->unit_off + (N * stripe_unit) +
  488. (M * group_depth * stripe_unit);
  489. /* "H - (N * U)" is just "H % U" so it's bound to u32 */
  490. si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
  491. si->dev *= ios->layout->mirrors_p1;
  492. si->group_length = T - H;
  493. }
  494. static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
  495. unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
  496. gfp_t gfp_flags)
  497. {
  498. unsigned pg = *cur_pg;
  499. struct request_queue *q =
  500. osd_request_queue(_io_od(ios, per_dev->dev));
  501. per_dev->length += cur_len;
  502. if (per_dev->bio == NULL) {
  503. unsigned stripes = ios->layout->num_comps /
  504. ios->layout->mirrors_p1;
  505. unsigned pages_in_stripe = stripes *
  506. (ios->layout->stripe_unit / PAGE_SIZE);
  507. unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
  508. stripes;
  509. if (BIO_MAX_PAGES_KMALLOC < bio_size)
  510. bio_size = BIO_MAX_PAGES_KMALLOC;
  511. per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
  512. if (unlikely(!per_dev->bio)) {
  513. dprintk("Faild to allocate BIO size=%u\n", bio_size);
  514. return -ENOMEM;
  515. }
  516. }
  517. while (cur_len > 0) {
  518. unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
  519. unsigned added_len;
  520. BUG_ON(ios->ol_state.nr_pages <= pg);
  521. cur_len -= pglen;
  522. added_len = bio_add_pc_page(q, per_dev->bio,
  523. ios->ol_state.pages[pg], pglen, pgbase);
  524. if (unlikely(pglen != added_len))
  525. return -ENOMEM;
  526. pgbase = 0;
  527. ++pg;
  528. }
  529. BUG_ON(cur_len);
  530. *cur_pg = pg;
  531. return 0;
  532. }
  533. static int _prepare_one_group(struct objio_state *ios, u64 length,
  534. struct _striping_info *si, unsigned *last_pg,
  535. gfp_t gfp_flags)
  536. {
  537. unsigned stripe_unit = ios->layout->stripe_unit;
  538. unsigned mirrors_p1 = ios->layout->mirrors_p1;
  539. unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
  540. unsigned dev = si->dev;
  541. unsigned first_dev = dev - (dev % devs_in_group);
  542. unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
  543. unsigned cur_pg = *last_pg;
  544. int ret = 0;
  545. while (length) {
  546. struct _objio_per_comp *per_dev = &ios->per_dev[dev];
  547. unsigned cur_len, page_off = 0;
  548. if (!per_dev->length) {
  549. per_dev->dev = dev;
  550. if (dev < si->dev) {
  551. per_dev->offset = si->obj_offset + stripe_unit -
  552. si->unit_off;
  553. cur_len = stripe_unit;
  554. } else if (dev == si->dev) {
  555. per_dev->offset = si->obj_offset;
  556. cur_len = stripe_unit - si->unit_off;
  557. page_off = si->unit_off & ~PAGE_MASK;
  558. BUG_ON(page_off &&
  559. (page_off != ios->ol_state.pgbase));
  560. } else { /* dev > si->dev */
  561. per_dev->offset = si->obj_offset - si->unit_off;
  562. cur_len = stripe_unit;
  563. }
  564. if (max_comp < dev)
  565. max_comp = dev;
  566. } else {
  567. cur_len = stripe_unit;
  568. }
  569. if (cur_len >= length)
  570. cur_len = length;
  571. ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
  572. cur_len, gfp_flags);
  573. if (unlikely(ret))
  574. goto out;
  575. dev += mirrors_p1;
  576. dev = (dev % devs_in_group) + first_dev;
  577. length -= cur_len;
  578. ios->length += cur_len;
  579. }
  580. out:
  581. ios->numdevs = max_comp + mirrors_p1;
  582. *last_pg = cur_pg;
  583. return ret;
  584. }
  585. static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
  586. {
  587. u64 length = ios->ol_state.count;
  588. u64 offset = ios->ol_state.offset;
  589. struct _striping_info si;
  590. unsigned last_pg = 0;
  591. int ret = 0;
  592. while (length) {
  593. _calc_stripe_info(ios, offset, &si);
  594. if (length < si.group_length)
  595. si.group_length = length;
  596. ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
  597. if (unlikely(ret))
  598. goto out;
  599. offset += si.group_length;
  600. length -= si.group_length;
  601. }
  602. out:
  603. if (!ios->length)
  604. return ret;
  605. return 0;
  606. }
  607. static ssize_t _sync_done(struct objio_state *ios)
  608. {
  609. struct completion *waiting = ios->private;
  610. complete(waiting);
  611. return 0;
  612. }
  613. static void _last_io(struct kref *kref)
  614. {
  615. struct objio_state *ios = container_of(kref, struct objio_state, kref);
  616. ios->done(ios);
  617. }
  618. static void _done_io(struct osd_request *or, void *p)
  619. {
  620. struct objio_state *ios = p;
  621. kref_put(&ios->kref, _last_io);
  622. }
  623. static ssize_t _io_exec(struct objio_state *ios)
  624. {
  625. DECLARE_COMPLETION_ONSTACK(wait);
  626. ssize_t status = 0; /* sync status */
  627. unsigned i;
  628. objio_done_fn saved_done_fn = ios->done;
  629. bool sync = ios->ol_state.sync;
  630. if (sync) {
  631. ios->done = _sync_done;
  632. ios->private = &wait;
  633. }
  634. kref_init(&ios->kref);
  635. for (i = 0; i < ios->numdevs; i++) {
  636. struct osd_request *or = ios->per_dev[i].or;
  637. if (!or)
  638. continue;
  639. kref_get(&ios->kref);
  640. osd_execute_request_async(or, _done_io, ios);
  641. }
  642. kref_put(&ios->kref, _last_io);
  643. if (sync) {
  644. wait_for_completion(&wait);
  645. status = saved_done_fn(ios);
  646. }
  647. return status;
  648. }
  649. /*
  650. * read
  651. */
  652. static ssize_t _read_done(struct objio_state *ios)
  653. {
  654. ssize_t status;
  655. int ret = _io_check(ios, false);
  656. _io_free(ios);
  657. if (likely(!ret))
  658. status = ios->length;
  659. else
  660. status = ret;
  661. objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
  662. return status;
  663. }
  664. static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
  665. {
  666. struct osd_request *or = NULL;
  667. struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
  668. unsigned dev = per_dev->dev;
  669. struct pnfs_osd_object_cred *cred =
  670. &ios->layout->comps[dev];
  671. struct osd_obj_id obj = {
  672. .partition = cred->oc_object_id.oid_partition_id,
  673. .id = cred->oc_object_id.oid_object_id,
  674. };
  675. int ret;
  676. or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
  677. if (unlikely(!or)) {
  678. ret = -ENOMEM;
  679. goto err;
  680. }
  681. per_dev->or = or;
  682. osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
  683. ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
  684. if (ret) {
  685. dprintk("%s: Faild to osd_finalize_request() => %d\n",
  686. __func__, ret);
  687. goto err;
  688. }
  689. dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
  690. __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
  691. per_dev->length);
  692. err:
  693. return ret;
  694. }
  695. static ssize_t _read_exec(struct objio_state *ios)
  696. {
  697. unsigned i;
  698. int ret;
  699. for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
  700. if (!ios->per_dev[i].length)
  701. continue;
  702. ret = _read_mirrors(ios, i);
  703. if (unlikely(ret))
  704. goto err;
  705. }
  706. ios->done = _read_done;
  707. return _io_exec(ios); /* In sync mode exec returns the io status */
  708. err:
  709. _io_free(ios);
  710. return ret;
  711. }
  712. ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
  713. {
  714. struct objio_state *ios = container_of(ol_state, struct objio_state,
  715. ol_state);
  716. int ret;
  717. ret = _io_rw_pagelist(ios, GFP_KERNEL);
  718. if (unlikely(ret))
  719. return ret;
  720. return _read_exec(ios);
  721. }
  722. /*
  723. * write
  724. */
  725. static ssize_t _write_done(struct objio_state *ios)
  726. {
  727. ssize_t status;
  728. int ret = _io_check(ios, true);
  729. _io_free(ios);
  730. if (likely(!ret)) {
  731. /* FIXME: should be based on the OSD's persistence model
  732. * See OSD2r05 Section 4.13 Data persistence model */
  733. ios->ol_state.committed = NFS_FILE_SYNC;
  734. status = ios->length;
  735. } else {
  736. status = ret;
  737. }
  738. objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
  739. return status;
  740. }
  741. static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
  742. {
  743. struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
  744. unsigned dev = ios->per_dev[cur_comp].dev;
  745. unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
  746. int ret;
  747. for (; cur_comp < last_comp; ++cur_comp, ++dev) {
  748. struct osd_request *or = NULL;
  749. struct pnfs_osd_object_cred *cred =
  750. &ios->layout->comps[dev];
  751. struct osd_obj_id obj = {
  752. .partition = cred->oc_object_id.oid_partition_id,
  753. .id = cred->oc_object_id.oid_object_id,
  754. };
  755. struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
  756. struct bio *bio;
  757. or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
  758. if (unlikely(!or)) {
  759. ret = -ENOMEM;
  760. goto err;
  761. }
  762. per_dev->or = or;
  763. if (per_dev != master_dev) {
  764. bio = bio_kmalloc(GFP_NOFS,
  765. master_dev->bio->bi_max_vecs);
  766. if (unlikely(!bio)) {
  767. dprintk("Faild to allocate BIO size=%u\n",
  768. master_dev->bio->bi_max_vecs);
  769. ret = -ENOMEM;
  770. goto err;
  771. }
  772. __bio_clone(bio, master_dev->bio);
  773. bio->bi_bdev = NULL;
  774. bio->bi_next = NULL;
  775. per_dev->bio = bio;
  776. per_dev->dev = dev;
  777. per_dev->length = master_dev->length;
  778. per_dev->offset = master_dev->offset;
  779. } else {
  780. bio = master_dev->bio;
  781. bio->bi_rw |= REQ_WRITE;
  782. }
  783. osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
  784. ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
  785. if (ret) {
  786. dprintk("%s: Faild to osd_finalize_request() => %d\n",
  787. __func__, ret);
  788. goto err;
  789. }
  790. dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
  791. __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
  792. per_dev->length);
  793. }
  794. err:
  795. return ret;
  796. }
  797. static ssize_t _write_exec(struct objio_state *ios)
  798. {
  799. unsigned i;
  800. int ret;
  801. for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
  802. if (!ios->per_dev[i].length)
  803. continue;
  804. ret = _write_mirrors(ios, i);
  805. if (unlikely(ret))
  806. goto err;
  807. }
  808. ios->done = _write_done;
  809. return _io_exec(ios); /* In sync mode exec returns the io->status */
  810. err:
  811. _io_free(ios);
  812. return ret;
  813. }
  814. ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
  815. {
  816. struct objio_state *ios = container_of(ol_state, struct objio_state,
  817. ol_state);
  818. int ret;
  819. /* TODO: ios->stable = stable; */
  820. ret = _io_rw_pagelist(ios, GFP_NOFS);
  821. if (unlikely(ret))
  822. return ret;
  823. return _write_exec(ios);
  824. }
  825. static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
  826. struct nfs_page *prev, struct nfs_page *req)
  827. {
  828. if (!pnfs_generic_pg_test(pgio, prev, req))
  829. return false;
  830. return pgio->pg_count + req->wb_bytes <=
  831. OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
  832. }
  833. static struct pnfs_layoutdriver_type objlayout_type = {
  834. .id = LAYOUT_OSD2_OBJECTS,
  835. .name = "LAYOUT_OSD2_OBJECTS",
  836. .flags = PNFS_LAYOUTRET_ON_SETATTR,
  837. .alloc_layout_hdr = objlayout_alloc_layout_hdr,
  838. .free_layout_hdr = objlayout_free_layout_hdr,
  839. .alloc_lseg = objlayout_alloc_lseg,
  840. .free_lseg = objlayout_free_lseg,
  841. .read_pagelist = objlayout_read_pagelist,
  842. .write_pagelist = objlayout_write_pagelist,
  843. .pg_test = objio_pg_test,
  844. .free_deviceid_node = objio_free_deviceid_node,
  845. .encode_layoutcommit = objlayout_encode_layoutcommit,
  846. .encode_layoutreturn = objlayout_encode_layoutreturn,
  847. };
  848. MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
  849. MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
  850. MODULE_LICENSE("GPL");
  851. static int __init
  852. objlayout_init(void)
  853. {
  854. int ret = pnfs_register_layoutdriver(&objlayout_type);
  855. if (ret)
  856. printk(KERN_INFO
  857. "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
  858. __func__, ret);
  859. else
  860. printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
  861. __func__);
  862. return ret;
  863. }
  864. static void __exit
  865. objlayout_exit(void)
  866. {
  867. pnfs_unregister_layoutdriver(&objlayout_type);
  868. printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
  869. __func__);
  870. }
  871. module_init(objlayout_init);
  872. module_exit(objlayout_exit);