objio_osd.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071
  1. /*
  2. * pNFS Objects layout implementation over open-osd initiator library
  3. *
  4. * Copyright (C) 2009 Panasas Inc. [year of first publication]
  5. * All rights reserved.
  6. *
  7. * Benny Halevy <bhalevy@panasas.com>
  8. * Boaz Harrosh <bharrosh@panasas.com>
  9. *
  10. * This program is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License version 2
  12. * See the file COPYING included with this distribution for more details.
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. *
  18. * 1. Redistributions of source code must retain the above copyright
  19. * notice, this list of conditions and the following disclaimer.
  20. * 2. Redistributions in binary form must reproduce the above copyright
  21. * notice, this list of conditions and the following disclaimer in the
  22. * documentation and/or other materials provided with the distribution.
  23. * 3. Neither the name of the Panasas company nor the names of its
  24. * contributors may be used to endorse or promote products derived
  25. * from this software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
  28. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  29. * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  30. * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  31. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
  34. * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  35. * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  36. * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  37. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. */
  39. #include <linux/module.h>
  40. #include <scsi/osd_initiator.h>
  41. #include "objlayout.h"
  42. #define NFSDBG_FACILITY NFSDBG_PNFS_LD
  43. #define _LLU(x) ((unsigned long long)x)
  44. enum { BIO_MAX_PAGES_KMALLOC =
  45. (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
  46. };
  47. struct objio_dev_ent {
  48. struct nfs4_deviceid_node id_node;
  49. struct osd_dev *od;
  50. };
  51. static void
  52. objio_free_deviceid_node(struct nfs4_deviceid_node *d)
  53. {
  54. struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
  55. dprintk("%s: free od=%p\n", __func__, de->od);
  56. osduld_put_device(de->od);
  57. kfree(de);
  58. }
  59. static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
  60. const struct nfs4_deviceid *d_id)
  61. {
  62. struct nfs4_deviceid_node *d;
  63. struct objio_dev_ent *de;
  64. d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
  65. if (!d)
  66. return NULL;
  67. de = container_of(d, struct objio_dev_ent, id_node);
  68. return de;
  69. }
  70. static struct objio_dev_ent *
  71. _dev_list_add(const struct nfs_server *nfss,
  72. const struct nfs4_deviceid *d_id, struct osd_dev *od,
  73. gfp_t gfp_flags)
  74. {
  75. struct nfs4_deviceid_node *d;
  76. struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
  77. struct objio_dev_ent *n;
  78. if (!de) {
  79. dprintk("%s: -ENOMEM od=%p\n", __func__, od);
  80. return NULL;
  81. }
  82. dprintk("%s: Adding od=%p\n", __func__, od);
  83. nfs4_init_deviceid_node(&de->id_node,
  84. nfss->pnfs_curr_ld,
  85. nfss->nfs_client,
  86. d_id);
  87. de->od = od;
  88. d = nfs4_insert_deviceid_node(&de->id_node);
  89. n = container_of(d, struct objio_dev_ent, id_node);
  90. if (n != de) {
  91. dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
  92. objio_free_deviceid_node(&de->id_node);
  93. de = n;
  94. }
  95. return de;
  96. }
  97. struct caps_buffers {
  98. u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
  99. u8 creds[OSD_CAP_LEN];
  100. };
  101. struct objio_segment {
  102. struct pnfs_layout_segment lseg;
  103. struct pnfs_osd_object_cred *comps;
  104. unsigned mirrors_p1;
  105. unsigned stripe_unit;
  106. unsigned group_width; /* Data stripe_units without integrity comps */
  107. u64 group_depth;
  108. unsigned group_count;
  109. unsigned max_io_size;
  110. unsigned comps_index;
  111. unsigned num_comps;
  112. /* variable length */
  113. struct objio_dev_ent *ods[];
  114. };
  115. static inline struct objio_segment *
  116. OBJIO_LSEG(struct pnfs_layout_segment *lseg)
  117. {
  118. return container_of(lseg, struct objio_segment, lseg);
  119. }
  120. struct objio_state;
  121. typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
  122. struct objio_state {
  123. /* Generic layer */
  124. struct objlayout_io_state ol_state;
  125. struct objio_segment *layout;
  126. struct kref kref;
  127. objio_done_fn done;
  128. void *private;
  129. unsigned long length;
  130. unsigned numdevs; /* Actually used devs in this IO */
  131. /* A per-device variable array of size numdevs */
  132. struct _objio_per_comp {
  133. struct bio *bio;
  134. struct osd_request *or;
  135. unsigned long length;
  136. u64 offset;
  137. unsigned dev;
  138. } per_dev[];
  139. };
  140. /* Send and wait for a get_device_info of devices in the layout,
  141. then look them up with the osd_initiator library */
  142. static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
  143. struct objio_segment *objio_seg, unsigned comp,
  144. gfp_t gfp_flags)
  145. {
  146. struct pnfs_osd_deviceaddr *deviceaddr;
  147. struct nfs4_deviceid *d_id;
  148. struct objio_dev_ent *ode;
  149. struct osd_dev *od;
  150. struct osd_dev_info odi;
  151. int err;
  152. d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
  153. ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
  154. if (ode)
  155. return ode;
  156. err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
  157. if (unlikely(err)) {
  158. dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
  159. __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
  160. return ERR_PTR(err);
  161. }
  162. odi.systemid_len = deviceaddr->oda_systemid.len;
  163. if (odi.systemid_len > sizeof(odi.systemid)) {
  164. err = -EINVAL;
  165. goto out;
  166. } else if (odi.systemid_len)
  167. memcpy(odi.systemid, deviceaddr->oda_systemid.data,
  168. odi.systemid_len);
  169. odi.osdname_len = deviceaddr->oda_osdname.len;
  170. odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
  171. if (!odi.osdname_len && !odi.systemid_len) {
  172. dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
  173. __func__);
  174. err = -ENODEV;
  175. goto out;
  176. }
  177. od = osduld_info_lookup(&odi);
  178. if (unlikely(IS_ERR(od))) {
  179. err = PTR_ERR(od);
  180. dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
  181. goto out;
  182. }
  183. ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
  184. gfp_flags);
  185. out:
  186. dprintk("%s: return=%d\n", __func__, err);
  187. objlayout_put_deviceinfo(deviceaddr);
  188. return err ? ERR_PTR(err) : ode;
  189. }
  190. static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
  191. struct objio_segment *objio_seg,
  192. gfp_t gfp_flags)
  193. {
  194. unsigned i;
  195. int err;
  196. /* lookup all devices */
  197. for (i = 0; i < objio_seg->num_comps; i++) {
  198. struct objio_dev_ent *ode;
  199. ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
  200. if (unlikely(IS_ERR(ode))) {
  201. err = PTR_ERR(ode);
  202. goto out;
  203. }
  204. objio_seg->ods[i] = ode;
  205. }
  206. err = 0;
  207. out:
  208. dprintk("%s: return=%d\n", __func__, err);
  209. return err;
  210. }
  211. static int _verify_data_map(struct pnfs_osd_layout *layout)
  212. {
  213. struct pnfs_osd_data_map *data_map = &layout->olo_map;
  214. u64 stripe_length;
  215. u32 group_width;
  216. /* FIXME: Only raid0 for now. if not go through MDS */
  217. if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
  218. printk(KERN_ERR "Only RAID_0 for now\n");
  219. return -ENOTSUPP;
  220. }
  221. if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
  222. printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
  223. data_map->odm_num_comps, data_map->odm_mirror_cnt);
  224. return -EINVAL;
  225. }
  226. if (data_map->odm_group_width)
  227. group_width = data_map->odm_group_width;
  228. else
  229. group_width = data_map->odm_num_comps /
  230. (data_map->odm_mirror_cnt + 1);
  231. stripe_length = (u64)data_map->odm_stripe_unit * group_width;
  232. if (stripe_length >= (1ULL << 32)) {
  233. printk(KERN_ERR "Total Stripe length(0x%llx)"
  234. " >= 32bit is not supported\n", _LLU(stripe_length));
  235. return -ENOTSUPP;
  236. }
  237. if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
  238. printk(KERN_ERR "Stripe Unit(0x%llx)"
  239. " must be Multples of PAGE_SIZE(0x%lx)\n",
  240. _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
  241. return -ENOTSUPP;
  242. }
  243. return 0;
  244. }
  245. static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
  246. struct pnfs_osd_object_cred *src_comp,
  247. struct caps_buffers *caps_p)
  248. {
  249. WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
  250. WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
  251. *cur_comp = *src_comp;
  252. memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
  253. sizeof(caps_p->caps_key));
  254. cur_comp->oc_cap_key.cred = caps_p->caps_key;
  255. memcpy(caps_p->creds, src_comp->oc_cap.cred,
  256. sizeof(caps_p->creds));
  257. cur_comp->oc_cap.cred = caps_p->creds;
  258. }
  259. int objio_alloc_lseg(struct pnfs_layout_segment **outp,
  260. struct pnfs_layout_hdr *pnfslay,
  261. struct pnfs_layout_range *range,
  262. struct xdr_stream *xdr,
  263. gfp_t gfp_flags)
  264. {
  265. struct objio_segment *objio_seg;
  266. struct pnfs_osd_xdr_decode_layout_iter iter;
  267. struct pnfs_osd_layout layout;
  268. struct pnfs_osd_object_cred *cur_comp, src_comp;
  269. struct caps_buffers *caps_p;
  270. int err;
  271. err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
  272. if (unlikely(err))
  273. return err;
  274. err = _verify_data_map(&layout);
  275. if (unlikely(err))
  276. return err;
  277. objio_seg = kzalloc(sizeof(*objio_seg) +
  278. sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
  279. sizeof(*objio_seg->comps) * layout.olo_num_comps +
  280. sizeof(struct caps_buffers) * layout.olo_num_comps,
  281. gfp_flags);
  282. if (!objio_seg)
  283. return -ENOMEM;
  284. objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
  285. cur_comp = objio_seg->comps;
  286. caps_p = (void *)(cur_comp + layout.olo_num_comps);
  287. while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
  288. copy_single_comp(cur_comp++, &src_comp, caps_p++);
  289. if (unlikely(err))
  290. goto err;
  291. objio_seg->num_comps = layout.olo_num_comps;
  292. objio_seg->comps_index = layout.olo_comps_index;
  293. err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
  294. if (err)
  295. goto err;
  296. objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
  297. objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
  298. if (layout.olo_map.odm_group_width) {
  299. objio_seg->group_width = layout.olo_map.odm_group_width;
  300. objio_seg->group_depth = layout.olo_map.odm_group_depth;
  301. objio_seg->group_count = layout.olo_map.odm_num_comps /
  302. objio_seg->mirrors_p1 /
  303. objio_seg->group_width;
  304. } else {
  305. objio_seg->group_width = layout.olo_map.odm_num_comps /
  306. objio_seg->mirrors_p1;
  307. objio_seg->group_depth = -1;
  308. objio_seg->group_count = 1;
  309. }
  310. /* Cache this calculation it will hit for every page */
  311. objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
  312. objio_seg->stripe_unit) *
  313. objio_seg->group_width;
  314. *outp = &objio_seg->lseg;
  315. return 0;
  316. err:
  317. kfree(objio_seg);
  318. dprintk("%s: Error: return %d\n", __func__, err);
  319. *outp = NULL;
  320. return err;
  321. }
  322. void objio_free_lseg(struct pnfs_layout_segment *lseg)
  323. {
  324. int i;
  325. struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
  326. for (i = 0; i < objio_seg->num_comps; i++) {
  327. if (!objio_seg->ods[i])
  328. break;
  329. nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
  330. }
  331. kfree(objio_seg);
  332. }
  333. int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
  334. struct objlayout_io_state **outp,
  335. gfp_t gfp_flags)
  336. {
  337. struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
  338. struct objio_state *ios;
  339. const unsigned first_size = sizeof(*ios) +
  340. objio_seg->num_comps * sizeof(ios->per_dev[0]);
  341. const unsigned sec_size = objio_seg->num_comps *
  342. sizeof(ios->ol_state.ioerrs[0]);
  343. ios = kzalloc(first_size + sec_size, gfp_flags);
  344. if (unlikely(!ios))
  345. return -ENOMEM;
  346. ios->layout = objio_seg;
  347. ios->ol_state.ioerrs = ((void *)ios) + first_size;
  348. ios->ol_state.num_comps = objio_seg->num_comps;
  349. *outp = &ios->ol_state;
  350. return 0;
  351. }
  352. void objio_free_io_state(struct objlayout_io_state *ol_state)
  353. {
  354. struct objio_state *ios = container_of(ol_state, struct objio_state,
  355. ol_state);
  356. kfree(ios);
  357. }
  358. enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
  359. {
  360. switch (oep) {
  361. case OSD_ERR_PRI_NO_ERROR:
  362. return (enum pnfs_osd_errno)0;
  363. case OSD_ERR_PRI_CLEAR_PAGES:
  364. BUG_ON(1);
  365. return 0;
  366. case OSD_ERR_PRI_RESOURCE:
  367. return PNFS_OSD_ERR_RESOURCE;
  368. case OSD_ERR_PRI_BAD_CRED:
  369. return PNFS_OSD_ERR_BAD_CRED;
  370. case OSD_ERR_PRI_NO_ACCESS:
  371. return PNFS_OSD_ERR_NO_ACCESS;
  372. case OSD_ERR_PRI_UNREACHABLE:
  373. return PNFS_OSD_ERR_UNREACHABLE;
  374. case OSD_ERR_PRI_NOT_FOUND:
  375. return PNFS_OSD_ERR_NOT_FOUND;
  376. case OSD_ERR_PRI_NO_SPACE:
  377. return PNFS_OSD_ERR_NO_SPACE;
  378. default:
  379. WARN_ON(1);
  380. /* fallthrough */
  381. case OSD_ERR_PRI_EIO:
  382. return PNFS_OSD_ERR_EIO;
  383. }
  384. }
  385. static void _clear_bio(struct bio *bio)
  386. {
  387. struct bio_vec *bv;
  388. unsigned i;
  389. __bio_for_each_segment(bv, bio, i, 0) {
  390. unsigned this_count = bv->bv_len;
  391. if (likely(PAGE_SIZE == this_count))
  392. clear_highpage(bv->bv_page);
  393. else
  394. zero_user(bv->bv_page, bv->bv_offset, this_count);
  395. }
  396. }
  397. static int _io_check(struct objio_state *ios, bool is_write)
  398. {
  399. enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
  400. int lin_ret = 0;
  401. int i;
  402. for (i = 0; i < ios->numdevs; i++) {
  403. struct osd_sense_info osi;
  404. struct osd_request *or = ios->per_dev[i].or;
  405. unsigned dev;
  406. int ret;
  407. if (!or)
  408. continue;
  409. ret = osd_req_decode_sense(or, &osi);
  410. if (likely(!ret))
  411. continue;
  412. if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
  413. /* start read offset passed endof file */
  414. BUG_ON(is_write);
  415. _clear_bio(ios->per_dev[i].bio);
  416. dprintk("%s: start read offset passed end of file "
  417. "offset=0x%llx, length=0x%lx\n", __func__,
  418. _LLU(ios->per_dev[i].offset),
  419. ios->per_dev[i].length);
  420. continue; /* we recovered */
  421. }
  422. dev = ios->per_dev[i].dev;
  423. objlayout_io_set_result(&ios->ol_state, dev,
  424. &ios->layout->comps[dev].oc_object_id,
  425. osd_pri_2_pnfs_err(osi.osd_err_pri),
  426. ios->per_dev[i].offset,
  427. ios->per_dev[i].length,
  428. is_write);
  429. if (osi.osd_err_pri >= oep) {
  430. oep = osi.osd_err_pri;
  431. lin_ret = ret;
  432. }
  433. }
  434. return lin_ret;
  435. }
  436. /*
  437. * Common IO state helpers.
  438. */
  439. static void _io_free(struct objio_state *ios)
  440. {
  441. unsigned i;
  442. for (i = 0; i < ios->numdevs; i++) {
  443. struct _objio_per_comp *per_dev = &ios->per_dev[i];
  444. if (per_dev->or) {
  445. osd_end_request(per_dev->or);
  446. per_dev->or = NULL;
  447. }
  448. if (per_dev->bio) {
  449. bio_put(per_dev->bio);
  450. per_dev->bio = NULL;
  451. }
  452. }
  453. }
  454. struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
  455. {
  456. unsigned min_dev = ios->layout->comps_index;
  457. unsigned max_dev = min_dev + ios->layout->num_comps;
  458. BUG_ON(dev < min_dev || max_dev <= dev);
  459. return ios->layout->ods[dev - min_dev]->od;
  460. }
  461. struct _striping_info {
  462. u64 obj_offset;
  463. u64 group_length;
  464. unsigned dev;
  465. unsigned unit_off;
  466. };
  467. static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
  468. struct _striping_info *si)
  469. {
  470. u32 stripe_unit = ios->layout->stripe_unit;
  471. u32 group_width = ios->layout->group_width;
  472. u64 group_depth = ios->layout->group_depth;
  473. u32 U = stripe_unit * group_width;
  474. u64 T = U * group_depth;
  475. u64 S = T * ios->layout->group_count;
  476. u64 M = div64_u64(file_offset, S);
  477. /*
  478. G = (L - (M * S)) / T
  479. H = (L - (M * S)) % T
  480. */
  481. u64 LmodU = file_offset - M * S;
  482. u32 G = div64_u64(LmodU, T);
  483. u64 H = LmodU - G * T;
  484. u32 N = div_u64(H, U);
  485. div_u64_rem(file_offset, stripe_unit, &si->unit_off);
  486. si->obj_offset = si->unit_off + (N * stripe_unit) +
  487. (M * group_depth * stripe_unit);
  488. /* "H - (N * U)" is just "H % U" so it's bound to u32 */
  489. si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
  490. si->dev *= ios->layout->mirrors_p1;
  491. si->group_length = T - H;
  492. }
  493. static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
  494. unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
  495. gfp_t gfp_flags)
  496. {
  497. unsigned pg = *cur_pg;
  498. struct request_queue *q =
  499. osd_request_queue(_io_od(ios, per_dev->dev));
  500. per_dev->length += cur_len;
  501. if (per_dev->bio == NULL) {
  502. unsigned stripes = ios->layout->num_comps /
  503. ios->layout->mirrors_p1;
  504. unsigned pages_in_stripe = stripes *
  505. (ios->layout->stripe_unit / PAGE_SIZE);
  506. unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
  507. stripes;
  508. if (BIO_MAX_PAGES_KMALLOC < bio_size)
  509. bio_size = BIO_MAX_PAGES_KMALLOC;
  510. per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
  511. if (unlikely(!per_dev->bio)) {
  512. dprintk("Faild to allocate BIO size=%u\n", bio_size);
  513. return -ENOMEM;
  514. }
  515. }
  516. while (cur_len > 0) {
  517. unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
  518. unsigned added_len;
  519. BUG_ON(ios->ol_state.nr_pages <= pg);
  520. cur_len -= pglen;
  521. added_len = bio_add_pc_page(q, per_dev->bio,
  522. ios->ol_state.pages[pg], pglen, pgbase);
  523. if (unlikely(pglen != added_len))
  524. return -ENOMEM;
  525. pgbase = 0;
  526. ++pg;
  527. }
  528. BUG_ON(cur_len);
  529. *cur_pg = pg;
  530. return 0;
  531. }
  532. static int _prepare_one_group(struct objio_state *ios, u64 length,
  533. struct _striping_info *si, unsigned *last_pg,
  534. gfp_t gfp_flags)
  535. {
  536. unsigned stripe_unit = ios->layout->stripe_unit;
  537. unsigned mirrors_p1 = ios->layout->mirrors_p1;
  538. unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
  539. unsigned dev = si->dev;
  540. unsigned first_dev = dev - (dev % devs_in_group);
  541. unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
  542. unsigned cur_pg = *last_pg;
  543. int ret = 0;
  544. while (length) {
  545. struct _objio_per_comp *per_dev = &ios->per_dev[dev];
  546. unsigned cur_len, page_off = 0;
  547. if (!per_dev->length) {
  548. per_dev->dev = dev;
  549. if (dev < si->dev) {
  550. per_dev->offset = si->obj_offset + stripe_unit -
  551. si->unit_off;
  552. cur_len = stripe_unit;
  553. } else if (dev == si->dev) {
  554. per_dev->offset = si->obj_offset;
  555. cur_len = stripe_unit - si->unit_off;
  556. page_off = si->unit_off & ~PAGE_MASK;
  557. BUG_ON(page_off &&
  558. (page_off != ios->ol_state.pgbase));
  559. } else { /* dev > si->dev */
  560. per_dev->offset = si->obj_offset - si->unit_off;
  561. cur_len = stripe_unit;
  562. }
  563. if (max_comp < dev)
  564. max_comp = dev;
  565. } else {
  566. cur_len = stripe_unit;
  567. }
  568. if (cur_len >= length)
  569. cur_len = length;
  570. ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
  571. cur_len, gfp_flags);
  572. if (unlikely(ret))
  573. goto out;
  574. dev += mirrors_p1;
  575. dev = (dev % devs_in_group) + first_dev;
  576. length -= cur_len;
  577. ios->length += cur_len;
  578. }
  579. out:
  580. ios->numdevs = max_comp + mirrors_p1;
  581. *last_pg = cur_pg;
  582. return ret;
  583. }
  584. static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
  585. {
  586. u64 length = ios->ol_state.count;
  587. u64 offset = ios->ol_state.offset;
  588. struct _striping_info si;
  589. unsigned last_pg = 0;
  590. int ret = 0;
  591. while (length) {
  592. _calc_stripe_info(ios, offset, &si);
  593. if (length < si.group_length)
  594. si.group_length = length;
  595. ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
  596. if (unlikely(ret))
  597. goto out;
  598. offset += si.group_length;
  599. length -= si.group_length;
  600. }
  601. out:
  602. if (!ios->length)
  603. return ret;
  604. return 0;
  605. }
  606. static ssize_t _sync_done(struct objio_state *ios)
  607. {
  608. struct completion *waiting = ios->private;
  609. complete(waiting);
  610. return 0;
  611. }
  612. static void _last_io(struct kref *kref)
  613. {
  614. struct objio_state *ios = container_of(kref, struct objio_state, kref);
  615. ios->done(ios);
  616. }
  617. static void _done_io(struct osd_request *or, void *p)
  618. {
  619. struct objio_state *ios = p;
  620. kref_put(&ios->kref, _last_io);
  621. }
  622. static ssize_t _io_exec(struct objio_state *ios)
  623. {
  624. DECLARE_COMPLETION_ONSTACK(wait);
  625. ssize_t status = 0; /* sync status */
  626. unsigned i;
  627. objio_done_fn saved_done_fn = ios->done;
  628. bool sync = ios->ol_state.sync;
  629. if (sync) {
  630. ios->done = _sync_done;
  631. ios->private = &wait;
  632. }
  633. kref_init(&ios->kref);
  634. for (i = 0; i < ios->numdevs; i++) {
  635. struct osd_request *or = ios->per_dev[i].or;
  636. if (!or)
  637. continue;
  638. kref_get(&ios->kref);
  639. osd_execute_request_async(or, _done_io, ios);
  640. }
  641. kref_put(&ios->kref, _last_io);
  642. if (sync) {
  643. wait_for_completion(&wait);
  644. status = saved_done_fn(ios);
  645. }
  646. return status;
  647. }
  648. /*
  649. * read
  650. */
  651. static ssize_t _read_done(struct objio_state *ios)
  652. {
  653. ssize_t status;
  654. int ret = _io_check(ios, false);
  655. _io_free(ios);
  656. if (likely(!ret))
  657. status = ios->length;
  658. else
  659. status = ret;
  660. objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
  661. return status;
  662. }
  663. static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
  664. {
  665. struct osd_request *or = NULL;
  666. struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
  667. unsigned dev = per_dev->dev;
  668. struct pnfs_osd_object_cred *cred =
  669. &ios->layout->comps[dev];
  670. struct osd_obj_id obj = {
  671. .partition = cred->oc_object_id.oid_partition_id,
  672. .id = cred->oc_object_id.oid_object_id,
  673. };
  674. int ret;
  675. or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
  676. if (unlikely(!or)) {
  677. ret = -ENOMEM;
  678. goto err;
  679. }
  680. per_dev->or = or;
  681. osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
  682. ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
  683. if (ret) {
  684. dprintk("%s: Faild to osd_finalize_request() => %d\n",
  685. __func__, ret);
  686. goto err;
  687. }
  688. dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
  689. __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
  690. per_dev->length);
  691. err:
  692. return ret;
  693. }
  694. static ssize_t _read_exec(struct objio_state *ios)
  695. {
  696. unsigned i;
  697. int ret;
  698. for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
  699. if (!ios->per_dev[i].length)
  700. continue;
  701. ret = _read_mirrors(ios, i);
  702. if (unlikely(ret))
  703. goto err;
  704. }
  705. ios->done = _read_done;
  706. return _io_exec(ios); /* In sync mode exec returns the io status */
  707. err:
  708. _io_free(ios);
  709. return ret;
  710. }
  711. ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
  712. {
  713. struct objio_state *ios = container_of(ol_state, struct objio_state,
  714. ol_state);
  715. int ret;
  716. ret = _io_rw_pagelist(ios, GFP_KERNEL);
  717. if (unlikely(ret))
  718. return ret;
  719. return _read_exec(ios);
  720. }
  721. /*
  722. * write
  723. */
  724. static ssize_t _write_done(struct objio_state *ios)
  725. {
  726. ssize_t status;
  727. int ret = _io_check(ios, true);
  728. _io_free(ios);
  729. if (likely(!ret)) {
  730. /* FIXME: should be based on the OSD's persistence model
  731. * See OSD2r05 Section 4.13 Data persistence model */
  732. ios->ol_state.committed = NFS_FILE_SYNC;
  733. status = ios->length;
  734. } else {
  735. status = ret;
  736. }
  737. objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
  738. return status;
  739. }
  740. static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
  741. {
  742. struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
  743. unsigned dev = ios->per_dev[cur_comp].dev;
  744. unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
  745. int ret;
  746. for (; cur_comp < last_comp; ++cur_comp, ++dev) {
  747. struct osd_request *or = NULL;
  748. struct pnfs_osd_object_cred *cred =
  749. &ios->layout->comps[dev];
  750. struct osd_obj_id obj = {
  751. .partition = cred->oc_object_id.oid_partition_id,
  752. .id = cred->oc_object_id.oid_object_id,
  753. };
  754. struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
  755. struct bio *bio;
  756. or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
  757. if (unlikely(!or)) {
  758. ret = -ENOMEM;
  759. goto err;
  760. }
  761. per_dev->or = or;
  762. if (per_dev != master_dev) {
  763. bio = bio_kmalloc(GFP_NOFS,
  764. master_dev->bio->bi_max_vecs);
  765. if (unlikely(!bio)) {
  766. dprintk("Faild to allocate BIO size=%u\n",
  767. master_dev->bio->bi_max_vecs);
  768. ret = -ENOMEM;
  769. goto err;
  770. }
  771. __bio_clone(bio, master_dev->bio);
  772. bio->bi_bdev = NULL;
  773. bio->bi_next = NULL;
  774. per_dev->bio = bio;
  775. per_dev->dev = dev;
  776. per_dev->length = master_dev->length;
  777. per_dev->offset = master_dev->offset;
  778. } else {
  779. bio = master_dev->bio;
  780. bio->bi_rw |= REQ_WRITE;
  781. }
  782. osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
  783. ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
  784. if (ret) {
  785. dprintk("%s: Faild to osd_finalize_request() => %d\n",
  786. __func__, ret);
  787. goto err;
  788. }
  789. dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
  790. __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
  791. per_dev->length);
  792. }
  793. err:
  794. return ret;
  795. }
  796. static ssize_t _write_exec(struct objio_state *ios)
  797. {
  798. unsigned i;
  799. int ret;
  800. for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
  801. if (!ios->per_dev[i].length)
  802. continue;
  803. ret = _write_mirrors(ios, i);
  804. if (unlikely(ret))
  805. goto err;
  806. }
  807. ios->done = _write_done;
  808. return _io_exec(ios); /* In sync mode exec returns the io->status */
  809. err:
  810. _io_free(ios);
  811. return ret;
  812. }
  813. ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
  814. {
  815. struct objio_state *ios = container_of(ol_state, struct objio_state,
  816. ol_state);
  817. int ret;
  818. /* TODO: ios->stable = stable; */
  819. ret = _io_rw_pagelist(ios, GFP_NOFS);
  820. if (unlikely(ret))
  821. return ret;
  822. return _write_exec(ios);
  823. }
  824. static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
  825. struct nfs_page *prev, struct nfs_page *req)
  826. {
  827. if (!pnfs_generic_pg_test(pgio, prev, req))
  828. return false;
  829. return pgio->pg_count + req->wb_bytes <=
  830. OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
  831. }
  832. static const struct nfs_pageio_ops objio_pg_read_ops = {
  833. .pg_init = pnfs_generic_pg_init_read,
  834. .pg_test = objio_pg_test,
  835. .pg_doio = pnfs_generic_pg_readpages,
  836. };
  837. static const struct nfs_pageio_ops objio_pg_write_ops = {
  838. .pg_init = pnfs_generic_pg_init_write,
  839. .pg_test = objio_pg_test,
  840. .pg_doio = pnfs_generic_pg_writepages,
  841. };
  842. static struct pnfs_layoutdriver_type objlayout_type = {
  843. .id = LAYOUT_OSD2_OBJECTS,
  844. .name = "LAYOUT_OSD2_OBJECTS",
  845. .flags = PNFS_LAYOUTRET_ON_SETATTR,
  846. .alloc_layout_hdr = objlayout_alloc_layout_hdr,
  847. .free_layout_hdr = objlayout_free_layout_hdr,
  848. .alloc_lseg = objlayout_alloc_lseg,
  849. .free_lseg = objlayout_free_lseg,
  850. .read_pagelist = objlayout_read_pagelist,
  851. .write_pagelist = objlayout_write_pagelist,
  852. .pg_read_ops = &objio_pg_read_ops,
  853. .pg_write_ops = &objio_pg_write_ops,
  854. .free_deviceid_node = objio_free_deviceid_node,
  855. .encode_layoutcommit = objlayout_encode_layoutcommit,
  856. .encode_layoutreturn = objlayout_encode_layoutreturn,
  857. };
  858. MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
  859. MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
  860. MODULE_LICENSE("GPL");
  861. static int __init
  862. objlayout_init(void)
  863. {
  864. int ret = pnfs_register_layoutdriver(&objlayout_type);
  865. if (ret)
  866. printk(KERN_INFO
  867. "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
  868. __func__, ret);
  869. else
  870. printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
  871. __func__);
  872. return ret;
  873. }
  874. static void __exit
  875. objlayout_exit(void)
  876. {
  877. pnfs_unregister_layoutdriver(&objlayout_type);
  878. printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
  879. __func__);
  880. }
  881. MODULE_ALIAS("nfs-layouttype4-2");
  882. module_init(objlayout_init);
  883. module_exit(objlayout_exit);