super.c 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115
  1. #include "ceph_debug.h"
  2. #include <linux/backing-dev.h>
  3. #include <linux/ctype.h>
  4. #include <linux/fs.h>
  5. #include <linux/inet.h>
  6. #include <linux/in6.h>
  7. #include <linux/module.h>
  8. #include <linux/mount.h>
  9. #include <linux/parser.h>
  10. #include <linux/sched.h>
  11. #include <linux/seq_file.h>
  12. #include <linux/slab.h>
  13. #include <linux/statfs.h>
  14. #include <linux/string.h>
  15. #include "decode.h"
  16. #include "super.h"
  17. #include "mon_client.h"
  18. #include "auth.h"
  19. /*
  20. * Ceph superblock operations
  21. *
  22. * Handle the basics of mounting, unmounting.
  23. */
  24. /*
  25. * find filename portion of a path (/foo/bar/baz -> baz)
  26. */
  27. const char *ceph_file_part(const char *s, int len)
  28. {
  29. const char *e = s + len;
  30. while (e != s && *(e-1) != '/')
  31. e--;
  32. return e;
  33. }
  34. /*
  35. * super ops
  36. */
  37. static void ceph_put_super(struct super_block *s)
  38. {
  39. struct ceph_client *client = ceph_sb_to_client(s);
  40. dout("put_super\n");
  41. ceph_mdsc_close_sessions(&client->mdsc);
  42. /*
  43. * ensure we release the bdi before put_anon_super releases
  44. * the device name.
  45. */
  46. if (s->s_bdi == &client->backing_dev_info) {
  47. bdi_unregister(&client->backing_dev_info);
  48. s->s_bdi = NULL;
  49. }
  50. return;
  51. }
  52. static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
  53. {
  54. struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
  55. struct ceph_monmap *monmap = client->monc.monmap;
  56. struct ceph_statfs st;
  57. u64 fsid;
  58. int err;
  59. dout("statfs\n");
  60. err = ceph_monc_do_statfs(&client->monc, &st);
  61. if (err < 0)
  62. return err;
  63. /* fill in kstatfs */
  64. buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
  65. /*
  66. * express utilization in terms of large blocks to avoid
  67. * overflow on 32-bit machines.
  68. */
  69. buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
  70. buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
  71. buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
  72. (CEPH_BLOCK_SHIFT-10);
  73. buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
  74. buf->f_files = le64_to_cpu(st.num_objects);
  75. buf->f_ffree = -1;
  76. buf->f_namelen = NAME_MAX;
  77. buf->f_frsize = PAGE_CACHE_SIZE;
  78. /* leave fsid little-endian, regardless of host endianness */
  79. fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
  80. buf->f_fsid.val[0] = fsid & 0xffffffff;
  81. buf->f_fsid.val[1] = fsid >> 32;
  82. return 0;
  83. }
  84. static int ceph_sync_fs(struct super_block *sb, int wait)
  85. {
  86. struct ceph_client *client = ceph_sb_to_client(sb);
  87. if (!wait) {
  88. dout("sync_fs (non-blocking)\n");
  89. ceph_flush_dirty_caps(&client->mdsc);
  90. dout("sync_fs (non-blocking) done\n");
  91. return 0;
  92. }
  93. dout("sync_fs (blocking)\n");
  94. ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
  95. ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
  96. dout("sync_fs (blocking) done\n");
  97. return 0;
  98. }
  99. static int default_congestion_kb(void)
  100. {
  101. int congestion_kb;
  102. /*
  103. * Copied from NFS
  104. *
  105. * congestion size, scale with available memory.
  106. *
  107. * 64MB: 8192k
  108. * 128MB: 11585k
  109. * 256MB: 16384k
  110. * 512MB: 23170k
  111. * 1GB: 32768k
  112. * 2GB: 46340k
  113. * 4GB: 65536k
  114. * 8GB: 92681k
  115. * 16GB: 131072k
  116. *
  117. * This allows larger machines to have larger/more transfers.
  118. * Limit the default to 256M
  119. */
  120. congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
  121. if (congestion_kb > 256*1024)
  122. congestion_kb = 256*1024;
  123. return congestion_kb;
  124. }
  125. /**
  126. * ceph_show_options - Show mount options in /proc/mounts
  127. * @m: seq_file to write to
  128. * @mnt: mount descriptor
  129. */
  130. static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
  131. {
  132. struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
  133. struct ceph_mount_args *args = client->mount_args;
  134. if (args->flags & CEPH_OPT_FSID)
  135. seq_printf(m, ",fsid=%pU", &args->fsid);
  136. if (args->flags & CEPH_OPT_NOSHARE)
  137. seq_puts(m, ",noshare");
  138. if (args->flags & CEPH_OPT_DIRSTAT)
  139. seq_puts(m, ",dirstat");
  140. if ((args->flags & CEPH_OPT_RBYTES) == 0)
  141. seq_puts(m, ",norbytes");
  142. if (args->flags & CEPH_OPT_NOCRC)
  143. seq_puts(m, ",nocrc");
  144. if (args->flags & CEPH_OPT_NOASYNCREADDIR)
  145. seq_puts(m, ",noasyncreaddir");
  146. if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
  147. seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
  148. if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
  149. seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
  150. if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
  151. seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
  152. if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
  153. seq_printf(m, ",osdkeepalivetimeout=%d",
  154. args->osd_keepalive_timeout);
  155. if (args->wsize)
  156. seq_printf(m, ",wsize=%d", args->wsize);
  157. if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
  158. seq_printf(m, ",rsize=%d", args->rsize);
  159. if (args->congestion_kb != default_congestion_kb())
  160. seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
  161. if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
  162. seq_printf(m, ",caps_wanted_delay_min=%d",
  163. args->caps_wanted_delay_min);
  164. if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
  165. seq_printf(m, ",caps_wanted_delay_max=%d",
  166. args->caps_wanted_delay_max);
  167. if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
  168. seq_printf(m, ",cap_release_safety=%d",
  169. args->cap_release_safety);
  170. if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
  171. seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
  172. if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
  173. seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
  174. if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
  175. seq_printf(m, ",snapdirname=%s", args->snapdir_name);
  176. if (args->name)
  177. seq_printf(m, ",name=%s", args->name);
  178. if (args->secret)
  179. seq_puts(m, ",secret=<hidden>");
  180. return 0;
  181. }
  182. /*
  183. * caches
  184. */
  185. struct kmem_cache *ceph_inode_cachep;
  186. struct kmem_cache *ceph_cap_cachep;
  187. struct kmem_cache *ceph_dentry_cachep;
  188. struct kmem_cache *ceph_file_cachep;
  189. static void ceph_inode_init_once(void *foo)
  190. {
  191. struct ceph_inode_info *ci = foo;
  192. inode_init_once(&ci->vfs_inode);
  193. }
  194. static int __init init_caches(void)
  195. {
  196. ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
  197. sizeof(struct ceph_inode_info),
  198. __alignof__(struct ceph_inode_info),
  199. (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
  200. ceph_inode_init_once);
  201. if (ceph_inode_cachep == NULL)
  202. return -ENOMEM;
  203. ceph_cap_cachep = KMEM_CACHE(ceph_cap,
  204. SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
  205. if (ceph_cap_cachep == NULL)
  206. goto bad_cap;
  207. ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
  208. SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
  209. if (ceph_dentry_cachep == NULL)
  210. goto bad_dentry;
  211. ceph_file_cachep = KMEM_CACHE(ceph_file_info,
  212. SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
  213. if (ceph_file_cachep == NULL)
  214. goto bad_file;
  215. return 0;
  216. bad_file:
  217. kmem_cache_destroy(ceph_dentry_cachep);
  218. bad_dentry:
  219. kmem_cache_destroy(ceph_cap_cachep);
  220. bad_cap:
  221. kmem_cache_destroy(ceph_inode_cachep);
  222. return -ENOMEM;
  223. }
  224. static void destroy_caches(void)
  225. {
  226. kmem_cache_destroy(ceph_inode_cachep);
  227. kmem_cache_destroy(ceph_cap_cachep);
  228. kmem_cache_destroy(ceph_dentry_cachep);
  229. kmem_cache_destroy(ceph_file_cachep);
  230. }
  231. /*
  232. * ceph_umount_begin - initiate forced umount. Tear down down the
  233. * mount, skipping steps that may hang while waiting for server(s).
  234. */
  235. static void ceph_umount_begin(struct super_block *sb)
  236. {
  237. struct ceph_client *client = ceph_sb_to_client(sb);
  238. dout("ceph_umount_begin - starting forced umount\n");
  239. if (!client)
  240. return;
  241. client->mount_state = CEPH_MOUNT_SHUTDOWN;
  242. return;
  243. }
  244. static const struct super_operations ceph_super_ops = {
  245. .alloc_inode = ceph_alloc_inode,
  246. .destroy_inode = ceph_destroy_inode,
  247. .write_inode = ceph_write_inode,
  248. .sync_fs = ceph_sync_fs,
  249. .put_super = ceph_put_super,
  250. .show_options = ceph_show_options,
  251. .statfs = ceph_statfs,
  252. .umount_begin = ceph_umount_begin,
  253. };
  254. const char *ceph_msg_type_name(int type)
  255. {
  256. switch (type) {
  257. case CEPH_MSG_SHUTDOWN: return "shutdown";
  258. case CEPH_MSG_PING: return "ping";
  259. case CEPH_MSG_AUTH: return "auth";
  260. case CEPH_MSG_AUTH_REPLY: return "auth_reply";
  261. case CEPH_MSG_MON_MAP: return "mon_map";
  262. case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
  263. case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
  264. case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
  265. case CEPH_MSG_STATFS: return "statfs";
  266. case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
  267. case CEPH_MSG_MDS_MAP: return "mds_map";
  268. case CEPH_MSG_CLIENT_SESSION: return "client_session";
  269. case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
  270. case CEPH_MSG_CLIENT_REQUEST: return "client_request";
  271. case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
  272. case CEPH_MSG_CLIENT_REPLY: return "client_reply";
  273. case CEPH_MSG_CLIENT_CAPS: return "client_caps";
  274. case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
  275. case CEPH_MSG_CLIENT_SNAP: return "client_snap";
  276. case CEPH_MSG_CLIENT_LEASE: return "client_lease";
  277. case CEPH_MSG_OSD_MAP: return "osd_map";
  278. case CEPH_MSG_OSD_OP: return "osd_op";
  279. case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
  280. default: return "unknown";
  281. }
  282. }
  283. /*
  284. * mount options
  285. */
  286. enum {
  287. Opt_wsize,
  288. Opt_rsize,
  289. Opt_osdtimeout,
  290. Opt_osdkeepalivetimeout,
  291. Opt_mount_timeout,
  292. Opt_osd_idle_ttl,
  293. Opt_caps_wanted_delay_min,
  294. Opt_caps_wanted_delay_max,
  295. Opt_cap_release_safety,
  296. Opt_readdir_max_entries,
  297. Opt_readdir_max_bytes,
  298. Opt_congestion_kb,
  299. Opt_last_int,
  300. /* int args above */
  301. Opt_fsid,
  302. Opt_snapdirname,
  303. Opt_name,
  304. Opt_secret,
  305. Opt_last_string,
  306. /* string args above */
  307. Opt_ip,
  308. Opt_noshare,
  309. Opt_dirstat,
  310. Opt_nodirstat,
  311. Opt_rbytes,
  312. Opt_norbytes,
  313. Opt_nocrc,
  314. Opt_noasyncreaddir,
  315. };
  316. static match_table_t arg_tokens = {
  317. {Opt_wsize, "wsize=%d"},
  318. {Opt_rsize, "rsize=%d"},
  319. {Opt_osdtimeout, "osdtimeout=%d"},
  320. {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
  321. {Opt_mount_timeout, "mount_timeout=%d"},
  322. {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
  323. {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
  324. {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
  325. {Opt_cap_release_safety, "cap_release_safety=%d"},
  326. {Opt_readdir_max_entries, "readdir_max_entries=%d"},
  327. {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
  328. {Opt_congestion_kb, "write_congestion_kb=%d"},
  329. /* int args above */
  330. {Opt_fsid, "fsid=%s"},
  331. {Opt_snapdirname, "snapdirname=%s"},
  332. {Opt_name, "name=%s"},
  333. {Opt_secret, "secret=%s"},
  334. /* string args above */
  335. {Opt_ip, "ip=%s"},
  336. {Opt_noshare, "noshare"},
  337. {Opt_dirstat, "dirstat"},
  338. {Opt_nodirstat, "nodirstat"},
  339. {Opt_rbytes, "rbytes"},
  340. {Opt_norbytes, "norbytes"},
  341. {Opt_nocrc, "nocrc"},
  342. {Opt_noasyncreaddir, "noasyncreaddir"},
  343. {-1, NULL}
  344. };
  345. static int parse_fsid(const char *str, struct ceph_fsid *fsid)
  346. {
  347. int i = 0;
  348. char tmp[3];
  349. int err = -EINVAL;
  350. int d;
  351. dout("parse_fsid '%s'\n", str);
  352. tmp[2] = 0;
  353. while (*str && i < 16) {
  354. if (ispunct(*str)) {
  355. str++;
  356. continue;
  357. }
  358. if (!isxdigit(str[0]) || !isxdigit(str[1]))
  359. break;
  360. tmp[0] = str[0];
  361. tmp[1] = str[1];
  362. if (sscanf(tmp, "%x", &d) < 1)
  363. break;
  364. fsid->fsid[i] = d & 0xff;
  365. i++;
  366. str += 2;
  367. }
  368. if (i == 16)
  369. err = 0;
  370. dout("parse_fsid ret %d got fsid %pU", err, fsid);
  371. return err;
  372. }
  373. static struct ceph_mount_args *parse_mount_args(int flags, char *options,
  374. const char *dev_name,
  375. const char **path)
  376. {
  377. struct ceph_mount_args *args;
  378. const char *c;
  379. int err = -ENOMEM;
  380. substring_t argstr[MAX_OPT_ARGS];
  381. args = kzalloc(sizeof(*args), GFP_KERNEL);
  382. if (!args)
  383. return ERR_PTR(-ENOMEM);
  384. args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
  385. GFP_KERNEL);
  386. if (!args->mon_addr)
  387. goto out;
  388. dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
  389. /* start with defaults */
  390. args->sb_flags = flags;
  391. args->flags = CEPH_OPT_DEFAULT;
  392. args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
  393. args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
  394. args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
  395. args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
  396. args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
  397. args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
  398. args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
  399. args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
  400. args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
  401. args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
  402. args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
  403. args->congestion_kb = default_congestion_kb();
  404. /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
  405. err = -EINVAL;
  406. if (!dev_name)
  407. goto out;
  408. *path = strstr(dev_name, ":/");
  409. if (*path == NULL) {
  410. pr_err("device name is missing path (no :/ in %s)\n",
  411. dev_name);
  412. goto out;
  413. }
  414. /* get mon ip(s) */
  415. err = ceph_parse_ips(dev_name, *path, args->mon_addr,
  416. CEPH_MAX_MON, &args->num_mon);
  417. if (err < 0)
  418. goto out;
  419. /* path on server */
  420. *path += 2;
  421. dout("server path '%s'\n", *path);
  422. /* parse mount options */
  423. while ((c = strsep(&options, ",")) != NULL) {
  424. int token, intval, ret;
  425. if (!*c)
  426. continue;
  427. err = -EINVAL;
  428. token = match_token((char *)c, arg_tokens, argstr);
  429. if (token < 0) {
  430. pr_err("bad mount option at '%s'\n", c);
  431. goto out;
  432. }
  433. if (token < Opt_last_int) {
  434. ret = match_int(&argstr[0], &intval);
  435. if (ret < 0) {
  436. pr_err("bad mount option arg (not int) "
  437. "at '%s'\n", c);
  438. continue;
  439. }
  440. dout("got int token %d val %d\n", token, intval);
  441. } else if (token > Opt_last_int && token < Opt_last_string) {
  442. dout("got string token %d val %s\n", token,
  443. argstr[0].from);
  444. } else {
  445. dout("got token %d\n", token);
  446. }
  447. switch (token) {
  448. case Opt_ip:
  449. err = ceph_parse_ips(argstr[0].from,
  450. argstr[0].to,
  451. &args->my_addr,
  452. 1, NULL);
  453. if (err < 0)
  454. goto out;
  455. args->flags |= CEPH_OPT_MYIP;
  456. break;
  457. case Opt_fsid:
  458. err = parse_fsid(argstr[0].from, &args->fsid);
  459. if (err == 0)
  460. args->flags |= CEPH_OPT_FSID;
  461. break;
  462. case Opt_snapdirname:
  463. kfree(args->snapdir_name);
  464. args->snapdir_name = kstrndup(argstr[0].from,
  465. argstr[0].to-argstr[0].from,
  466. GFP_KERNEL);
  467. break;
  468. case Opt_name:
  469. args->name = kstrndup(argstr[0].from,
  470. argstr[0].to-argstr[0].from,
  471. GFP_KERNEL);
  472. break;
  473. case Opt_secret:
  474. args->secret = kstrndup(argstr[0].from,
  475. argstr[0].to-argstr[0].from,
  476. GFP_KERNEL);
  477. break;
  478. /* misc */
  479. case Opt_wsize:
  480. args->wsize = intval;
  481. break;
  482. case Opt_rsize:
  483. args->rsize = intval;
  484. break;
  485. case Opt_osdtimeout:
  486. args->osd_timeout = intval;
  487. break;
  488. case Opt_osdkeepalivetimeout:
  489. args->osd_keepalive_timeout = intval;
  490. break;
  491. case Opt_osd_idle_ttl:
  492. args->osd_idle_ttl = intval;
  493. break;
  494. case Opt_mount_timeout:
  495. args->mount_timeout = intval;
  496. break;
  497. case Opt_caps_wanted_delay_min:
  498. args->caps_wanted_delay_min = intval;
  499. break;
  500. case Opt_caps_wanted_delay_max:
  501. args->caps_wanted_delay_max = intval;
  502. break;
  503. case Opt_readdir_max_entries:
  504. args->max_readdir = intval;
  505. break;
  506. case Opt_readdir_max_bytes:
  507. args->max_readdir_bytes = intval;
  508. break;
  509. case Opt_congestion_kb:
  510. args->congestion_kb = intval;
  511. break;
  512. case Opt_noshare:
  513. args->flags |= CEPH_OPT_NOSHARE;
  514. break;
  515. case Opt_dirstat:
  516. args->flags |= CEPH_OPT_DIRSTAT;
  517. break;
  518. case Opt_nodirstat:
  519. args->flags &= ~CEPH_OPT_DIRSTAT;
  520. break;
  521. case Opt_rbytes:
  522. args->flags |= CEPH_OPT_RBYTES;
  523. break;
  524. case Opt_norbytes:
  525. args->flags &= ~CEPH_OPT_RBYTES;
  526. break;
  527. case Opt_nocrc:
  528. args->flags |= CEPH_OPT_NOCRC;
  529. break;
  530. case Opt_noasyncreaddir:
  531. args->flags |= CEPH_OPT_NOASYNCREADDIR;
  532. break;
  533. default:
  534. BUG_ON(token);
  535. }
  536. }
  537. return args;
  538. out:
  539. kfree(args->mon_addr);
  540. kfree(args);
  541. return ERR_PTR(err);
  542. }
  543. static void destroy_mount_args(struct ceph_mount_args *args)
  544. {
  545. dout("destroy_mount_args %p\n", args);
  546. kfree(args->snapdir_name);
  547. args->snapdir_name = NULL;
  548. kfree(args->name);
  549. args->name = NULL;
  550. kfree(args->secret);
  551. args->secret = NULL;
  552. kfree(args);
  553. }
  554. /*
  555. * create a fresh client instance
  556. */
  557. static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
  558. {
  559. struct ceph_client *client;
  560. int err = -ENOMEM;
  561. client = kzalloc(sizeof(*client), GFP_KERNEL);
  562. if (client == NULL)
  563. return ERR_PTR(-ENOMEM);
  564. mutex_init(&client->mount_mutex);
  565. init_waitqueue_head(&client->auth_wq);
  566. client->sb = NULL;
  567. client->mount_state = CEPH_MOUNT_MOUNTING;
  568. client->mount_args = args;
  569. client->msgr = NULL;
  570. client->auth_err = 0;
  571. atomic_long_set(&client->writeback_count, 0);
  572. err = bdi_init(&client->backing_dev_info);
  573. if (err < 0)
  574. goto fail;
  575. err = -ENOMEM;
  576. client->wb_wq = create_workqueue("ceph-writeback");
  577. if (client->wb_wq == NULL)
  578. goto fail_bdi;
  579. client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
  580. if (client->pg_inv_wq == NULL)
  581. goto fail_wb_wq;
  582. client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
  583. if (client->trunc_wq == NULL)
  584. goto fail_pg_inv_wq;
  585. /* set up mempools */
  586. err = -ENOMEM;
  587. client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
  588. client->mount_args->wsize >> PAGE_CACHE_SHIFT);
  589. if (!client->wb_pagevec_pool)
  590. goto fail_trunc_wq;
  591. /* caps */
  592. client->min_caps = args->max_readdir;
  593. /* subsystems */
  594. err = ceph_monc_init(&client->monc, client);
  595. if (err < 0)
  596. goto fail_mempool;
  597. err = ceph_osdc_init(&client->osdc, client);
  598. if (err < 0)
  599. goto fail_monc;
  600. err = ceph_mdsc_init(&client->mdsc, client);
  601. if (err < 0)
  602. goto fail_osdc;
  603. return client;
  604. fail_osdc:
  605. ceph_osdc_stop(&client->osdc);
  606. fail_monc:
  607. ceph_monc_stop(&client->monc);
  608. fail_mempool:
  609. mempool_destroy(client->wb_pagevec_pool);
  610. fail_trunc_wq:
  611. destroy_workqueue(client->trunc_wq);
  612. fail_pg_inv_wq:
  613. destroy_workqueue(client->pg_inv_wq);
  614. fail_wb_wq:
  615. destroy_workqueue(client->wb_wq);
  616. fail_bdi:
  617. bdi_destroy(&client->backing_dev_info);
  618. fail:
  619. kfree(client);
  620. return ERR_PTR(err);
  621. }
  622. static void ceph_destroy_client(struct ceph_client *client)
  623. {
  624. dout("destroy_client %p\n", client);
  625. /* unmount */
  626. ceph_mdsc_stop(&client->mdsc);
  627. ceph_osdc_stop(&client->osdc);
  628. /*
  629. * make sure mds and osd connections close out before destroying
  630. * the auth module, which is needed to free those connections'
  631. * ceph_authorizers.
  632. */
  633. ceph_msgr_flush();
  634. ceph_monc_stop(&client->monc);
  635. ceph_debugfs_client_cleanup(client);
  636. destroy_workqueue(client->wb_wq);
  637. destroy_workqueue(client->pg_inv_wq);
  638. destroy_workqueue(client->trunc_wq);
  639. bdi_destroy(&client->backing_dev_info);
  640. if (client->msgr)
  641. ceph_messenger_destroy(client->msgr);
  642. mempool_destroy(client->wb_pagevec_pool);
  643. destroy_mount_args(client->mount_args);
  644. kfree(client);
  645. dout("destroy_client %p done\n", client);
  646. }
  647. /*
  648. * Initially learn our fsid, or verify an fsid matches.
  649. */
  650. int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
  651. {
  652. if (client->have_fsid) {
  653. if (ceph_fsid_compare(&client->fsid, fsid)) {
  654. pr_err("bad fsid, had %pU got %pU",
  655. &client->fsid, fsid);
  656. return -1;
  657. }
  658. } else {
  659. pr_info("client%lld fsid %pU\n", client->monc.auth->global_id,
  660. fsid);
  661. memcpy(&client->fsid, fsid, sizeof(*fsid));
  662. ceph_debugfs_client_init(client);
  663. client->have_fsid = true;
  664. }
  665. return 0;
  666. }
  667. /*
  668. * true if we have the mon map (and have thus joined the cluster)
  669. */
  670. static int have_mon_and_osd_map(struct ceph_client *client)
  671. {
  672. return client->monc.monmap && client->monc.monmap->epoch &&
  673. client->osdc.osdmap && client->osdc.osdmap->epoch;
  674. }
  675. /*
  676. * Bootstrap mount by opening the root directory. Note the mount
  677. * @started time from caller, and time out if this takes too long.
  678. */
  679. static struct dentry *open_root_dentry(struct ceph_client *client,
  680. const char *path,
  681. unsigned long started)
  682. {
  683. struct ceph_mds_client *mdsc = &client->mdsc;
  684. struct ceph_mds_request *req = NULL;
  685. int err;
  686. struct dentry *root;
  687. /* open dir */
  688. dout("open_root_inode opening '%s'\n", path);
  689. req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
  690. if (IS_ERR(req))
  691. return ERR_CAST(req);
  692. req->r_path1 = kstrdup(path, GFP_NOFS);
  693. req->r_ino1.ino = CEPH_INO_ROOT;
  694. req->r_ino1.snap = CEPH_NOSNAP;
  695. req->r_started = started;
  696. req->r_timeout = client->mount_args->mount_timeout * HZ;
  697. req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
  698. req->r_num_caps = 2;
  699. err = ceph_mdsc_do_request(mdsc, NULL, req);
  700. if (err == 0) {
  701. dout("open_root_inode success\n");
  702. if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
  703. client->sb->s_root == NULL)
  704. root = d_alloc_root(req->r_target_inode);
  705. else
  706. root = d_obtain_alias(req->r_target_inode);
  707. req->r_target_inode = NULL;
  708. dout("open_root_inode success, root dentry is %p\n", root);
  709. } else {
  710. root = ERR_PTR(err);
  711. }
  712. ceph_mdsc_put_request(req);
  713. return root;
  714. }
  715. /*
  716. * mount: join the ceph cluster, and open root directory.
  717. */
  718. static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
  719. const char *path)
  720. {
  721. struct ceph_entity_addr *myaddr = NULL;
  722. int err;
  723. unsigned long timeout = client->mount_args->mount_timeout * HZ;
  724. unsigned long started = jiffies; /* note the start time */
  725. struct dentry *root;
  726. dout("mount start\n");
  727. mutex_lock(&client->mount_mutex);
  728. /* initialize the messenger */
  729. if (client->msgr == NULL) {
  730. if (ceph_test_opt(client, MYIP))
  731. myaddr = &client->mount_args->my_addr;
  732. client->msgr = ceph_messenger_create(myaddr);
  733. if (IS_ERR(client->msgr)) {
  734. err = PTR_ERR(client->msgr);
  735. client->msgr = NULL;
  736. goto out;
  737. }
  738. client->msgr->nocrc = ceph_test_opt(client, NOCRC);
  739. }
  740. /* open session, and wait for mon, mds, and osd maps */
  741. err = ceph_monc_open_session(&client->monc);
  742. if (err < 0)
  743. goto out;
  744. while (!have_mon_and_osd_map(client)) {
  745. err = -EIO;
  746. if (timeout && time_after_eq(jiffies, started + timeout))
  747. goto out;
  748. /* wait */
  749. dout("mount waiting for mon_map\n");
  750. err = wait_event_interruptible_timeout(client->auth_wq,
  751. have_mon_and_osd_map(client) || (client->auth_err < 0),
  752. timeout);
  753. if (err == -EINTR || err == -ERESTARTSYS)
  754. goto out;
  755. if (client->auth_err < 0) {
  756. err = client->auth_err;
  757. goto out;
  758. }
  759. }
  760. dout("mount opening root\n");
  761. root = open_root_dentry(client, "", started);
  762. if (IS_ERR(root)) {
  763. err = PTR_ERR(root);
  764. goto out;
  765. }
  766. if (client->sb->s_root)
  767. dput(root);
  768. else
  769. client->sb->s_root = root;
  770. if (path[0] == 0) {
  771. dget(root);
  772. } else {
  773. dout("mount opening base mountpoint\n");
  774. root = open_root_dentry(client, path, started);
  775. if (IS_ERR(root)) {
  776. err = PTR_ERR(root);
  777. dput(client->sb->s_root);
  778. client->sb->s_root = NULL;
  779. goto out;
  780. }
  781. }
  782. mnt->mnt_root = root;
  783. mnt->mnt_sb = client->sb;
  784. client->mount_state = CEPH_MOUNT_MOUNTED;
  785. dout("mount success\n");
  786. err = 0;
  787. out:
  788. mutex_unlock(&client->mount_mutex);
  789. return err;
  790. }
  791. static int ceph_set_super(struct super_block *s, void *data)
  792. {
  793. struct ceph_client *client = data;
  794. int ret;
  795. dout("set_super %p data %p\n", s, data);
  796. s->s_flags = client->mount_args->sb_flags;
  797. s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
  798. s->s_fs_info = client;
  799. client->sb = s;
  800. s->s_op = &ceph_super_ops;
  801. s->s_export_op = &ceph_export_ops;
  802. s->s_time_gran = 1000; /* 1000 ns == 1 us */
  803. ret = set_anon_super(s, NULL); /* what is that second arg for? */
  804. if (ret != 0)
  805. goto fail;
  806. return ret;
  807. fail:
  808. s->s_fs_info = NULL;
  809. client->sb = NULL;
  810. return ret;
  811. }
  812. /*
  813. * share superblock if same fs AND options
  814. */
  815. static int ceph_compare_super(struct super_block *sb, void *data)
  816. {
  817. struct ceph_client *new = data;
  818. struct ceph_mount_args *args = new->mount_args;
  819. struct ceph_client *other = ceph_sb_to_client(sb);
  820. int i;
  821. dout("ceph_compare_super %p\n", sb);
  822. if (args->flags & CEPH_OPT_FSID) {
  823. if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
  824. dout("fsid doesn't match\n");
  825. return 0;
  826. }
  827. } else {
  828. /* do we share (a) monitor? */
  829. for (i = 0; i < new->monc.monmap->num_mon; i++)
  830. if (ceph_monmap_contains(other->monc.monmap,
  831. &new->monc.monmap->mon_inst[i].addr))
  832. break;
  833. if (i == new->monc.monmap->num_mon) {
  834. dout("mon ip not part of monmap\n");
  835. return 0;
  836. }
  837. dout("mon ip matches existing sb %p\n", sb);
  838. }
  839. if (args->sb_flags != other->mount_args->sb_flags) {
  840. dout("flags differ\n");
  841. return 0;
  842. }
  843. return 1;
  844. }
  845. /*
  846. * construct our own bdi so we can control readahead, etc.
  847. */
  848. static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
  849. static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
  850. {
  851. int err;
  852. /* set ra_pages based on rsize mount option? */
  853. if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
  854. client->backing_dev_info.ra_pages =
  855. (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
  856. >> PAGE_SHIFT;
  857. err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
  858. atomic_long_inc_return(&bdi_seq));
  859. if (!err)
  860. sb->s_bdi = &client->backing_dev_info;
  861. return err;
  862. }
  863. static int ceph_get_sb(struct file_system_type *fs_type,
  864. int flags, const char *dev_name, void *data,
  865. struct vfsmount *mnt)
  866. {
  867. struct super_block *sb;
  868. struct ceph_client *client;
  869. int err;
  870. int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
  871. const char *path = NULL;
  872. struct ceph_mount_args *args;
  873. dout("ceph_get_sb\n");
  874. args = parse_mount_args(flags, data, dev_name, &path);
  875. if (IS_ERR(args)) {
  876. err = PTR_ERR(args);
  877. goto out_final;
  878. }
  879. /* create client (which we may/may not use) */
  880. client = ceph_create_client(args);
  881. if (IS_ERR(client)) {
  882. err = PTR_ERR(client);
  883. goto out_final;
  884. }
  885. if (client->mount_args->flags & CEPH_OPT_NOSHARE)
  886. compare_super = NULL;
  887. sb = sget(fs_type, compare_super, ceph_set_super, client);
  888. if (IS_ERR(sb)) {
  889. err = PTR_ERR(sb);
  890. goto out;
  891. }
  892. if (ceph_sb_to_client(sb) != client) {
  893. ceph_destroy_client(client);
  894. client = ceph_sb_to_client(sb);
  895. dout("get_sb got existing client %p\n", client);
  896. } else {
  897. dout("get_sb using new client %p\n", client);
  898. err = ceph_register_bdi(sb, client);
  899. if (err < 0)
  900. goto out_splat;
  901. }
  902. err = ceph_mount(client, mnt, path);
  903. if (err < 0)
  904. goto out_splat;
  905. dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
  906. mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
  907. return 0;
  908. out_splat:
  909. ceph_mdsc_close_sessions(&client->mdsc);
  910. deactivate_locked_super(sb);
  911. goto out_final;
  912. out:
  913. ceph_destroy_client(client);
  914. out_final:
  915. dout("ceph_get_sb fail %d\n", err);
  916. return err;
  917. }
  918. static void ceph_kill_sb(struct super_block *s)
  919. {
  920. struct ceph_client *client = ceph_sb_to_client(s);
  921. dout("kill_sb %p\n", s);
  922. ceph_mdsc_pre_umount(&client->mdsc);
  923. kill_anon_super(s); /* will call put_super after sb is r/o */
  924. ceph_destroy_client(client);
  925. }
  926. static struct file_system_type ceph_fs_type = {
  927. .owner = THIS_MODULE,
  928. .name = "ceph",
  929. .get_sb = ceph_get_sb,
  930. .kill_sb = ceph_kill_sb,
  931. .fs_flags = FS_RENAME_DOES_D_MOVE,
  932. };
  933. #define _STRINGIFY(x) #x
  934. #define STRINGIFY(x) _STRINGIFY(x)
  935. static int __init init_ceph(void)
  936. {
  937. int ret = 0;
  938. ret = ceph_debugfs_init();
  939. if (ret < 0)
  940. goto out;
  941. ret = ceph_msgr_init();
  942. if (ret < 0)
  943. goto out_debugfs;
  944. ret = init_caches();
  945. if (ret)
  946. goto out_msgr;
  947. ret = register_filesystem(&ceph_fs_type);
  948. if (ret)
  949. goto out_icache;
  950. pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
  951. CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
  952. CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
  953. CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
  954. return 0;
  955. out_icache:
  956. destroy_caches();
  957. out_msgr:
  958. ceph_msgr_exit();
  959. out_debugfs:
  960. ceph_debugfs_cleanup();
  961. out:
  962. return ret;
  963. }
  964. static void __exit exit_ceph(void)
  965. {
  966. dout("exit_ceph\n");
  967. unregister_filesystem(&ceph_fs_type);
  968. destroy_caches();
  969. ceph_msgr_exit();
  970. ceph_debugfs_cleanup();
  971. }
  972. module_init(init_ceph);
  973. module_exit(exit_ceph);
  974. MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
  975. MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
  976. MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
  977. MODULE_DESCRIPTION("Ceph filesystem for Linux");
  978. MODULE_LICENSE("GPL");