Przeglądaj źródła

Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2

* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (138 commits)
  ocfs2: Access the right buffer_head in ocfs2_merge_rec_left.
  ocfs2: use min_t in ocfs2_quota_read()
  ocfs2: remove unneeded lvb casts
  ocfs2: Add xattr support checking in init_security
  ocfs2: alloc xattr bucket in ocfs2_xattr_set_handle
  ocfs2: calculate and reserve credits for xattr value in mknod
  ocfs2/xattr: fix credits calculation during index create
  ocfs2/xattr: Always updating ctime during xattr set.
  ocfs2/xattr: Remove extend_trans call and add its credits from the beginning
  ocfs2/dlm: Fix race during lockres mastery
  ocfs2/dlm: Fix race in adding/removing lockres' to/from the tracking list
  ocfs2/dlm: Hold off sending lockres drop ref message while lockres is migrating
  ocfs2/dlm: Clean up errors in dlm_proxy_ast_handler()
  ocfs2/dlm: Fix a race between migrate request and exit domain
  ocfs2: One more hamming code optimization.
  ocfs2: Another hamming code optimization.
  ocfs2: Don't hand-code xor in ocfs2_hamming_encode().
  ocfs2: Enable metadata checksums.
  ocfs2: Validate superblock with checksum and ecc.
  ocfs2: Checksum and ECC for directory blocks.
  ...
Linus Torvalds 16 lat temu
rodzic
commit
10cc04f5a0
72 zmienionych plików z 8313 dodań i 2108 usunięć
  1. 2 1
      Documentation/filesystems/ocfs2.txt
  2. 12 6
      fs/Kconfig
  3. 1 0
      fs/Makefile
  4. 303 133
      fs/dquot.c
  5. 7 9
      fs/ext3/super.c
  6. 7 8
      fs/ext4/super.c
  7. 9 0
      fs/jbd2/commit.c
  8. 19 0
      fs/jbd2/journal.c
  9. 47 0
      fs/jbd2/transaction.c
  10. 7 0
      fs/ocfs2/Makefile
  11. 479 0
      fs/ocfs2/acl.c
  12. 58 0
      fs/ocfs2/acl.h
  13. 332 167
      fs/ocfs2/alloc.c
  14. 27 3
      fs/ocfs2/alloc.h
  15. 21 38
      fs/ocfs2/aops.c
  16. 477 0
      fs/ocfs2/blockcheck.c
  17. 82 0
      fs/ocfs2/blockcheck.h
  18. 31 1
      fs/ocfs2/buffer_head_io.c
  19. 16 11
      fs/ocfs2/buffer_head_io.h
  20. 1 0
      fs/ocfs2/cluster/masklog.c
  21. 1 0
      fs/ocfs2/cluster/masklog.h
  22. 314 85
      fs/ocfs2/dir.c
  23. 2 0
      fs/ocfs2/dir.h
  24. 27 25
      fs/ocfs2/dlm/dlmast.c
  25. 3 0
      fs/ocfs2/dlm/dlmcommon.h
  26. 24 29
      fs/ocfs2/dlm/dlmdebug.c
  27. 1 0
      fs/ocfs2/dlm/dlmdomain.c
  28. 37 5
      fs/ocfs2/dlm/dlmmaster.c
  29. 2 1
      fs/ocfs2/dlm/dlmthread.c
  30. 154 14
      fs/ocfs2/dlmglue.c
  31. 19 0
      fs/ocfs2/dlmglue.h
  32. 77 19
      fs/ocfs2/extent_map.c
  33. 24 0
      fs/ocfs2/extent_map.h
  34. 96 113
      fs/ocfs2/file.c
  35. 3 0
      fs/ocfs2/file.h
  36. 128 47
      fs/ocfs2/inode.c
  37. 16 2
      fs/ocfs2/inode.h
  38. 300 64
      fs/ocfs2/journal.c
  39. 98 30
      fs/ocfs2/journal.h
  40. 14 12
      fs/ocfs2/localalloc.c
  41. 227 91
      fs/ocfs2/namei.c
  42. 22 24
      fs/ocfs2/ocfs2.h
  43. 204 9
      fs/ocfs2/ocfs2_fs.h
  44. 0 82
      fs/ocfs2/ocfs2_jbd_compat.h
  45. 5 0
      fs/ocfs2/ocfs2_lockid.h
  46. 119 0
      fs/ocfs2/quota.h
  47. 1025 0
      fs/ocfs2/quota_global.c
  48. 1253 0
      fs/ocfs2/quota_local.c
  49. 22 54
      fs/ocfs2/resize.c
  50. 2 2
      fs/ocfs2/slot_map.c
  51. 226 137
      fs/ocfs2/suballoc.c
  52. 16 2
      fs/ocfs2/suballoc.h
  53. 326 2
      fs/ocfs2/super.c
  54. 1 1
      fs/ocfs2/symlink.c
  55. 495 212
      fs/ocfs2/xattr.c
  56. 45 0
      fs/ocfs2/xattr.h
  57. 7 4
      fs/quota.c
  58. 645 0
      fs/quota_tree.c
  59. 25 0
      fs/quota_tree.h
  60. 21 7
      fs/quota_v1.c
  61. 84 547
      fs/quota_v2.c
  62. 0 0
      fs/quotaio_v1.h
  63. 7 26
      fs/quotaio_v2.h
  64. 5 5
      fs/reiserfs/super.c
  65. 0 4
      include/linux/Kbuild
  66. 56 0
      include/linux/dqblk_qtree.h
  67. 0 7
      include/linux/dqblk_v1.h
  68. 6 16
      include/linux/dqblk_v2.h
  69. 32 0
      include/linux/jbd2.h
  70. 8 0
      include/linux/journal-head.h
  71. 75 33
      include/linux/quota.h
  72. 76 20
      include/linux/quotaops.h

+ 2 - 1
Documentation/filesystems/ocfs2.txt

@@ -31,7 +31,6 @@ Features which OCFS2 does not support yet:
 	- quotas
 	- quotas
 	- Directory change notification (F_NOTIFY)
 	- Directory change notification (F_NOTIFY)
 	- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
 	- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
-	- POSIX ACLs
 
 
 Mount options
 Mount options
 =============
 =============
@@ -79,3 +78,5 @@ inode64			Indicates that Ocfs2 is allowed to create inodes at
 			bits of significance.
 			bits of significance.
 user_xattr	(*)	Enables Extended User Attributes.
 user_xattr	(*)	Enables Extended User Attributes.
 nouser_xattr		Disables Extended User Attributes.
 nouser_xattr		Disables Extended User Attributes.
+acl			Enables POSIX Access Control Lists support.
+noacl		(*)	Disables POSIX Access Control Lists support.

+ 12 - 6
fs/Kconfig

@@ -189,6 +189,8 @@ config OCFS2_FS
 	select CONFIGFS_FS
 	select CONFIGFS_FS
 	select JBD2
 	select JBD2
 	select CRC32
 	select CRC32
+	select QUOTA
+	select QUOTA_TREE
 	help
 	help
 	  OCFS2 is a general purpose extent based shared disk cluster file
 	  OCFS2 is a general purpose extent based shared disk cluster file
 	  system with many similarities to ext3. It supports 64 bit inode
 	  system with many similarities to ext3. It supports 64 bit inode
@@ -258,15 +260,14 @@ config OCFS2_DEBUG_FS
 	  this option for debugging only as it is likely to decrease
 	  this option for debugging only as it is likely to decrease
 	  performance of the filesystem.
 	  performance of the filesystem.
 
 
-config OCFS2_COMPAT_JBD
-	bool "Use JBD for compatibility"
+config OCFS2_FS_POSIX_ACL
+	bool "OCFS2 POSIX Access Control Lists"
 	depends on OCFS2_FS
 	depends on OCFS2_FS
+	select FS_POSIX_ACL
 	default n
 	default n
-	select JBD
 	help
 	help
-	  The ocfs2 filesystem now uses JBD2 for its journalling.  JBD2
-	  is backwards compatible with JBD.  It is safe to say N here.
-	  However, if you really want to use the original JBD, say Y here.
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
 
 
 endif # BLOCK
 endif # BLOCK
 
 
@@ -303,6 +304,10 @@ config PRINT_QUOTA_WARNING
 	  Note that this behavior is currently deprecated and may go away in
 	  Note that this behavior is currently deprecated and may go away in
 	  future. Please use notification via netlink socket instead.
 	  future. Please use notification via netlink socket instead.
 
 
+# Generic support for tree structured quota files. Seleted when needed.
+config QUOTA_TREE
+	 tristate
+
 config QFMT_V1
 config QFMT_V1
 	tristate "Old quota format support"
 	tristate "Old quota format support"
 	depends on QUOTA
 	depends on QUOTA
@@ -314,6 +319,7 @@ config QFMT_V1
 config QFMT_V2
 config QFMT_V2
 	tristate "Quota format v2 support"
 	tristate "Quota format v2 support"
 	depends on QUOTA
 	depends on QUOTA
+	select QUOTA_TREE
 	help
 	help
 	  This quota format allows using quotas with 32-bit UIDs/GIDs. If you
 	  This quota format allows using quotas with 32-bit UIDs/GIDs. If you
 	  need this functionality say Y here.
 	  need this functionality say Y here.

+ 1 - 0
fs/Makefile

@@ -54,6 +54,7 @@ obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 obj-$(CONFIG_QUOTA)		+= dquot.o
 obj-$(CONFIG_QUOTA)		+= dquot.o
 obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
 obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
 obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
 obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
+obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
 obj-$(CONFIG_QUOTACTL)		+= quota.o
 obj-$(CONFIG_QUOTACTL)		+= quota.o
 
 
 obj-$(CONFIG_PROC_FS)		+= proc/
 obj-$(CONFIG_PROC_FS)		+= proc/

+ 303 - 133
fs/dquot.c

@@ -211,8 +211,6 @@ static struct hlist_head *dquot_hash;
 
 
 struct dqstats dqstats;
 struct dqstats dqstats;
 
 
-static void dqput(struct dquot *dquot);
-
 static inline unsigned int
 static inline unsigned int
 hashfn(const struct super_block *sb, unsigned int id, int type)
 hashfn(const struct super_block *sb, unsigned int id, int type)
 {
 {
@@ -415,6 +413,17 @@ out_dqlock:
 	return ret;
 	return ret;
 }
 }
 
 
+void dquot_destroy(struct dquot *dquot)
+{
+	kmem_cache_free(dquot_cachep, dquot);
+}
+EXPORT_SYMBOL(dquot_destroy);
+
+static inline void do_destroy_dquot(struct dquot *dquot)
+{
+	dquot->dq_sb->dq_op->destroy_dquot(dquot);
+}
+
 /* Invalidate all dquots on the list. Note that this function is called after
 /* Invalidate all dquots on the list. Note that this function is called after
  * quota is disabled and pointers from inodes removed so there cannot be new
  * quota is disabled and pointers from inodes removed so there cannot be new
  * quota users. There can still be some users of quotas due to inodes being
  * quota users. There can still be some users of quotas due to inodes being
@@ -463,9 +472,44 @@ restart:
 		remove_dquot_hash(dquot);
 		remove_dquot_hash(dquot);
 		remove_free_dquot(dquot);
 		remove_free_dquot(dquot);
 		remove_inuse(dquot);
 		remove_inuse(dquot);
-		kmem_cache_free(dquot_cachep, dquot);
+		do_destroy_dquot(dquot);
+	}
+	spin_unlock(&dq_list_lock);
+}
+
+/* Call callback for every active dquot on given filesystem */
+int dquot_scan_active(struct super_block *sb,
+		      int (*fn)(struct dquot *dquot, unsigned long priv),
+		      unsigned long priv)
+{
+	struct dquot *dquot, *old_dquot = NULL;
+	int ret = 0;
+
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	spin_lock(&dq_list_lock);
+	list_for_each_entry(dquot, &inuse_list, dq_inuse) {
+		if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+			continue;
+		if (dquot->dq_sb != sb)
+			continue;
+		/* Now we have active dquot so we can just increase use count */
+		atomic_inc(&dquot->dq_count);
+		dqstats.lookups++;
+		spin_unlock(&dq_list_lock);
+		dqput(old_dquot);
+		old_dquot = dquot;
+		ret = fn(dquot, priv);
+		if (ret < 0)
+			goto out;
+		spin_lock(&dq_list_lock);
+		/* We are safe to continue now because our dquot could not
+		 * be moved out of the inuse list while we hold the reference */
 	}
 	}
 	spin_unlock(&dq_list_lock);
 	spin_unlock(&dq_list_lock);
+out:
+	dqput(old_dquot);
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	return ret;
 }
 }
 
 
 int vfs_quota_sync(struct super_block *sb, int type)
 int vfs_quota_sync(struct super_block *sb, int type)
@@ -479,7 +523,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (type != -1 && cnt != type)
 		if (type != -1 && cnt != type)
 			continue;
 			continue;
-		if (!sb_has_quota_enabled(sb, cnt))
+		if (!sb_has_quota_active(sb, cnt))
 			continue;
 			continue;
 		spin_lock(&dq_list_lock);
 		spin_lock(&dq_list_lock);
 		dirty = &dqopt->info[cnt].dqi_dirty_list;
 		dirty = &dqopt->info[cnt].dqi_dirty_list;
@@ -504,8 +548,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
 	}
 	}
 
 
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt)
-			&& info_dirty(&dqopt->info[cnt]))
+		if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
+		    && info_dirty(&dqopt->info[cnt]))
 			sb->dq_op->write_info(sb, cnt);
 			sb->dq_op->write_info(sb, cnt);
 	spin_lock(&dq_list_lock);
 	spin_lock(&dq_list_lock);
 	dqstats.syncs++;
 	dqstats.syncs++;
@@ -527,7 +571,7 @@ static void prune_dqcache(int count)
 		remove_dquot_hash(dquot);
 		remove_dquot_hash(dquot);
 		remove_free_dquot(dquot);
 		remove_free_dquot(dquot);
 		remove_inuse(dquot);
 		remove_inuse(dquot);
-		kmem_cache_free(dquot_cachep, dquot);
+		do_destroy_dquot(dquot);
 		count--;
 		count--;
 		head = free_dquots.prev;
 		head = free_dquots.prev;
 	}
 	}
@@ -558,7 +602,7 @@ static struct shrinker dqcache_shrinker = {
  * NOTE: If you change this function please check whether dqput_blocks() works right...
  * NOTE: If you change this function please check whether dqput_blocks() works right...
  * MUST be called with either dqptr_sem or dqonoff_mutex held
  * MUST be called with either dqptr_sem or dqonoff_mutex held
  */
  */
-static void dqput(struct dquot *dquot)
+void dqput(struct dquot *dquot)
 {
 {
 	int ret;
 	int ret;
 
 
@@ -584,7 +628,7 @@ we_slept:
 		/* We have more than one user... nothing to do */
 		/* We have more than one user... nothing to do */
 		atomic_dec(&dquot->dq_count);
 		atomic_dec(&dquot->dq_count);
 		/* Releasing dquot during quotaoff phase? */
 		/* Releasing dquot during quotaoff phase? */
-		if (!sb_has_quota_enabled(dquot->dq_sb, dquot->dq_type) &&
+		if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) &&
 		    atomic_read(&dquot->dq_count) == 1)
 		    atomic_read(&dquot->dq_count) == 1)
 			wake_up(&dquot->dq_wait_unused);
 			wake_up(&dquot->dq_wait_unused);
 		spin_unlock(&dq_list_lock);
 		spin_unlock(&dq_list_lock);
@@ -625,11 +669,17 @@ we_slept:
 	spin_unlock(&dq_list_lock);
 	spin_unlock(&dq_list_lock);
 }
 }
 
 
+struct dquot *dquot_alloc(struct super_block *sb, int type)
+{
+	return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+}
+EXPORT_SYMBOL(dquot_alloc);
+
 static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 {
 {
 	struct dquot *dquot;
 	struct dquot *dquot;
 
 
-	dquot = kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+	dquot = sb->dq_op->alloc_dquot(sb, type);
 	if(!dquot)
 	if(!dquot)
 		return NODQUOT;
 		return NODQUOT;
 
 
@@ -646,16 +696,34 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 	return dquot;
 	return dquot;
 }
 }
 
 
+/*
+ * Check whether dquot is in memory.
+ * MUST be called with either dqptr_sem or dqonoff_mutex held
+ */
+int dquot_is_cached(struct super_block *sb, unsigned int id, int type)
+{
+	unsigned int hashent = hashfn(sb, id, type);
+	int ret = 0;
+
+        if (!sb_has_quota_active(sb, type))
+		return 0;
+	spin_lock(&dq_list_lock);
+	if (find_dquot(hashent, sb, id, type) != NODQUOT)
+		ret = 1;
+	spin_unlock(&dq_list_lock);
+	return ret;
+}
+
 /*
 /*
  * Get reference to dquot
  * Get reference to dquot
  * MUST be called with either dqptr_sem or dqonoff_mutex held
  * MUST be called with either dqptr_sem or dqonoff_mutex held
  */
  */
-static struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
+struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 {
 {
 	unsigned int hashent = hashfn(sb, id, type);
 	unsigned int hashent = hashfn(sb, id, type);
 	struct dquot *dquot, *empty = NODQUOT;
 	struct dquot *dquot, *empty = NODQUOT;
 
 
-        if (!sb_has_quota_enabled(sb, type))
+        if (!sb_has_quota_active(sb, type))
 		return NODQUOT;
 		return NODQUOT;
 we_slept:
 we_slept:
 	spin_lock(&dq_list_lock);
 	spin_lock(&dq_list_lock);
@@ -682,7 +750,7 @@ we_slept:
 		dqstats.lookups++;
 		dqstats.lookups++;
 		spin_unlock(&dq_list_lock);
 		spin_unlock(&dq_list_lock);
 		if (empty)
 		if (empty)
-			kmem_cache_free(dquot_cachep, empty);
+			do_destroy_dquot(empty);
 	}
 	}
 	/* Wait for dq_lock - after this we know that either dquot_release() is already
 	/* Wait for dq_lock - after this we know that either dquot_release() is already
 	 * finished or it will be canceled due to dq_count > 1 test */
 	 * finished or it will be canceled due to dq_count > 1 test */
@@ -820,7 +888,7 @@ static void drop_dquot_ref(struct super_block *sb, int type)
 	}
 	}
 }
 }
 
 
-static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
 {
 {
 	dquot->dq_dqb.dqb_curinodes += number;
 	dquot->dq_dqb.dqb_curinodes += number;
 }
 }
@@ -830,9 +898,10 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
 	dquot->dq_dqb.dqb_curspace += number;
 	dquot->dq_dqb.dqb_curspace += number;
 }
 }
 
 
-static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
 {
 {
-	if (dquot->dq_dqb.dqb_curinodes > number)
+	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+	    dquot->dq_dqb.dqb_curinodes >= number)
 		dquot->dq_dqb.dqb_curinodes -= number;
 		dquot->dq_dqb.dqb_curinodes -= number;
 	else
 	else
 		dquot->dq_dqb.dqb_curinodes = 0;
 		dquot->dq_dqb.dqb_curinodes = 0;
@@ -843,11 +912,12 @@ static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
 
 
 static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
 static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
 {
 {
-	if (dquot->dq_dqb.dqb_curspace > number)
+	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+	    dquot->dq_dqb.dqb_curspace >= number)
 		dquot->dq_dqb.dqb_curspace -= number;
 		dquot->dq_dqb.dqb_curspace -= number;
 	else
 	else
 		dquot->dq_dqb.dqb_curspace = 0;
 		dquot->dq_dqb.dqb_curspace = 0;
-	if (toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+	if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
 		dquot->dq_dqb.dqb_btime = (time_t) 0;
 		dquot->dq_dqb.dqb_btime = (time_t) 0;
 	clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 	clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 }
 }
@@ -1023,10 +1093,11 @@ static inline char ignore_hardlimit(struct dquot *dquot)
 }
 }
 
 
 /* needs dq_data_lock */
 /* needs dq_data_lock */
-static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
+static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 {
 {
 	*warntype = QUOTA_NL_NOWARN;
 	*warntype = QUOTA_NL_NOWARN;
-	if (inodes <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
 		return QUOTA_OK;
 		return QUOTA_OK;
 
 
 	if (dquot->dq_dqb.dqb_ihardlimit &&
 	if (dquot->dq_dqb.dqb_ihardlimit &&
@@ -1058,11 +1129,12 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
 static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
 static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
 {
 {
 	*warntype = QUOTA_NL_NOWARN;
 	*warntype = QUOTA_NL_NOWARN;
-	if (space <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
 		return QUOTA_OK;
 		return QUOTA_OK;
 
 
 	if (dquot->dq_dqb.dqb_bhardlimit &&
 	if (dquot->dq_dqb.dqb_bhardlimit &&
-	   toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bhardlimit &&
+	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit &&
             !ignore_hardlimit(dquot)) {
             !ignore_hardlimit(dquot)) {
 		if (!prealloc)
 		if (!prealloc)
 			*warntype = QUOTA_NL_BHARDWARN;
 			*warntype = QUOTA_NL_BHARDWARN;
@@ -1070,7 +1142,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	}
 	}
 
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
-	   toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
 	    dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
 	    dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
             !ignore_hardlimit(dquot)) {
             !ignore_hardlimit(dquot)) {
 		if (!prealloc)
 		if (!prealloc)
@@ -1079,7 +1151,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	}
 	}
 
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
-	   toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
 	    dquot->dq_dqb.dqb_btime == 0) {
 	    dquot->dq_dqb.dqb_btime == 0) {
 		if (!prealloc) {
 		if (!prealloc) {
 			*warntype = QUOTA_NL_BSOFTWARN;
 			*warntype = QUOTA_NL_BSOFTWARN;
@@ -1096,10 +1168,11 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	return QUOTA_OK;
 	return QUOTA_OK;
 }
 }
 
 
-static int info_idq_free(struct dquot *dquot, ulong inodes)
+static int info_idq_free(struct dquot *dquot, qsize_t inodes)
 {
 {
 	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
 	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
-	    dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
+	    dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
+	    !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
 		return QUOTA_NL_NOWARN;
 		return QUOTA_NL_NOWARN;
 
 
 	if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
 	if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
@@ -1113,15 +1186,13 @@ static int info_idq_free(struct dquot *dquot, ulong inodes)
 static int info_bdq_free(struct dquot *dquot, qsize_t space)
 static int info_bdq_free(struct dquot *dquot, qsize_t space)
 {
 {
 	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
 	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
-	    toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+	    dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
 		return QUOTA_NL_NOWARN;
 		return QUOTA_NL_NOWARN;
 
 
-	if (toqb(dquot->dq_dqb.dqb_curspace - space) <=
-	    dquot->dq_dqb.dqb_bsoftlimit)
+	if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
 		return QUOTA_NL_BSOFTBELOW;
 		return QUOTA_NL_BSOFTBELOW;
-	if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit &&
-	    toqb(dquot->dq_dqb.dqb_curspace - space) <
-						dquot->dq_dqb.dqb_bhardlimit)
+	if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit &&
+	    dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit)
 		return QUOTA_NL_BHARDBELOW;
 		return QUOTA_NL_BHARDBELOW;
 	return QUOTA_NL_NOWARN;
 	return QUOTA_NL_NOWARN;
 }
 }
@@ -1166,17 +1237,23 @@ out_err:
  * 	Release all quotas referenced by inode
  * 	Release all quotas referenced by inode
  *	Transaction must be started at an entry
  *	Transaction must be started at an entry
  */
  */
-int dquot_drop(struct inode *inode)
+int dquot_drop_locked(struct inode *inode)
 {
 {
 	int cnt;
 	int cnt;
 
 
-	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (inode->i_dquot[cnt] != NODQUOT) {
 		if (inode->i_dquot[cnt] != NODQUOT) {
 			dqput(inode->i_dquot[cnt]);
 			dqput(inode->i_dquot[cnt]);
 			inode->i_dquot[cnt] = NODQUOT;
 			inode->i_dquot[cnt] = NODQUOT;
 		}
 		}
 	}
 	}
+	return 0;
+}
+
+int dquot_drop(struct inode *inode)
+{
+	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	dquot_drop_locked(inode);
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return 0;
 	return 0;
 }
 }
@@ -1264,7 +1341,7 @@ warn_put_all:
 /*
 /*
  * This operation can block, but only after everything is updated
  * This operation can block, but only after everything is updated
  */
  */
-int dquot_alloc_inode(const struct inode *inode, unsigned long number)
+int dquot_alloc_inode(const struct inode *inode, qsize_t number)
 {
 {
 	int cnt, ret = NO_QUOTA;
 	int cnt, ret = NO_QUOTA;
 	char warntype[MAXQUOTAS];
 	char warntype[MAXQUOTAS];
@@ -1349,7 +1426,7 @@ out_sub:
 /*
 /*
  * This operation can block, but only after everything is updated
  * This operation can block, but only after everything is updated
  */
  */
-int dquot_free_inode(const struct inode *inode, unsigned long number)
+int dquot_free_inode(const struct inode *inode, qsize_t number)
 {
 {
 	unsigned int cnt;
 	unsigned int cnt;
 	char warntype[MAXQUOTAS];
 	char warntype[MAXQUOTAS];
@@ -1495,7 +1572,7 @@ warn_put_all:
 /* Wrapper for transferring ownership of an inode */
 /* Wrapper for transferring ownership of an inode */
 int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
 int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
 {
 {
-	if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) {
+	if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
 		vfs_dq_init(inode);
 		vfs_dq_init(inode);
 		if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
 		if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
 			return 1;
 			return 1;
@@ -1533,54 +1610,27 @@ struct dquot_operations dquot_operations = {
 	.acquire_dquot	= dquot_acquire,
 	.acquire_dquot	= dquot_acquire,
 	.release_dquot	= dquot_release,
 	.release_dquot	= dquot_release,
 	.mark_dirty	= dquot_mark_dquot_dirty,
 	.mark_dirty	= dquot_mark_dquot_dirty,
-	.write_info	= dquot_commit_info
+	.write_info	= dquot_commit_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
 };
 };
 
 
-static inline void set_enable_flags(struct quota_info *dqopt, int type)
-{
-	switch (type) {
-		case USRQUOTA:
-			dqopt->flags |= DQUOT_USR_ENABLED;
-			dqopt->flags &= ~DQUOT_USR_SUSPENDED;
-			break;
-		case GRPQUOTA:
-			dqopt->flags |= DQUOT_GRP_ENABLED;
-			dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
-			break;
-	}
-}
-
-static inline void reset_enable_flags(struct quota_info *dqopt, int type,
-				      int remount)
-{
-	switch (type) {
-		case USRQUOTA:
-			dqopt->flags &= ~DQUOT_USR_ENABLED;
-			if (remount)
-				dqopt->flags |= DQUOT_USR_SUSPENDED;
-			else
-				dqopt->flags &= ~DQUOT_USR_SUSPENDED;
-			break;
-		case GRPQUOTA:
-			dqopt->flags &= ~DQUOT_GRP_ENABLED;
-			if (remount)
-				dqopt->flags |= DQUOT_GRP_SUSPENDED;
-			else
-				dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
-			break;
-	}
-}
-
-
 /*
 /*
  * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
  * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
  */
  */
-int vfs_quota_off(struct super_block *sb, int type, int remount)
+int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 {
 {
 	int cnt, ret = 0;
 	int cnt, ret = 0;
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct inode *toputinode[MAXQUOTAS];
 	struct inode *toputinode[MAXQUOTAS];
 
 
+	/* Cannot turn off usage accounting without turning off limits, or
+	 * suspend quotas and simultaneously turn quotas off. */
+	if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
+	    || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
+	    DQUOT_USAGE_ENABLED)))
+		return -EINVAL;
+
 	/* We need to serialize quota_off() for device */
 	/* We need to serialize quota_off() for device */
 	mutex_lock(&dqopt->dqonoff_mutex);
 	mutex_lock(&dqopt->dqonoff_mutex);
 
 
@@ -1589,7 +1639,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 	 * sometimes we are called when fill_super() failed and calling
 	 * sometimes we are called when fill_super() failed and calling
 	 * sync_fs() in such cases does no good.
 	 * sync_fs() in such cases does no good.
 	 */
 	 */
-	if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) {
+	if (!sb_any_quota_loaded(sb)) {
 		mutex_unlock(&dqopt->dqonoff_mutex);
 		mutex_unlock(&dqopt->dqonoff_mutex);
 		return 0;
 		return 0;
 	}
 	}
@@ -1597,17 +1647,28 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 		toputinode[cnt] = NULL;
 		toputinode[cnt] = NULL;
 		if (type != -1 && cnt != type)
 		if (type != -1 && cnt != type)
 			continue;
 			continue;
-		/* If we keep inodes of quota files after remount and quotaoff
-		 * is called, drop kept inodes. */
-		if (!remount && sb_has_quota_suspended(sb, cnt)) {
-			iput(dqopt->files[cnt]);
-			dqopt->files[cnt] = NULL;
-			reset_enable_flags(dqopt, cnt, 0);
+		if (!sb_has_quota_loaded(sb, cnt))
 			continue;
 			continue;
+
+		if (flags & DQUOT_SUSPENDED) {
+			dqopt->flags |=
+				dquot_state_flag(DQUOT_SUSPENDED, cnt);
+		} else {
+			dqopt->flags &= ~dquot_state_flag(flags, cnt);
+			/* Turning off suspended quotas? */
+			if (!sb_has_quota_loaded(sb, cnt) &&
+			    sb_has_quota_suspended(sb, cnt)) {
+				dqopt->flags &=	~dquot_state_flag(
+							DQUOT_SUSPENDED, cnt);
+				iput(dqopt->files[cnt]);
+				dqopt->files[cnt] = NULL;
+				continue;
+			}
 		}
 		}
-		if (!sb_has_quota_enabled(sb, cnt))
+
+		/* We still have to keep quota loaded? */
+		if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
 			continue;
 			continue;
-		reset_enable_flags(dqopt, cnt, remount);
 
 
 		/* Note: these are blocking operations */
 		/* Note: these are blocking operations */
 		drop_dquot_ref(sb, cnt);
 		drop_dquot_ref(sb, cnt);
@@ -1623,7 +1684,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 		put_quota_format(dqopt->info[cnt].dqi_format);
 		put_quota_format(dqopt->info[cnt].dqi_format);
 
 
 		toputinode[cnt] = dqopt->files[cnt];
 		toputinode[cnt] = dqopt->files[cnt];
-		if (!remount)
+		if (!sb_has_quota_loaded(sb, cnt))
 			dqopt->files[cnt] = NULL;
 			dqopt->files[cnt] = NULL;
 		dqopt->info[cnt].dqi_flags = 0;
 		dqopt->info[cnt].dqi_flags = 0;
 		dqopt->info[cnt].dqi_igrace = 0;
 		dqopt->info[cnt].dqi_igrace = 0;
@@ -1631,6 +1692,11 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 		dqopt->ops[cnt] = NULL;
 		dqopt->ops[cnt] = NULL;
 	}
 	}
 	mutex_unlock(&dqopt->dqonoff_mutex);
 	mutex_unlock(&dqopt->dqonoff_mutex);
+
+	/* Skip syncing and setting flags if quota files are hidden */
+	if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+		goto put_inodes;
+
 	/* Sync the superblock so that buffers with quota data are written to
 	/* Sync the superblock so that buffers with quota data are written to
 	 * disk (and so userspace sees correct data afterwards). */
 	 * disk (and so userspace sees correct data afterwards). */
 	if (sb->s_op->sync_fs)
 	if (sb->s_op->sync_fs)
@@ -1646,7 +1712,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 			mutex_lock(&dqopt->dqonoff_mutex);
 			mutex_lock(&dqopt->dqonoff_mutex);
 			/* If quota was reenabled in the meantime, we have
 			/* If quota was reenabled in the meantime, we have
 			 * nothing to do */
 			 * nothing to do */
-			if (!sb_has_quota_enabled(sb, cnt)) {
+			if (!sb_has_quota_loaded(sb, cnt)) {
 				mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
 				mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
 				toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
 				toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
 				  S_NOATIME | S_NOQUOTA);
 				  S_NOATIME | S_NOQUOTA);
@@ -1655,26 +1721,43 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 				mark_inode_dirty(toputinode[cnt]);
 				mark_inode_dirty(toputinode[cnt]);
 			}
 			}
 			mutex_unlock(&dqopt->dqonoff_mutex);
 			mutex_unlock(&dqopt->dqonoff_mutex);
+		}
+	if (sb->s_bdev)
+		invalidate_bdev(sb->s_bdev);
+put_inodes:
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (toputinode[cnt]) {
 			/* On remount RO, we keep the inode pointer so that we
 			/* On remount RO, we keep the inode pointer so that we
-			 * can reenable quota on the subsequent remount RW.
-			 * But we have better not keep inode pointer when there
-			 * is pending delete on the quota file... */
-			if (!remount)
+			 * can reenable quota on the subsequent remount RW. We
+			 * have to check 'flags' variable and not use sb_has_
+			 * function because another quotaon / quotaoff could
+			 * change global state before we got here. We refuse
+			 * to suspend quotas when there is pending delete on
+			 * the quota file... */
+			if (!(flags & DQUOT_SUSPENDED))
 				iput(toputinode[cnt]);
 				iput(toputinode[cnt]);
 			else if (!toputinode[cnt]->i_nlink)
 			else if (!toputinode[cnt]->i_nlink)
 				ret = -EBUSY;
 				ret = -EBUSY;
 		}
 		}
-	if (sb->s_bdev)
-		invalidate_bdev(sb->s_bdev);
 	return ret;
 	return ret;
 }
 }
 
 
+int vfs_quota_off(struct super_block *sb, int type, int remount)
+{
+	return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
+				 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
+}
+
 /*
 /*
  *	Turn quotas on on a device
  *	Turn quotas on on a device
  */
  */
 
 
-/* Helper function when we already have the inode */
-static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
+/*
+ * Helper function to turn quotas on when we already have the inode of
+ * quota file and no quota information is loaded.
+ */
+static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
+	unsigned int flags)
 {
 {
 	struct quota_format_type *fmt = find_quota_format(format_id);
 	struct quota_format_type *fmt = find_quota_format(format_id);
 	struct super_block *sb = inode->i_sb;
 	struct super_block *sb = inode->i_sb;
@@ -1696,27 +1779,37 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
 		error = -EINVAL;
 		error = -EINVAL;
 		goto out_fmt;
 		goto out_fmt;
 	}
 	}
+	/* Usage always has to be set... */
+	if (!(flags & DQUOT_USAGE_ENABLED)) {
+		error = -EINVAL;
+		goto out_fmt;
+	}
 
 
-	/* As we bypass the pagecache we must now flush the inode so that
-	 * we see all the changes from userspace... */
-	write_inode_now(inode, 1);
-	/* And now flush the block cache so that kernel sees the changes */
-	invalidate_bdev(sb->s_bdev);
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+		/* As we bypass the pagecache we must now flush the inode so
+		 * that we see all the changes from userspace... */
+		write_inode_now(inode, 1);
+		/* And now flush the block cache so that kernel sees the
+		 * changes */
+		invalidate_bdev(sb->s_bdev);
+	}
 	mutex_lock(&inode->i_mutex);
 	mutex_lock(&inode->i_mutex);
 	mutex_lock(&dqopt->dqonoff_mutex);
 	mutex_lock(&dqopt->dqonoff_mutex);
-	if (sb_has_quota_enabled(sb, type) ||
-			sb_has_quota_suspended(sb, type)) {
+	if (sb_has_quota_loaded(sb, type)) {
 		error = -EBUSY;
 		error = -EBUSY;
 		goto out_lock;
 		goto out_lock;
 	}
 	}
-	/* We don't want quota and atime on quota files (deadlocks possible)
-	 * Also nobody should write to the file - we use special IO operations
-	 * which ignore the immutable bit. */
-	down_write(&dqopt->dqptr_sem);
-	oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
-	inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
-	up_write(&dqopt->dqptr_sem);
-	sb->dq_op->drop(inode);
+
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+		/* We don't want quota and atime on quota files (deadlocks
+		 * possible) Also nobody should write to the file - we use
+		 * special IO operations which ignore the immutable bit. */
+		down_write(&dqopt->dqptr_sem);
+		oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
+		inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+		up_write(&dqopt->dqptr_sem);
+		sb->dq_op->drop(inode);
+	}
 
 
 	error = -EIO;
 	error = -EIO;
 	dqopt->files[type] = igrab(inode);
 	dqopt->files[type] = igrab(inode);
@@ -1737,7 +1830,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
 	}
 	}
 	mutex_unlock(&dqopt->dqio_mutex);
 	mutex_unlock(&dqopt->dqio_mutex);
 	mutex_unlock(&inode->i_mutex);
 	mutex_unlock(&inode->i_mutex);
-	set_enable_flags(dqopt, type);
+	dqopt->flags |= dquot_state_flag(flags, type);
 
 
 	add_dquot_ref(sb, type);
 	add_dquot_ref(sb, type);
 	mutex_unlock(&dqopt->dqonoff_mutex);
 	mutex_unlock(&dqopt->dqonoff_mutex);
@@ -1770,20 +1863,23 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct inode *inode;
 	struct inode *inode;
 	int ret;
 	int ret;
+	unsigned int flags;
 
 
 	mutex_lock(&dqopt->dqonoff_mutex);
 	mutex_lock(&dqopt->dqonoff_mutex);
 	if (!sb_has_quota_suspended(sb, type)) {
 	if (!sb_has_quota_suspended(sb, type)) {
 		mutex_unlock(&dqopt->dqonoff_mutex);
 		mutex_unlock(&dqopt->dqonoff_mutex);
 		return 0;
 		return 0;
 	}
 	}
-	BUG_ON(sb_has_quota_enabled(sb, type));
-
 	inode = dqopt->files[type];
 	inode = dqopt->files[type];
 	dqopt->files[type] = NULL;
 	dqopt->files[type] = NULL;
-	reset_enable_flags(dqopt, type, 0);
+	flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+						DQUOT_LIMITS_ENABLED, type);
+	dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
 	mutex_unlock(&dqopt->dqonoff_mutex);
 	mutex_unlock(&dqopt->dqonoff_mutex);
 
 
-	ret = vfs_quota_on_inode(inode, type, dqopt->info[type].dqi_fmt_id);
+	flags = dquot_generic_flag(flags, type);
+	ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
+				   flags);
 	iput(inode);
 	iput(inode);
 
 
 	return ret;
 	return ret;
@@ -1799,12 +1895,12 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
 	if (path->mnt->mnt_sb != sb)
 	if (path->mnt->mnt_sb != sb)
 		error = -EXDEV;
 		error = -EXDEV;
 	else
 	else
-		error = vfs_quota_on_inode(path->dentry->d_inode, type,
-					   format_id);
+		error = vfs_load_quota_inode(path->dentry->d_inode, type,
+					     format_id, DQUOT_USAGE_ENABLED |
+					     DQUOT_LIMITS_ENABLED);
 	return error;
 	return error;
 }
 }
 
 
-/* Actual function called from quotactl() */
 int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 		 int remount)
 		 int remount)
 {
 {
@@ -1822,6 +1918,50 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 	return error;
 	return error;
 }
 }
 
 
+/*
+ * More powerful function for turning on quotas allowing setting
+ * of individual quota flags
+ */
+int vfs_quota_enable(struct inode *inode, int type, int format_id,
+		unsigned int flags)
+{
+	int ret = 0;
+	struct super_block *sb = inode->i_sb;
+	struct quota_info *dqopt = sb_dqopt(sb);
+
+	/* Just unsuspend quotas? */
+	if (flags & DQUOT_SUSPENDED)
+		return vfs_quota_on_remount(sb, type);
+	if (!flags)
+		return 0;
+	/* Just updating flags needed? */
+	if (sb_has_quota_loaded(sb, type)) {
+		mutex_lock(&dqopt->dqonoff_mutex);
+		/* Now do a reliable test... */
+		if (!sb_has_quota_loaded(sb, type)) {
+			mutex_unlock(&dqopt->dqonoff_mutex);
+			goto load_quota;
+		}
+		if (flags & DQUOT_USAGE_ENABLED &&
+		    sb_has_quota_usage_enabled(sb, type)) {
+			ret = -EBUSY;
+			goto out_lock;
+		}
+		if (flags & DQUOT_LIMITS_ENABLED &&
+		    sb_has_quota_limits_enabled(sb, type)) {
+			ret = -EBUSY;
+			goto out_lock;
+		}
+		sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
+out_lock:
+		mutex_unlock(&dqopt->dqonoff_mutex);
+		return ret;
+	}
+
+load_quota:
+	return vfs_load_quota_inode(inode, type, format_id, flags);
+}
+
 /*
 /*
  * This function is used when filesystem needs to initialize quotas
  * This function is used when filesystem needs to initialize quotas
  * during mount time.
  * during mount time.
@@ -1843,7 +1983,8 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
 
 
 	error = security_quota_on(dentry);
 	error = security_quota_on(dentry);
 	if (!error)
 	if (!error)
-		error = vfs_quota_on_inode(dentry->d_inode, type, format_id);
+		error = vfs_load_quota_inode(dentry->d_inode, type, format_id,
+				DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
 
 out:
 out:
 	dput(dentry);
 	dput(dentry);
@@ -1866,14 +2007,24 @@ int vfs_dq_quota_on_remount(struct super_block *sb)
 	return ret;
 	return ret;
 }
 }
 
 
+static inline qsize_t qbtos(qsize_t blocks)
+{
+	return blocks << QIF_DQBLKSIZE_BITS;
+}
+
+static inline qsize_t stoqb(qsize_t space)
+{
+	return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
+
 /* Generic routine for getting common part of quota structure */
 /* Generic routine for getting common part of quota structure */
 static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 {
 {
 	struct mem_dqblk *dm = &dquot->dq_dqb;
 	struct mem_dqblk *dm = &dquot->dq_dqb;
 
 
 	spin_lock(&dq_data_lock);
 	spin_lock(&dq_data_lock);
-	di->dqb_bhardlimit = dm->dqb_bhardlimit;
-	di->dqb_bsoftlimit = dm->dqb_bsoftlimit;
+	di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
+	di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
 	di->dqb_curspace = dm->dqb_curspace;
 	di->dqb_curspace = dm->dqb_curspace;
 	di->dqb_ihardlimit = dm->dqb_ihardlimit;
 	di->dqb_ihardlimit = dm->dqb_ihardlimit;
 	di->dqb_isoftlimit = dm->dqb_isoftlimit;
 	di->dqb_isoftlimit = dm->dqb_isoftlimit;
@@ -1918,28 +2069,36 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 	if (di->dqb_valid & QIF_SPACE) {
 	if (di->dqb_valid & QIF_SPACE) {
 		dm->dqb_curspace = di->dqb_curspace;
 		dm->dqb_curspace = di->dqb_curspace;
 		check_blim = 1;
 		check_blim = 1;
+		__set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
 	}
 	}
 	if (di->dqb_valid & QIF_BLIMITS) {
 	if (di->dqb_valid & QIF_BLIMITS) {
-		dm->dqb_bsoftlimit = di->dqb_bsoftlimit;
-		dm->dqb_bhardlimit = di->dqb_bhardlimit;
+		dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
+		dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
 		check_blim = 1;
 		check_blim = 1;
+		__set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
 	}
 	}
 	if (di->dqb_valid & QIF_INODES) {
 	if (di->dqb_valid & QIF_INODES) {
 		dm->dqb_curinodes = di->dqb_curinodes;
 		dm->dqb_curinodes = di->dqb_curinodes;
 		check_ilim = 1;
 		check_ilim = 1;
+		__set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
 	}
 	}
 	if (di->dqb_valid & QIF_ILIMITS) {
 	if (di->dqb_valid & QIF_ILIMITS) {
 		dm->dqb_isoftlimit = di->dqb_isoftlimit;
 		dm->dqb_isoftlimit = di->dqb_isoftlimit;
 		dm->dqb_ihardlimit = di->dqb_ihardlimit;
 		dm->dqb_ihardlimit = di->dqb_ihardlimit;
 		check_ilim = 1;
 		check_ilim = 1;
+		__set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
 	}
 	}
-	if (di->dqb_valid & QIF_BTIME)
+	if (di->dqb_valid & QIF_BTIME) {
 		dm->dqb_btime = di->dqb_btime;
 		dm->dqb_btime = di->dqb_btime;
-	if (di->dqb_valid & QIF_ITIME)
+		__set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+	}
+	if (di->dqb_valid & QIF_ITIME) {
 		dm->dqb_itime = di->dqb_itime;
 		dm->dqb_itime = di->dqb_itime;
+		__set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+	}
 
 
 	if (check_blim) {
 	if (check_blim) {
-		if (!dm->dqb_bsoftlimit || toqb(dm->dqb_curspace) < dm->dqb_bsoftlimit) {
+		if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) {
 			dm->dqb_btime = 0;
 			dm->dqb_btime = 0;
 			clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 			clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 		}
 		}
@@ -1970,12 +2129,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 	int rc;
 	int rc;
 
 
 	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
 	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-	if (!(dquot = dqget(sb, id, type))) {
-		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-		return -ESRCH;
+	dquot = dqget(sb, id, type);
+	if (!dquot) {
+		rc = -ESRCH;
+		goto out;
 	}
 	}
 	rc = do_set_dqblk(dquot, di);
 	rc = do_set_dqblk(dquot, di);
 	dqput(dquot);
 	dqput(dquot);
+out:
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return rc;
 	return rc;
 }
 }
@@ -1986,7 +2147,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	struct mem_dqinfo *mi;
 	struct mem_dqinfo *mi;
   
   
 	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
 	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-	if (!sb_has_quota_enabled(sb, type)) {
+	if (!sb_has_quota_active(sb, type)) {
 		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 		return -ESRCH;
 		return -ESRCH;
 	}
 	}
@@ -2005,11 +2166,12 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
 {
 	struct mem_dqinfo *mi;
 	struct mem_dqinfo *mi;
+	int err = 0;
 
 
 	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
 	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-	if (!sb_has_quota_enabled(sb, type)) {
-		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-		return -ESRCH;
+	if (!sb_has_quota_active(sb, type)) {
+		err = -ESRCH;
+		goto out;
 	}
 	}
 	mi = sb_dqopt(sb)->info + type;
 	mi = sb_dqopt(sb)->info + type;
 	spin_lock(&dq_data_lock);
 	spin_lock(&dq_data_lock);
@@ -2023,8 +2185,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	mark_info_dirty(sb, type);
 	mark_info_dirty(sb, type);
 	/* Force write to disk */
 	/* Force write to disk */
 	sb->dq_op->write_info(sb, type);
 	sb->dq_op->write_info(sb, type);
+out:
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-	return 0;
+	return err;
 }
 }
 
 
 struct quotactl_ops vfs_quotactl_ops = {
 struct quotactl_ops vfs_quotactl_ops = {
@@ -2186,10 +2349,13 @@ EXPORT_SYMBOL(register_quota_format);
 EXPORT_SYMBOL(unregister_quota_format);
 EXPORT_SYMBOL(unregister_quota_format);
 EXPORT_SYMBOL(dqstats);
 EXPORT_SYMBOL(dqstats);
 EXPORT_SYMBOL(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
+EXPORT_SYMBOL(vfs_quota_enable);
 EXPORT_SYMBOL(vfs_quota_on);
 EXPORT_SYMBOL(vfs_quota_on);
 EXPORT_SYMBOL(vfs_quota_on_path);
 EXPORT_SYMBOL(vfs_quota_on_path);
 EXPORT_SYMBOL(vfs_quota_on_mount);
 EXPORT_SYMBOL(vfs_quota_on_mount);
+EXPORT_SYMBOL(vfs_quota_disable);
 EXPORT_SYMBOL(vfs_quota_off);
 EXPORT_SYMBOL(vfs_quota_off);
+EXPORT_SYMBOL(dquot_scan_active);
 EXPORT_SYMBOL(vfs_quota_sync);
 EXPORT_SYMBOL(vfs_quota_sync);
 EXPORT_SYMBOL(vfs_get_dqinfo);
 EXPORT_SYMBOL(vfs_get_dqinfo);
 EXPORT_SYMBOL(vfs_set_dqinfo);
 EXPORT_SYMBOL(vfs_set_dqinfo);
@@ -2202,7 +2368,11 @@ EXPORT_SYMBOL(dquot_release);
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 EXPORT_SYMBOL(dquot_initialize);
 EXPORT_SYMBOL(dquot_initialize);
 EXPORT_SYMBOL(dquot_drop);
 EXPORT_SYMBOL(dquot_drop);
+EXPORT_SYMBOL(dquot_drop_locked);
 EXPORT_SYMBOL(vfs_dq_drop);
 EXPORT_SYMBOL(vfs_dq_drop);
+EXPORT_SYMBOL(dqget);
+EXPORT_SYMBOL(dqput);
+EXPORT_SYMBOL(dquot_is_cached);
 EXPORT_SYMBOL(dquot_alloc_space);
 EXPORT_SYMBOL(dquot_alloc_space);
 EXPORT_SYMBOL(dquot_alloc_inode);
 EXPORT_SYMBOL(dquot_alloc_inode);
 EXPORT_SYMBOL(dquot_free_space);
 EXPORT_SYMBOL(dquot_free_space);

+ 7 - 9
fs/ext3/super.c

@@ -713,7 +713,9 @@ static struct dquot_operations ext3_quota_operations = {
 	.acquire_dquot	= ext3_acquire_dquot,
 	.acquire_dquot	= ext3_acquire_dquot,
 	.release_dquot	= ext3_release_dquot,
 	.release_dquot	= ext3_release_dquot,
 	.mark_dirty	= ext3_mark_dquot_dirty,
 	.mark_dirty	= ext3_mark_dquot_dirty,
-	.write_info	= ext3_write_info
+	.write_info	= ext3_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
 };
 };
 
 
 static struct quotactl_ops ext3_qctl_operations = {
 static struct quotactl_ops ext3_qctl_operations = {
@@ -1035,8 +1037,7 @@ static int parse_options (char *options, struct super_block *sb,
 		case Opt_grpjquota:
 		case Opt_grpjquota:
 			qtype = GRPQUOTA;
 			qtype = GRPQUOTA;
 set_qf_name:
 set_qf_name:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    !sbi->s_qf_names[qtype]) {
 			    !sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR
 				printk(KERN_ERR
 					"EXT3-fs: Cannot change journaled "
 					"EXT3-fs: Cannot change journaled "
@@ -1075,8 +1076,7 @@ set_qf_name:
 		case Opt_offgrpjquota:
 		case Opt_offgrpjquota:
 			qtype = GRPQUOTA;
 			qtype = GRPQUOTA;
 clear_qf_name:
 clear_qf_name:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_qf_names[qtype]) {
 			    sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR "EXT3-fs: Cannot change "
 				printk(KERN_ERR "EXT3-fs: Cannot change "
 					"journaled quota options when "
 					"journaled quota options when "
@@ -1095,8 +1095,7 @@ clear_qf_name:
 		case Opt_jqfmt_vfsv0:
 		case Opt_jqfmt_vfsv0:
 			qfmt = QFMT_VFS_V0;
 			qfmt = QFMT_VFS_V0;
 set_qf_format:
 set_qf_format:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_jquota_fmt != qfmt) {
 			    sbi->s_jquota_fmt != qfmt) {
 				printk(KERN_ERR "EXT3-fs: Cannot change "
 				printk(KERN_ERR "EXT3-fs: Cannot change "
 					"journaled quota options when "
 					"journaled quota options when "
@@ -1115,8 +1114,7 @@ set_qf_format:
 			set_opt(sbi->s_mount_opt, GRPQUOTA);
 			set_opt(sbi->s_mount_opt, GRPQUOTA);
 			break;
 			break;
 		case Opt_noquota:
 		case Opt_noquota:
-			if (sb_any_quota_enabled(sb) ||
-			    sb_any_quota_suspended(sb)) {
+			if (sb_any_quota_loaded(sb)) {
 				printk(KERN_ERR "EXT3-fs: Cannot change quota "
 				printk(KERN_ERR "EXT3-fs: Cannot change quota "
 					"options when quota turned on.\n");
 					"options when quota turned on.\n");
 				return 0;
 				return 0;

+ 7 - 8
fs/ext4/super.c

@@ -803,7 +803,9 @@ static struct dquot_operations ext4_quota_operations = {
 	.acquire_dquot	= ext4_acquire_dquot,
 	.acquire_dquot	= ext4_acquire_dquot,
 	.release_dquot	= ext4_release_dquot,
 	.release_dquot	= ext4_release_dquot,
 	.mark_dirty	= ext4_mark_dquot_dirty,
 	.mark_dirty	= ext4_mark_dquot_dirty,
-	.write_info	= ext4_write_info
+	.write_info	= ext4_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
 };
 };
 
 
 static struct quotactl_ops ext4_qctl_operations = {
 static struct quotactl_ops ext4_qctl_operations = {
@@ -1142,8 +1144,7 @@ static int parse_options(char *options, struct super_block *sb,
 		case Opt_grpjquota:
 		case Opt_grpjquota:
 			qtype = GRPQUOTA;
 			qtype = GRPQUOTA;
 set_qf_name:
 set_qf_name:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    !sbi->s_qf_names[qtype]) {
 			    !sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR
 				printk(KERN_ERR
 				       "EXT4-fs: Cannot change journaled "
 				       "EXT4-fs: Cannot change journaled "
@@ -1182,8 +1183,7 @@ set_qf_name:
 		case Opt_offgrpjquota:
 		case Opt_offgrpjquota:
 			qtype = GRPQUOTA;
 			qtype = GRPQUOTA;
 clear_qf_name:
 clear_qf_name:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_qf_names[qtype]) {
 			    sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR "EXT4-fs: Cannot change "
 				printk(KERN_ERR "EXT4-fs: Cannot change "
 					"journaled quota options when "
 					"journaled quota options when "
@@ -1202,8 +1202,7 @@ clear_qf_name:
 		case Opt_jqfmt_vfsv0:
 		case Opt_jqfmt_vfsv0:
 			qfmt = QFMT_VFS_V0;
 			qfmt = QFMT_VFS_V0;
 set_qf_format:
 set_qf_format:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_jquota_fmt != qfmt) {
 			    sbi->s_jquota_fmt != qfmt) {
 				printk(KERN_ERR "EXT4-fs: Cannot change "
 				printk(KERN_ERR "EXT4-fs: Cannot change "
 					"journaled quota options when "
 					"journaled quota options when "
@@ -1222,7 +1221,7 @@ set_qf_format:
 			set_opt(sbi->s_mount_opt, GRPQUOTA);
 			set_opt(sbi->s_mount_opt, GRPQUOTA);
 			break;
 			break;
 		case Opt_noquota:
 		case Opt_noquota:
-			if (sb_any_quota_enabled(sb)) {
+			if (sb_any_quota_loaded(sb)) {
 				printk(KERN_ERR "EXT4-fs: Cannot change quota "
 				printk(KERN_ERR "EXT4-fs: Cannot change quota "
 					"options when quota turned on.\n");
 					"options when quota turned on.\n");
 				return 0;
 				return 0;

+ 9 - 0
fs/jbd2/commit.c

@@ -509,6 +509,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		if (is_journal_aborted(journal)) {
 		if (is_journal_aborted(journal)) {
 			clear_buffer_jbddirty(jh2bh(jh));
 			clear_buffer_jbddirty(jh2bh(jh));
 			JBUFFER_TRACE(jh, "journal is aborting: refile");
 			JBUFFER_TRACE(jh, "journal is aborting: refile");
+			jbd2_buffer_abort_trigger(jh,
+						  jh->b_frozen_data ?
+						  jh->b_frozen_triggers :
+						  jh->b_triggers);
 			jbd2_journal_refile_buffer(journal, jh);
 			jbd2_journal_refile_buffer(journal, jh);
 			/* If that was the last one, we need to clean up
 			/* If that was the last one, we need to clean up
 			 * any descriptor buffers which may have been
 			 * any descriptor buffers which may have been
@@ -844,6 +848,9 @@ restart_loop:
 		 * data.
 		 * data.
 		 *
 		 *
 		 * Otherwise, we can just throw away the frozen data now.
 		 * Otherwise, we can just throw away the frozen data now.
+		 *
+		 * We also know that the frozen data has already fired
+		 * its triggers if they exist, so we can clear that too.
 		 */
 		 */
 		if (jh->b_committed_data) {
 		if (jh->b_committed_data) {
 			jbd2_free(jh->b_committed_data, bh->b_size);
 			jbd2_free(jh->b_committed_data, bh->b_size);
@@ -851,10 +858,12 @@ restart_loop:
 			if (jh->b_frozen_data) {
 			if (jh->b_frozen_data) {
 				jh->b_committed_data = jh->b_frozen_data;
 				jh->b_committed_data = jh->b_frozen_data;
 				jh->b_frozen_data = NULL;
 				jh->b_frozen_data = NULL;
+				jh->b_frozen_triggers = NULL;
 			}
 			}
 		} else if (jh->b_frozen_data) {
 		} else if (jh->b_frozen_data) {
 			jbd2_free(jh->b_frozen_data, bh->b_size);
 			jbd2_free(jh->b_frozen_data, bh->b_size);
 			jh->b_frozen_data = NULL;
 			jh->b_frozen_data = NULL;
+			jh->b_frozen_triggers = NULL;
 		}
 		}
 
 
 		spin_lock(&journal->j_list_lock);
 		spin_lock(&journal->j_list_lock);

+ 19 - 0
fs/jbd2/journal.c

@@ -50,6 +50,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
+EXPORT_SYMBOL(jbd2_journal_set_triggers);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
 EXPORT_SYMBOL(jbd2_journal_forget);
@@ -290,6 +291,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 	struct page *new_page;
 	struct page *new_page;
 	unsigned int new_offset;
 	unsigned int new_offset;
 	struct buffer_head *bh_in = jh2bh(jh_in);
 	struct buffer_head *bh_in = jh2bh(jh_in);
+	struct jbd2_buffer_trigger_type *triggers;
 
 
 	/*
 	/*
 	 * The buffer really shouldn't be locked: only the current committing
 	 * The buffer really shouldn't be locked: only the current committing
@@ -314,12 +316,22 @@ repeat:
 		done_copy_out = 1;
 		done_copy_out = 1;
 		new_page = virt_to_page(jh_in->b_frozen_data);
 		new_page = virt_to_page(jh_in->b_frozen_data);
 		new_offset = offset_in_page(jh_in->b_frozen_data);
 		new_offset = offset_in_page(jh_in->b_frozen_data);
+		triggers = jh_in->b_frozen_triggers;
 	} else {
 	} else {
 		new_page = jh2bh(jh_in)->b_page;
 		new_page = jh2bh(jh_in)->b_page;
 		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
 		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+		triggers = jh_in->b_triggers;
 	}
 	}
 
 
 	mapped_data = kmap_atomic(new_page, KM_USER0);
 	mapped_data = kmap_atomic(new_page, KM_USER0);
+	/*
+	 * Fire any commit trigger.  Do this before checking for escaping,
+	 * as the trigger may modify the magic offset.  If a copy-out
+	 * happens afterwards, it will have the correct data in the buffer.
+	 */
+	jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
+				   triggers);
+
 	/*
 	/*
 	 * Check for escaping
 	 * Check for escaping
 	 */
 	 */
@@ -352,6 +364,13 @@ repeat:
 		new_page = virt_to_page(tmp);
 		new_page = virt_to_page(tmp);
 		new_offset = offset_in_page(tmp);
 		new_offset = offset_in_page(tmp);
 		done_copy_out = 1;
 		done_copy_out = 1;
+
+		/*
+		 * This isn't strictly necessary, as we're using frozen
+		 * data for the escaping, but it keeps consistency with
+		 * b_frozen_data usage.
+		 */
+		jh_in->b_frozen_triggers = jh_in->b_triggers;
 	}
 	}
 
 
 	/*
 	/*

+ 47 - 0
fs/jbd2/transaction.c

@@ -741,6 +741,12 @@ done:
 		source = kmap_atomic(page, KM_USER0);
 		source = kmap_atomic(page, KM_USER0);
 		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
 		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
 		kunmap_atomic(source, KM_USER0);
 		kunmap_atomic(source, KM_USER0);
+
+		/*
+		 * Now that the frozen data is saved off, we need to store
+		 * any matching triggers.
+		 */
+		jh->b_frozen_triggers = jh->b_triggers;
 	}
 	}
 	jbd_unlock_bh_state(bh);
 	jbd_unlock_bh_state(bh);
 
 
@@ -943,6 +949,47 @@ out:
 	return err;
 	return err;
 }
 }
 
 
+/**
+ * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * @bh: buffer to trigger on
+ * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
+ *
+ * Set any triggers on this journal_head.  This is always safe, because
+ * triggers for a committing buffer will be saved off, and triggers for
+ * a running transaction will match the buffer in that transaction.
+ *
+ * Call with NULL to clear the triggers.
+ */
+void jbd2_journal_set_triggers(struct buffer_head *bh,
+			       struct jbd2_buffer_trigger_type *type)
+{
+	struct journal_head *jh = bh2jh(bh);
+
+	jh->b_triggers = type;
+}
+
+void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+				struct jbd2_buffer_trigger_type *triggers)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	if (!triggers || !triggers->t_commit)
+		return;
+
+	triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+}
+
+void jbd2_buffer_abort_trigger(struct journal_head *jh,
+			       struct jbd2_buffer_trigger_type *triggers)
+{
+	if (!triggers || !triggers->t_abort)
+		return;
+
+	triggers->t_abort(triggers, jh2bh(jh));
+}
+
+
+
 /**
 /**
  * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
  * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
  * @handle: transaction to add buffer to.
  * @handle: transaction to add buffer to.

+ 7 - 0
fs/ocfs2/Makefile

@@ -12,6 +12,7 @@ obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
 ocfs2-objs := \
 ocfs2-objs := \
 	alloc.o 		\
 	alloc.o 		\
 	aops.o 			\
 	aops.o 			\
+	blockcheck.o		\
 	buffer_head_io.o	\
 	buffer_head_io.o	\
 	dcache.o 		\
 	dcache.o 		\
 	dir.o 			\
 	dir.o 			\
@@ -35,8 +36,14 @@ ocfs2-objs := \
 	sysfile.o 		\
 	sysfile.o 		\
 	uptodate.o		\
 	uptodate.o		\
 	ver.o			\
 	ver.o			\
+	quota_local.o		\
+	quota_global.o		\
 	xattr.o
 	xattr.o
 
 
+ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
+ocfs2-objs += acl.o
+endif
+
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
 ocfs2_stack_user-objs := stack_user.o
 ocfs2_stack_user-objs := stack_user.o

+ 479 - 0
fs/ocfs2/acl.c

@@ -0,0 +1,479 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.c
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/acl.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "ocfs2_fs.h"
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from xattr value to acl struct.
+ */
+static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
+{
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(struct posix_acl_entry))
+		return ERR_PTR(-EINVAL);
+
+	count = size / sizeof(struct posix_acl_entry);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n = 0; n < count; n++) {
+		struct ocfs2_acl_entry *entry =
+			(struct ocfs2_acl_entry *)value;
+
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		acl->a_entries[n].e_id   = le32_to_cpu(entry->e_id);
+		value += sizeof(struct posix_acl_entry);
+
+	}
+	return acl;
+}
+
+/*
+ * Convert acl struct to xattr value.
+ */
+static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
+{
+	struct ocfs2_acl_entry *entry = NULL;
+	char *ocfs2_acl;
+	size_t n;
+
+	*size = acl->a_count * sizeof(struct posix_acl_entry);
+
+	ocfs2_acl = kmalloc(*size, GFP_NOFS);
+	if (!ocfs2_acl)
+		return ERR_PTR(-ENOMEM);
+
+	entry = (struct ocfs2_acl_entry *)ocfs2_acl;
+	for (n = 0; n < acl->a_count; n++, entry++) {
+		entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+		entry->e_id   = cpu_to_le32(acl->a_entries[n].e_id);
+	}
+	return ocfs2_acl;
+}
+
+static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
+					      int type,
+					      struct buffer_head *di_bh)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int name_index;
+	char *value = NULL;
+	struct posix_acl *acl;
+	int retval;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
+	if (retval > 0) {
+		value = kmalloc(retval, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+						"", value, retval);
+	}
+
+	if (retval > 0)
+		acl = ocfs2_acl_from_xattr(value, retval);
+	else if (retval == -ENODATA || retval == 0)
+		acl = NULL;
+	else
+		acl = ERR_PTR(retval);
+
+	kfree(value);
+
+	return acl;
+}
+
+
+/*
+ * Get posix acl.
+ */
+static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *di_bh = NULL;
+	struct posix_acl *acl;
+	int ret;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return NULL;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		acl = ERR_PTR(ret);
+		return acl;
+	}
+
+	acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+
+	ocfs2_inode_unlock(inode, 0);
+
+	brelse(di_bh);
+
+	return acl;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ */
+static int ocfs2_set_acl(handle_t *handle,
+			 struct inode *inode,
+			 struct buffer_head *di_bh,
+			 int type,
+			 struct posix_acl *acl,
+			 struct ocfs2_alloc_context *meta_ac,
+			 struct ocfs2_alloc_context *data_ac)
+{
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int ret;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			ret = posix_acl_equiv_mode(acl, &mode);
+			if (ret < 0)
+				return ret;
+			else {
+				inode->i_mode = mode;
+				if (ret == 0)
+					acl = NULL;
+			}
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		value = ocfs2_acl_to_xattr(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	if (handle)
+		ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
+					     "", value, size, 0,
+					     meta_ac, data_ac);
+	else
+		ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
+
+	kfree(value);
+
+	return ret;
+}
+
+int ocfs2_check_acl(struct inode *inode, int mask)
+{
+	struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		int ret = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return ret;
+	}
+
+	return -EAGAIN;
+}
+
+int ocfs2_acl_chmod(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl, *clone;
+	int ret;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return 0;
+
+	acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+	ret = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!ret)
+		ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+				    clone, NULL, NULL);
+	posix_acl_release(clone);
+	return ret;
+}
+
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+		   struct inode *inode,
+		   struct inode *dir,
+		   struct buffer_head *di_bh,
+		   struct buffer_head *dir_bh,
+		   struct ocfs2_alloc_context *meta_ac,
+		   struct ocfs2_alloc_context *data_ac)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl = NULL;
+	int ret = 0;
+
+	if (!S_ISLNK(inode->i_mode)) {
+		if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+			acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+						   dir_bh);
+			if (IS_ERR(acl))
+				return PTR_ERR(acl);
+		}
+		if (!acl)
+			inode->i_mode &= ~current->fs->umask;
+	}
+	if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+		struct posix_acl *clone;
+		mode_t mode;
+
+		if (S_ISDIR(inode->i_mode)) {
+			ret = ocfs2_set_acl(handle, inode, di_bh,
+					    ACL_TYPE_DEFAULT, acl,
+					    meta_ac, data_ac);
+			if (ret)
+				goto cleanup;
+		}
+		clone = posix_acl_clone(acl, GFP_NOFS);
+		ret = -ENOMEM;
+		if (!clone)
+			goto cleanup;
+
+		mode = inode->i_mode;
+		ret = posix_acl_create_masq(clone, &mode);
+		if (ret >= 0) {
+			inode->i_mode = mode;
+			if (ret > 0) {
+				ret = ocfs2_set_acl(handle, inode,
+						    di_bh, ACL_TYPE_ACCESS,
+						    clone, meta_ac, data_ac);
+			}
+		}
+		posix_acl_release(clone);
+	}
+cleanup:
+	posix_acl_release(acl);
+	return ret;
+}
+
+static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
+					  char *list,
+					  size_t list_len,
+					  const char *name,
+					  size_t name_len)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return 0;
+
+	if (list && size <= list_len)
+		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+	return size;
+}
+
+static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
+					   char *list,
+					   size_t list_len,
+					   const char *name,
+					   size_t name_len)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return 0;
+
+	if (list && size <= list_len)
+		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+	return size;
+}
+
+static int ocfs2_xattr_get_acl(struct inode *inode,
+			       int type,
+			       void *buffer,
+			       size_t size)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl;
+	int ret;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return -EOPNOTSUPP;
+
+	acl = ocfs2_get_acl(inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	ret = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return ret;
+}
+
+static int ocfs2_xattr_get_acl_access(struct inode *inode,
+				      const char *name,
+				      void *buffer,
+				      size_t size)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+
+static int ocfs2_xattr_get_acl_default(struct inode *inode,
+				       const char *name,
+				       void *buffer,
+				       size_t size)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+
+static int ocfs2_xattr_set_acl(struct inode *inode,
+			       int type,
+			       const void *value,
+			       size_t size)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl;
+	int ret = 0;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return -EOPNOTSUPP;
+
+	if (!is_owner_or_cap(inode))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		else if (acl) {
+			ret = posix_acl_valid(acl);
+			if (ret)
+				goto cleanup;
+		}
+	} else
+		acl = NULL;
+
+	ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+
+cleanup:
+	posix_acl_release(acl);
+	return ret;
+}
+
+static int ocfs2_xattr_set_acl_access(struct inode *inode,
+				      const char *name,
+				      const void *value,
+				      size_t size,
+				      int flags)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+
+static int ocfs2_xattr_set_acl_default(struct inode *inode,
+				       const char *name,
+				       const void *value,
+				       size_t size,
+				       int flags)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+
+struct xattr_handler ocfs2_xattr_acl_access_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.list	= ocfs2_xattr_list_acl_access,
+	.get	= ocfs2_xattr_get_acl_access,
+	.set	= ocfs2_xattr_set_acl_access,
+};
+
+struct xattr_handler ocfs2_xattr_acl_default_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.list	= ocfs2_xattr_list_acl_default,
+	.get	= ocfs2_xattr_get_acl_default,
+	.set	= ocfs2_xattr_set_acl_default,
+};

+ 58 - 0
fs/ocfs2/acl.h

@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.h
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_ACL_H
+#define OCFS2_ACL_H
+
+#include <linux/posix_acl_xattr.h>
+
+struct ocfs2_acl_entry {
+	__le16 e_tag;
+	__le16 e_perm;
+	__le32 e_id;
+};
+
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+
+extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_acl_chmod(struct inode *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+			  struct buffer_head *, struct buffer_head *,
+			  struct ocfs2_alloc_context *,
+			  struct ocfs2_alloc_context *);
+
+#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
+
+#define ocfs2_check_acl NULL
+static inline int ocfs2_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+static inline int ocfs2_init_acl(handle_t *handle,
+				 struct inode *inode,
+				 struct inode *dir,
+				 struct buffer_head *di_bh,
+				 struct buffer_head *dir_bh,
+				 struct ocfs2_alloc_context *meta_ac,
+				 struct ocfs2_alloc_context *data_ac)
+{
+	return 0;
+}
+
+#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
+
+#endif /* OCFS2_ACL_H */

Plik diff jest za duży
+ 332 - 167
fs/ocfs2/alloc.c


+ 27 - 3
fs/ocfs2/alloc.h

@@ -45,7 +45,9 @@
  *
  *
  * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
  * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
  * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
  * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
- * functions.
+ * functions.  With metadata ecc, we now call different journal_access
+ * functions for each type of metadata, so it must have the
+ * root_journal_access function.
  * ocfs2_extent_tree_operations abstract the normal operations we do for
  * ocfs2_extent_tree_operations abstract the normal operations we do for
  * the root of extent b-tree.
  * the root of extent b-tree.
  */
  */
@@ -54,6 +56,7 @@ struct ocfs2_extent_tree {
 	struct ocfs2_extent_tree_operations	*et_ops;
 	struct ocfs2_extent_tree_operations	*et_ops;
 	struct buffer_head			*et_root_bh;
 	struct buffer_head			*et_root_bh;
 	struct ocfs2_extent_list		*et_root_el;
 	struct ocfs2_extent_list		*et_root_el;
+	ocfs2_journal_access_func		et_root_journal_access;
 	void					*et_object;
 	void					*et_object;
 	unsigned int				et_max_leaf_clusters;
 	unsigned int				et_max_leaf_clusters;
 };
 };
@@ -68,10 +71,18 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
 				       struct inode *inode,
 				       struct inode *inode,
 				       struct buffer_head *bh);
 				       struct buffer_head *bh);
+struct ocfs2_xattr_value_buf;
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
 					struct inode *inode,
 					struct inode *inode,
-					struct buffer_head *bh,
-					struct ocfs2_xattr_value_root *xv);
+					struct ocfs2_xattr_value_buf *vb);
+
+/*
+ * Read an extent block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The extent block will be validated
+ * with ocfs2_validate_extent_block().
+ */
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+			    struct buffer_head **bh);
 
 
 struct ocfs2_alloc_context;
 struct ocfs2_alloc_context;
 int ocfs2_insert_extent(struct ocfs2_super *osb,
 int ocfs2_insert_extent(struct ocfs2_super *osb,
@@ -110,6 +121,11 @@ int ocfs2_remove_extent(struct inode *inode,
 			u32 cpos, u32 len, handle_t *handle,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
 			struct ocfs2_alloc_context *meta_ac,
 			struct ocfs2_cached_dealloc_ctxt *dealloc);
 			struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_btree_range(struct inode *inode,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 phys_cpos, u32 len,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc);
+
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct inode *inode,
 			   struct ocfs2_extent_tree *et);
 			   struct ocfs2_extent_tree *et);
@@ -167,10 +183,18 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
  */
  */
 struct ocfs2_cached_dealloc_ctxt {
 struct ocfs2_cached_dealloc_ctxt {
 	struct ocfs2_per_slot_free_list		*c_first_suballocator;
 	struct ocfs2_per_slot_free_list		*c_first_suballocator;
+	struct ocfs2_cached_block_free 		*c_global_allocator;
 };
 };
 static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 {
 {
 	c->c_first_suballocator = NULL;
 	c->c_first_suballocator = NULL;
+	c->c_global_allocator = NULL;
+}
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+				u64 blkno, unsigned int bit);
+static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
+{
+	return c->c_global_allocator != NULL;
 }
 }
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
 		       struct ocfs2_cached_dealloc_ctxt *ctxt);
 		       struct ocfs2_cached_dealloc_ctxt *ctxt);

+ 21 - 38
fs/ocfs2/aops.c

@@ -27,6 +27,7 @@
 #include <linux/swap.h>
 #include <linux/swap.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mpage.h>
 #include <linux/mpage.h>
+#include <linux/quotaops.h>
 
 
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
 #include <cluster/masklog.h>
@@ -68,20 +69,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 		goto bail;
 		goto bail;
 	}
 	}
 
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+	status = ocfs2_read_inode_block(inode, &bh);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
 	}
 	}
 	fe = (struct ocfs2_dinode *) bh->b_data;
 	fe = (struct ocfs2_dinode *) bh->b_data;
 
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
-		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-		     fe->i_signature);
-		goto bail;
-	}
-
 	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
 	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
 						    le32_to_cpu(fe->i_clusters))) {
 						    le32_to_cpu(fe->i_clusters))) {
 		mlog(ML_ERROR, "block offset is outside the allocated size: "
 		mlog(ML_ERROR, "block offset is outside the allocated size: "
@@ -262,7 +256,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
 	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
 
 
-	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+	ret = ocfs2_read_inode_block(inode, &di_bh);
 	if (ret) {
 	if (ret) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out;
 		goto out;
@@ -481,12 +475,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 
 
 	if (ocfs2_should_order_data(inode)) {
 	if (ocfs2_should_order_data(inode)) {
 		ret = ocfs2_jbd2_file_inode(handle, inode);
 		ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-		ret = walk_page_buffers(handle,
-					page_buffers(page),
-					from, to, NULL,
-					ocfs2_journal_dirty_data);
-#endif
 		if (ret < 0)
 		if (ret < 0)
 			mlog_errno(ret);
 			mlog_errno(ret);
 	}
 	}
@@ -1072,15 +1060,8 @@ static void ocfs2_write_failure(struct inode *inode,
 		tmppage = wc->w_pages[i];
 		tmppage = wc->w_pages[i];
 
 
 		if (page_has_buffers(tmppage)) {
 		if (page_has_buffers(tmppage)) {
-			if (ocfs2_should_order_data(inode)) {
+			if (ocfs2_should_order_data(inode))
 				ocfs2_jbd2_file_inode(wc->w_handle, inode);
 				ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-				walk_page_buffers(wc->w_handle,
-						  page_buffers(tmppage),
-						  from, to, NULL,
-						  ocfs2_journal_dirty_data);
-#endif
-			}
 
 
 			block_commit_write(tmppage, from, to);
 			block_commit_write(tmppage, from, to);
 		}
 		}
@@ -1531,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
 		goto out;
 		goto out;
 	}
 	}
 
 
-	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 	if (ret) {
 		ocfs2_commit_trans(osb, handle);
 		ocfs2_commit_trans(osb, handle);
 
 
@@ -1750,15 +1731,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 
 
 	wc->w_handle = handle;
 	wc->w_handle = handle;
 
 
+	if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
+		ret = -EDQUOT;
+		goto out_commit;
+	}
 	/*
 	/*
 	 * We don't want this to fail in ocfs2_write_end(), so do it
 	 * We don't want this to fail in ocfs2_write_end(), so do it
 	 * here.
 	 * here.
 	 */
 	 */
-	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 	if (ret) {
 		mlog_errno(ret);
 		mlog_errno(ret);
-		goto out_commit;
+		goto out_quota;
 	}
 	}
 
 
 	/*
 	/*
@@ -1771,14 +1757,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 					 mmap_page);
 					 mmap_page);
 	if (ret) {
 	if (ret) {
 		mlog_errno(ret);
 		mlog_errno(ret);
-		goto out_commit;
+		goto out_quota;
 	}
 	}
 
 
 	ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
 	ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
 					  len);
 					  len);
 	if (ret) {
 	if (ret) {
 		mlog_errno(ret);
 		mlog_errno(ret);
-		goto out_commit;
+		goto out_quota;
 	}
 	}
 
 
 	if (data_ac)
 	if (data_ac)
@@ -1790,6 +1776,10 @@ success:
 	*pagep = wc->w_target_page;
 	*pagep = wc->w_target_page;
 	*fsdata = wc;
 	*fsdata = wc;
 	return 0;
 	return 0;
+out_quota:
+	if (clusters_to_alloc)
+		vfs_dq_free_space(inode,
+			  ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
 out_commit:
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 	ocfs2_commit_trans(osb, handle);
 
 
@@ -1919,15 +1909,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 		}
 		}
 
 
 		if (page_has_buffers(tmppage)) {
 		if (page_has_buffers(tmppage)) {
-			if (ocfs2_should_order_data(inode)) {
+			if (ocfs2_should_order_data(inode))
 				ocfs2_jbd2_file_inode(wc->w_handle, inode);
 				ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-				walk_page_buffers(wc->w_handle,
-						  page_buffers(tmppage),
-						  from, to, NULL,
-						  ocfs2_journal_dirty_data);
-#endif
-			}
 			block_commit_write(tmppage, from, to);
 			block_commit_write(tmppage, from, to);
 		}
 		}
 	}
 	}

+ 477 - 0
fs/ocfs2/blockcheck.c

@@ -0,0 +1,477 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.c
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2006, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "blockcheck.h"
+
+
+/*
+ * We use the following conventions:
+ *
+ * d = # data bits
+ * p = # parity bits
+ * c = # total code bits (d + p)
+ */
+
+
+/*
+ * Calculate the bit offset in the hamming code buffer based on the bit's
+ * offset in the data buffer.  Since the hamming code reserves all
+ * power-of-two bits for parity, the data bit number and the code bit
+ * number are offest by all the parity bits beforehand.
+ *
+ * Recall that bit numbers in hamming code are 1-based.  This function
+ * takes the 0-based data bit from the caller.
+ *
+ * An example.  Take bit 1 of the data buffer.  1 is a power of two (2^0),
+ * so it's a parity bit.  2 is a power of two (2^1), so it's a parity bit.
+ * 3 is not a power of two.  So bit 1 of the data buffer ends up as bit 3
+ * in the code buffer.
+ *
+ * The caller can pass in *p if it wants to keep track of the most recent
+ * number of parity bits added.  This allows the function to start the
+ * calculation at the last place.
+ */
+static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
+{
+	unsigned int b, p = 0;
+
+	/*
+	 * Data bits are 0-based, but we're talking code bits, which
+	 * are 1-based.
+	 */
+	b = i + 1;
+
+	/* Use the cache if it is there */
+	if (p_cache)
+		p = *p_cache;
+        b += p;
+
+	/*
+	 * For every power of two below our bit number, bump our bit.
+	 *
+	 * We compare with (b + 1) because we have to compare with what b
+	 * would be _if_ it were bumped up by the parity bit.  Capice?
+	 *
+	 * p is set above.
+	 */
+	for (; (1 << p) < (b + 1); p++)
+		b++;
+
+	if (p_cache)
+		*p_cache = p;
+
+	return b;
+}
+
+/*
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
+{
+	unsigned int i, b, p = 0;
+
+	BUG_ON(!d);
+
+	/*
+	 * b is the hamming code bit number.  Hamming code specifies a
+	 * 1-based array, but C uses 0-based.  So 'i' is for C, and 'b' is
+	 * for the algorithm.
+	 *
+	 * The i++ in the for loop is so that the start offset passed
+	 * to ocfs2_find_next_bit_set() is one greater than the previously
+	 * found bit.
+	 */
+	for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
+	{
+		/*
+		 * i is the offset in this hunk, nr + i is the total bit
+		 * offset.
+		 */
+		b = calc_code_bit(nr + i, &p);
+
+		/*
+		 * Data bits in the resultant code are checked by
+		 * parity bits that are part of the bit number
+		 * representation.  Huh?
+		 *
+		 * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+		 * In other words, the parity bit at position 2^k
+		 * checks bits in positions having bit k set in
+		 * their binary representation.  Conversely, for
+		 * instance, bit 13, i.e. 1101(2), is checked by
+		 * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
+		 * </wikipedia>
+		 *
+		 * Note that 'k' is the _code_ bit number.  'b' in
+		 * our loop.
+		 */
+		parity ^= b;
+	}
+
+	/* While the data buffer was treated as little endian, the
+	 * return value is in host endian. */
+	return parity;
+}
+
+u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
+{
+	return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
+}
+
+/*
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one hunk, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+		       unsigned int fix)
+{
+	unsigned int i, b;
+
+	BUG_ON(!d);
+
+	/*
+	 * If the bit to fix has an hweight of 1, it's a parity bit.  One
+	 * busted parity bit is its own error.  Nothing to do here.
+	 */
+	if (hweight32(fix) == 1)
+		return;
+
+	/*
+	 * nr + d is the bit right past the data hunk we're looking at.
+	 * If fix after that, nothing to do
+	 */
+	if (fix >= calc_code_bit(nr + d, NULL))
+		return;
+
+	/*
+	 * nr is the offset in the data hunk we're starting at.  Let's
+	 * start b at the offset in the code buffer.  See hamming_encode()
+	 * for a more detailed description of 'b'.
+	 */
+	b = calc_code_bit(nr, NULL);
+	/* If the fix is before this hunk, nothing to do */
+	if (fix < b)
+		return;
+
+	for (i = 0; i < d; i++, b++)
+	{
+		/* Skip past parity bits */
+		while (hweight32(b) == 1)
+			b++;
+
+		/*
+		 * i is the offset in this data hunk.
+		 * nr + i is the offset in the total data buffer.
+		 * b is the offset in the total code buffer.
+		 *
+		 * Thus, when b == fix, bit i in the current hunk needs
+		 * fixing.
+		 */
+		if (b == fix)
+		{
+			if (ocfs2_test_bit(i, data))
+				ocfs2_clear_bit(i, data);
+			else
+				ocfs2_set_bit(i, data);
+			break;
+		}
+	}
+}
+
+void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+			     unsigned int fix)
+{
+	ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
+}
+
+/*
+ * This function generates check information for a block.
+ * data is the block to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc)
+{
+	u32 crc;
+	u32 ecc;
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	crc = crc32_le(~0, data, blocksize);
+	ecc = ocfs2_hamming_encode_block(data, blocksize);
+
+	/*
+	 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+	 * larger than 16 bits.
+	 */
+	BUG_ON(ecc > USHORT_MAX);
+
+	bc->bc_crc32e = cpu_to_le32(crc);
+	bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information.  Like _compute,
+ * the function will take care of zeroing bc before calculating check codes.
+ * If bc is not a pointer inside data, the caller must have zeroed any
+ * inline ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc)
+{
+	int rc = 0;
+	struct ocfs2_block_check check;
+	u32 crc, ecc;
+
+	check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+	check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	/* Fast path - if the crc32 validates, we're good to go */
+	crc = crc32_le(~0, data, blocksize);
+	if (crc == check.bc_crc32e)
+		goto out;
+
+	mlog(ML_ERROR,
+	     "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+	/* Ok, try ECC fixups */
+	ecc = ocfs2_hamming_encode_block(data, blocksize);
+	ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+
+	/* And check the crc32 again */
+	crc = crc32_le(~0, data, blocksize);
+	if (crc == check.bc_crc32e)
+		goto out;
+
+	mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+	rc = -EIO;
+
+out:
+	bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+	bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+
+	return rc;
+}
+
+/*
+ * This function generates check information for a list of buffer_heads.
+ * bhs is the blocks to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc)
+{
+	int i;
+	u32 crc, ecc;
+
+	BUG_ON(nr < 0);
+
+	if (!nr)
+		return;
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
+		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+		/*
+		 * The number of bits in a buffer is obviously b_size*8.
+		 * The offset of this buffer is b_size*i, so the bit offset
+		 * of this buffer is b_size*8*i.
+		 */
+		ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+						bhs[i]->b_size * 8,
+						bhs[i]->b_size * 8 * i);
+	}
+
+	/*
+	 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+	 * larger than 16 bits.
+	 */
+	BUG_ON(ecc > USHORT_MAX);
+
+	bc->bc_crc32e = cpu_to_le32(crc);
+	bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information on a list of
+ * buffer_heads.  Like _compute_bhs, the function will take care of
+ * zeroing bc before calculating check codes.  If bc is not a pointer
+ * inside data, the caller must have zeroed any inline
+ * ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc)
+{
+	int i, rc = 0;
+	struct ocfs2_block_check check;
+	u32 crc, ecc, fix;
+
+	BUG_ON(nr < 0);
+
+	if (!nr)
+		return 0;
+
+	check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+	check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	/* Fast path - if the crc32 validates, we're good to go */
+	for (i = 0, crc = ~0; i < nr; i++)
+		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+	if (crc == check.bc_crc32e)
+		goto out;
+
+	mlog(ML_ERROR,
+	     "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+	/* Ok, try ECC fixups */
+	for (i = 0, ecc = 0; i < nr; i++) {
+		/*
+		 * The number of bits in a buffer is obviously b_size*8.
+		 * The offset of this buffer is b_size*i, so the bit offset
+		 * of this buffer is b_size*8*i.
+		 */
+		ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+						bhs[i]->b_size * 8,
+						bhs[i]->b_size * 8 * i);
+	}
+	fix = ecc ^ check.bc_ecc;
+	for (i = 0; i < nr; i++) {
+		/*
+		 * Try the fix against each buffer.  It will only affect
+		 * one of them.
+		 */
+		ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
+				  bhs[i]->b_size * 8 * i, fix);
+	}
+
+	/* And check the crc32 again */
+	for (i = 0, crc = ~0; i < nr; i++)
+		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+	if (crc == check.bc_crc32e)
+		goto out;
+
+	mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+	rc = -EIO;
+
+out:
+	bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+	bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+
+	return rc;
+}
+
+/*
+ * These are the main API.  They check the superblock flag before
+ * calling the underlying operations.
+ *
+ * They expect the buffer(s) to be in disk format.
+ */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc)
+{
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		ocfs2_block_check_compute(data, sb->s_blocksize, bc);
+}
+
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc)
+{
+	int rc = 0;
+
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
+
+	return rc;
+}
+
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc)
+{
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		ocfs2_block_check_compute_bhs(bhs, nr, bc);
+}
+
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc)
+{
+	int rc = 0;
+
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
+
+	return rc;
+}
+

+ 82 - 0
fs/ocfs2/blockcheck.h

@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.h
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_BLOCKCHECK_H
+#define OCFS2_BLOCKCHECK_H
+
+
+/* High level block API */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc);
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc);
+
+/* Lower level API */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc);
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc);
+
+/*
+ * Hamming code functions
+ */
+
+/*
+ * Encoding hamming code parity bits for a buffer.
+ *
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
+			 unsigned int nr);
+/*
+ * Fix a buffer with a bit error.  The 'fix' is the original parity
+ * xor'd with the parity calculated now.
+ *
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one buffer, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+		       unsigned int fix);
+
+/* Convenience wrappers for a single buffer of data */
+extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
+extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+				    unsigned int fix);
+#endif

+ 31 - 1
fs/ocfs2/buffer_head_io.c

@@ -39,6 +39,18 @@
 
 
 #include "buffer_head_io.h"
 #include "buffer_head_io.h"
 
 
+/*
+ * Bits on bh->b_state used by ocfs2.
+ *
+ * These MUST be after the JBD2 bits.  Hence, we use BH_JBDPrivateStart.
+ */
+enum ocfs2_state_bits {
+	BH_NeedsValidate = BH_JBDPrivateStart,
+};
+
+/* Expand the magic b_state functions */
+BUFFER_FNS(NeedsValidate, needs_validate);
+
 int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 		      struct inode *inode)
 		      struct inode *inode)
 {
 {
@@ -166,7 +178,9 @@ bail:
 }
 }
 
 
 int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
 int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
-		      struct buffer_head *bhs[], int flags)
+		      struct buffer_head *bhs[], int flags,
+		      int (*validate)(struct super_block *sb,
+				      struct buffer_head *bh))
 {
 {
 	int status = 0;
 	int status = 0;
 	int i, ignore_cache = 0;
 	int i, ignore_cache = 0;
@@ -298,6 +312,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
 
 
 			clear_buffer_uptodate(bh);
 			clear_buffer_uptodate(bh);
 			get_bh(bh); /* for end_buffer_read_sync() */
 			get_bh(bh); /* for end_buffer_read_sync() */
+			if (validate)
+				set_buffer_needs_validate(bh);
 			bh->b_end_io = end_buffer_read_sync;
 			bh->b_end_io = end_buffer_read_sync;
 			submit_bh(READ, bh);
 			submit_bh(READ, bh);
 			continue;
 			continue;
@@ -328,6 +344,20 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
 				bhs[i] = NULL;
 				bhs[i] = NULL;
 				continue;
 				continue;
 			}
 			}
+
+			if (buffer_needs_validate(bh)) {
+				/* We never set NeedsValidate if the
+				 * buffer was held by the journal, so
+				 * that better not have changed */
+				BUG_ON(buffer_jbd(bh));
+				clear_buffer_needs_validate(bh);
+				status = validate(inode->i_sb, bh);
+				if (status) {
+					put_bh(bh);
+					bhs[i] = NULL;
+					continue;
+				}
+			}
 		}
 		}
 
 
 		/* Always set the buffer in the cache, even if it was
 		/* Always set the buffer in the cache, even if it was

+ 16 - 11
fs/ocfs2/buffer_head_io.h

@@ -31,21 +31,24 @@
 void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
 void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
 			     int uptodate);
 			     int uptodate);
 
 
-static inline int ocfs2_read_block(struct inode	       *inode,
-				   u64                  off,
-				   struct buffer_head **bh);
-
 int ocfs2_write_block(struct ocfs2_super          *osb,
 int ocfs2_write_block(struct ocfs2_super          *osb,
 		      struct buffer_head  *bh,
 		      struct buffer_head  *bh,
 		      struct inode        *inode);
 		      struct inode        *inode);
-int ocfs2_read_blocks(struct inode	  *inode,
-		      u64                  block,
-		      int                  nr,
-		      struct buffer_head  *bhs[],
-		      int                  flags);
 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 			   unsigned int nr, struct buffer_head *bhs[]);
 			   unsigned int nr, struct buffer_head *bhs[]);
 
 
+/*
+ * If not NULL, validate() will be called on a buffer that is freshly
+ * read from disk.  It will not be called if the buffer was in cache.
+ * Note that if validate() is being used for this buffer, it needs to
+ * be set even for a READAHEAD call, as it marks the buffer for later
+ * validation.
+ */
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags,
+		      int (*validate)(struct super_block *sb,
+				      struct buffer_head *bh));
+
 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 				struct buffer_head *bh);
 				struct buffer_head *bh);
 
 
@@ -53,7 +56,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 #define OCFS2_BH_READAHEAD         8
 #define OCFS2_BH_READAHEAD         8
 
 
 static inline int ocfs2_read_block(struct inode *inode, u64 off,
 static inline int ocfs2_read_block(struct inode *inode, u64 off,
-				   struct buffer_head **bh)
+				   struct buffer_head **bh,
+				   int (*validate)(struct super_block *sb,
+						   struct buffer_head *bh))
 {
 {
 	int status = 0;
 	int status = 0;
 
 
@@ -63,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
 		goto bail;
 		goto bail;
 	}
 	}
 
 
-	status = ocfs2_read_blocks(inode, off, 1, bh, 0);
+	status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate);
 
 
 bail:
 bail:
 	return status;
 	return status;

+ 1 - 0
fs/ocfs2/cluster/masklog.c

@@ -110,6 +110,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(QUORUM),
 	define_mask(QUORUM),
 	define_mask(EXPORT),
 	define_mask(EXPORT),
 	define_mask(XATTR),
 	define_mask(XATTR),
+	define_mask(QUOTA),
 	define_mask(ERROR),
 	define_mask(ERROR),
 	define_mask(NOTICE),
 	define_mask(NOTICE),
 	define_mask(KTHREAD),
 	define_mask(KTHREAD),

+ 1 - 0
fs/ocfs2/cluster/masklog.h

@@ -113,6 +113,7 @@
 #define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
 #define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
 #define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
 #define ML_XATTR	0x0000000020000000ULL /* ocfs2 extended attributes */
 #define ML_XATTR	0x0000000020000000ULL /* ocfs2 extended attributes */
+#define ML_QUOTA	0x0000000040000000ULL /* ocfs2 quota operations */
 /* bits that are infrequently given and frequently matched in the high word */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */

+ 314 - 85
fs/ocfs2/dir.c

@@ -40,6 +40,7 @@
 #include <linux/types.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/highmem.h>
+#include <linux/quotaops.h>
 
 
 #define MLOG_MASK_PREFIX ML_NAMEI
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
 #include <cluster/masklog.h>
@@ -47,6 +48,7 @@
 #include "ocfs2.h"
 #include "ocfs2.h"
 
 
 #include "alloc.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dir.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "extent_map.h"
@@ -82,47 +84,72 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct buffer_head **new_bh);
 			       struct buffer_head **new_bh);
 
 
-static struct buffer_head *ocfs2_bread(struct inode *inode,
-				       int block, int *err, int reada)
+/*
+ * These are distinct checks because future versions of the file system will
+ * want to have a trailing dirent structure independent of indexing.
+ */
+static int ocfs2_dir_has_trailer(struct inode *dir)
 {
 {
-	struct buffer_head *bh = NULL;
-	int tmperr;
-	u64 p_blkno;
-	int readflags = 0;
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
 
 
-	if (reada)
-		readflags |= OCFS2_BH_READAHEAD;
+	return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
+}
 
 
-	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
-	    i_size_read(inode)) {
-		BUG_ON(!reada);
-		return NULL;
-	}
+static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
+{
+	return ocfs2_meta_ecc(osb);
+}
 
 
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
-					     NULL);
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-	if (tmperr < 0) {
-		mlog_errno(tmperr);
-		goto fail;
-	}
+static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
+{
+	return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
+}
 
 
-	tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
-	if (tmperr < 0)
-		goto fail;
+#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
 
 
-	tmperr = 0;
+/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
+ * them more consistent? */
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+							    void *data)
+{
+	char *p = data;
 
 
-	*err = 0;
-	return bh;
+	p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
+	return (struct ocfs2_dir_block_trailer *)p;
+}
 
 
-fail:
-	brelse(bh);
-	bh = NULL;
+/*
+ * XXX: This is executed once on every dirent. We should consider optimizing
+ * it.
+ */
+static int ocfs2_skip_dir_trailer(struct inode *dir,
+				  struct ocfs2_dir_entry *de,
+				  unsigned long offset,
+				  unsigned long blklen)
+{
+	unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
 
 
-	*err = -EIO;
-	return NULL;
+	if (!ocfs2_dir_has_trailer(dir))
+		return 0;
+
+	if (offset != toff)
+		return 0;
+
+	return 1;
+}
+
+static void ocfs2_init_dir_trailer(struct inode *inode,
+				   struct buffer_head *bh)
+{
+	struct ocfs2_dir_block_trailer *trailer;
+
+	trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
+	strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
+	trailer->db_compat_rec_len =
+			cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
+	trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+	trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
 }
 }
 
 
 /*
 /*
@@ -231,7 +258,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
 	struct ocfs2_dinode *di;
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
 	struct ocfs2_inline_data *data;
 
 
-	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+	ret = ocfs2_read_inode_block(dir, &di_bh);
 	if (ret) {
 	if (ret) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out;
 		goto out;
@@ -250,6 +277,108 @@ out:
 	return NULL;
 	return NULL;
 }
 }
 
 
+static int ocfs2_validate_dir_block(struct super_block *sb,
+				    struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_dir_block_trailer *trailer =
+		ocfs2_trailer_from_bh(bh, sb);
+
+
+	/*
+	 * We don't validate dirents here, that's handled
+	 * in-place when the code walks them.
+	 */
+	mlog(0, "Validating dirblock %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 *
+	 * Note that we are safe to call this even if the directory
+	 * doesn't have a trailer.  Filesystems without metaecc will do
+	 * nothing, and filesystems with it will have one.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
+	if (rc)
+		mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+
+	return rc;
+}
+
+/*
+ * This function forces all errors to -EIO for consistency with its
+ * predecessor, ocfs2_bread().  We haven't audited what returning the
+ * real error codes would do to callers.  We log the real codes with
+ * mlog_errno() before we squash them.
+ */
+static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
+				struct buffer_head **bh, int flags)
+{
+	int rc = 0;
+	struct buffer_head *tmp = *bh;
+	struct ocfs2_dir_block_trailer *trailer;
+
+	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
+				    ocfs2_validate_dir_block);
+	if (rc) {
+		mlog_errno(rc);
+		goto out;
+	}
+
+	/*
+	 * We check the trailer here rather than in
+	 * ocfs2_validate_dir_block() because that function doesn't have
+	 * the inode to test.
+	 */
+	if (!(flags & OCFS2_BH_READAHEAD) &&
+	    ocfs2_dir_has_trailer(inode)) {
+		trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
+		if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+			rc = -EINVAL;
+			ocfs2_error(inode->i_sb,
+				    "Invalid dirblock #%llu: "
+				    "signature = %.*s\n",
+				    (unsigned long long)tmp->b_blocknr, 7,
+				    trailer->db_signature);
+			goto out;
+		}
+		if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
+			rc = -EINVAL;
+			ocfs2_error(inode->i_sb,
+				    "Directory block #%llu has an invalid "
+				    "db_blkno of %llu",
+				    (unsigned long long)tmp->b_blocknr,
+				    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+			goto out;
+		}
+		if (le64_to_cpu(trailer->db_parent_dinode) !=
+		    OCFS2_I(inode)->ip_blkno) {
+			rc = -EINVAL;
+			ocfs2_error(inode->i_sb,
+				    "Directory block #%llu on dinode "
+				    "#%llu has an invalid parent_dinode "
+				    "of %llu",
+				    (unsigned long long)tmp->b_blocknr,
+				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+				    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+			goto out;
+		}
+	}
+
+	/* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+	if (!*bh)
+		*bh = tmp;
+
+out:
+	return rc ? -EIO : 0;
+}
+
 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
 					       struct inode *dir,
 					       struct inode *dir,
 					       struct ocfs2_dir_entry **res_dir)
 					       struct ocfs2_dir_entry **res_dir)
@@ -296,15 +425,17 @@ restart:
 				}
 				}
 				num++;
 				num++;
 
 
-				bh = ocfs2_bread(dir, b++, &err, 1);
+				bh = NULL;
+				err = ocfs2_read_dir_block(dir, b++, &bh,
+							   OCFS2_BH_READAHEAD);
 				bh_use[ra_max] = bh;
 				bh_use[ra_max] = bh;
 			}
 			}
 		}
 		}
 		if ((bh = bh_use[ra_ptr++]) == NULL)
 		if ((bh = bh_use[ra_ptr++]) == NULL)
 			goto next;
 			goto next;
-		if (ocfs2_read_block(dir, block, &bh)) {
+		if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
 			/* read error, skip block & hope for the best.
 			/* read error, skip block & hope for the best.
-			 * ocfs2_read_block() has released the bh. */
+			 * ocfs2_read_dir_block() has released the bh. */
 			ocfs2_error(dir->i_sb, "reading directory %llu, "
 			ocfs2_error(dir->i_sb, "reading directory %llu, "
 				    "offset %lu\n",
 				    "offset %lu\n",
 				    (unsigned long long)OCFS2_I(dir)->ip_blkno,
 				    (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -381,14 +512,18 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
 		       struct inode *new_entry_inode)
 		       struct inode *new_entry_inode)
 {
 {
 	int ret;
 	int ret;
+	ocfs2_journal_access_func access = ocfs2_journal_access_db;
 
 
 	/*
 	/*
 	 * The same code works fine for both inline-data and extent
 	 * The same code works fine for both inline-data and extent
-	 * based directories, so no need to split this up.
+	 * based directories, so no need to split this up.  The only
+	 * difference is the journal_access function.
 	 */
 	 */
 
 
-	ret = ocfs2_journal_access(handle, dir, de_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		access = ocfs2_journal_access_di;
+
+	ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 	if (ret) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out;
 		goto out;
@@ -410,9 +545,13 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 {
 {
 	struct ocfs2_dir_entry *de, *pde;
 	struct ocfs2_dir_entry *de, *pde;
 	int i, status = -ENOENT;
 	int i, status = -ENOENT;
+	ocfs2_journal_access_func access = ocfs2_journal_access_db;
 
 
 	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
 	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
 
 
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		access = ocfs2_journal_access_di;
+
 	i = 0;
 	i = 0;
 	pde = NULL;
 	pde = NULL;
 	de = (struct ocfs2_dir_entry *) first_de;
 	de = (struct ocfs2_dir_entry *) first_de;
@@ -423,8 +562,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 			goto bail;
 			goto bail;
 		}
 		}
 		if (de == de_del)  {
 		if (de == de_del)  {
-			status = ocfs2_journal_access(handle, dir, bh,
-						      OCFS2_JOURNAL_ACCESS_WRITE);
+			status = access(handle, dir, bh,
+					OCFS2_JOURNAL_ACCESS_WRITE);
 			if (status < 0) {
 			if (status < 0) {
 				status = -EIO;
 				status = -EIO;
 				mlog_errno(status);
 				mlog_errno(status);
@@ -458,7 +597,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
 	struct ocfs2_dinode *di;
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
 	struct ocfs2_inline_data *data;
 
 
-	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+	ret = ocfs2_read_inode_block(dir, &di_bh);
 	if (ret) {
 	if (ret) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out;
 		goto out;
@@ -576,6 +715,16 @@ int __ocfs2_add_entry(handle_t *handle,
 			goto bail;
 			goto bail;
 		}
 		}
 
 
+		/* We're guaranteed that we should have space, so we
+		 * can't possibly have hit the trailer...right? */
+		mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
+				"Hit dir trailer trying to insert %.*s "
+			        "(namelen %d) into directory %llu.  "
+				"offset is %lu, trailer offset is %d\n",
+				namelen, name, namelen,
+				(unsigned long long)parent_fe_bh->b_blocknr,
+				offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
+
 		if (ocfs2_dirent_would_fit(de, rec_len)) {
 		if (ocfs2_dirent_would_fit(de, rec_len)) {
 			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 			retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
 			retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
@@ -584,8 +733,14 @@ int __ocfs2_add_entry(handle_t *handle,
 				goto bail;
 				goto bail;
 			}
 			}
 
 
-			status = ocfs2_journal_access(handle, dir, insert_bh,
-						      OCFS2_JOURNAL_ACCESS_WRITE);
+			if (insert_bh == parent_fe_bh)
+				status = ocfs2_journal_access_di(handle, dir,
+								 insert_bh,
+								 OCFS2_JOURNAL_ACCESS_WRITE);
+			else
+				status = ocfs2_journal_access_db(handle, dir,
+								 insert_bh,
+								 OCFS2_JOURNAL_ACCESS_WRITE);
 			/* By now the buffer is marked for journaling */
 			/* By now the buffer is marked for journaling */
 			offset += le16_to_cpu(de->rec_len);
 			offset += le16_to_cpu(de->rec_len);
 			if (le64_to_cpu(de->inode)) {
 			if (le64_to_cpu(de->inode)) {
@@ -611,6 +766,7 @@ int __ocfs2_add_entry(handle_t *handle,
 			retval = 0;
 			retval = 0;
 			goto bail;
 			goto bail;
 		}
 		}
+
 		offset += le16_to_cpu(de->rec_len);
 		offset += le16_to_cpu(de->rec_len);
 		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
 		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
 	}
 	}
@@ -636,7 +792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
 	struct ocfs2_inline_data *data;
 	struct ocfs2_inline_data *data;
 	struct ocfs2_dir_entry *de;
 	struct ocfs2_dir_entry *de;
 
 
-	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+	ret = ocfs2_read_inode_block(inode, &di_bh);
 	if (ret) {
 	if (ret) {
 		mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
 		mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -724,7 +880,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 	int i, stored;
 	int i, stored;
 	struct buffer_head * bh, * tmp;
 	struct buffer_head * bh, * tmp;
 	struct ocfs2_dir_entry * de;
 	struct ocfs2_dir_entry * de;
-	int err;
 	struct super_block * sb = inode->i_sb;
 	struct super_block * sb = inode->i_sb;
 	unsigned int ra_sectors = 16;
 	unsigned int ra_sectors = 16;
 
 
@@ -735,12 +890,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 
 
 	while (!error && !stored && *f_pos < i_size_read(inode)) {
 	while (!error && !stored && *f_pos < i_size_read(inode)) {
 		blk = (*f_pos) >> sb->s_blocksize_bits;
 		blk = (*f_pos) >> sb->s_blocksize_bits;
-		bh = ocfs2_bread(inode, blk, &err, 0);
-		if (!bh) {
-			mlog(ML_ERROR,
-			     "directory #%llu contains a hole at offset %lld\n",
-			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-			     *f_pos);
+		if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
+			/* Skip the corrupt dirblock and keep trying */
 			*f_pos += sb->s_blocksize - offset;
 			*f_pos += sb->s_blocksize - offset;
 			continue;
 			continue;
 		}
 		}
@@ -754,8 +905,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 		    || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
 		    || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
 			for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
 			for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
 			     i > 0; i--) {
 			     i > 0; i--) {
-				tmp = ocfs2_bread(inode, ++blk, &err, 1);
-				brelse(tmp);
+				tmp = NULL;
+				if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
+							  OCFS2_BH_READAHEAD))
+					brelse(tmp);
 			}
 			}
 			last_ra_blk = blk;
 			last_ra_blk = blk;
 			ra_sectors = 8;
 			ra_sectors = 8;
@@ -828,6 +981,7 @@ revalidate:
 		}
 		}
 		offset = 0;
 		offset = 0;
 		brelse(bh);
 		brelse(bh);
+		bh = NULL;
 	}
 	}
 
 
 	stored = 0;
 	stored = 0;
@@ -1050,9 +1204,15 @@ int ocfs2_empty_dir(struct inode *inode)
 	return !priv.seen_other;
 	return !priv.seen_other;
 }
 }
 
 
-static void ocfs2_fill_initial_dirents(struct inode *inode,
-				       struct inode *parent,
-				       char *start, unsigned int size)
+/*
+ * Fills "." and ".." dirents in a new directory block. Returns dirent for
+ * "..", which might be used during creation of a directory with a trailing
+ * header. It is otherwise safe to ignore the return code.
+ */
+static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
+							  struct inode *parent,
+							  char *start,
+							  unsigned int size)
 {
 {
 	struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
 	struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
 
 
@@ -1069,6 +1229,8 @@ static void ocfs2_fill_initial_dirents(struct inode *inode,
 	de->name_len = 2;
 	de->name_len = 2;
 	strcpy(de->name, "..");
 	strcpy(de->name, "..");
 	ocfs2_set_de_type(de, S_IFDIR);
 	ocfs2_set_de_type(de, S_IFDIR);
+
+	return de;
 }
 }
 
 
 /*
 /*
@@ -1086,8 +1248,8 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
 	struct ocfs2_inline_data *data = &di->id2.i_data;
 	struct ocfs2_inline_data *data = &di->id2.i_data;
 	unsigned int size = le16_to_cpu(data->id_count);
 	unsigned int size = le16_to_cpu(data->id_count);
 
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 	if (ret) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out;
 		goto out;
@@ -1121,10 +1283,15 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 				 struct ocfs2_alloc_context *data_ac)
 				 struct ocfs2_alloc_context *data_ac)
 {
 {
 	int status;
 	int status;
+	unsigned int size = osb->sb->s_blocksize;
 	struct buffer_head *new_bh = NULL;
 	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry *de;
 
 
 	mlog_entry_void();
 	mlog_entry_void();
 
 
+	if (ocfs2_supports_dir_trailer(osb))
+		size = ocfs2_dir_trailer_blk_off(parent->i_sb);
+
 	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
 	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
 				     data_ac, NULL, &new_bh);
 				     data_ac, NULL, &new_bh);
 	if (status < 0) {
 	if (status < 0) {
@@ -1134,16 +1301,17 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 
 
 	ocfs2_set_new_buffer_uptodate(inode, new_bh);
 	ocfs2_set_new_buffer_uptodate(inode, new_bh);
 
 
-	status = ocfs2_journal_access(handle, inode, new_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	status = ocfs2_journal_access_db(handle, inode, new_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
 	}
 	}
 	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
 	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
 
 
-	ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data,
-				   osb->sb->s_blocksize);
+	de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
+	if (ocfs2_supports_dir_trailer(osb))
+		ocfs2_init_dir_trailer(inode, new_bh);
 
 
 	status = ocfs2_journal_dirty(handle, new_bh);
 	status = ocfs2_journal_dirty(handle, new_bh);
 	if (status < 0) {
 	if (status < 0) {
@@ -1184,13 +1352,27 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 				     data_ac);
 				     data_ac);
 }
 }
 
 
+/*
+ * Expand rec_len of the rightmost dirent in a directory block so that it
+ * contains the end of our valid space for dirents. We do this during
+ * expansion from an inline directory to one with extents. The first dir block
+ * in that case is taken from the inline data portion of the inode block.
+ *
+ * We add the dir trailer if this filesystem wants it.
+ */
 static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
 static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
-				     unsigned int new_size)
+				     struct super_block *sb)
 {
 {
 	struct ocfs2_dir_entry *de;
 	struct ocfs2_dir_entry *de;
 	struct ocfs2_dir_entry *prev_de;
 	struct ocfs2_dir_entry *prev_de;
 	char *de_buf, *limit;
 	char *de_buf, *limit;
-	unsigned int bytes = new_size - old_size;
+	unsigned int new_size = sb->s_blocksize;
+	unsigned int bytes;
+
+	if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+		new_size = ocfs2_dir_trailer_blk_off(sb);
+
+	bytes = new_size - old_size;
 
 
 	limit = start + old_size;
 	limit = start + old_size;
 	de_buf = start;
 	de_buf = start;
@@ -1216,9 +1398,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 				   unsigned int blocks_wanted,
 				   unsigned int blocks_wanted,
 				   struct buffer_head **first_block_bh)
 				   struct buffer_head **first_block_bh)
 {
 {
-	int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
 	u32 alloc, bit_off, len;
 	u32 alloc, bit_off, len;
 	struct super_block *sb = dir->i_sb;
 	struct super_block *sb = dir->i_sb;
+	int ret, credits = ocfs2_inline_to_extents_credits(sb);
 	u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
 	u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
 	struct ocfs2_inode_info *oi = OCFS2_I(dir);
 	struct ocfs2_inode_info *oi = OCFS2_I(dir);
@@ -1227,6 +1409,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	handle_t *handle;
 	handle_t *handle;
 	struct ocfs2_extent_tree et;
 	struct ocfs2_extent_tree et;
+	int did_quota = 0;
 
 
 	ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
 	ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
 
 
@@ -1264,6 +1447,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		goto out_sem;
 		goto out_sem;
 	}
 	}
 
 
+	if (vfs_dq_alloc_space_nodirty(dir,
+				ocfs2_clusters_to_bytes(osb->sb, alloc))) {
+		ret = -EDQUOT;
+		goto out_commit;
+	}
+	did_quota = 1;
 	/*
 	/*
 	 * Try to claim as many clusters as the bitmap can give though
 	 * Try to claim as many clusters as the bitmap can give though
 	 * if we only get one now, that's enough to continue. The rest
 	 * if we only get one now, that's enough to continue. The rest
@@ -1290,8 +1479,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 
 
 	ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
 	ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
 
 
-	ret = ocfs2_journal_access(handle, dir, dirdata_bh,
-				   OCFS2_JOURNAL_ACCESS_CREATE);
+	ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
 	if (ret) {
 	if (ret) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out_commit;
 		goto out_commit;
@@ -1300,8 +1489,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
 	memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
 	memset(dirdata_bh->b_data + i_size_read(dir), 0,
 	memset(dirdata_bh->b_data + i_size_read(dir), 0,
 	       sb->s_blocksize - i_size_read(dir));
 	       sb->s_blocksize - i_size_read(dir));
-	ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir),
-				 sb->s_blocksize);
+	ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
+	if (ocfs2_supports_dir_trailer(osb))
+		ocfs2_init_dir_trailer(dir, dirdata_bh);
 
 
 	ret = ocfs2_journal_dirty(handle, dirdata_bh);
 	ret = ocfs2_journal_dirty(handle, dirdata_bh);
 	if (ret) {
 	if (ret) {
@@ -1317,8 +1507,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	 * We let the later dirent insert modify c/mtime - to the user
 	 * We let the later dirent insert modify c/mtime - to the user
 	 * the data hasn't changed.
 	 * the data hasn't changed.
 	 */
 	 */
-	ret = ocfs2_journal_access(handle, dir, di_bh,
-				   OCFS2_JOURNAL_ACCESS_CREATE);
+	ret = ocfs2_journal_access_di(handle, dir, di_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
 	if (ret) {
 	if (ret) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out_commit;
 		goto out_commit;
@@ -1386,6 +1576,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	dirdata_bh = NULL;
 	dirdata_bh = NULL;
 
 
 out_commit:
 out_commit:
+	if (ret < 0 && did_quota)
+		vfs_dq_free_space_nodirty(dir,
+			ocfs2_clusters_to_bytes(osb->sb, 2));
 	ocfs2_commit_trans(osb, handle);
 	ocfs2_commit_trans(osb, handle);
 
 
 out_sem:
 out_sem:
@@ -1410,7 +1603,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 			       struct buffer_head **new_bh)
 			       struct buffer_head **new_bh)
 {
 {
 	int status;
 	int status;
-	int extend;
+	int extend, did_quota = 0;
 	u64 p_blkno, v_blkno;
 	u64 p_blkno, v_blkno;
 
 
 	spin_lock(&OCFS2_I(dir)->ip_lock);
 	spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -1420,6 +1613,13 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 	if (extend) {
 	if (extend) {
 		u32 offset = OCFS2_I(dir)->ip_clusters;
 		u32 offset = OCFS2_I(dir)->ip_clusters;
 
 
+		if (vfs_dq_alloc_space_nodirty(dir,
+					ocfs2_clusters_to_bytes(sb, 1))) {
+			status = -EDQUOT;
+			goto bail;
+		}
+		did_quota = 1;
+
 		status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
 		status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
 					      1, 0, parent_fe_bh, handle,
 					      1, 0, parent_fe_bh, handle,
 					      data_ac, meta_ac, NULL);
 					      data_ac, meta_ac, NULL);
@@ -1445,6 +1645,8 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 	}
 	}
 	status = 0;
 	status = 0;
 bail:
 bail:
+	if (did_quota && status < 0)
+		vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
 	mlog_exit(status);
 	mlog_exit(status);
 	return status;
 	return status;
 }
 }
@@ -1569,16 +1771,22 @@ do_extend:
 
 
 	ocfs2_set_new_buffer_uptodate(dir, new_bh);
 	ocfs2_set_new_buffer_uptodate(dir, new_bh);
 
 
-	status = ocfs2_journal_access(handle, dir, new_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	status = ocfs2_journal_access_db(handle, dir, new_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
 	}
 	}
 	memset(new_bh->b_data, 0, sb->s_blocksize);
 	memset(new_bh->b_data, 0, sb->s_blocksize);
+
 	de = (struct ocfs2_dir_entry *) new_bh->b_data;
 	de = (struct ocfs2_dir_entry *) new_bh->b_data;
 	de->inode = 0;
 	de->inode = 0;
-	de->rec_len = cpu_to_le16(sb->s_blocksize);
+	if (ocfs2_dir_has_trailer(dir)) {
+		de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
+		ocfs2_init_dir_trailer(dir, new_bh);
+	} else {
+		de->rec_len = cpu_to_le16(sb->s_blocksize);
+	}
 	status = ocfs2_journal_dirty(handle, new_bh);
 	status = ocfs2_journal_dirty(handle, new_bh);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
@@ -1620,11 +1828,21 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
 				   unsigned int *blocks_wanted)
 				   unsigned int *blocks_wanted)
 {
 {
 	int ret;
 	int ret;
+	struct super_block *sb = dir->i_sb;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_dir_entry *de, *last_de = NULL;
 	struct ocfs2_dir_entry *de, *last_de = NULL;
 	char *de_buf, *limit;
 	char *de_buf, *limit;
 	unsigned long offset = 0;
 	unsigned long offset = 0;
-	unsigned int rec_len, new_rec_len;
+	unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
+
+	/*
+	 * This calculates how many free bytes we'd have in block zero, should
+	 * this function force expansion to an extent tree.
+	 */
+	if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+		free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
+	else
+		free_space = dir->i_sb->s_blocksize - i_size_read(dir);
 
 
 	de_buf = di->id2.i_data.id_data;
 	de_buf = di->id2.i_data.id_data;
 	limit = de_buf + i_size_read(dir);
 	limit = de_buf + i_size_read(dir);
@@ -1641,6 +1859,11 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
 			ret = -EEXIST;
 			ret = -EEXIST;
 			goto out;
 			goto out;
 		}
 		}
+		/*
+		 * No need to check for a trailing dirent record here as
+		 * they're not used for inline dirs.
+		 */
+
 		if (ocfs2_dirent_would_fit(de, rec_len)) {
 		if (ocfs2_dirent_would_fit(de, rec_len)) {
 			/* Ok, we found a spot. Return this bh and let
 			/* Ok, we found a spot. Return this bh and let
 			 * the caller actually fill it in. */
 			 * the caller actually fill it in. */
@@ -1661,7 +1884,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
 	 * dirent can be found.
 	 * dirent can be found.
 	 */
 	 */
 	*blocks_wanted = 1;
 	*blocks_wanted = 1;
-	new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir));
+	new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
 	if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
 	if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
 		*blocks_wanted = 2;
 		*blocks_wanted = 2;
 
 
@@ -1679,9 +1902,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 	struct ocfs2_dir_entry *de;
 	struct ocfs2_dir_entry *de;
 	struct super_block *sb = dir->i_sb;
 	struct super_block *sb = dir->i_sb;
 	int status;
 	int status;
+	int blocksize = dir->i_sb->s_blocksize;
 
 
-	bh = ocfs2_bread(dir, 0, &status, 0);
-	if (!bh) {
+	status = ocfs2_read_dir_block(dir, 0, &bh, 0);
+	if (status) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
 	}
 	}
@@ -1702,11 +1926,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 				status = -ENOSPC;
 				status = -ENOSPC;
 				goto bail;
 				goto bail;
 			}
 			}
-			bh = ocfs2_bread(dir,
-					 offset >> sb->s_blocksize_bits,
-					 &status,
-					 0);
-			if (!bh) {
+			status = ocfs2_read_dir_block(dir,
+					     offset >> sb->s_blocksize_bits,
+					     &bh, 0);
+			if (status) {
 				mlog_errno(status);
 				mlog_errno(status);
 				goto bail;
 				goto bail;
 			}
 			}
@@ -1721,6 +1944,11 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 			status = -EEXIST;
 			status = -EEXIST;
 			goto bail;
 			goto bail;
 		}
 		}
+
+		if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
+					   blocksize))
+			goto next;
+
 		if (ocfs2_dirent_would_fit(de, rec_len)) {
 		if (ocfs2_dirent_would_fit(de, rec_len)) {
 			/* Ok, we found a spot. Return this bh and let
 			/* Ok, we found a spot. Return this bh and let
 			 * the caller actually fill it in. */
 			 * the caller actually fill it in. */
@@ -1729,6 +1957,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 			status = 0;
 			status = 0;
 			goto bail;
 			goto bail;
 		}
 		}
+next:
 		offset += le16_to_cpu(de->rec_len);
 		offset += le16_to_cpu(de->rec_len);
 		de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
 		de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
 	}
 	}

+ 2 - 0
fs/ocfs2/dir.h

@@ -83,4 +83,6 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 		       struct buffer_head *fe_bh,
 		       struct buffer_head *fe_bh,
 		       struct ocfs2_alloc_context *data_ac);
 		       struct ocfs2_alloc_context *data_ac);
 
 
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+							    void *data);
 #endif /* OCFS2_DIR_H */
 #endif /* OCFS2_DIR_H */

+ 27 - 25
fs/ocfs2/dlm/dlmast.c

@@ -275,6 +275,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	struct list_head *iter, *head=NULL;
 	struct list_head *iter, *head=NULL;
 	u64 cookie;
 	u64 cookie;
 	u32 flags;
 	u32 flags;
+	u8 node;
 
 
 	if (!dlm_grab(dlm)) {
 	if (!dlm_grab(dlm)) {
 		dlm_error(DLM_REJECTED);
 		dlm_error(DLM_REJECTED);
@@ -286,18 +287,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 
 
 	name = past->name;
 	name = past->name;
 	locklen = past->namelen;
 	locklen = past->namelen;
-	cookie = be64_to_cpu(past->cookie);
+	cookie = past->cookie;
 	flags = be32_to_cpu(past->flags);
 	flags = be32_to_cpu(past->flags);
+	node = past->node_idx;
 
 
 	if (locklen > DLM_LOCKID_NAME_MAX) {
 	if (locklen > DLM_LOCKID_NAME_MAX) {
 		ret = DLM_IVBUFLEN;
 		ret = DLM_IVBUFLEN;
-		mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
+		mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
+		     "handler!\n", locklen);
 		goto leave;
 		goto leave;
 	}
 	}
 
 
 	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
 	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
 	     (LKM_PUT_LVB|LKM_GET_LVB)) {
 	     (LKM_PUT_LVB|LKM_GET_LVB)) {
-		mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+		mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
+		     flags);
 		ret = DLM_BADARGS;
 		ret = DLM_BADARGS;
 		goto leave;
 		goto leave;
 	}
 	}
@@ -310,22 +314,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	if (past->type != DLM_AST &&
 	if (past->type != DLM_AST &&
 	    past->type != DLM_BAST) {
 	    past->type != DLM_BAST) {
 		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
 		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
-		     "name=%.*s\n", past->type, 
-		     dlm_get_lock_cookie_node(cookie),
-		     dlm_get_lock_cookie_seq(cookie),
-		     locklen, name);
+		     "name=%.*s, node=%u\n", past->type,
+		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     locklen, name, node);
 		ret = DLM_IVLOCKID;
 		ret = DLM_IVLOCKID;
 		goto leave;
 		goto leave;
 	}
 	}
 
 
 	res = dlm_lookup_lockres(dlm, name, locklen);
 	res = dlm_lookup_lockres(dlm, name, locklen);
 	if (!res) {
 	if (!res) {
-		mlog(0, "got %sast for unknown lockres! "
-		     "cookie=%u:%llu, name=%.*s, namelen=%u\n",
-		     past->type == DLM_AST ? "" : "b",
-		     dlm_get_lock_cookie_node(cookie),
-		     dlm_get_lock_cookie_seq(cookie),
-		     locklen, name, locklen);
+		mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
+		     "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
+		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     locklen, name, node);
 		ret = DLM_IVLOCKID;
 		ret = DLM_IVLOCKID;
 		goto leave;
 		goto leave;
 	}
 	}
@@ -337,12 +340,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 
 
 	spin_lock(&res->spinlock);
 	spin_lock(&res->spinlock);
 	if (res->state & DLM_LOCK_RES_RECOVERING) {
 	if (res->state & DLM_LOCK_RES_RECOVERING) {
-		mlog(0, "responding with DLM_RECOVERING!\n");
+		mlog(0, "Responding with DLM_RECOVERING!\n");
 		ret = DLM_RECOVERING;
 		ret = DLM_RECOVERING;
 		goto unlock_out;
 		goto unlock_out;
 	}
 	}
 	if (res->state & DLM_LOCK_RES_MIGRATING) {
 	if (res->state & DLM_LOCK_RES_MIGRATING) {
-		mlog(0, "responding with DLM_MIGRATING!\n");
+		mlog(0, "Responding with DLM_MIGRATING!\n");
 		ret = DLM_MIGRATING;
 		ret = DLM_MIGRATING;
 		goto unlock_out;
 		goto unlock_out;
 	}
 	}
@@ -351,7 +354,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	lock = NULL;
 	lock = NULL;
 	list_for_each(iter, head) {
 	list_for_each(iter, head) {
 		lock = list_entry (iter, struct dlm_lock, list);
 		lock = list_entry (iter, struct dlm_lock, list);
-		if (be64_to_cpu(lock->ml.cookie) == cookie)
+		if (lock->ml.cookie == cookie)
 			goto do_ast;
 			goto do_ast;
 	}
 	}
 
 
@@ -363,15 +366,15 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 
 
 	list_for_each(iter, head) {
 	list_for_each(iter, head) {
 		lock = list_entry (iter, struct dlm_lock, list);
 		lock = list_entry (iter, struct dlm_lock, list);
-		if (be64_to_cpu(lock->ml.cookie) == cookie)
+		if (lock->ml.cookie == cookie)
 			goto do_ast;
 			goto do_ast;
 	}
 	}
 
 
-	mlog(0, "got %sast for unknown lock!  cookie=%u:%llu, "
-	     "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 
-	     dlm_get_lock_cookie_node(cookie),
-	     dlm_get_lock_cookie_seq(cookie),
-	     locklen, name, locklen);
+	mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
+	     "node=%u\n", past->type == DLM_AST ? "" : "b",
+	     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+	     locklen, name, node);
 
 
 	ret = DLM_NORMAL;
 	ret = DLM_NORMAL;
 unlock_out:
 unlock_out:
@@ -383,8 +386,8 @@ do_ast:
 	if (past->type == DLM_AST) {
 	if (past->type == DLM_AST) {
 		/* do not alter lock refcount.  switching lists. */
 		/* do not alter lock refcount.  switching lists. */
 		list_move_tail(&lock->list, &res->granted);
 		list_move_tail(&lock->list, &res->granted);
-		mlog(0, "ast: adding to granted list... type=%d, "
-			  "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+		mlog(0, "ast: Adding to granted list... type=%d, "
+		     "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
 		if (lock->ml.convert_type != LKM_IVMODE) {
 		if (lock->ml.convert_type != LKM_IVMODE) {
 			lock->ml.type = lock->ml.convert_type;
 			lock->ml.type = lock->ml.convert_type;
 			lock->ml.convert_type = LKM_IVMODE;
 			lock->ml.convert_type = LKM_IVMODE;
@@ -408,7 +411,6 @@ do_ast:
 		dlm_do_local_bast(dlm, res, lock, past->blocked_type);
 		dlm_do_local_bast(dlm, res, lock, past->blocked_type);
 
 
 leave:
 leave:
-
 	if (res)
 	if (res)
 		dlm_lockres_put(res);
 		dlm_lockres_put(res);
 
 

+ 3 - 0
fs/ocfs2/dlm/dlmcommon.h

@@ -140,6 +140,7 @@ struct dlm_ctxt
 	unsigned int purge_count;
 	unsigned int purge_count;
 	spinlock_t spinlock;
 	spinlock_t spinlock;
 	spinlock_t ast_lock;
 	spinlock_t ast_lock;
+	spinlock_t track_lock;
 	char *name;
 	char *name;
 	u8 node_num;
 	u8 node_num;
 	u32 key;
 	u32 key;
@@ -316,6 +317,8 @@ struct dlm_lock_resource
 	 * put on a list for the dlm thread to run. */
 	 * put on a list for the dlm thread to run. */
 	unsigned long    last_used;
 	unsigned long    last_used;
 
 
+	struct dlm_ctxt *dlm;
+
 	unsigned migration_pending:1;
 	unsigned migration_pending:1;
 	atomic_t asts_reserved;
 	atomic_t asts_reserved;
 	spinlock_t spinlock;
 	spinlock_t spinlock;

+ 24 - 29
fs/ocfs2/dlm/dlmdebug.c

@@ -630,43 +630,38 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
 {
 {
 	struct debug_lockres *dl = m->private;
 	struct debug_lockres *dl = m->private;
 	struct dlm_ctxt *dlm = dl->dl_ctxt;
 	struct dlm_ctxt *dlm = dl->dl_ctxt;
+	struct dlm_lock_resource *oldres = dl->dl_res;
 	struct dlm_lock_resource *res = NULL;
 	struct dlm_lock_resource *res = NULL;
+	struct list_head *track_list;
 
 
-	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->track_lock);
+	if (oldres)
+		track_list = &oldres->tracking;
+	else
+		track_list = &dlm->tracking_list;
 
 
-	if (dl->dl_res) {
-		list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
-			if (dl->dl_res) {
-				dlm_lockres_put(dl->dl_res);
-				dl->dl_res = NULL;
-			}
-			if (&res->tracking == &dlm->tracking_list) {
-				mlog(0, "End of list found, %p\n", res);
-				dl = NULL;
-				break;
-			}
+	list_for_each_entry(res, track_list, tracking) {
+		if (&res->tracking == &dlm->tracking_list)
+			res = NULL;
+		else
 			dlm_lockres_get(res);
 			dlm_lockres_get(res);
-			dl->dl_res = res;
-			break;
-		}
-	} else {
-		if (!list_empty(&dlm->tracking_list)) {
-			list_for_each_entry(res, &dlm->tracking_list, tracking)
-				break;
-			dlm_lockres_get(res);
-			dl->dl_res = res;
-		} else
-			dl = NULL;
+		break;
 	}
 	}
+	spin_unlock(&dlm->track_lock);
 
 
-	if (dl) {
-		spin_lock(&dl->dl_res->spinlock);
-		dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
-		spin_unlock(&dl->dl_res->spinlock);
-	}
+	if (oldres)
+		dlm_lockres_put(oldres);
 
 
-	spin_unlock(&dlm->spinlock);
+	dl->dl_res = res;
+
+	if (res) {
+		spin_lock(&res->spinlock);
+		dump_lockres(res, dl->dl_buf, dl->dl_len - 1);
+		spin_unlock(&res->spinlock);
+	} else
+		dl = NULL;
 
 
+	/* passed to seq_show */
 	return dl;
 	return dl;
 }
 }
 
 

+ 1 - 0
fs/ocfs2/dlm/dlmdomain.c

@@ -1550,6 +1550,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	spin_lock_init(&dlm->spinlock);
 	spin_lock_init(&dlm->spinlock);
 	spin_lock_init(&dlm->master_lock);
 	spin_lock_init(&dlm->master_lock);
 	spin_lock_init(&dlm->ast_lock);
 	spin_lock_init(&dlm->ast_lock);
+	spin_lock_init(&dlm->track_lock);
 	INIT_LIST_HEAD(&dlm->list);
 	INIT_LIST_HEAD(&dlm->list);
 	INIT_LIST_HEAD(&dlm->dirty_list);
 	INIT_LIST_HEAD(&dlm->dirty_list);
 	INIT_LIST_HEAD(&dlm->reco.resources);
 	INIT_LIST_HEAD(&dlm->reco.resources);

+ 37 - 5
fs/ocfs2/dlm/dlmmaster.c

@@ -505,8 +505,10 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
 static void dlm_lockres_release(struct kref *kref)
 static void dlm_lockres_release(struct kref *kref)
 {
 {
 	struct dlm_lock_resource *res;
 	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm;
 
 
 	res = container_of(kref, struct dlm_lock_resource, refs);
 	res = container_of(kref, struct dlm_lock_resource, refs);
+	dlm = res->dlm;
 
 
 	/* This should not happen -- all lockres' have a name
 	/* This should not happen -- all lockres' have a name
 	 * associated with them at init time. */
 	 * associated with them at init time. */
@@ -515,6 +517,7 @@ static void dlm_lockres_release(struct kref *kref)
 	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
 	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
 	     res->lockname.name);
 	     res->lockname.name);
 
 
+	spin_lock(&dlm->track_lock);
 	if (!list_empty(&res->tracking))
 	if (!list_empty(&res->tracking))
 		list_del_init(&res->tracking);
 		list_del_init(&res->tracking);
 	else {
 	else {
@@ -522,6 +525,9 @@ static void dlm_lockres_release(struct kref *kref)
 		     res->lockname.len, res->lockname.name);
 		     res->lockname.len, res->lockname.name);
 		dlm_print_one_lock_resource(res);
 		dlm_print_one_lock_resource(res);
 	}
 	}
+	spin_unlock(&dlm->track_lock);
+
+	dlm_put(dlm);
 
 
 	if (!hlist_unhashed(&res->hash_node) ||
 	if (!hlist_unhashed(&res->hash_node) ||
 	    !list_empty(&res->granted) ||
 	    !list_empty(&res->granted) ||
@@ -595,6 +601,10 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 	res->migration_pending = 0;
 	res->migration_pending = 0;
 	res->inflight_locks = 0;
 	res->inflight_locks = 0;
 
 
+	/* put in dlm_lockres_release */
+	dlm_grab(dlm);
+	res->dlm = dlm;
+
 	kref_init(&res->refs);
 	kref_init(&res->refs);
 
 
 	/* just for consistency */
 	/* just for consistency */
@@ -722,14 +732,21 @@ lookup:
 	if (tmpres) {
 	if (tmpres) {
 		int dropping_ref = 0;
 		int dropping_ref = 0;
 
 
+		spin_unlock(&dlm->spinlock);
+
 		spin_lock(&tmpres->spinlock);
 		spin_lock(&tmpres->spinlock);
+		/* We wait for the other thread that is mastering the resource */
+		if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			__dlm_wait_on_lockres(tmpres);
+			BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+		}
+
 		if (tmpres->owner == dlm->node_num) {
 		if (tmpres->owner == dlm->node_num) {
 			BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
 			BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
 			dlm_lockres_grab_inflight_ref(dlm, tmpres);
 			dlm_lockres_grab_inflight_ref(dlm, tmpres);
 		} else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
 		} else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
 			dropping_ref = 1;
 			dropping_ref = 1;
 		spin_unlock(&tmpres->spinlock);
 		spin_unlock(&tmpres->spinlock);
-		spin_unlock(&dlm->spinlock);
 
 
 		/* wait until done messaging the master, drop our ref to allow
 		/* wait until done messaging the master, drop our ref to allow
 		 * the lockres to be purged, start over. */
 		 * the lockres to be purged, start over. */
@@ -2949,7 +2966,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
 				  struct dlm_node_iter *iter)
 				  struct dlm_node_iter *iter)
 {
 {
 	struct dlm_migrate_request migrate;
 	struct dlm_migrate_request migrate;
-	int ret, status = 0;
+	int ret, skip, status = 0;
 	int nodenum;
 	int nodenum;
 
 
 	memset(&migrate, 0, sizeof(migrate));
 	memset(&migrate, 0, sizeof(migrate));
@@ -2966,12 +2983,27 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
 		    nodenum == new_master)
 		    nodenum == new_master)
 			continue;
 			continue;
 
 
+		/* We could race exit domain. If exited, skip. */
+		spin_lock(&dlm->spinlock);
+		skip = (!test_bit(nodenum, dlm->domain_map));
+		spin_unlock(&dlm->spinlock);
+		if (skip) {
+			clear_bit(nodenum, iter->node_map);
+			continue;
+		}
+
 		ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
 		ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
 					 &migrate, sizeof(migrate), nodenum,
 					 &migrate, sizeof(migrate), nodenum,
 					 &status);
 					 &status);
-		if (ret < 0)
-			mlog_errno(ret);
-		else if (status < 0) {
+		if (ret < 0) {
+			mlog(0, "migrate_request returned %d!\n", ret);
+			if (!dlm_is_host_down(ret)) {
+				mlog(ML_ERROR, "unhandled error=%d!\n", ret);
+				BUG();
+			}
+			clear_bit(nodenum, iter->node_map);
+			ret = 0;
+		} else if (status < 0) {
 			mlog(0, "migrate request (node %u) returned %d!\n",
 			mlog(0, "migrate request (node %u) returned %d!\n",
 			     nodenum, status);
 			     nodenum, status);
 			ret = status;
 			ret = status;

+ 2 - 1
fs/ocfs2/dlm/dlmthread.c

@@ -181,7 +181,8 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
 
 
 		spin_lock(&res->spinlock);
 		spin_lock(&res->spinlock);
 		/* This ensures that clear refmap is sent after the set */
 		/* This ensures that clear refmap is sent after the set */
-		__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+		__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
+						  DLM_LOCK_RES_MIGRATING));
 		spin_unlock(&res->spinlock);
 		spin_unlock(&res->spinlock);
 
 
 		/* clear our bit from the master's refmap, ignore errors */
 		/* clear our bit from the master's refmap, ignore errors */

+ 154 - 14
fs/ocfs2/dlmglue.c

@@ -32,6 +32,7 @@
 #include <linux/debugfs.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/seq_file.h>
 #include <linux/time.h>
 #include <linux/time.h>
+#include <linux/quotaops.h>
 
 
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
 #include <cluster/masklog.h>
@@ -51,6 +52,7 @@
 #include "slot_map.h"
 #include "slot_map.h"
 #include "super.h"
 #include "super.h"
 #include "uptodate.h"
 #include "uptodate.h"
+#include "quota.h"
 
 
 #include "buffer_head_io.h"
 #include "buffer_head_io.h"
 
 
@@ -68,6 +70,7 @@ struct ocfs2_mask_waiter {
 static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
 
 
 /*
 /*
  * Return value from ->downconvert_worker functions.
  * Return value from ->downconvert_worker functions.
@@ -102,6 +105,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
 static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
 				     struct ocfs2_lock_res *lockres);
 				     struct ocfs2_lock_res *lockres);
 
 
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
 
 
 #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
 #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
 
 
@@ -111,8 +115,7 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
 				     unsigned int line,
 				     unsigned int line,
 				     struct ocfs2_lock_res *lockres)
 				     struct ocfs2_lock_res *lockres)
 {
 {
-	struct ocfs2_meta_lvb *lvb =
-		(struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+	struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
 
 	mlog(level, "LVB information for %s (called from %s:%u):\n",
 	mlog(level, "LVB information for %s (called from %s:%u):\n",
 	     lockres->l_name, function, line);
 	     lockres->l_name, function, line);
@@ -258,6 +261,12 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
 	.flags		= 0,
 	.flags		= 0,
 };
 };
 
 
+static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
+	.set_lvb	= ocfs2_set_qinfo_lvb,
+	.get_osb	= ocfs2_get_qinfo_osb,
+	.flags		= LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
+};
+
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
 {
 	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
 	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -279,6 +288,13 @@ static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res
 	return (struct ocfs2_dentry_lock *)lockres->l_priv;
 	return (struct ocfs2_dentry_lock *)lockres->l_priv;
 }
 }
 
 
+static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
+
+	return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
+}
+
 static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
 static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
 {
 {
 	if (lockres->l_ops->get_osb)
 	if (lockres->l_ops->get_osb)
@@ -507,6 +523,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
 	return OCFS2_SB(inode->i_sb);
 	return OCFS2_SB(inode->i_sb);
 }
 }
 
 
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_mem_dqinfo *info = lockres->l_priv;
+
+	return OCFS2_SB(info->dqi_gi.dqi_sb);
+}
+
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
 {
 {
 	struct ocfs2_file_private *fp = lockres->l_priv;
 	struct ocfs2_file_private *fp = lockres->l_priv;
@@ -609,6 +632,17 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
 	lockres->l_flags |= OCFS2_LOCK_NOCACHE;
 	lockres->l_flags |= OCFS2_LOCK_NOCACHE;
 }
 }
 
 
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+			       struct ocfs2_mem_dqinfo *info)
+{
+	ocfs2_lock_res_init_once(lockres);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
+			      0, lockres->l_name);
+	ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
+				   OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
+				   info);
+}
+
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
 {
 	mlog_entry_void();
 	mlog_entry_void();
@@ -1829,7 +1863,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 
 
 	mlog_entry_void();
 	mlog_entry_void();
 
 
-	lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
 
 	/*
 	/*
 	 * Invalidate the LVB of a deleted inode - this way other
 	 * Invalidate the LVB of a deleted inode - this way other
@@ -1881,7 +1915,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 
 
 	mlog_meta_lvb(0, lockres);
 	mlog_meta_lvb(0, lockres);
 
 
-	lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
 
 	/* We're safe here without the lockres lock... */
 	/* We're safe here without the lockres lock... */
 	spin_lock(&oi->ip_lock);
 	spin_lock(&oi->ip_lock);
@@ -1916,8 +1950,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
 static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
 					      struct ocfs2_lock_res *lockres)
 					      struct ocfs2_lock_res *lockres)
 {
 {
-	struct ocfs2_meta_lvb *lvb =
-		(struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+	struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
 
 	if (lvb->lvb_version == OCFS2_LVB_VERSION
 	if (lvb->lvb_version == OCFS2_LVB_VERSION
 	    && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
 	    && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -2024,7 +2057,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 	} else {
 	} else {
 		/* Boo, we have to go to disk. */
 		/* Boo, we have to go to disk. */
 		/* read bh, cast, ocfs2_refresh_inode */
 		/* read bh, cast, ocfs2_refresh_inode */
-		status = ocfs2_read_block(inode, oi->ip_blkno, bh);
+		status = ocfs2_read_inode_block(inode, bh);
 		if (status < 0) {
 		if (status < 0) {
 			mlog_errno(status);
 			mlog_errno(status);
 			goto bail_refresh;
 			goto bail_refresh;
@@ -2032,18 +2065,14 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 		fe = (struct ocfs2_dinode *) (*bh)->b_data;
 		fe = (struct ocfs2_dinode *) (*bh)->b_data;
 
 
 		/* This is a good chance to make sure we're not
 		/* This is a good chance to make sure we're not
-		 * locking an invalid object.
+		 * locking an invalid object.  ocfs2_read_inode_block()
+		 * already checked that the inode block is sane.
 		 *
 		 *
 		 * We bug on a stale inode here because we checked
 		 * We bug on a stale inode here because we checked
 		 * above whether it was wiped from disk. The wiping
 		 * above whether it was wiped from disk. The wiping
 		 * node provides a guarantee that we receive that
 		 * node provides a guarantee that we receive that
 		 * message and can mark the inode before dropping any
 		 * message and can mark the inode before dropping any
 		 * locks associated with it. */
 		 * locks associated with it. */
-		if (!OCFS2_IS_VALID_DINODE(fe)) {
-			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-			status = -EIO;
-			goto bail_refresh;
-		}
 		mlog_bug_on_msg(inode->i_generation !=
 		mlog_bug_on_msg(inode->i_generation !=
 				le32_to_cpu(fe->i_generation),
 				le32_to_cpu(fe->i_generation),
 				"Invalid dinode %llu disk generation: %u "
 				"Invalid dinode %llu disk generation: %u "
@@ -2085,7 +2114,7 @@ static int ocfs2_assign_bh(struct inode *inode,
 		return 0;
 		return 0;
 	}
 	}
 
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
+	status = ocfs2_read_inode_block(inode, ret_bh);
 	if (status < 0)
 	if (status < 0)
 		mlog_errno(status);
 		mlog_errno(status);
 
 
@@ -3449,6 +3478,117 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 	return UNBLOCK_CONTINUE_POST;
 	return UNBLOCK_CONTINUE_POST;
 }
 }
 
 
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_qinfo_lvb *lvb;
+	struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
+	struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+					    oinfo->dqi_gi.dqi_type);
+
+	mlog_entry_void();
+
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
+	lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
+	lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
+	lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
+	lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
+	lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
+	lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
+
+	mlog_exit_void();
+}
+
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+	struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+	mlog_entry_void();
+	if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres, level);
+	mlog_exit_void();
+}
+
+static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
+{
+	struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+					    oinfo->dqi_gi.dqi_type);
+	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+	struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	struct buffer_head *bh = NULL;
+	struct ocfs2_global_disk_dqinfo *gdinfo;
+	int status = 0;
+
+	if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
+		info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
+		info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
+		oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
+		oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
+		oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
+		oinfo->dqi_gi.dqi_free_entry =
+					be32_to_cpu(lvb->lvb_free_entry);
+	} else {
+		status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+		gdinfo = (struct ocfs2_global_disk_dqinfo *)
+					(bh->b_data + OCFS2_GLOBAL_INFO_OFF);
+		info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
+		info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
+		oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
+		oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
+		oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
+		oinfo->dqi_gi.dqi_free_entry =
+					le32_to_cpu(gdinfo->dqi_free_entry);
+		brelse(bh);
+		ocfs2_track_lock_refresh(lockres);
+	}
+
+bail:
+	return status;
+}
+
+/* Lock quota info, this function expects at least shared lock on the quota file
+ * so that we can safely refresh quota info from disk. */
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+	struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	int status = 0;
+
+	mlog_entry_void();
+
+	/* On RO devices, locking really isn't needed... */
+	if (ocfs2_is_hard_readonly(osb)) {
+		if (ex)
+			status = -EROFS;
+		goto bail;
+	}
+	if (ocfs2_mount_local(osb))
+		goto bail;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (!ocfs2_should_refresh_lock_res(lockres))
+		goto bail;
+	/* OK, we have the lock but we need to refresh the quota info */
+	status = ocfs2_refresh_qinfo(oinfo);
+	if (status)
+		ocfs2_qinfo_unlock(oinfo, ex);
+	ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+	mlog_exit(status);
+	return status;
+}
+
 /*
 /*
  * This is the filesystem locking protocol.  It provides the lock handling
  * This is the filesystem locking protocol.  It provides the lock handling
  * hooks for the underlying DLM.  It has a maximum version number.
  * hooks for the underlying DLM.  It has a maximum version number.

+ 19 - 0
fs/ocfs2/dlmglue.h

@@ -49,6 +49,19 @@ struct ocfs2_meta_lvb {
 	__be32       lvb_reserved2;
 	__be32       lvb_reserved2;
 };
 };
 
 
+#define OCFS2_QINFO_LVB_VERSION 1
+
+struct ocfs2_qinfo_lvb {
+	__u8	lvb_version;
+	__u8	lvb_reserved[3];
+	__be32	lvb_bgrace;
+	__be32	lvb_igrace;
+	__be32	lvb_syncms;
+	__be32	lvb_blocks;
+	__be32	lvb_free_blk;
+	__be32	lvb_free_entry;
+};
+
 /* ocfs2_inode_lock_full() 'arg_flags' flags */
 /* ocfs2_inode_lock_full() 'arg_flags' flags */
 /* don't wait on recovery. */
 /* don't wait on recovery. */
 #define OCFS2_META_LOCK_RECOVERY	(0x01)
 #define OCFS2_META_LOCK_RECOVERY	(0x01)
@@ -69,6 +82,9 @@ void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
 struct ocfs2_file_private;
 struct ocfs2_file_private;
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
 			      struct ocfs2_file_private *fp);
 			      struct ocfs2_file_private *fp);
+struct ocfs2_mem_dqinfo;
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+                               struct ocfs2_mem_dqinfo *info);
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 int ocfs2_create_new_inode_locks(struct inode *inode);
 int ocfs2_create_new_inode_locks(struct inode *inode);
 int ocfs2_drop_inode_locks(struct inode *inode);
 int ocfs2_drop_inode_locks(struct inode *inode);
@@ -103,6 +119,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
 int ocfs2_file_lock(struct file *file, int ex, int trylock);
 int ocfs2_file_lock(struct file *file, int ex, int trylock);
 void ocfs2_file_unlock(struct file *file);
 void ocfs2_file_unlock(struct file *file);
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+
 
 
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
 void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
 void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,

+ 77 - 19
fs/ocfs2/extent_map.c

@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 	struct ocfs2_extent_list *el;
 
 
-	ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
+	ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
 	if (ret) {
 	if (ret) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out;
 		goto out;
@@ -302,12 +302,6 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
 	eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 	eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 	el = &eb->h_list;
 	el = &eb->h_list;
 
 
-	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-		ret = -EROFS;
-		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-		goto out;
-	}
-
 	if (el->l_tree_depth) {
 	if (el->l_tree_depth) {
 		ocfs2_error(inode->i_sb,
 		ocfs2_error(inode->i_sb,
 			    "Inode %lu has non zero tree depth in "
 			    "Inode %lu has non zero tree depth in "
@@ -381,23 +375,16 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
 		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
 		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
 			goto no_more_extents;
 			goto no_more_extents;
 
 
-		ret = ocfs2_read_block(inode,
-				       le64_to_cpu(eb->h_next_leaf_blk),
-				       &next_eb_bh);
+		ret = ocfs2_read_extent_block(inode,
+					      le64_to_cpu(eb->h_next_leaf_blk),
+					      &next_eb_bh);
 		if (ret) {
 		if (ret) {
 			mlog_errno(ret);
 			mlog_errno(ret);
 			goto out;
 			goto out;
 		}
 		}
-		next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
-
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
-			ret = -EROFS;
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
-			goto out;
-		}
 
 
+		next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
 		el = &next_eb->h_list;
 		el = &next_eb->h_list;
-
 		i = ocfs2_search_for_hole_index(el, v_cluster);
 		i = ocfs2_search_for_hole_index(el, v_cluster);
 	}
 	}
 
 
@@ -630,7 +617,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 	if (ret == 0)
 	if (ret == 0)
 		goto out;
 		goto out;
 
 
-	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+	ret = ocfs2_read_inode_block(inode, &di_bh);
 	if (ret) {
 	if (ret) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out;
 		goto out;
@@ -819,3 +806,74 @@ out:
 
 
 	return ret;
 	return ret;
 }
 }
+
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+			   struct buffer_head *bhs[], int flags,
+			   int (*validate)(struct super_block *sb,
+					   struct buffer_head *bh))
+{
+	int rc = 0;
+	u64 p_block, p_count;
+	int i, count, done = 0;
+
+	mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
+		   "flags = %x, validate = %p)\n",
+		   inode, (unsigned long long)v_block, nr, bhs, flags,
+		   validate);
+
+	if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
+	    i_size_read(inode)) {
+		BUG_ON(!(flags & OCFS2_BH_READAHEAD));
+		goto out;
+	}
+
+	while (done < nr) {
+		down_read(&OCFS2_I(inode)->ip_alloc_sem);
+		rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
+						 &p_block, &p_count, NULL);
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
+		if (rc) {
+			mlog_errno(rc);
+			break;
+		}
+
+		if (!p_block) {
+			rc = -EIO;
+			mlog(ML_ERROR,
+			     "Inode #%llu contains a hole at offset %llu\n",
+			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			     (unsigned long long)(v_block + done) <<
+			     inode->i_sb->s_blocksize_bits);
+			break;
+		}
+
+		count = nr - done;
+		if (p_count < count)
+			count = p_count;
+
+		/*
+		 * If the caller passed us bhs, they should have come
+		 * from a previous readahead call to this function.  Thus,
+		 * they should have the right b_blocknr.
+		 */
+		for (i = 0; i < count; i++) {
+			if (!bhs[done + i])
+				continue;
+			BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
+		}
+
+		rc = ocfs2_read_blocks(inode, p_block, count, bhs + done,
+				       flags, validate);
+		if (rc) {
+			mlog_errno(rc);
+			break;
+		}
+		done += count;
+	}
+
+out:
+	mlog_exit(rc);
+	return rc;
+}
+
+

+ 24 - 0
fs/ocfs2/extent_map.h

@@ -57,4 +57,28 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
 			     u32 *p_cluster, u32 *num_clusters,
 			     u32 *p_cluster, u32 *num_clusters,
 			     struct ocfs2_extent_list *el);
 			     struct ocfs2_extent_list *el);
 
 
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+			   struct buffer_head *bhs[], int flags,
+			   int (*validate)(struct super_block *sb,
+					   struct buffer_head *bh));
+static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
+					struct buffer_head **bh,
+					int (*validate)(struct super_block *sb,
+							struct buffer_head *bh))
+{
+	int status = 0;
+
+	if (bh == NULL) {
+		printk("ocfs2: bh == NULL\n");
+		status = -EINVAL;
+		goto bail;
+	}
+
+	status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
+
+bail:
+	return status;
+}
+
+
 #endif  /* _EXTENT_MAP_H */
 #endif  /* _EXTENT_MAP_H */

+ 96 - 113
fs/ocfs2/file.c

@@ -35,6 +35,7 @@
 #include <linux/mount.h>
 #include <linux/mount.h>
 #include <linux/writeback.h>
 #include <linux/writeback.h>
 #include <linux/falloc.h>
 #include <linux/falloc.h>
+#include <linux/quotaops.h>
 
 
 #define MLOG_MASK_PREFIX ML_INODE
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 #include <cluster/masklog.h>
@@ -56,6 +57,8 @@
 #include "suballoc.h"
 #include "suballoc.h"
 #include "super.h"
 #include "super.h"
 #include "xattr.h"
 #include "xattr.h"
+#include "acl.h"
+#include "quota.h"
 
 
 #include "buffer_head_io.h"
 #include "buffer_head_io.h"
 
 
@@ -253,8 +256,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
 		goto out;
 		goto out;
 	}
 	}
 
 
-	ret = ocfs2_journal_access(handle, inode, bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 	if (ret) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out_commit;
 		goto out_commit;
@@ -303,9 +306,9 @@ bail:
 	return status;
 	return status;
 }
 }
 
 
-static int ocfs2_simple_size_update(struct inode *inode,
-				    struct buffer_head *di_bh,
-				    u64 new_i_size)
+int ocfs2_simple_size_update(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size)
 {
 {
 	int ret;
 	int ret;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -350,8 +353,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 		goto out;
 		goto out;
 	}
 	}
 
 
-	status = ocfs2_journal_access(handle, inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, inode, fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto out_commit;
 		goto out_commit;
@@ -401,12 +404,9 @@ static int ocfs2_truncate_file(struct inode *inode,
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno,
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno,
 		   (unsigned long long)new_i_size);
 		   (unsigned long long)new_i_size);
 
 
+	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
+	 * already validated it */
 	fe = (struct ocfs2_dinode *) di_bh->b_data;
 	fe = (struct ocfs2_dinode *) di_bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-		status = -EIO;
-		goto bail;
-	}
 
 
 	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 			"Inode %llu, inode i_size = %lld != di "
 			"Inode %llu, inode i_size = %lld != di "
@@ -536,6 +536,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	enum ocfs2_alloc_restarted why;
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_extent_tree et;
 	struct ocfs2_extent_tree et;
+	int did_quota = 0;
 
 
 	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
 	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
 
 
@@ -545,18 +546,12 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	 */
 	 */
 	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+	status = ocfs2_read_inode_block(inode, &bh);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto leave;
 		goto leave;
 	}
 	}
-
 	fe = (struct ocfs2_dinode *) bh->b_data;
 	fe = (struct ocfs2_dinode *) bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-		status = -EIO;
-		goto leave;
-	}
 
 
 restart_all:
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
@@ -585,11 +580,18 @@ restart_all:
 	}
 	}
 
 
 restarted_transaction:
 restarted_transaction:
+	if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
+	    clusters_to_add))) {
+		status = -EDQUOT;
+		goto leave;
+	}
+	did_quota = 1;
+
 	/* reserve a write to the file entry early on - that we if we
 	/* reserve a write to the file entry early on - that we if we
 	 * run out of credits in the allocation path, we can still
 	 * run out of credits in the allocation path, we can still
 	 * update i_size. */
 	 * update i_size. */
-	status = ocfs2_journal_access(handle, inode, bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, inode, bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto leave;
 		goto leave;
@@ -622,6 +624,10 @@ restarted_transaction:
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
 	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	/* Release unused quota reservation */
+	vfs_dq_free_space(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+	did_quota = 0;
 
 
 	if (why != RESTART_NONE && clusters_to_add) {
 	if (why != RESTART_NONE && clusters_to_add) {
 		if (why == RESTART_META) {
 		if (why == RESTART_META) {
@@ -654,6 +660,9 @@ restarted_transaction:
 	     OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
 	     OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
 
 
 leave:
 leave:
+	if (status < 0 && did_quota)
+		vfs_dq_free_space(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 	if (handle) {
 	if (handle) {
 		ocfs2_commit_trans(osb, handle);
 		ocfs2_commit_trans(osb, handle);
 		handle = NULL;
 		handle = NULL;
@@ -885,6 +894,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	struct buffer_head *bh = NULL;
 	struct buffer_head *bh = NULL;
 	handle_t *handle = NULL;
 	handle_t *handle = NULL;
+	int locked[MAXQUOTAS] = {0, 0};
+	int credits, qtype;
+	struct ocfs2_mem_dqinfo *oinfo;
 
 
 	mlog_entry("(0x%p, '%.*s')\n", dentry,
 	mlog_entry("(0x%p, '%.*s')\n", dentry,
 	           dentry->d_name.len, dentry->d_name.name);
 	           dentry->d_name.len, dentry->d_name.name);
@@ -955,11 +967,47 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 		}
 	}
 	}
 
 
-	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		mlog_errno(status);
-		goto bail_unlock;
+	if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+	    (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+		credits = OCFS2_INODE_UPDATE_CREDITS;
+		if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
+		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+			oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
+			status = ocfs2_lock_global_qf(oinfo, 1);
+			if (status < 0)
+				goto bail_unlock;
+			credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+				ocfs2_calc_qdel_credits(sb, USRQUOTA);
+			locked[USRQUOTA] = 1;
+		}
+		if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
+		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+			oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
+			status = ocfs2_lock_global_qf(oinfo, 1);
+			if (status < 0)
+				goto bail_unlock;
+			credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
+				   ocfs2_calc_qdel_credits(sb, GRPQUOTA);
+			locked[GRPQUOTA] = 1;
+		}
+		handle = ocfs2_start_trans(osb, credits);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_unlock;
+		}
+		status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+		if (status < 0)
+			goto bail_commit;
+	} else {
+		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_unlock;
+		}
 	}
 	}
 
 
 	/*
 	/*
@@ -982,6 +1030,12 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
 bail_commit:
 	ocfs2_commit_trans(osb, handle);
 	ocfs2_commit_trans(osb, handle);
 bail_unlock:
 bail_unlock:
+	for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+		if (!locked[qtype])
+			continue;
+		oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
+		ocfs2_unlock_global_qf(oinfo, 1);
+	}
 	ocfs2_inode_unlock(inode, 1);
 	ocfs2_inode_unlock(inode, 1);
 bail_unlock_rw:
 bail_unlock_rw:
 	if (size_change)
 	if (size_change)
@@ -989,6 +1043,12 @@ bail_unlock_rw:
 bail:
 bail:
 	brelse(bh);
 	brelse(bh);
 
 
+	if (!status && attr->ia_valid & ATTR_MODE) {
+		status = ocfs2_acl_chmod(inode);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
 	mlog_exit(status);
 	mlog_exit(status);
 	return status;
 	return status;
 }
 }
@@ -1035,7 +1095,7 @@ int ocfs2_permission(struct inode *inode, int mask)
 		goto out;
 		goto out;
 	}
 	}
 
 
-	ret = generic_permission(inode, mask, NULL);
+	ret = generic_permission(inode, mask, ocfs2_check_acl);
 
 
 	ocfs2_inode_unlock(inode, 0);
 	ocfs2_inode_unlock(inode, 0);
 out:
 out:
@@ -1061,8 +1121,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
 		goto out;
 		goto out;
 	}
 	}
 
 
-	ret = ocfs2_journal_access(handle, inode, bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 	if (ret < 0) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out_trans;
 		goto out_trans;
@@ -1128,9 +1188,8 @@ static int ocfs2_write_remove_suid(struct inode *inode)
 {
 {
 	int ret;
 	int ret;
 	struct buffer_head *bh = NULL;
 	struct buffer_head *bh = NULL;
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
 
-	ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
+	ret = ocfs2_read_inode_block(inode, &bh);
 	if (ret < 0) {
 	if (ret < 0) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out;
 		goto out;
@@ -1156,8 +1215,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
 	struct buffer_head *di_bh = NULL;
 	struct buffer_head *di_bh = NULL;
 
 
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-		ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
-				       &di_bh);
+		ret = ocfs2_read_inode_block(inode, &di_bh);
 		if (ret) {
 		if (ret) {
 			mlog_errno(ret);
 			mlog_errno(ret);
 			goto out;
 			goto out;
@@ -1226,83 +1284,6 @@ out:
 	return ret;
 	return ret;
 }
 }
 
 
-static int __ocfs2_remove_inode_range(struct inode *inode,
-				      struct buffer_head *di_bh,
-				      u32 cpos, u32 phys_cpos, u32 len,
-				      struct ocfs2_cached_dealloc_ctxt *dealloc)
-{
-	int ret;
-	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct inode *tl_inode = osb->osb_tl_inode;
-	handle_t *handle;
-	struct ocfs2_alloc_context *meta_ac = NULL;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-	struct ocfs2_extent_tree et;
-
-	ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
-
-	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
-	if (ret) {
-		mlog_errno(ret);
-		return ret;
-	}
-
-	mutex_lock(&tl_inode->i_mutex);
-
-	if (ocfs2_truncate_log_needs_flush(osb)) {
-		ret = __ocfs2_flush_truncate_log(osb);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
-				  dealloc);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	OCFS2_I(inode)->ip_clusters -= len;
-	di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
-
-	ret = ocfs2_journal_dirty(handle, di_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
-	if (ret)
-		mlog_errno(ret);
-
-out_commit:
-	ocfs2_commit_trans(osb, handle);
-out:
-	mutex_unlock(&tl_inode->i_mutex);
-
-	if (meta_ac)
-		ocfs2_free_alloc_context(meta_ac);
-
-	return ret;
-}
-
 /*
 /*
  * Truncate a byte range, avoiding pages within partial clusters. This
  * Truncate a byte range, avoiding pages within partial clusters. This
  * preserves those pages for the zeroing code to write to.
  * preserves those pages for the zeroing code to write to.
@@ -1402,7 +1383,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_cached_dealloc_ctxt dealloc;
 	struct ocfs2_cached_dealloc_ctxt dealloc;
 	struct address_space *mapping = inode->i_mapping;
 	struct address_space *mapping = inode->i_mapping;
+	struct ocfs2_extent_tree et;
 
 
+	ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
 	ocfs2_init_dealloc_ctxt(&dealloc);
 	ocfs2_init_dealloc_ctxt(&dealloc);
 
 
 	if (byte_len == 0)
 	if (byte_len == 0)
@@ -1458,9 +1441,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
 
 
 		/* Only do work for non-holes */
 		/* Only do work for non-holes */
 		if (phys_cpos != 0) {
 		if (phys_cpos != 0) {
-			ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
-							 phys_cpos, alloc_size,
-							 &dealloc);
+			ret = ocfs2_remove_btree_range(inode, &et, cpos,
+						       phys_cpos, alloc_size,
+						       &dealloc);
 			if (ret) {
 			if (ret) {
 				mlog_errno(ret);
 				mlog_errno(ret);
 				goto out;
 				goto out;

+ 3 - 0
fs/ocfs2/file.h

@@ -51,6 +51,9 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
 			 struct ocfs2_alloc_context *data_ac,
 			 struct ocfs2_alloc_context *data_ac,
 			 struct ocfs2_alloc_context *meta_ac,
 			 struct ocfs2_alloc_context *meta_ac,
 			 enum ocfs2_alloc_restarted *reason_ret);
 			 enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_simple_size_update(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
 			  u64 zero_to);
 			  u64 zero_to);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);

+ 128 - 47
fs/ocfs2/inode.c

@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/pagemap.h>
+#include <linux/quotaops.h>
 
 
 #include <asm/byteorder.h>
 #include <asm/byteorder.h>
 
 
@@ -37,6 +38,7 @@
 #include "ocfs2.h"
 #include "ocfs2.h"
 
 
 #include "alloc.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "extent_map.h"
 #include "file.h"
 #include "file.h"
@@ -214,12 +216,11 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
 	return 0;
 	return 0;
 }
 }
 
 
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
-		     	 int create_ino)
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+			  int create_ino)
 {
 {
 	struct super_block *sb;
 	struct super_block *sb;
 	struct ocfs2_super *osb;
 	struct ocfs2_super *osb;
-	int status = -EINVAL;
 	int use_plocks = 1;
 	int use_plocks = 1;
 
 
 	mlog_entry("(0x%p, size:%llu)\n", inode,
 	mlog_entry("(0x%p, size:%llu)\n", inode,
@@ -232,25 +233,17 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	    ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
 	    ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
 		use_plocks = 0;
 		use_plocks = 0;
 
 
-	/* this means that read_inode cannot create a superblock inode
-	 * today.  change if needed. */
-	if (!OCFS2_IS_VALID_DINODE(fe) ||
-	    !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
-		mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, "
-		     "signature = %.*s, flags = 0x%x\n",
-		     inode->i_ino,
-		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-		     fe->i_signature, le32_to_cpu(fe->i_flags));
-		goto bail;
-	}
+	/*
+	 * These have all been checked by ocfs2_read_inode_block() or set
+	 * by ocfs2_mknod_locked(), so a failure is a code bug.
+	 */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));  /* This means that read_inode
+						cannot create a superblock
+						inode today.  change if
+						that is needed. */
+	BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)));
+	BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation);
 
 
-	if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
-		mlog(ML_ERROR, "file entry generation does not match "
-		     "superblock! osb->fs_generation=%x, "
-		     "fe->i_fs_generation=%x\n",
-		     osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
-		goto bail;
-	}
 
 
 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
 	OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
 	OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
@@ -284,14 +277,18 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
 
 	inode->i_nlink = le16_to_cpu(fe->i_links_count);
 	inode->i_nlink = le16_to_cpu(fe->i_links_count);
 
 
-	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
+		inode->i_flags |= S_NOQUOTA;
+	}
 
 
 	if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
 	if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
 		mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
 		mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
 	} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
 	} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
+		inode->i_flags |= S_NOQUOTA;
 	} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
 	} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
 		mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
 		mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
 		/* we can't actually hit this as read_inode can't
 		/* we can't actually hit this as read_inode can't
@@ -354,10 +351,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
 
 	ocfs2_set_inode_flags(inode);
 	ocfs2_set_inode_flags(inode);
 
 
-	status = 0;
-bail:
-	mlog_exit(status);
-	return status;
+	mlog_exit_void();
 }
 }
 
 
 static int ocfs2_read_locked_inode(struct inode *inode,
 static int ocfs2_read_locked_inode(struct inode *inode,
@@ -460,11 +454,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 		}
 		}
 	}
 	}
 
 
-	if (can_lock)
-		status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
-					   OCFS2_BH_IGNORE_CACHE);
-	else
+	if (can_lock) {
+		status = ocfs2_read_inode_block_full(inode, &bh,
+						     OCFS2_BH_IGNORE_CACHE);
+	} else {
 		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
 		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
+		if (!status)
+			status = ocfs2_validate_inode_block(osb->sb, bh);
+	}
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
@@ -472,12 +469,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 
 
 	status = -EINVAL;
 	status = -EINVAL;
 	fe = (struct ocfs2_dinode *) bh->b_data;
 	fe = (struct ocfs2_dinode *) bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
-		     (unsigned long long)args->fi_blkno, 7,
-		     fe->i_signature);
-		goto bail;
-	}
 
 
 	/*
 	/*
 	 * This is a code bug. Right now the caller needs to
 	 * This is a code bug. Right now the caller needs to
@@ -491,10 +482,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 
 
 	if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
 	if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
 	    S_ISBLK(le16_to_cpu(fe->i_mode)))
 	    S_ISBLK(le16_to_cpu(fe->i_mode)))
-    		inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+		inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
 
 
-	if (ocfs2_populate_inode(inode, fe, 0) < 0)
-		goto bail;
+	ocfs2_populate_inode(inode, fe, 0);
 
 
 	BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
 	BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
 
 
@@ -547,8 +537,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 			goto out;
 			goto out;
 		}
 		}
 
 
-		status = ocfs2_journal_access(handle, inode, fe_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_di(handle, inode, fe_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 		if (status < 0) {
 			mlog_errno(status);
 			mlog_errno(status);
 			goto out;
 			goto out;
@@ -615,7 +605,8 @@ static int ocfs2_remove_inode(struct inode *inode,
 		goto bail;
 		goto bail;
 	}
 	}
 
 
-	handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS);
+	handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
+					ocfs2_quota_trans_credits(inode->i_sb));
 	if (IS_ERR(handle)) {
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		status = PTR_ERR(handle);
 		mlog_errno(status);
 		mlog_errno(status);
@@ -630,8 +621,8 @@ static int ocfs2_remove_inode(struct inode *inode,
 	}
 	}
 
 
 	/* set the inodes dtime */
 	/* set the inodes dtime */
-	status = ocfs2_journal_access(handle, inode, di_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, inode, di_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail_commit;
 		goto bail_commit;
@@ -647,6 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
 	}
 	}
 
 
 	ocfs2_remove_from_cache(inode, di_bh);
 	ocfs2_remove_from_cache(inode, di_bh);
+	vfs_dq_free_inode(inode);
 
 
 	status = ocfs2_free_dinode(handle, inode_alloc_inode,
 	status = ocfs2_free_dinode(handle, inode_alloc_inode,
 				   inode_alloc_bh, di);
 				   inode_alloc_bh, di);
@@ -929,7 +921,10 @@ void ocfs2_delete_inode(struct inode *inode)
 
 
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
 
 
-	if (is_bad_inode(inode)) {
+	/* When we fail in read_inode() we mark inode as bad. The second test
+	 * catches the case when inode allocation fails before allocating
+	 * a block for inode. */
+	if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
 		mlog(0, "Skipping delete of bad inode\n");
 		mlog(0, "Skipping delete of bad inode\n");
 		goto bail;
 		goto bail;
 	}
 	}
@@ -1195,8 +1190,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 	mlog_entry("(inode %llu)\n",
 	mlog_entry("(inode %llu)\n",
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno);
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 
-	status = ocfs2_journal_access(handle, inode, bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, inode, bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto leave;
 		goto leave;
@@ -1264,3 +1259,89 @@ void ocfs2_refresh_inode(struct inode *inode,
 
 
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 }
 }
+
+int ocfs2_validate_inode_block(struct super_block *sb,
+			       struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+	mlog(0, "Validating dinode %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+	if (rc) {
+		mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		goto bail;
+	}
+
+	/*
+	 * Errors after here are fatal.
+	 */
+
+	rc = -EINVAL;
+
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    di->i_signature);
+		goto bail;
+	}
+
+	if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(di->i_blkno));
+		goto bail;
+	}
+
+	if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+		ocfs2_error(sb,
+			    "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+			    (unsigned long long)bh->b_blocknr);
+		goto bail;
+	}
+
+	if (le32_to_cpu(di->i_fs_generation) !=
+	    OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Invalid dinode #%llu: fs_generation is %u\n",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(di->i_fs_generation));
+		goto bail;
+	}
+
+	rc = 0;
+
+bail:
+	return rc;
+}
+
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+				int flags)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp,
+			       flags, ocfs2_validate_inode_block);
+
+	/* If ocfs2_read_blocks() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
+{
+	return ocfs2_read_inode_block_full(inode, bh, 0);
+}

+ 16 - 2
fs/ocfs2/inode.h

@@ -128,8 +128,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
 			 int sysfile_type);
 			 int sysfile_type);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_revalidate(struct dentry *dentry);
 int ocfs2_inode_revalidate(struct dentry *dentry);
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
-			 int create_ino);
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+			  int create_ino);
 void ocfs2_read_inode(struct inode *inode);
 void ocfs2_read_inode(struct inode *inode);
 void ocfs2_read_inode2(struct inode *inode, void *opaque);
 void ocfs2_read_inode2(struct inode *inode, void *opaque);
 ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
 ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
@@ -142,6 +142,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 			   struct buffer_head *bh);
 			   struct buffer_head *bh);
 int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
 int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
 int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
 int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+struct buffer_head *ocfs2_bread(struct inode *inode,
+				int block, int *err, int reada);
 
 
 void ocfs2_set_inode_flags(struct inode *inode);
 void ocfs2_set_inode_flags(struct inode *inode);
 void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
 void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
@@ -153,4 +155,16 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
 	return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
 	return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
 }
 }
 
 
+/* Validate that a bh contains a valid inode */
+int ocfs2_validate_inode_block(struct super_block *sb,
+			       struct buffer_head *bh);
+/*
+ * Read an inode block into *bh.  If *bh is NULL, a bh will be allocated.
+ * This is a cached read.  The inode will be validated with
+ * ocfs2_validate_inode_block().
+ */
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
+/* The same, but can be passed OCFS2_BH_* flags */
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+				int flags);
 #endif /* OCFS2_INODE_H */
 #endif /* OCFS2_INODE_H */

+ 300 - 64
fs/ocfs2/journal.c

@@ -35,6 +35,7 @@
 #include "ocfs2.h"
 #include "ocfs2.h"
 
 
 #include "alloc.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dir.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "extent_map.h"
@@ -45,6 +46,7 @@
 #include "slot_map.h"
 #include "slot_map.h"
 #include "super.h"
 #include "super.h"
 #include "sysfile.h"
 #include "sysfile.h"
+#include "quota.h"
 
 
 #include "buffer_head_io.h"
 #include "buffer_head_io.h"
 
 
@@ -52,10 +54,10 @@ DEFINE_SPINLOCK(trans_inc_lock);
 
 
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
 static int ocfs2_recover_node(struct ocfs2_super *osb,
-			      int node_num);
+			      int node_num, int slot_num);
 static int __ocfs2_recovery_thread(void *arg);
 static int __ocfs2_recovery_thread(void *arg);
 static int ocfs2_commit_cache(struct ocfs2_super *osb);
 static int ocfs2_commit_cache(struct ocfs2_super *osb);
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 				      int dirty, int replayed);
 				      int dirty, int replayed);
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
@@ -64,6 +66,17 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 				 int slot);
 				 int slot);
 static int ocfs2_commit_thread(void *arg);
 static int ocfs2_commit_thread(void *arg);
 
 
+static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+{
+	return __ocfs2_wait_on_mount(osb, 0);
+}
+
+static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
+{
+	return __ocfs2_wait_on_mount(osb, 1);
+}
+
+
 
 
 /*
 /*
  * The recovery_list is a simple linked list of node numbers to recover.
  * The recovery_list is a simple linked list of node numbers to recover.
@@ -256,11 +269,9 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 	BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
 	BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
 	BUG_ON(max_buffs <= 0);
 	BUG_ON(max_buffs <= 0);
 
 
-	/* JBD might support this, but our journalling code doesn't yet. */
-	if (journal_current_handle()) {
-		mlog(ML_ERROR, "Recursive transaction attempted!\n");
-		BUG();
-	}
+	/* Nested transaction? Just return the handle... */
+	if (journal_current_handle())
+		return jbd2_journal_start(journal, max_buffs);
 
 
 	down_read(&osb->journal->j_trans_barrier);
 	down_read(&osb->journal->j_trans_barrier);
 
 
@@ -285,16 +296,18 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 int ocfs2_commit_trans(struct ocfs2_super *osb,
 int ocfs2_commit_trans(struct ocfs2_super *osb,
 		       handle_t *handle)
 		       handle_t *handle)
 {
 {
-	int ret;
+	int ret, nested;
 	struct ocfs2_journal *journal = osb->journal;
 	struct ocfs2_journal *journal = osb->journal;
 
 
 	BUG_ON(!handle);
 	BUG_ON(!handle);
 
 
+	nested = handle->h_ref > 1;
 	ret = jbd2_journal_stop(handle);
 	ret = jbd2_journal_stop(handle);
 	if (ret < 0)
 	if (ret < 0)
 		mlog_errno(ret);
 		mlog_errno(ret);
 
 
-	up_read(&journal->j_trans_barrier);
+	if (!nested)
+		up_read(&journal->j_trans_barrier);
 
 
 	return ret;
 	return ret;
 }
 }
@@ -357,10 +370,137 @@ bail:
 	return status;
 	return status;
 }
 }
 
 
-int ocfs2_journal_access(handle_t *handle,
-			 struct inode *inode,
-			 struct buffer_head *bh,
-			 int type)
+struct ocfs2_triggers {
+	struct jbd2_buffer_trigger_type	ot_triggers;
+	int				ot_offset;
+};
+
+static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
+{
+	return container_of(triggers, struct ocfs2_triggers, ot_triggers);
+}
+
+static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+				 struct buffer_head *bh,
+				 void *data, size_t size)
+{
+	struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
+
+	/*
+	 * We aren't guaranteed to have the superblock here, so we
+	 * must unconditionally compute the ecc data.
+	 * __ocfs2_journal_access() will only set the triggers if
+	 * metaecc is enabled.
+	 */
+	ocfs2_block_check_compute(data, size, data + ot->ot_offset);
+}
+
+/*
+ * Quota blocks have their own trigger because the struct ocfs2_block_check
+ * offset depends on the blocksize.
+ */
+static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+				 struct buffer_head *bh,
+				 void *data, size_t size)
+{
+	struct ocfs2_disk_dqtrailer *dqt =
+		ocfs2_block_dqtrailer(size, data);
+
+	/*
+	 * We aren't guaranteed to have the superblock here, so we
+	 * must unconditionally compute the ecc data.
+	 * __ocfs2_journal_access() will only set the triggers if
+	 * metaecc is enabled.
+	 */
+	ocfs2_block_check_compute(data, size, &dqt->dq_check);
+}
+
+/*
+ * Directory blocks also have their own trigger because the
+ * struct ocfs2_block_check offset depends on the blocksize.
+ */
+static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+				 struct buffer_head *bh,
+				 void *data, size_t size)
+{
+	struct ocfs2_dir_block_trailer *trailer =
+		ocfs2_dir_trailer_from_size(size, data);
+
+	/*
+	 * We aren't guaranteed to have the superblock here, so we
+	 * must unconditionally compute the ecc data.
+	 * __ocfs2_journal_access() will only set the triggers if
+	 * metaecc is enabled.
+	 */
+	ocfs2_block_check_compute(data, size, &trailer->db_check);
+}
+
+static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
+				struct buffer_head *bh)
+{
+	mlog(ML_ERROR,
+	     "ocfs2_abort_trigger called by JBD2.  bh = 0x%lx, "
+	     "bh->b_blocknr = %llu\n",
+	     (unsigned long)bh,
+	     (unsigned long long)bh->b_blocknr);
+
+	/* We aren't guaranteed to have the superblock here - but if we
+	 * don't, it'll just crash. */
+	ocfs2_error(bh->b_assoc_map->host->i_sb,
+		    "JBD2 has aborted our journal, ocfs2 cannot continue\n");
+}
+
+static struct ocfs2_triggers di_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_dinode, i_check),
+};
+
+static struct ocfs2_triggers eb_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_extent_block, h_check),
+};
+
+static struct ocfs2_triggers gd_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_group_desc, bg_check),
+};
+
+static struct ocfs2_triggers db_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_db_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+};
+
+static struct ocfs2_triggers xb_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_xattr_block, xb_check),
+};
+
+static struct ocfs2_triggers dq_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_dq_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+};
+
+static int __ocfs2_journal_access(handle_t *handle,
+				  struct inode *inode,
+				  struct buffer_head *bh,
+				  struct ocfs2_triggers *triggers,
+				  int type)
 {
 {
 	int status;
 	int status;
 
 
@@ -406,6 +546,8 @@ int ocfs2_journal_access(handle_t *handle,
 		status = -EINVAL;
 		status = -EINVAL;
 		mlog(ML_ERROR, "Uknown access type!\n");
 		mlog(ML_ERROR, "Uknown access type!\n");
 	}
 	}
+	if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers)
+		jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
 	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
 	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
 
 
 	if (status < 0)
 	if (status < 0)
@@ -416,6 +558,54 @@ int ocfs2_journal_access(handle_t *handle,
 	return status;
 	return status;
 }
 }
 
 
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+			       struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &di_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &eb_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &gd_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &db_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &xb_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &dq_triggers,
+				      type);
+}
+
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+			 struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, NULL, type);
+}
+
 int ocfs2_journal_dirty(handle_t *handle,
 int ocfs2_journal_dirty(handle_t *handle,
 			struct buffer_head *bh)
 			struct buffer_head *bh)
 {
 {
@@ -434,20 +624,6 @@ int ocfs2_journal_dirty(handle_t *handle,
 	return status;
 	return status;
 }
 }
 
 
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int ocfs2_journal_dirty_data(handle_t *handle,
-			     struct buffer_head *bh)
-{
-	int err = journal_dirty_data(handle, bh);
-	if (err)
-		mlog_errno(err);
-	/* TODO: When we can handle it, abort the handle and go RO on
-	 * error here. */
-
-	return err;
-}
-#endif
-
 #define OCFS2_DEFAULT_COMMIT_INTERVAL	(HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
 #define OCFS2_DEFAULT_COMMIT_INTERVAL	(HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
 
 
 void ocfs2_set_journal_params(struct ocfs2_super *osb)
 void ocfs2_set_journal_params(struct ocfs2_super *osb)
@@ -587,17 +763,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 	mlog_entry_void();
 	mlog_entry_void();
 
 
 	fe = (struct ocfs2_dinode *)bh->b_data;
 	fe = (struct ocfs2_dinode *)bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		/* This is called from startup/shutdown which will
-		 * handle the errors in a specific manner, so no need
-		 * to call ocfs2_error() here. */
-		mlog(ML_ERROR, "Journal dinode %llu  has invalid "
-		     "signature: %.*s",
-		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-		     fe->i_signature);
-		status = -EIO;
-		goto out;
-	}
+
+	/* The journal bh on the osb always comes from ocfs2_journal_init()
+	 * and was validated there inside ocfs2_inode_lock_full().  It's a
+	 * code bug if we mess it up. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 
 
 	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
 	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
 	if (dirty)
 	if (dirty)
@@ -609,11 +779,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 	if (replayed)
 	if (replayed)
 		ocfs2_bump_recovery_generation(fe);
 		ocfs2_bump_recovery_generation(fe);
 
 
+	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
 	status = ocfs2_write_block(osb, bh, journal->j_inode);
 	status = ocfs2_write_block(osb, bh, journal->j_inode);
 	if (status < 0)
 	if (status < 0)
 		mlog_errno(status);
 		mlog_errno(status);
 
 
-out:
 	mlog_exit(status);
 	mlog_exit(status);
 	return status;
 	return status;
 }
 }
@@ -878,6 +1048,7 @@ struct ocfs2_la_recovery_item {
 	int			lri_slot;
 	int			lri_slot;
 	struct ocfs2_dinode	*lri_la_dinode;
 	struct ocfs2_dinode	*lri_la_dinode;
 	struct ocfs2_dinode	*lri_tl_dinode;
 	struct ocfs2_dinode	*lri_tl_dinode;
+	struct ocfs2_quota_recovery *lri_qrec;
 };
 };
 
 
 /* Does the second half of the recovery process. By this point, the
 /* Does the second half of the recovery process. By this point, the
@@ -898,6 +1069,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
 	struct ocfs2_super *osb = journal->j_osb;
 	struct ocfs2_super *osb = journal->j_osb;
 	struct ocfs2_dinode *la_dinode, *tl_dinode;
 	struct ocfs2_dinode *la_dinode, *tl_dinode;
 	struct ocfs2_la_recovery_item *item, *n;
 	struct ocfs2_la_recovery_item *item, *n;
+	struct ocfs2_quota_recovery *qrec;
 	LIST_HEAD(tmp_la_list);
 	LIST_HEAD(tmp_la_list);
 
 
 	mlog_entry_void();
 	mlog_entry_void();
@@ -913,6 +1085,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
 
 
 		mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
 		mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
 
 
+		ocfs2_wait_on_quotas(osb);
+
 		la_dinode = item->lri_la_dinode;
 		la_dinode = item->lri_la_dinode;
 		if (la_dinode) {
 		if (la_dinode) {
 			mlog(0, "Clean up local alloc %llu\n",
 			mlog(0, "Clean up local alloc %llu\n",
@@ -943,6 +1117,16 @@ void ocfs2_complete_recovery(struct work_struct *work)
 		if (ret < 0)
 		if (ret < 0)
 			mlog_errno(ret);
 			mlog_errno(ret);
 
 
+		qrec = item->lri_qrec;
+		if (qrec) {
+			mlog(0, "Recovering quota files");
+			ret = ocfs2_finish_quota_recovery(osb, qrec,
+							  item->lri_slot);
+			if (ret < 0)
+				mlog_errno(ret);
+			/* Recovery info is already freed now */
+		}
+
 		kfree(item);
 		kfree(item);
 	}
 	}
 
 
@@ -956,7 +1140,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
 static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 					    int slot_num,
 					    int slot_num,
 					    struct ocfs2_dinode *la_dinode,
 					    struct ocfs2_dinode *la_dinode,
-					    struct ocfs2_dinode *tl_dinode)
+					    struct ocfs2_dinode *tl_dinode,
+					    struct ocfs2_quota_recovery *qrec)
 {
 {
 	struct ocfs2_la_recovery_item *item;
 	struct ocfs2_la_recovery_item *item;
 
 
@@ -971,6 +1156,9 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 		if (tl_dinode)
 		if (tl_dinode)
 			kfree(tl_dinode);
 			kfree(tl_dinode);
 
 
+		if (qrec)
+			ocfs2_free_quota_recovery(qrec);
+
 		mlog_errno(-ENOMEM);
 		mlog_errno(-ENOMEM);
 		return;
 		return;
 	}
 	}
@@ -979,6 +1167,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 	item->lri_la_dinode = la_dinode;
 	item->lri_la_dinode = la_dinode;
 	item->lri_slot = slot_num;
 	item->lri_slot = slot_num;
 	item->lri_tl_dinode = tl_dinode;
 	item->lri_tl_dinode = tl_dinode;
+	item->lri_qrec = qrec;
 
 
 	spin_lock(&journal->j_lock);
 	spin_lock(&journal->j_lock);
 	list_add_tail(&item->lri_list, &journal->j_la_cleanups);
 	list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -998,6 +1187,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
 		ocfs2_queue_recovery_completion(journal,
 		ocfs2_queue_recovery_completion(journal,
 						osb->slot_num,
 						osb->slot_num,
 						osb->local_alloc_copy,
 						osb->local_alloc_copy,
+						NULL,
 						NULL);
 						NULL);
 		ocfs2_schedule_truncate_log_flush(osb, 0);
 		ocfs2_schedule_truncate_log_flush(osb, 0);
 
 
@@ -1006,11 +1196,26 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
 	}
 	}
 }
 }
 
 
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
+{
+	if (osb->quota_rec) {
+		ocfs2_queue_recovery_completion(osb->journal,
+						osb->slot_num,
+						NULL,
+						NULL,
+						osb->quota_rec);
+		osb->quota_rec = NULL;
+	}
+}
+
 static int __ocfs2_recovery_thread(void *arg)
 static int __ocfs2_recovery_thread(void *arg)
 {
 {
-	int status, node_num;
+	int status, node_num, slot_num;
 	struct ocfs2_super *osb = arg;
 	struct ocfs2_super *osb = arg;
 	struct ocfs2_recovery_map *rm = osb->recovery_map;
 	struct ocfs2_recovery_map *rm = osb->recovery_map;
+	int *rm_quota = NULL;
+	int rm_quota_used = 0, i;
+	struct ocfs2_quota_recovery *qrec;
 
 
 	mlog_entry_void();
 	mlog_entry_void();
 
 
@@ -1019,6 +1224,11 @@ static int __ocfs2_recovery_thread(void *arg)
 		goto bail;
 		goto bail;
 	}
 	}
 
 
+	rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
+	if (!rm_quota) {
+		status = -ENOMEM;
+		goto bail;
+	}
 restart:
 restart:
 	status = ocfs2_super_lock(osb, 1);
 	status = ocfs2_super_lock(osb, 1);
 	if (status < 0) {
 	if (status < 0) {
@@ -1032,8 +1242,28 @@ restart:
 		 * clear it until ocfs2_recover_node() has succeeded. */
 		 * clear it until ocfs2_recover_node() has succeeded. */
 		node_num = rm->rm_entries[0];
 		node_num = rm->rm_entries[0];
 		spin_unlock(&osb->osb_lock);
 		spin_unlock(&osb->osb_lock);
-
-		status = ocfs2_recover_node(osb, node_num);
+		mlog(0, "checking node %d\n", node_num);
+		slot_num = ocfs2_node_num_to_slot(osb, node_num);
+		if (slot_num == -ENOENT) {
+			status = 0;
+			mlog(0, "no slot for this node, so no recovery"
+			     "required.\n");
+			goto skip_recovery;
+		}
+		mlog(0, "node %d was using slot %d\n", node_num, slot_num);
+
+		/* It is a bit subtle with quota recovery. We cannot do it
+		 * immediately because we have to obtain cluster locks from
+		 * quota files and we also don't want to just skip it because
+		 * then quota usage would be out of sync until some node takes
+		 * the slot. So we remember which nodes need quota recovery
+		 * and when everything else is done, we recover quotas. */
+		for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
+		if (i == rm_quota_used)
+			rm_quota[rm_quota_used++] = slot_num;
+
+		status = ocfs2_recover_node(osb, node_num, slot_num);
+skip_recovery:
 		if (!status) {
 		if (!status) {
 			ocfs2_recovery_map_clear(osb, node_num);
 			ocfs2_recovery_map_clear(osb, node_num);
 		} else {
 		} else {
@@ -1055,13 +1285,27 @@ restart:
 	if (status < 0)
 	if (status < 0)
 		mlog_errno(status);
 		mlog_errno(status);
 
 
+	/* Now it is right time to recover quotas... We have to do this under
+	 * superblock lock so that noone can start using the slot (and crash)
+	 * before we recover it */
+	for (i = 0; i < rm_quota_used; i++) {
+		qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+		if (IS_ERR(qrec)) {
+			status = PTR_ERR(qrec);
+			mlog_errno(status);
+			continue;
+		}
+		ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
+						NULL, NULL, qrec);
+	}
+
 	ocfs2_super_unlock(osb, 1);
 	ocfs2_super_unlock(osb, 1);
 
 
 	/* We always run recovery on our own orphan dir - the dead
 	/* We always run recovery on our own orphan dir - the dead
 	 * node(s) may have disallowd a previos inode delete. Re-processing
 	 * node(s) may have disallowd a previos inode delete. Re-processing
 	 * is therefore required. */
 	 * is therefore required. */
 	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
 	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
-					NULL);
+					NULL, NULL);
 
 
 bail:
 bail:
 	mutex_lock(&osb->recovery_lock);
 	mutex_lock(&osb->recovery_lock);
@@ -1076,6 +1320,9 @@ bail:
 
 
 	mutex_unlock(&osb->recovery_lock);
 	mutex_unlock(&osb->recovery_lock);
 
 
+	if (rm_quota)
+		kfree(rm_quota);
+
 	mlog_exit(status);
 	mlog_exit(status);
 	/* no one is callint kthread_stop() for us so the kthread() api
 	/* no one is callint kthread_stop() for us so the kthread() api
 	 * requires that we call do_exit().  And it isn't exported, but
 	 * requires that we call do_exit().  And it isn't exported, but
@@ -1135,8 +1382,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
 	}
 	}
 	SET_INODE_JOURNAL(inode);
 	SET_INODE_JOURNAL(inode);
 
 
-	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
-				   OCFS2_BH_IGNORE_CACHE);
+	status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
@@ -1268,6 +1514,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	osb->slot_recovery_generations[slot_num] =
 	osb->slot_recovery_generations[slot_num] =
 					ocfs2_get_recovery_generation(fe);
 					ocfs2_get_recovery_generation(fe);
 
 
+	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
 	status = ocfs2_write_block(osb, bh, inode);
 	status = ocfs2_write_block(osb, bh, inode);
 	if (status < 0)
 	if (status < 0)
 		mlog_errno(status);
 		mlog_errno(status);
@@ -1304,31 +1551,19 @@ done:
  * far less concerning.
  * far less concerning.
  */
  */
 static int ocfs2_recover_node(struct ocfs2_super *osb,
 static int ocfs2_recover_node(struct ocfs2_super *osb,
-			      int node_num)
+			      int node_num, int slot_num)
 {
 {
 	int status = 0;
 	int status = 0;
-	int slot_num;
 	struct ocfs2_dinode *la_copy = NULL;
 	struct ocfs2_dinode *la_copy = NULL;
 	struct ocfs2_dinode *tl_copy = NULL;
 	struct ocfs2_dinode *tl_copy = NULL;
 
 
-	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
-		   node_num, osb->node_num);
-
-	mlog(0, "checking node %d\n", node_num);
+	mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
+		   node_num, slot_num, osb->node_num);
 
 
 	/* Should not ever be called to recover ourselves -- in that
 	/* Should not ever be called to recover ourselves -- in that
 	 * case we should've called ocfs2_journal_load instead. */
 	 * case we should've called ocfs2_journal_load instead. */
 	BUG_ON(osb->node_num == node_num);
 	BUG_ON(osb->node_num == node_num);
 
 
-	slot_num = ocfs2_node_num_to_slot(osb, node_num);
-	if (slot_num == -ENOENT) {
-		status = 0;
-		mlog(0, "no slot for this node, so no recovery required.\n");
-		goto done;
-	}
-
-	mlog(0, "node %d was using slot %d\n", node_num, slot_num);
-
 	status = ocfs2_replay_journal(osb, node_num, slot_num);
 	status = ocfs2_replay_journal(osb, node_num, slot_num);
 	if (status < 0) {
 	if (status < 0) {
 		if (status == -EBUSY) {
 		if (status == -EBUSY) {
@@ -1364,7 +1599,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 
 
 	/* This will kfree the memory pointed to by la_copy and tl_copy */
 	/* This will kfree the memory pointed to by la_copy and tl_copy */
 	ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
 	ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
-					tl_copy);
+					tl_copy, NULL);
 
 
 	status = 0;
 	status = 0;
 done:
 done:
@@ -1659,13 +1894,14 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 	return ret;
 	return ret;
 }
 }
 
 
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
 {
 {
 	/* This check is good because ocfs2 will wait on our recovery
 	/* This check is good because ocfs2 will wait on our recovery
 	 * thread before changing it to something other than MOUNTED
 	 * thread before changing it to something other than MOUNTED
 	 * or DISABLED. */
 	 * or DISABLED. */
 	wait_event(osb->osb_mount_event,
 	wait_event(osb->osb_mount_event,
-		   atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
+		  (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) ||
+		   atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS ||
 		   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
 		   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
 
 
 	/* If there's an error on mount, then we may never get to the
 	/* If there's an error on mount, then we may never get to the

+ 98 - 30
fs/ocfs2/journal.h

@@ -27,12 +27,7 @@
 #define OCFS2_JOURNAL_H
 #define OCFS2_JOURNAL_H
 
 
 #include <linux/fs.h>
 #include <linux/fs.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-# include "ocfs2_jbd_compat.h"
-#endif
+#include <linux/jbd2.h>
 
 
 enum ocfs2_journal_state {
 enum ocfs2_journal_state {
 	OCFS2_JOURNAL_FREE = 0,
 	OCFS2_JOURNAL_FREE = 0,
@@ -173,6 +168,7 @@ void   ocfs2_recovery_thread(struct ocfs2_super *osb,
 			     int node_num);
 			     int node_num);
 int    ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
 int    ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
 void   ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
 void   ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
 
 
 static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
 static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
 {
 {
@@ -216,9 +212,12 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
  *  ocfs2_extend_trans     - Extend a handle by nblocks credits. This may
  *  ocfs2_extend_trans     - Extend a handle by nblocks credits. This may
  *                          commit the handle to disk in the process, but will
  *                          commit the handle to disk in the process, but will
  *                          not release any locks taken during the transaction.
  *                          not release any locks taken during the transaction.
- *  ocfs2_journal_access   - Notify the handle that we want to journal this
+ *  ocfs2_journal_access* - Notify the handle that we want to journal this
  *                          buffer. Will have to call ocfs2_journal_dirty once
  *                          buffer. Will have to call ocfs2_journal_dirty once
  *                          we've actually dirtied it. Type is one of . or .
  *                          we've actually dirtied it. Type is one of . or .
+ *                          Always call the specific flavor of
+ *                          ocfs2_journal_access_*() unless you intend to
+ *                          manage the checksum by hand.
  *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
  *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
  *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
  *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
  *                           the current handle commits.
  *                           the current handle commits.
@@ -248,10 +247,29 @@ int			     ocfs2_extend_trans(handle_t *handle, int nblocks);
 #define OCFS2_JOURNAL_ACCESS_WRITE  1
 #define OCFS2_JOURNAL_ACCESS_WRITE  1
 #define OCFS2_JOURNAL_ACCESS_UNDO   2
 #define OCFS2_JOURNAL_ACCESS_UNDO   2
 
 
-int                  ocfs2_journal_access(handle_t *handle,
-					  struct inode *inode,
-					  struct buffer_head *bh,
-					  int type);
+
+/* ocfs2_inode */
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* ocfs2_extent_block */
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* ocfs2_group_desc */
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* ocfs2_xattr_block */
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* quota blocks */
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* dirblock */
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* Anything that has no ecc */
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+			 struct buffer_head *bh, int type);
+
 /*
 /*
  * A word about the journal_access/journal_dirty "dance". It is
  * A word about the journal_access/journal_dirty "dance". It is
  * entirely legal to journal_access a buffer more than once (as long
  * entirely legal to journal_access a buffer more than once (as long
@@ -273,10 +291,6 @@ int                  ocfs2_journal_access(handle_t *handle,
  */
  */
 int                  ocfs2_journal_dirty(handle_t *handle,
 int                  ocfs2_journal_dirty(handle_t *handle,
 					 struct buffer_head *bh);
 					 struct buffer_head *bh);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int                  ocfs2_journal_dirty_data(handle_t *handle,
-					      struct buffer_head *bh);
-#endif
 
 
 /*
 /*
  *  Credit Macros:
  *  Credit Macros:
@@ -293,6 +307,37 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* extended attribute block update */
 /* extended attribute block update */
 #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
 #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
 
 
+/* global quotafile inode update, data block */
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/*
+ * The two writes below can accidentally see global info dirty due
+ * to set_info() quotactl so make them prepared for the writes.
+ */
+/* quota data block, global info */
+/* Write to local quota file */
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+
+/* global quota data block, local quota data block, global quota inode,
+ * global quota info */
+#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+
+static inline int ocfs2_quota_trans_credits(struct super_block *sb)
+{
+	int credits = 0;
+
+	if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
+		credits += OCFS2_QWRITE_CREDITS;
+	if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
+		credits += OCFS2_QWRITE_CREDITS;
+	return credits;
+}
+
+/* Number of credits needed for removing quota structure from file */
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
+/* Number of credits needed for initialization of new quota structure */
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
+
 /* group extend. inode update and last group update. */
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
 #define OCFS2_GROUP_EXTEND_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
 
 
@@ -303,8 +348,11 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
  * prev. group desc. if we relink. */
  * prev. group desc. if we relink. */
 #define OCFS2_SUBALLOC_ALLOC (3)
 #define OCFS2_SUBALLOC_ALLOC (3)
 
 
-#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC		\
-					 + OCFS2_INODE_UPDATE_CREDITS)
+static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
+{
+	return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
+	       ocfs2_quota_trans_credits(sb);
+}
 
 
 /* dinode + group descriptor update. We don't relink on free yet. */
 /* dinode + group descriptor update. We don't relink on free yet. */
 #define OCFS2_SUBALLOC_FREE  (2)
 #define OCFS2_SUBALLOC_FREE  (2)
@@ -313,16 +361,23 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE 		      \
 #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE 		      \
 					 + OCFS2_TRUNCATE_LOG_UPDATE)
 					 + OCFS2_TRUNCATE_LOG_UPDATE)
 
 
-#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
+static inline int ocfs2_remove_extent_credits(struct super_block *sb)
+{
+	return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
+	       ocfs2_quota_trans_credits(sb);
+}
 
 
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
  * bitmap block for the new bit) */
  * bitmap block for the new bit) */
 #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
 #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
 
 
 /* parent fe, parent block, new file entry, inode alloc fe, inode alloc
 /* parent fe, parent block, new file entry, inode alloc fe, inode alloc
- * group descriptor + mkdir/symlink blocks */
-#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC                         \
-			    + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
+ * group descriptor + mkdir/symlink blocks + quota update */
+static inline int ocfs2_mknod_credits(struct super_block *sb)
+{
+	return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
+	       ocfs2_quota_trans_credits(sb);
+}
 
 
 /* local alloc metadata change + main bitmap updates */
 /* local alloc metadata change + main bitmap updates */
 #define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS                 \
 #define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS                 \
@@ -332,13 +387,21 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
  * for the dinode, one for the new block. */
  * for the dinode, one for the new block. */
 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
 
 
-/* file update (nlink, etc) + directory mtime/ctime + dir entry block */
-#define OCFS2_LINK_CREDITS  (2*OCFS2_INODE_UPDATE_CREDITS + 1)
+/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
+ * update on dir */
+static inline int ocfs2_link_credits(struct super_block *sb)
+{
+	return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
+	       ocfs2_quota_trans_credits(sb);
+}
 
 
 /* inode + dir inode (if we unlink a dir), + dir entry block + orphan
 /* inode + dir inode (if we unlink a dir), + dir entry block + orphan
  * dir inode link */
  * dir inode link */
-#define OCFS2_UNLINK_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 1             \
-			      + OCFS2_LINK_CREDITS)
+static inline int ocfs2_unlink_credits(struct super_block *sb)
+{
+	/* The quota update from ocfs2_link_credits is unused here... */
+	return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
+}
 
 
 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
  * inode alloc group descriptor */
  * inode alloc group descriptor */
@@ -347,8 +410,10 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* dinode update, old dir dinode update, new dir dinode update, old
 /* dinode update, old dir dinode update, new dir dinode update, old
  * dir dir entry, new dir dir entry, dir entry update for renaming
  * dir dir entry, new dir dir entry, dir entry update for renaming
  * directory + target unlink */
  * directory + target unlink */
-#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
-			     + OCFS2_UNLINK_CREDITS)
+static inline int ocfs2_rename_credits(struct super_block *sb)
+{
+	return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
+}
 
 
 /* global bitmap dinode, group desc., relinked group,
 /* global bitmap dinode, group desc., relinked group,
  * suballocator dinode, group desc., relinked group,
  * suballocator dinode, group desc., relinked group,
@@ -386,18 +451,19 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
 	 * credit for the dinode there. */
 	 * credit for the dinode there. */
 	extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
 	extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
 
 
-	return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
+	return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
+	       ocfs2_quota_trans_credits(sb);
 }
 }
 
 
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
 {
 {
-	int blocks = OCFS2_MKNOD_CREDITS;
+	int blocks = ocfs2_mknod_credits(sb);
 
 
 	/* links can be longer than one block so we may update many
 	/* links can be longer than one block so we may update many
 	 * within our single allocated extent. */
 	 * within our single allocated extent. */
 	blocks += ocfs2_clusters_to_blocks(sb, 1);
 	blocks += ocfs2_clusters_to_blocks(sb, 1);
 
 
-	return blocks;
+	return blocks + ocfs2_quota_trans_credits(sb);
 }
 }
 
 
 static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
 static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
@@ -434,6 +500,8 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 	/* update to the truncate log. */
 	/* update to the truncate log. */
 	credits += OCFS2_TRUNCATE_LOG_UPDATE;
 	credits += OCFS2_TRUNCATE_LOG_UPDATE;
 
 
+	credits += ocfs2_quota_trans_credits(sb);
+
 	return credits;
 	return credits;
 }
 }
 
 

+ 14 - 12
fs/ocfs2/localalloc.c

@@ -36,6 +36,7 @@
 #include "ocfs2.h"
 #include "ocfs2.h"
 
 
 #include "alloc.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "dlmglue.h"
 #include "inode.h"
 #include "inode.h"
 #include "journal.h"
 #include "journal.h"
@@ -248,8 +249,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 		goto bail;
 		goto bail;
 	}
 	}
 
 
-	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
-				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+	status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+					     OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
@@ -382,8 +383,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 	}
 	}
 	memcpy(alloc_copy, alloc, bh->b_size);
 	memcpy(alloc_copy, alloc, bh->b_size);
 
 
-	status = ocfs2_journal_access(handle, local_alloc_inode, bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, local_alloc_inode, bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto out_commit;
 		goto out_commit;
@@ -459,8 +460,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 
 
 	mutex_lock(&inode->i_mutex);
 	mutex_lock(&inode->i_mutex);
 
 
-	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
-				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+	status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+					     OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
@@ -476,6 +477,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
 	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
 	ocfs2_clear_local_alloc(alloc);
 	ocfs2_clear_local_alloc(alloc);
 
 
+	ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
 	status = ocfs2_write_block(osb, alloc_bh, inode);
 	status = ocfs2_write_block(osb, alloc_bh, inode);
 	if (status < 0)
 	if (status < 0)
 		mlog_errno(status);
 		mlog_errno(status);
@@ -762,9 +764,9 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 	 * delete bits from it! */
 	 * delete bits from it! */
 	*num_bits = bits_wanted;
 	*num_bits = bits_wanted;
 
 
-	status = ocfs2_journal_access(handle, local_alloc_inode,
-				      osb->local_alloc_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, local_alloc_inode,
+					 osb->local_alloc_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
@@ -1240,9 +1242,9 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 	}
 	}
 	memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
 	memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
 
 
-	status = ocfs2_journal_access(handle, local_alloc_inode,
-				      osb->local_alloc_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, local_alloc_inode,
+					 osb->local_alloc_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;

+ 227 - 91
fs/ocfs2/namei.c

@@ -40,6 +40,7 @@
 #include <linux/types.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/highmem.h>
+#include <linux/quotaops.h>
 
 
 #define MLOG_MASK_PREFIX ML_NAMEI
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
 #include <cluster/masklog.h>
@@ -61,17 +62,18 @@
 #include "sysfile.h"
 #include "sysfile.h"
 #include "uptodate.h"
 #include "uptodate.h"
 #include "xattr.h"
 #include "xattr.h"
+#include "acl.h"
 
 
 #include "buffer_head_io.h"
 #include "buffer_head_io.h"
 
 
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 			      struct inode *dir,
 			      struct inode *dir,
-			      struct dentry *dentry, int mode,
+			      struct inode *inode,
+			      struct dentry *dentry,
 			      dev_t dev,
 			      dev_t dev,
 			      struct buffer_head **new_fe_bh,
 			      struct buffer_head **new_fe_bh,
 			      struct buffer_head *parent_fe_bh,
 			      struct buffer_head *parent_fe_bh,
 			      handle_t *handle,
 			      handle_t *handle,
-			      struct inode **ret_inode,
 			      struct ocfs2_alloc_context *inode_ac);
 			      struct ocfs2_alloc_context *inode_ac);
 
 
 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
@@ -186,6 +188,35 @@ bail:
 	return ret;
 	return ret;
 }
 }
 
 
+static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
+{
+	struct inode *inode;
+
+	inode = new_inode(dir->i_sb);
+	if (!inode) {
+		mlog(ML_ERROR, "new_inode failed!\n");
+		return NULL;
+	}
+
+	/* populate as many fields early on as possible - many of
+	 * these are used by the support functions here and in
+	 * callers. */
+	if (S_ISDIR(mode))
+		inode->i_nlink = 2;
+	else
+		inode->i_nlink = 1;
+	inode->i_uid = current_fsuid();
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		inode->i_gid = current_fsgid();
+	inode->i_mode = mode;
+	vfs_dq_init(inode);
+	return inode;
+}
+
 static int ocfs2_mknod(struct inode *dir,
 static int ocfs2_mknod(struct inode *dir,
 		       struct dentry *dentry,
 		       struct dentry *dentry,
 		       int mode,
 		       int mode,
@@ -201,6 +232,13 @@ static int ocfs2_mknod(struct inode *dir,
 	struct inode *inode = NULL;
 	struct inode *inode = NULL;
 	struct ocfs2_alloc_context *inode_ac = NULL;
 	struct ocfs2_alloc_context *inode_ac = NULL;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *xattr_ac = NULL;
+	int want_clusters = 0;
+	int xattr_credits = 0;
+	struct ocfs2_security_xattr_info si = {
+		.enable = 1,
+	};
+	int did_quota_inode = 0;
 
 
 	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
 	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
 		   (unsigned long)dev, dentry->d_name.len,
 		   (unsigned long)dev, dentry->d_name.len,
@@ -250,17 +288,46 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 		goto leave;
 	}
 	}
 
 
-	/* Reserve a cluster if creating an extent based directory. */
-	if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
-		status = ocfs2_reserve_clusters(osb, 1, &data_ac);
-		if (status < 0) {
-			if (status != -ENOSPC)
-				mlog_errno(status);
+	inode = ocfs2_get_init_inode(dir, mode);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* get security xattr */
+	status = ocfs2_init_security_get(inode, dir, &si);
+	if (status) {
+		if (status == -EOPNOTSUPP)
+			si.enable = 0;
+		else {
+			mlog_errno(status);
 			goto leave;
 			goto leave;
 		}
 		}
 	}
 	}
 
 
-	handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS);
+	/* calculate meta data/clusters for setting security and acl xattr */
+	status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
+					&si, &want_clusters,
+					&xattr_credits, &xattr_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* Reserve a cluster if creating an extent based directory. */
+	if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
+		want_clusters += 1;
+
+	status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
+				   xattr_credits);
 	if (IS_ERR(handle)) {
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		status = PTR_ERR(handle);
 		handle = NULL;
 		handle = NULL;
@@ -268,10 +335,19 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 		goto leave;
 	}
 	}
 
 
+	/* We don't use standard VFS wrapper because we don't want vfs_dq_init
+	 * to be called. */
+	if (sb_any_quota_active(osb->sb) &&
+	    osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+		status = -EDQUOT;
+		goto leave;
+	}
+	did_quota_inode = 1;
+
 	/* do the real work now. */
 	/* do the real work now. */
-	status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
+	status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
 				    &new_fe_bh, parent_fe_bh, handle,
 				    &new_fe_bh, parent_fe_bh, handle,
-				    &inode, inode_ac);
+				    inode_ac);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto leave;
 		goto leave;
@@ -285,8 +361,8 @@ static int ocfs2_mknod(struct inode *dir,
 			goto leave;
 			goto leave;
 		}
 		}
 
 
-		status = ocfs2_journal_access(handle, dir, parent_fe_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_di(handle, dir, parent_fe_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 		if (status < 0) {
 			mlog_errno(status);
 			mlog_errno(status);
 			goto leave;
 			goto leave;
@@ -300,6 +376,22 @@ static int ocfs2_mknod(struct inode *dir,
 		inc_nlink(dir);
 		inc_nlink(dir);
 	}
 	}
 
 
+	status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+				xattr_ac, data_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (si.enable) {
+		status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+						 xattr_ac, data_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
 	status = ocfs2_add_entry(handle, dentry, inode,
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
 				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
 				 de_bh);
 				 de_bh);
@@ -320,6 +412,8 @@ static int ocfs2_mknod(struct inode *dir,
 	d_instantiate(dentry, inode);
 	d_instantiate(dentry, inode);
 	status = 0;
 	status = 0;
 leave:
 leave:
+	if (status < 0 && did_quota_inode)
+		vfs_dq_free_inode(inode);
 	if (handle)
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 		ocfs2_commit_trans(osb, handle);
 
 
@@ -331,9 +425,13 @@ leave:
 	brelse(new_fe_bh);
 	brelse(new_fe_bh);
 	brelse(de_bh);
 	brelse(de_bh);
 	brelse(parent_fe_bh);
 	brelse(parent_fe_bh);
+	kfree(si.name);
+	kfree(si.value);
 
 
-	if ((status < 0) && inode)
+	if ((status < 0) && inode) {
+		clear_nlink(inode);
 		iput(inode);
 		iput(inode);
+	}
 
 
 	if (inode_ac)
 	if (inode_ac)
 		ocfs2_free_alloc_context(inode_ac);
 		ocfs2_free_alloc_context(inode_ac);
@@ -341,6 +439,9 @@ leave:
 	if (data_ac)
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
 		ocfs2_free_alloc_context(data_ac);
 
 
+	if (xattr_ac)
+		ocfs2_free_alloc_context(xattr_ac);
+
 	mlog_exit(status);
 	mlog_exit(status);
 
 
 	return status;
 	return status;
@@ -348,12 +449,12 @@ leave:
 
 
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 			      struct inode *dir,
 			      struct inode *dir,
-			      struct dentry *dentry, int mode,
+			      struct inode *inode,
+			      struct dentry *dentry,
 			      dev_t dev,
 			      dev_t dev,
 			      struct buffer_head **new_fe_bh,
 			      struct buffer_head **new_fe_bh,
 			      struct buffer_head *parent_fe_bh,
 			      struct buffer_head *parent_fe_bh,
 			      handle_t *handle,
 			      handle_t *handle,
-			      struct inode **ret_inode,
 			      struct ocfs2_alloc_context *inode_ac)
 			      struct ocfs2_alloc_context *inode_ac)
 {
 {
 	int status = 0;
 	int status = 0;
@@ -361,14 +462,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	struct ocfs2_extent_list *fel;
 	struct ocfs2_extent_list *fel;
 	u64 fe_blkno = 0;
 	u64 fe_blkno = 0;
 	u16 suballoc_bit;
 	u16 suballoc_bit;
-	struct inode *inode = NULL;
 
 
-	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
-		   (unsigned long)dev, dentry->d_name.len,
+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
+		   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
 		   dentry->d_name.name);
 		   dentry->d_name.name);
 
 
 	*new_fe_bh = NULL;
 	*new_fe_bh = NULL;
-	*ret_inode = NULL;
 
 
 	status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
 	status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
 				       &fe_blkno);
 				       &fe_blkno);
@@ -377,23 +476,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 		goto leave;
 		goto leave;
 	}
 	}
 
 
-	inode = new_inode(dir->i_sb);
-	if (!inode) {
-		status = -ENOMEM;
-		mlog(ML_ERROR, "new_inode failed!\n");
-		goto leave;
-	}
-
 	/* populate as many fields early on as possible - many of
 	/* populate as many fields early on as possible - many of
 	 * these are used by the support functions here and in
 	 * these are used by the support functions here and in
 	 * callers. */
 	 * callers. */
 	inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
 	inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
 	OCFS2_I(inode)->ip_blkno = fe_blkno;
 	OCFS2_I(inode)->ip_blkno = fe_blkno;
-	if (S_ISDIR(mode))
-		inode->i_nlink = 2;
-	else
-		inode->i_nlink = 1;
-	inode->i_mode = mode;
 	spin_lock(&osb->osb_lock);
 	spin_lock(&osb->osb_lock);
 	inode->i_generation = osb->s_next_generation++;
 	inode->i_generation = osb->s_next_generation++;
 	spin_unlock(&osb->osb_lock);
 	spin_unlock(&osb->osb_lock);
@@ -406,8 +493,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	}
 	}
 	ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
 	ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
 
 
-	status = ocfs2_journal_access(handle, inode, *new_fe_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	status = ocfs2_journal_access_di(handle, inode, *new_fe_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto leave;
 		goto leave;
@@ -421,17 +508,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	fe->i_blkno = cpu_to_le64(fe_blkno);
 	fe->i_blkno = cpu_to_le64(fe_blkno);
 	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
 	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
 	fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
 	fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
-	fe->i_uid = cpu_to_le32(current_fsuid());
-	if (dir->i_mode & S_ISGID) {
-		fe->i_gid = cpu_to_le32(dir->i_gid);
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
-	} else
-		fe->i_gid = cpu_to_le32(current_fsgid());
-	fe->i_mode = cpu_to_le16(mode);
-	if (S_ISCHR(mode) || S_ISBLK(mode))
+	fe->i_uid = cpu_to_le32(inode->i_uid);
+	fe->i_gid = cpu_to_le32(inode->i_gid);
+	fe->i_mode = cpu_to_le16(inode->i_mode);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
 		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
-
 	fe->i_links_count = cpu_to_le16(inode->i_nlink);
 	fe->i_links_count = cpu_to_le16(inode->i_nlink);
 
 
 	fe->i_last_eb_blk = 0;
 	fe->i_last_eb_blk = 0;
@@ -446,7 +527,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	/*
 	/*
 	 * If supported, directories start with inline data.
 	 * If supported, directories start with inline data.
 	 */
 	 */
-	if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) {
+	if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
 		u16 feat = le16_to_cpu(fe->i_dyn_features);
 		u16 feat = le16_to_cpu(fe->i_dyn_features);
 
 
 		fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
 		fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
@@ -465,15 +546,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 		goto leave;
 		goto leave;
 	}
 	}
 
 
-	if (ocfs2_populate_inode(inode, fe, 1) < 0) {
-		mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
-		     "i_blkno=%llu, i_ino=%lu\n",
-		     (unsigned long long)(*new_fe_bh)->b_blocknr,
-		     (unsigned long long)le64_to_cpu(fe->i_blkno),
-		     inode->i_ino);
-		BUG();
-	}
-
+	ocfs2_populate_inode(inode, fe, 1);
 	ocfs2_inode_set_new(osb, inode);
 	ocfs2_inode_set_new(osb, inode);
 	if (!ocfs2_mount_local(osb)) {
 	if (!ocfs2_mount_local(osb)) {
 		status = ocfs2_create_new_inode_locks(inode);
 		status = ocfs2_create_new_inode_locks(inode);
@@ -484,17 +557,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	status = 0; /* error in ocfs2_create_new_inode_locks is not
 	status = 0; /* error in ocfs2_create_new_inode_locks is not
 		     * critical */
 		     * critical */
 
 
-	*ret_inode = inode;
 leave:
 leave:
 	if (status < 0) {
 	if (status < 0) {
 		if (*new_fe_bh) {
 		if (*new_fe_bh) {
 			brelse(*new_fe_bh);
 			brelse(*new_fe_bh);
 			*new_fe_bh = NULL;
 			*new_fe_bh = NULL;
 		}
 		}
-		if (inode) {
-			clear_nlink(inode);
-			iput(inode);
-		}
 	}
 	}
 
 
 	mlog_exit(status);
 	mlog_exit(status);
@@ -588,7 +656,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 		goto out_unlock_inode;
 		goto out_unlock_inode;
 	}
 	}
 
 
-	handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS);
+	handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb));
 	if (IS_ERR(handle)) {
 	if (IS_ERR(handle)) {
 		err = PTR_ERR(handle);
 		err = PTR_ERR(handle);
 		handle = NULL;
 		handle = NULL;
@@ -596,8 +664,8 @@ static int ocfs2_link(struct dentry *old_dentry,
 		goto out_unlock_inode;
 		goto out_unlock_inode;
 	}
 	}
 
 
-	err = ocfs2_journal_access(handle, inode, fe_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	err = ocfs2_journal_access_di(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (err < 0) {
 	if (err < 0) {
 		mlog_errno(err);
 		mlog_errno(err);
 		goto out_commit;
 		goto out_commit;
@@ -775,7 +843,7 @@ static int ocfs2_unlink(struct inode *dir,
 		}
 		}
 	}
 	}
 
 
-	handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS);
+	handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
 	if (IS_ERR(handle)) {
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		status = PTR_ERR(handle);
 		handle = NULL;
 		handle = NULL;
@@ -783,8 +851,8 @@ static int ocfs2_unlink(struct inode *dir,
 		goto leave;
 		goto leave;
 	}
 	}
 
 
-	status = ocfs2_journal_access(handle, inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, inode, fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto leave;
 		goto leave;
@@ -1181,7 +1249,7 @@ static int ocfs2_rename(struct inode *old_dir,
 		}
 		}
 	}
 	}
 
 
-	handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS);
+	handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
 	if (IS_ERR(handle)) {
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		status = PTR_ERR(handle);
 		handle = NULL;
 		handle = NULL;
@@ -1197,8 +1265,8 @@ static int ocfs2_rename(struct inode *old_dir,
 				goto bail;
 				goto bail;
 			}
 			}
 		}
 		}
-		status = ocfs2_journal_access(handle, new_inode, newfe_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_di(handle, new_inode, newfe_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 		if (status < 0) {
 			mlog_errno(status);
 			mlog_errno(status);
 			goto bail;
 			goto bail;
@@ -1244,8 +1312,8 @@ static int ocfs2_rename(struct inode *old_dir,
 	old_inode->i_ctime = CURRENT_TIME;
 	old_inode->i_ctime = CURRENT_TIME;
 	mark_inode_dirty(old_inode);
 	mark_inode_dirty(old_inode);
 
 
-	status = ocfs2_journal_access(handle, old_inode, old_inode_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status >= 0) {
 	if (status >= 0) {
 		old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
 		old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
 
 
@@ -1321,9 +1389,9 @@ static int ocfs2_rename(struct inode *old_dir,
 			     (int)old_dir_nlink, old_dir->i_nlink);
 			     (int)old_dir_nlink, old_dir->i_nlink);
 		} else {
 		} else {
 			struct ocfs2_dinode *fe;
 			struct ocfs2_dinode *fe;
-			status = ocfs2_journal_access(handle, old_dir,
-						      old_dir_bh,
-						      OCFS2_JOURNAL_ACCESS_WRITE);
+			status = ocfs2_journal_access_di(handle, old_dir,
+							 old_dir_bh,
+							 OCFS2_JOURNAL_ACCESS_WRITE);
 			fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
 			fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
 			fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
 			fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
 			status = ocfs2_journal_dirty(handle, old_dir_bh);
 			status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1496,6 +1564,13 @@ static int ocfs2_symlink(struct inode *dir,
 	handle_t *handle = NULL;
 	handle_t *handle = NULL;
 	struct ocfs2_alloc_context *inode_ac = NULL;
 	struct ocfs2_alloc_context *inode_ac = NULL;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *xattr_ac = NULL;
+	int want_clusters = 0;
+	int xattr_credits = 0;
+	struct ocfs2_security_xattr_info si = {
+		.enable = 1,
+	};
+	int did_quota = 0, did_quota_inode = 0;
 
 
 	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
 	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
 		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
 		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1542,17 +1617,46 @@ static int ocfs2_symlink(struct inode *dir,
 		goto bail;
 		goto bail;
 	}
 	}
 
 
-	/* don't reserve bitmap space for fast symlinks. */
-	if (l > ocfs2_fast_symlink_chars(sb)) {
-		status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+	inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* get security xattr */
+	status = ocfs2_init_security_get(inode, dir, &si);
+	if (status) {
+		if (status == -EOPNOTSUPP)
+			si.enable = 0;
+		else {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* calculate meta data/clusters for setting security xattr */
+	if (si.enable) {
+		status = ocfs2_calc_security_init(dir, &si, &want_clusters,
+						  &xattr_credits, &xattr_ac);
 		if (status < 0) {
 		if (status < 0) {
-			if (status != -ENOSPC)
-				mlog_errno(status);
+			mlog_errno(status);
 			goto bail;
 			goto bail;
 		}
 		}
 	}
 	}
 
 
-	handle = ocfs2_start_trans(osb, credits);
+	/* don't reserve bitmap space for fast symlinks. */
+	if (l > ocfs2_fast_symlink_chars(sb))
+		want_clusters += 1;
+
+	status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, credits + xattr_credits);
 	if (IS_ERR(handle)) {
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		status = PTR_ERR(handle);
 		handle = NULL;
 		handle = NULL;
@@ -1560,10 +1664,18 @@ static int ocfs2_symlink(struct inode *dir,
 		goto bail;
 		goto bail;
 	}
 	}
 
 
-	status = ocfs2_mknod_locked(osb, dir, dentry,
-				    S_IFLNK | S_IRWXUGO, 0,
-				    &new_fe_bh, parent_fe_bh, handle,
-				    &inode, inode_ac);
+	/* We don't use standard VFS wrapper because we don't want vfs_dq_init
+	 * to be called. */
+	if (sb_any_quota_active(osb->sb) &&
+	    osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+		status = -EDQUOT;
+		goto bail;
+	}
+	did_quota_inode = 1;
+
+	status = ocfs2_mknod_locked(osb, dir, inode, dentry,
+				    0, &new_fe_bh, parent_fe_bh, handle,
+				    inode_ac);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
@@ -1576,6 +1688,12 @@ static int ocfs2_symlink(struct inode *dir,
 		u32 offset = 0;
 		u32 offset = 0;
 
 
 		inode->i_op = &ocfs2_symlink_inode_operations;
 		inode->i_op = &ocfs2_symlink_inode_operations;
+		if (vfs_dq_alloc_space_nodirty(inode,
+		    ocfs2_clusters_to_bytes(osb->sb, 1))) {
+			status = -EDQUOT;
+			goto bail;
+		}
+		did_quota = 1;
 		status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
 		status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
 					      new_fe_bh,
 					      new_fe_bh,
 					      handle, data_ac, NULL,
 					      handle, data_ac, NULL,
@@ -1614,6 +1732,15 @@ static int ocfs2_symlink(struct inode *dir,
 		}
 		}
 	}
 	}
 
 
+	if (si.enable) {
+		status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+						 xattr_ac, data_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
 	status = ocfs2_add_entry(handle, dentry, inode,
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
 				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
 				 de_bh);
 				 de_bh);
@@ -1632,6 +1759,11 @@ static int ocfs2_symlink(struct inode *dir,
 	dentry->d_op = &ocfs2_dentry_ops;
 	dentry->d_op = &ocfs2_dentry_ops;
 	d_instantiate(dentry, inode);
 	d_instantiate(dentry, inode);
 bail:
 bail:
+	if (status < 0 && did_quota)
+		vfs_dq_free_space_nodirty(inode,
+					ocfs2_clusters_to_bytes(osb->sb, 1));
+	if (status < 0 && did_quota_inode)
+		vfs_dq_free_inode(inode);
 	if (handle)
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 		ocfs2_commit_trans(osb, handle);
 
 
@@ -1640,12 +1772,18 @@ bail:
 	brelse(new_fe_bh);
 	brelse(new_fe_bh);
 	brelse(parent_fe_bh);
 	brelse(parent_fe_bh);
 	brelse(de_bh);
 	brelse(de_bh);
+	kfree(si.name);
+	kfree(si.value);
 	if (inode_ac)
 	if (inode_ac)
 		ocfs2_free_alloc_context(inode_ac);
 		ocfs2_free_alloc_context(inode_ac);
 	if (data_ac)
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
 		ocfs2_free_alloc_context(data_ac);
-	if ((status < 0) && inode)
+	if (xattr_ac)
+		ocfs2_free_alloc_context(xattr_ac);
+	if ((status < 0) && inode) {
+		clear_nlink(inode);
 		iput(inode);
 		iput(inode);
+	}
 
 
 	mlog_exit(status);
 	mlog_exit(status);
 
 
@@ -1754,16 +1892,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 
 
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
 
 
-	status = ocfs2_read_block(orphan_dir_inode,
-				  OCFS2_I(orphan_dir_inode)->ip_blkno,
-				  &orphan_dir_bh);
+	status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto leave;
 		goto leave;
 	}
 	}
 
 
-	status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto leave;
 		goto leave;
@@ -1850,8 +1986,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 		goto leave;
 		goto leave;
 	}
 	}
 
 
-	status = ocfs2_journal_access(handle,orphan_dir_inode,  orphan_dir_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle,orphan_dir_inode,  orphan_dir_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto leave;
 		goto leave;

+ 22 - 24
fs/ocfs2/ocfs2.h

@@ -161,6 +161,7 @@ enum ocfs2_vol_state
 {
 {
 	VOLUME_INIT = 0,
 	VOLUME_INIT = 0,
 	VOLUME_MOUNTED,
 	VOLUME_MOUNTED,
+	VOLUME_MOUNTED_QUOTAS,
 	VOLUME_DISMOUNTED,
 	VOLUME_DISMOUNTED,
 	VOLUME_DISABLED
 	VOLUME_DISABLED
 };
 };
@@ -195,6 +196,9 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
 	OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
 	OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
 	OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
 	OCFS2_MOUNT_INODE64 = 1 << 7,	/* Allow inode numbers > 2^32 */
 	OCFS2_MOUNT_INODE64 = 1 << 7,	/* Allow inode numbers > 2^32 */
+	OCFS2_MOUNT_POSIX_ACL = 1 << 8,	/* POSIX access control lists */
+	OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
+	OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
 };
 };
 
 
 #define OCFS2_OSB_SOFT_RO	0x0001
 #define OCFS2_OSB_SOFT_RO	0x0001
@@ -205,6 +209,7 @@ enum ocfs2_mount_options
 struct ocfs2_journal;
 struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
 struct ocfs2_recovery_map;
+struct ocfs2_quota_recovery;
 struct ocfs2_super
 struct ocfs2_super
 {
 {
 	struct task_struct *commit_task;
 	struct task_struct *commit_task;
@@ -286,10 +291,11 @@ struct ocfs2_super
 	char *local_alloc_debug_buf;
 	char *local_alloc_debug_buf;
 #endif
 #endif
 
 
-	/* Next two fields are for local node slot recovery during
+	/* Next three fields are for local node slot recovery during
 	 * mount. */
 	 * mount. */
 	int dirty;
 	int dirty;
 	struct ocfs2_dinode *local_alloc_copy;
 	struct ocfs2_dinode *local_alloc_copy;
+	struct ocfs2_quota_recovery *quota_rec;
 
 
 	struct ocfs2_alloc_stats alloc_stats;
 	struct ocfs2_alloc_stats alloc_stats;
 	char dev_str[20];		/* "major,minor" of the device */
 	char dev_str[20];		/* "major,minor" of the device */
@@ -333,6 +339,10 @@ struct ocfs2_super
 
 
 #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
 #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
 
 
+/* Useful typedef for passing around journal access functions */
+typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode,
+					 struct buffer_head *bh, int type);
+
 static inline int ocfs2_should_order_data(struct inode *inode)
 static inline int ocfs2_should_order_data(struct inode *inode)
 {
 {
 	if (!S_ISREG(inode->i_mode))
 	if (!S_ISREG(inode->i_mode))
@@ -376,6 +386,13 @@ static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
 	return 0;
 	return 0;
 }
 }
 
 
+static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
+		return 1;
+	return 0;
+}
+
 /* set / clear functions because cluster events can make these happen
 /* set / clear functions because cluster events can make these happen
  * in parallel so we want the transitions to be atomic. this also
  * in parallel so we want the transitions to be atomic. this also
  * means that any future flags osb_flags must be protected by spinlock
  * means that any future flags osb_flags must be protected by spinlock
@@ -443,39 +460,19 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_DINODE(ptr)					\
 #define OCFS2_IS_VALID_DINODE(ptr)					\
 	(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
 	(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
 
 
-#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di)	do {			\
-	typeof(__di) ____di = (__di);					\
-	ocfs2_error((__sb), 						\
-		"Dinode # %llu has bad signature %.*s",			\
-		(unsigned long long)le64_to_cpu((____di)->i_blkno), 7, 	\
-		(____di)->i_signature);					\
-} while (0)
-
 #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)				\
 #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)				\
 	(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
 	(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
 
 
-#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb)	do {		\
-	typeof(__eb) ____eb = (__eb);					\
-	ocfs2_error((__sb), 						\
-		"Extent Block # %llu has bad signature %.*s",		\
-		(unsigned long long)le64_to_cpu((____eb)->h_blkno), 7,	\
-		(____eb)->h_signature);					\
-} while (0)
-
 #define OCFS2_IS_VALID_GROUP_DESC(ptr)					\
 #define OCFS2_IS_VALID_GROUP_DESC(ptr)					\
 	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
 	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
 
 
-#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd)	do {		\
-	typeof(__gd) ____gd = (__gd);					\
-		ocfs2_error((__sb),					\
-		"Group Descriptor # %llu has bad signature %.*s",	\
-		(unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
-		(____gd)->bg_signature);				\
-} while (0)
 
 
 #define OCFS2_IS_VALID_XATTR_BLOCK(ptr)					\
 #define OCFS2_IS_VALID_XATTR_BLOCK(ptr)					\
 	(!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
 	(!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
 
 
+#define OCFS2_IS_VALID_DIR_TRAILER(ptr)					\
+	(!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
+
 static inline unsigned long ino_from_blkno(struct super_block *sb,
 static inline unsigned long ino_from_blkno(struct super_block *sb,
 					   u64 blkno)
 					   u64 blkno)
 {
 {
@@ -632,5 +629,6 @@ static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
 #define ocfs2_test_bit ext2_test_bit
 #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
 #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+#define ocfs2_find_next_bit ext2_find_next_bit
 #endif  /* OCFS2_H */
 #endif  /* OCFS2_H */
 
 

+ 204 - 9
fs/ocfs2/ocfs2_fs.h

@@ -65,6 +65,7 @@
 #define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
 #define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
 #define OCFS2_XATTR_BLOCK_SIGNATURE	"XATTR01"
 #define OCFS2_XATTR_BLOCK_SIGNATURE	"XATTR01"
+#define OCFS2_DIR_TRAILER_SIGNATURE	"DIRTRL1"
 
 
 /* Compatibility flags */
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
@@ -93,8 +94,11 @@
 					 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
 					 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
 					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
 					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
 					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
 					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
-					 | OCFS2_FEATURE_INCOMPAT_XATTR)
-#define OCFS2_FEATURE_RO_COMPAT_SUPP	OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
+					 | OCFS2_FEATURE_INCOMPAT_XATTR \
+					 | OCFS2_FEATURE_INCOMPAT_META_ECC)
+#define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
+					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
+					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
 
 
 /*
 /*
  * Heartbeat-only devices are missing journals and other files.  The
  * Heartbeat-only devices are missing journals and other files.  The
@@ -147,6 +151,9 @@
 /* Support for extended attributes */
 /* Support for extended attributes */
 #define OCFS2_FEATURE_INCOMPAT_XATTR		0x0200
 #define OCFS2_FEATURE_INCOMPAT_XATTR		0x0200
 
 
+/* Metadata checksum and error correction */
+#define OCFS2_FEATURE_INCOMPAT_META_ECC		0x0800
+
 /*
 /*
  * backup superblock flag is used to indicate that this volume
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
  * has backup superblocks.
@@ -163,6 +170,12 @@
  */
  */
 #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN	0x0001
 #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN	0x0001
 
 
+/*
+ * Maintain quota information for this filesystem
+ */
+#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA	0x0002
+#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA	0x0004
+
 /* The byte offset of the first backup block will be 1G.
 /* The byte offset of the first backup block will be 1G.
  * The following will be 4G, 16G, 64G, 256G and 1T.
  * The following will be 4G, 16G, 64G, 256G and 1T.
  */
  */
@@ -192,6 +205,7 @@
 #define OCFS2_HEARTBEAT_FL	(0x00000200)	/* Heartbeat area */
 #define OCFS2_HEARTBEAT_FL	(0x00000200)	/* Heartbeat area */
 #define OCFS2_CHAIN_FL		(0x00000400)	/* Chain allocator */
 #define OCFS2_CHAIN_FL		(0x00000400)	/* Chain allocator */
 #define OCFS2_DEALLOC_FL	(0x00000800)	/* Truncate log */
 #define OCFS2_DEALLOC_FL	(0x00000800)	/* Truncate log */
+#define OCFS2_QUOTA_FL		(0x00001000)	/* Quota file */
 
 
 /*
 /*
  * Flags on ocfs2_dinode.i_dyn_features
  * Flags on ocfs2_dinode.i_dyn_features
@@ -329,13 +343,17 @@ enum {
 #define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
 #define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
 	HEARTBEAT_SYSTEM_INODE,
 	HEARTBEAT_SYSTEM_INODE,
 	GLOBAL_BITMAP_SYSTEM_INODE,
 	GLOBAL_BITMAP_SYSTEM_INODE,
-#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
+	USER_QUOTA_SYSTEM_INODE,
+	GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
 	ORPHAN_DIR_SYSTEM_INODE,
 	ORPHAN_DIR_SYSTEM_INODE,
 	EXTENT_ALLOC_SYSTEM_INODE,
 	EXTENT_ALLOC_SYSTEM_INODE,
 	INODE_ALLOC_SYSTEM_INODE,
 	INODE_ALLOC_SYSTEM_INODE,
 	JOURNAL_SYSTEM_INODE,
 	JOURNAL_SYSTEM_INODE,
 	LOCAL_ALLOC_SYSTEM_INODE,
 	LOCAL_ALLOC_SYSTEM_INODE,
 	TRUNCATE_LOG_SYSTEM_INODE,
 	TRUNCATE_LOG_SYSTEM_INODE,
+	LOCAL_USER_QUOTA_SYSTEM_INODE,
+	LOCAL_GROUP_QUOTA_SYSTEM_INODE,
 	NUM_SYSTEM_INODES
 	NUM_SYSTEM_INODES
 };
 };
 
 
@@ -349,6 +367,8 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 	[SLOT_MAP_SYSTEM_INODE]			= { "slot_map", 0, S_IFREG | 0644 },
 	[SLOT_MAP_SYSTEM_INODE]			= { "slot_map", 0, S_IFREG | 0644 },
 	[HEARTBEAT_SYSTEM_INODE]		= { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
 	[HEARTBEAT_SYSTEM_INODE]		= { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
 	[GLOBAL_BITMAP_SYSTEM_INODE]		= { "global_bitmap", 0, S_IFREG | 0644 },
 	[GLOBAL_BITMAP_SYSTEM_INODE]		= { "global_bitmap", 0, S_IFREG | 0644 },
+	[USER_QUOTA_SYSTEM_INODE]		= { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+	[GROUP_QUOTA_SYSTEM_INODE]		= { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 },
 
 
 	/* Slot-specific system inodes (one copy per slot) */
 	/* Slot-specific system inodes (one copy per slot) */
 	[ORPHAN_DIR_SYSTEM_INODE]		= { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
 	[ORPHAN_DIR_SYSTEM_INODE]		= { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
@@ -356,7 +376,9 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 	[INODE_ALLOC_SYSTEM_INODE]		= { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
 	[INODE_ALLOC_SYSTEM_INODE]		= { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
 	[JOURNAL_SYSTEM_INODE]			= { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
 	[JOURNAL_SYSTEM_INODE]			= { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
 	[LOCAL_ALLOC_SYSTEM_INODE]		= { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
 	[LOCAL_ALLOC_SYSTEM_INODE]		= { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
-	[TRUNCATE_LOG_SYSTEM_INODE]		= { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
+	[TRUNCATE_LOG_SYSTEM_INODE]		= { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 },
+	[LOCAL_USER_QUOTA_SYSTEM_INODE]		= { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+	[LOCAL_GROUP_QUOTA_SYSTEM_INODE]	= { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
 };
 };
 
 
 /* Parameter passed from mount.ocfs2 to module */
 /* Parameter passed from mount.ocfs2 to module */
@@ -409,6 +431,22 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
  */
  */
 #define OCFS2_RAW_SB(dinode)		(&((dinode)->id2.i_super))
 #define OCFS2_RAW_SB(dinode)		(&((dinode)->id2.i_super))
 
 
+/*
+ * Block checking structure.  This is used in metadata to validate the
+ * contents.  If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all
+ * zeros.
+ */
+struct ocfs2_block_check {
+/*00*/	__le32 bc_crc32e;	/* 802.3 Ethernet II CRC32 */
+	__le16 bc_ecc;		/* Single-error-correction parity vector.
+				   This is a simple Hamming code dependant
+				   on the blocksize.  OCFS2's maximum
+				   blocksize, 4K, requires 16 parity bits,
+				   so we fit in __le16. */
+	__le16 bc_reserved1;
+/*08*/
+};
+
 /*
 /*
  * On disk extent record for OCFS2
  * On disk extent record for OCFS2
  * It describes a range of clusters on disk.
  * It describes a range of clusters on disk.
@@ -496,7 +534,7 @@ struct ocfs2_truncate_log {
 struct ocfs2_extent_block
 struct ocfs2_extent_block
 {
 {
 /*00*/	__u8 h_signature[8];		/* Signature for verification */
 /*00*/	__u8 h_signature[8];		/* Signature for verification */
-	__le64 h_reserved1;
+	struct ocfs2_block_check h_check;	/* Error checking */
 /*10*/	__le16 h_suballoc_slot;		/* Slot suballocator this
 /*10*/	__le16 h_suballoc_slot;		/* Slot suballocator this
 					   extent_header belongs to */
 					   extent_header belongs to */
 	__le16 h_suballoc_bit;		/* Bit offset in suballocator
 	__le16 h_suballoc_bit;		/* Bit offset in suballocator
@@ -666,7 +704,8 @@ struct ocfs2_dinode {
 					   was set in i_flags */
 					   was set in i_flags */
 	__le16 i_dyn_features;
 	__le16 i_dyn_features;
 	__le64 i_xattr_loc;
 	__le64 i_xattr_loc;
-/*80*/	__le64 i_reserved2[7];
+/*80*/	struct ocfs2_block_check i_check;	/* Error checking */
+/*88*/	__le64 i_reserved2[6];
 /*B8*/	union {
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
 		__le64 i_pad1;		/* Generic way to refer to this
 					   64bit union */
 					   64bit union */
@@ -714,6 +753,34 @@ struct ocfs2_dir_entry {
 /* Actual on-disk length specified by rec_len */
 /* Actual on-disk length specified by rec_len */
 } __attribute__ ((packed));
 } __attribute__ ((packed));
 
 
+/*
+ * Per-block record for the unindexed directory btree. This is carefully
+ * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are
+ * mirrored. That way, the directory manipulation code needs a minimal amount
+ * of update.
+ *
+ * NOTE: Keep this structure aligned to a multiple of 4 bytes.
+ */
+struct ocfs2_dir_block_trailer {
+/*00*/	__le64		db_compat_inode;	/* Always zero. Was inode */
+
+	__le16		db_compat_rec_len;	/* Backwards compatible with
+						 * ocfs2_dir_entry. */
+	__u8		db_compat_name_len;	/* Always zero. Was name_len */
+	__u8		db_reserved0;
+	__le16		db_reserved1;
+	__le16		db_free_rec_len;	/* Size of largest empty hole
+						 * in this block. (unused) */
+/*10*/	__u8		db_signature[8];	/* Signature for verification */
+	__le64		db_reserved2;
+	__le64		db_free_next;		/* Next block in list (unused) */
+/*20*/	__le64		db_blkno;		/* Offset on disk, in blocks */
+	__le64		db_parent_dinode;	/* dinode which owns me, in
+						   blocks */
+/*30*/	struct ocfs2_block_check db_check;	/* Error checking */
+/*40*/
+};
+
 /*
 /*
  * On disk allocator group structure for OCFS2
  * On disk allocator group structure for OCFS2
  */
  */
@@ -733,7 +800,8 @@ struct ocfs2_group_desc
 /*20*/	__le64   bg_parent_dinode;       /* dinode which owns me, in
 /*20*/	__le64   bg_parent_dinode;       /* dinode which owns me, in
 					   blocks */
 					   blocks */
 	__le64   bg_blkno;               /* Offset on disk, in blocks */
 	__le64   bg_blkno;               /* Offset on disk, in blocks */
-/*30*/	__le64   bg_reserved2[2];
+/*30*/	struct ocfs2_block_check bg_check;	/* Error checking */
+	__le64   bg_reserved2;
 /*40*/	__u8    bg_bitmap[0];
 /*40*/	__u8    bg_bitmap[0];
 };
 };
 
 
@@ -776,7 +844,12 @@ struct ocfs2_xattr_header {
 						   in this extent record,
 						   in this extent record,
 						   only valid in the first
 						   only valid in the first
 						   bucket. */
 						   bucket. */
-	__le64  xh_csum;
+	struct ocfs2_block_check xh_check;	/* Error checking
+						   (Note, this is only
+						    used for xattr
+						    buckets.  A block uses
+						    xb_check and sets
+						    this field to zero.) */
 	struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
 	struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
 };
 };
 
 
@@ -827,7 +900,7 @@ struct ocfs2_xattr_block {
 					block group */
 					block group */
 	__le32	xb_fs_generation;    /* Must match super block */
 	__le32	xb_fs_generation;    /* Must match super block */
 /*10*/	__le64	xb_blkno;            /* Offset on disk, in blocks */
 /*10*/	__le64	xb_blkno;            /* Offset on disk, in blocks */
-	__le64	xb_csum;
+	struct ocfs2_block_check xb_check;	/* Error checking */
 /*20*/	__le16	xb_flags;            /* Indicates whether this block contains
 /*20*/	__le16	xb_flags;            /* Indicates whether this block contains
 					real xattr or a xattr tree. */
 					real xattr or a xattr tree. */
 	__le16	xb_reserved0;
 	__le16	xb_reserved0;
@@ -868,6 +941,128 @@ static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
 	return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
 	return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
 }
 }
 
 
+/*
+ *  On disk structures for global quota file
+ */
+
+/* Magic numbers and known versions for global quota files */
+#define OCFS2_GLOBAL_QMAGICS {\
+	0x0cf52470, /* USRQUOTA */ \
+	0x0cf52471  /* GRPQUOTA */ \
+}
+
+#define OCFS2_GLOBAL_QVERSIONS {\
+	0, \
+	0, \
+}
+
+
+/* Each block of each quota file has a certain fixed number of bytes reserved
+ * for OCFS2 internal use at its end. OCFS2 can use it for things like
+ * checksums, etc. */
+#define OCFS2_QBLK_RESERVED_SPACE 8
+
+/* Generic header of all quota files */
+struct ocfs2_disk_dqheader {
+	__le32 dqh_magic;	/* Magic number identifying file */
+	__le32 dqh_version;	/* Quota format version */
+};
+
+#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of global quota file (immediately follows the generic
+ * header) */
+struct ocfs2_global_disk_dqinfo {
+/*00*/	__le32 dqi_bgrace;	/* Grace time for space softlimit excess */
+	__le32 dqi_igrace;	/* Grace time for inode softlimit excess */
+	__le32 dqi_syncms;	/* Time after which we sync local changes to
+				 * global quota file */
+	__le32 dqi_blocks;	/* Number of blocks in quota file */
+/*10*/	__le32 dqi_free_blk;	/* First free block in quota file */
+	__le32 dqi_free_entry;	/* First block with free dquot entry in quota
+				 * file */
+};
+
+/* Structure with global user / group information. We reserve some space
+ * for future use. */
+struct ocfs2_global_disk_dqblk {
+/*00*/	__le32 dqb_id;          /* ID the structure belongs to */
+	__le32 dqb_use_count;   /* Number of nodes having reference to this structure */
+	__le64 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+/*10*/	__le64 dqb_isoftlimit;  /* preferred inode limit */
+	__le64 dqb_curinodes;   /* current # allocated inodes */
+/*20*/	__le64 dqb_bhardlimit;  /* absolute limit on disk space */
+	__le64 dqb_bsoftlimit;  /* preferred limit on disk space */
+/*30*/	__le64 dqb_curspace;    /* current space occupied */
+	__le64 dqb_btime;       /* time limit for excessive disk use */
+/*40*/	__le64 dqb_itime;       /* time limit for excessive inode use */
+	__le64 dqb_pad1;
+/*50*/	__le64 dqb_pad2;
+};
+
+/*
+ *  On-disk structures for local quota file
+ */
+
+/* Magic numbers and known versions for local quota files */
+#define OCFS2_LOCAL_QMAGICS {\
+	0x0cf524c0, /* USRQUOTA */ \
+	0x0cf524c1  /* GRPQUOTA */ \
+}
+
+#define OCFS2_LOCAL_QVERSIONS {\
+	0, \
+	0, \
+}
+
+/* Quota flags in dqinfo header */
+#define OLQF_CLEAN	0x0001	/* Quota file is empty (this should be after\
+				 * quota has been cleanly turned off) */
+
+#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of local quota file (immediately follows the generic
+ * header) */
+struct ocfs2_local_disk_dqinfo {
+	__le32 dqi_flags;	/* Flags for quota file */
+	__le32 dqi_chunks;	/* Number of chunks of quota structures
+				 * with a bitmap */
+	__le32 dqi_blocks;	/* Number of blocks allocated for quota file */
+};
+
+/* Header of one chunk of a quota file */
+struct ocfs2_local_disk_chunk {
+	__le32 dqc_free;	/* Number of free entries in the bitmap */
+	u8 dqc_bitmap[0];	/* Bitmap of entries in the corresponding
+				 * chunk of quota file */
+};
+
+/* One entry in local quota file */
+struct ocfs2_local_disk_dqblk {
+/*00*/	__le64 dqb_id;		/* id this quota applies to */
+	__le64 dqb_spacemod;	/* Change in the amount of used space */
+/*10*/	__le64 dqb_inodemod;	/* Change in the amount of used inodes */
+};
+
+
+/*
+ * The quota trailer lives at the end of each quota block.
+ */
+
+struct ocfs2_disk_dqtrailer {
+/*00*/	struct ocfs2_block_check dq_check;	/* Error checking */
+/*08*/	/* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */
+};
+
+static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize,
+								 void *buf)
+{
+	char *ptr = buf;
+	ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE;
+
+	return (struct ocfs2_disk_dqtrailer *)ptr;
+}
+
 #ifdef __KERNEL__
 #ifdef __KERNEL__
 static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 {
 {

+ 0 - 82
fs/ocfs2/ocfs2_jbd_compat.h

@@ -1,82 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ocfs2_jbd_compat.h
- *
- * Compatibility defines for JBD.
- *
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#ifndef OCFS2_JBD_COMPAT_H
-#define OCFS2_JBD_COMPAT_H
-
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# error Should not have been included
-#endif
-
-struct jbd2_inode {
-	unsigned int dummy;
-};
-
-#define JBD2_BARRIER			JFS_BARRIER
-#define JBD2_DEFAULT_MAX_COMMIT_AGE	JBD_DEFAULT_MAX_COMMIT_AGE
-
-#define jbd2_journal_ack_err			journal_ack_err
-#define jbd2_journal_clear_err			journal_clear_err
-#define jbd2_journal_destroy			journal_destroy
-#define jbd2_journal_dirty_metadata		journal_dirty_metadata
-#define jbd2_journal_errno			journal_errno
-#define jbd2_journal_extend			journal_extend
-#define jbd2_journal_flush			journal_flush
-#define jbd2_journal_force_commit		journal_force_commit
-#define jbd2_journal_get_write_access		journal_get_write_access
-#define jbd2_journal_get_undo_access		journal_get_undo_access
-#define jbd2_journal_init_inode			journal_init_inode
-#define jbd2_journal_invalidatepage		journal_invalidatepage
-#define jbd2_journal_load			journal_load
-#define jbd2_journal_lock_updates		journal_lock_updates
-#define jbd2_journal_restart			journal_restart
-#define jbd2_journal_start			journal_start
-#define jbd2_journal_start_commit		journal_start_commit
-#define jbd2_journal_stop			journal_stop
-#define jbd2_journal_try_to_free_buffers	journal_try_to_free_buffers
-#define jbd2_journal_unlock_updates		journal_unlock_updates
-#define jbd2_journal_wipe			journal_wipe
-#define jbd2_log_wait_commit			log_wait_commit
-
-static inline int jbd2_journal_file_inode(handle_t *handle,
-					  struct jbd2_inode *inode)
-{
-	return 0;
-}
-
-static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
-						      loff_t new_size)
-{
-	return 0;
-}
-
-static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
-					       struct inode *inode)
-{
-	return;
-}
-
-static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
-						  struct jbd2_inode *jinode)
-{
-	return;
-}
-
-
-#endif  /* OCFS2_JBD_COMPAT_H */

+ 5 - 0
fs/ocfs2/ocfs2_lockid.h

@@ -46,6 +46,7 @@ enum ocfs2_lock_type {
 	OCFS2_LOCK_TYPE_DENTRY,
 	OCFS2_LOCK_TYPE_DENTRY,
 	OCFS2_LOCK_TYPE_OPEN,
 	OCFS2_LOCK_TYPE_OPEN,
 	OCFS2_LOCK_TYPE_FLOCK,
 	OCFS2_LOCK_TYPE_FLOCK,
+	OCFS2_LOCK_TYPE_QINFO,
 	OCFS2_NUM_LOCK_TYPES
 	OCFS2_NUM_LOCK_TYPES
 };
 };
 
 
@@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
 		case OCFS2_LOCK_TYPE_FLOCK:
 		case OCFS2_LOCK_TYPE_FLOCK:
 			c = 'F';
 			c = 'F';
 			break;
 			break;
+		case OCFS2_LOCK_TYPE_QINFO:
+			c = 'Q';
+			break;
 		default:
 		default:
 			c = '\0';
 			c = '\0';
 	}
 	}
@@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = {
 	[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
 	[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
 	[OCFS2_LOCK_TYPE_OPEN] = "Open",
 	[OCFS2_LOCK_TYPE_OPEN] = "Open",
 	[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
 	[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
+	[OCFS2_LOCK_TYPE_QINFO] = "Quota",
 };
 };
 
 
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)

+ 119 - 0
fs/ocfs2/quota.h

@@ -0,0 +1,119 @@
+/*
+ * quota.h for OCFS2
+ *
+ * On disk quota structures for local and global quota file, in-memory
+ * structures.
+ *
+ */
+
+#ifndef _OCFS2_QUOTA_H
+#define _OCFS2_QUOTA_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/list.h>
+#include <linux/dqblk_qtree.h>
+
+#include "ocfs2.h"
+
+/* Common stuff */
+/* id number of quota format */
+#define QFMT_OCFS2 3
+
+/*
+ * In-memory structures
+ */
+struct ocfs2_dquot {
+	struct dquot dq_dquot;	/* Generic VFS dquot */
+	loff_t dq_local_off;	/* Offset in the local quota file */
+	struct ocfs2_quota_chunk *dq_chunk;	/* Chunk dquot is in */
+	unsigned int dq_use_count;	/* Number of nodes having reference to this entry in global quota file */
+	s64 dq_origspace;	/* Last globally synced space usage */
+	s64 dq_originodes;	/* Last globally synced inode usage */
+};
+
+/* Description of one chunk to recover in memory */
+struct ocfs2_recovery_chunk {
+	struct list_head rc_list;	/* List of chunks */
+	int rc_chunk;			/* Chunk number */
+	unsigned long *rc_bitmap;	/* Bitmap of entries to recover */
+};
+
+struct ocfs2_quota_recovery {
+	struct list_head r_list[MAXQUOTAS];	/* List of chunks to recover */
+};
+
+/* In-memory structure with quota header information */
+struct ocfs2_mem_dqinfo {
+	unsigned int dqi_type;		/* Quota type this structure describes */
+	unsigned int dqi_chunks;	/* Number of chunks in local quota file */
+	unsigned int dqi_blocks;	/* Number of blocks allocated for local quota file */
+	unsigned int dqi_syncms;	/* How often should we sync with other nodes */
+	unsigned int dqi_syncjiff;	/* Precomputed dqi_syncms in jiffies */
+	struct list_head dqi_chunk;	/* List of chunks */
+	struct inode *dqi_gqinode;	/* Global quota file inode */
+	struct ocfs2_lock_res dqi_gqlock;	/* Lock protecting quota information structure */
+	struct buffer_head *dqi_gqi_bh;	/* Buffer head with global quota file inode - set only if inode lock is obtained */
+	int dqi_gqi_count;		/* Number of holders of dqi_gqi_bh */
+	struct buffer_head *dqi_lqi_bh;	/* Buffer head with local quota file inode */
+	struct buffer_head *dqi_ibh;	/* Buffer with information header */
+	struct qtree_mem_dqinfo dqi_gi;	/* Info about global file */
+	struct delayed_work dqi_sync_work;	/* Work for syncing dquots */
+	struct ocfs2_quota_recovery *dqi_rec;	/* Pointer to recovery
+						 * information, in case we
+						 * enable quotas on file
+						 * needing it */
+};
+
+static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
+{
+	return container_of(dquot, struct ocfs2_dquot, dq_dquot);
+}
+
+struct ocfs2_quota_chunk {
+	struct list_head qc_chunk;	/* List of quotafile chunks */
+	int qc_num;			/* Number of quota chunk */
+	struct buffer_head *qc_headerbh;	/* Buffer head with chunk header */
+};
+
+extern struct kmem_cache *ocfs2_dquot_cachep;
+extern struct kmem_cache *ocfs2_qf_chunk_cachep;
+
+extern struct qtree_fmt_operations ocfs2_global_ops;
+
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+				struct ocfs2_super *osb, int slot_num);
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+				struct ocfs2_quota_recovery *rec,
+				int slot_num);
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+			 size_t len, loff_t off);
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+			  const char *data, size_t len, loff_t off);
+int ocfs2_global_read_info(struct super_block *sb, int type);
+int ocfs2_global_write_info(struct super_block *sb, int type);
+int ocfs2_global_read_dquot(struct dquot *dquot);
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
+static inline int ocfs2_sync_dquot(struct dquot *dquot)
+{
+	return __ocfs2_sync_dquot(dquot, 0);
+}
+static inline int ocfs2_global_release_dquot(struct dquot *dquot)
+{
+	return __ocfs2_sync_dquot(dquot, 1);
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+			   struct buffer_head **bh);
+
+extern struct dquot_operations ocfs2_quota_operations;
+extern struct quota_format_type ocfs2_quota_format;
+
+int ocfs2_quota_setup(void);
+void ocfs2_quota_shutdown(void);
+
+#endif /* _OCFS2_QUOTA_H */

+ 1025 - 0
fs/ocfs2/quota_global.c

@@ -0,0 +1,1025 @@
+/*
+ *  Implementation of operations over global quota file
+ */
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/dqblk_qtree.h>
+#include <linux/jiffies.h>
+#include <linux/writeback.h>
+#include <linux/workqueue.h>
+
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "alloc.h"
+#include "blockcheck.h"
+#include "inode.h"
+#include "journal.h"
+#include "file.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "uptodate.h"
+#include "quota.h"
+
+static struct workqueue_struct *ocfs2_quota_wq = NULL;
+
+static void qsync_work_fn(struct work_struct *work);
+
+static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
+{
+	struct ocfs2_global_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
+	/* Update from disk only entries not set by the admin */
+	if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
+		m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+		m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+	}
+	if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+		m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+	if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
+		m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
+		m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
+	}
+	if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+		m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+	if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
+		m->dqb_btime = le64_to_cpu(d->dqb_btime);
+	if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
+		m->dqb_itime = le64_to_cpu(d->dqb_itime);
+	OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
+}
+
+static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+	struct ocfs2_global_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
+	d->dqb_id = cpu_to_le32(dquot->dq_id);
+	d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
+	d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+	d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+	d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+	d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
+	d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
+	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+	d->dqb_btime = cpu_to_le64(m->dqb_btime);
+	d->dqb_itime = cpu_to_le64(m->dqb_itime);
+}
+
+static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
+{
+	struct ocfs2_global_disk_dqblk *d = dp;
+	struct ocfs2_mem_dqinfo *oinfo =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+	if (qtree_entry_unused(&oinfo->dqi_gi, dp))
+		return 0;
+	return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+}
+
+struct qtree_fmt_operations ocfs2_global_ops = {
+	.mem2disk_dqblk = ocfs2_global_mem2diskdqb,
+	.disk2mem_dqblk = ocfs2_global_disk2memdqb,
+	.is_id = ocfs2_global_is_id,
+};
+
+static int ocfs2_validate_quota_block(struct super_block *sb,
+				      struct buffer_head *bh)
+{
+	struct ocfs2_disk_dqtrailer *dqt =
+		ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
+
+	mlog(0, "Validating quota block %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
+}
+
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+			   struct buffer_head **bh)
+{
+	int rc = 0;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+				    ocfs2_validate_quota_block);
+	if (rc)
+		mlog_errno(rc);
+
+	/* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
+static int ocfs2_get_quota_block(struct inode *inode, int block,
+				 struct buffer_head **bh)
+{
+	u64 pblock, pcount;
+	int err;
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	if (err) {
+		mlog_errno(err);
+		return err;
+	}
+	*bh = sb_getblk(inode->i_sb, pblock);
+	if (!*bh) {
+		err = -EIO;
+		mlog_errno(err);
+	}
+	return err;;
+}
+
+/* Read data from global quotafile - avoid pagecache and such because we cannot
+ * afford acquiring the locks... We use quota cluster lock to serialize
+ * operations. Caller is responsible for acquiring it. */
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+			 size_t len, loff_t off)
+{
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct inode *gqinode = oinfo->dqi_gqinode;
+	loff_t i_size = i_size_read(gqinode);
+	int offset = off & (sb->s_blocksize - 1);
+	sector_t blk = off >> sb->s_blocksize_bits;
+	int err = 0;
+	struct buffer_head *bh;
+	size_t toread, tocopy;
+
+	if (off > i_size)
+		return 0;
+	if (off + len > i_size)
+		len = i_size - off;
+	toread = len;
+	while (toread > 0) {
+		tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+		bh = NULL;
+		err = ocfs2_read_quota_block(gqinode, blk, &bh);
+		if (err) {
+			mlog_errno(err);
+			return err;
+		}
+		memcpy(data, bh->b_data + offset, tocopy);
+		brelse(bh);
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+			  const char *data, size_t len, loff_t off)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct inode *gqinode = oinfo->dqi_gqinode;
+	int offset = off & (sb->s_blocksize - 1);
+	sector_t blk = off >> sb->s_blocksize_bits;
+	int err = 0, new = 0, ja_type;
+	struct buffer_head *bh = NULL;
+	handle_t *handle = journal_current_handle();
+
+	if (!handle) {
+		mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
+		     "because transaction was not started.\n",
+		     (unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+	if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
+		WARN_ON(1);
+		len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
+	}
+
+	mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
+	if (gqinode->i_size < off + len) {
+		down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+		err = ocfs2_extend_no_holes(gqinode, off + len, off);
+		up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+		if (err < 0)
+			goto out;
+		err = ocfs2_simple_size_update(gqinode,
+					       oinfo->dqi_gqi_bh,
+					       off + len);
+		if (err < 0)
+			goto out;
+		new = 1;
+	}
+	/* Not rewriting whole block? */
+	if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
+	    !new) {
+		err = ocfs2_read_quota_block(gqinode, blk, &bh);
+		ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
+	} else {
+		err = ocfs2_get_quota_block(gqinode, blk, &bh);
+		ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
+	}
+	if (err) {
+		mlog_errno(err);
+		return err;
+	}
+	lock_buffer(bh);
+	if (new)
+		memset(bh->b_data, 0, sb->s_blocksize);
+	memcpy(bh->b_data + offset, data, len);
+	flush_dcache_page(bh->b_page);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	ocfs2_set_buffer_uptodate(gqinode, bh);
+	err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type);
+	if (err < 0) {
+		brelse(bh);
+		goto out;
+	}
+	err = ocfs2_journal_dirty(handle, bh);
+	brelse(bh);
+	if (err < 0)
+		goto out;
+out:
+	if (err) {
+		mutex_unlock(&gqinode->i_mutex);
+		mlog_errno(err);
+		return err;
+	}
+	gqinode->i_version++;
+	ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
+	mutex_unlock(&gqinode->i_mutex);
+	return len;
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	int status;
+	struct buffer_head *bh = NULL;
+
+	status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
+	if (status < 0)
+		return status;
+	spin_lock(&dq_data_lock);
+	if (!oinfo->dqi_gqi_count++)
+		oinfo->dqi_gqi_bh = bh;
+	else
+		WARN_ON(bh != oinfo->dqi_gqi_bh);
+	spin_unlock(&dq_data_lock);
+	return 0;
+}
+
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
+	brelse(oinfo->dqi_gqi_bh);
+	spin_lock(&dq_data_lock);
+	if (!--oinfo->dqi_gqi_count)
+		oinfo->dqi_gqi_bh = NULL;
+	spin_unlock(&dq_data_lock);
+}
+
+/* Read information header from global quota file */
+int ocfs2_global_read_info(struct super_block *sb, int type)
+{
+	struct inode *gqinode = NULL;
+	unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+					GROUP_QUOTA_SYSTEM_INODE };
+	struct ocfs2_global_disk_dqinfo dinfo;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	int status;
+
+	mlog_entry_void();
+
+	/* Read global header */
+	gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+			OCFS2_INVALID_SLOT);
+	if (!gqinode) {
+		mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
+			type);
+		status = -EINVAL;
+		goto out_err;
+	}
+	oinfo->dqi_gi.dqi_sb = sb;
+	oinfo->dqi_gi.dqi_type = type;
+	ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
+	oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
+	oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
+	oinfo->dqi_gqi_bh = NULL;
+	oinfo->dqi_gqi_count = 0;
+	oinfo->dqi_gqinode = gqinode;
+	status = ocfs2_lock_global_qf(oinfo, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+	status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+				      sizeof(struct ocfs2_global_disk_dqinfo),
+				      OCFS2_GLOBAL_INFO_OFF);
+	ocfs2_unlock_global_qf(oinfo, 0);
+	if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
+		mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
+		     status);
+		if (status >= 0)
+			status = -EIO;
+		mlog_errno(status);
+		goto out_err;
+	}
+	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+	oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
+	oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
+	oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+	oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+	oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+	oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
+	oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
+						OCFS2_QBLK_RESERVED_SPACE;
+	oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
+	INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
+	queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+			   oinfo->dqi_syncjiff);
+
+out_err:
+	mlog_exit(status);
+	return status;
+}
+
+/* Write information to global quota file. Expects exlusive lock on quota
+ * file inode and quota info */
+static int __ocfs2_global_write_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_global_disk_dqinfo dinfo;
+	ssize_t size;
+
+	spin_lock(&dq_data_lock);
+	info->dqi_flags &= ~DQF_INFO_DIRTY;
+	dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+	dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+	spin_unlock(&dq_data_lock);
+	dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
+	dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
+	dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
+	dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
+	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
+				     sizeof(struct ocfs2_global_disk_dqinfo),
+				     OCFS2_GLOBAL_INFO_OFF);
+	if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
+		mlog(ML_ERROR, "Cannot write global quota info structure\n");
+		if (size >= 0)
+			size = -EIO;
+		return size;
+	}
+	return 0;
+}
+
+int ocfs2_global_write_info(struct super_block *sb, int type)
+{
+	int err;
+	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+
+	err = ocfs2_qinfo_lock(info, 1);
+	if (err < 0)
+		return err;
+	err = __ocfs2_global_write_info(sb, type);
+	ocfs2_qinfo_unlock(info, 1);
+	return err;
+}
+
+/* Read in information from global quota file and acquire a reference to it.
+ * dquot_acquire() has already started the transaction and locked quota file */
+int ocfs2_global_read_dquot(struct dquot *dquot)
+{
+	int err, err2, ex = 0;
+	struct ocfs2_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+	err = ocfs2_qinfo_lock(info, 0);
+	if (err < 0)
+		goto out;
+	err = qtree_read_dquot(&info->dqi_gi, dquot);
+	if (err < 0)
+		goto out_qlock;
+	OCFS2_DQUOT(dquot)->dq_use_count++;
+	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+	if (!dquot->dq_off) {	/* No real quota entry? */
+		/* Upgrade to exclusive lock for allocation */
+		err = ocfs2_qinfo_lock(info, 1);
+		if (err < 0)
+			goto out_qlock;
+		ex = 1;
+	}
+	err = qtree_write_dquot(&info->dqi_gi, dquot);
+	if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
+		err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
+		if (!err)
+			err = err2;
+	}
+out_qlock:
+	if (ex)
+		ocfs2_qinfo_unlock(info, 1);
+	ocfs2_qinfo_unlock(info, 0);
+out:
+	if (err < 0)
+		mlog_errno(err);
+	return err;
+}
+
+/* Sync local information about quota modifications with global quota file.
+ * Caller must have started the transaction and obtained exclusive lock for
+ * global quota file inode */
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
+{
+	int err, err2;
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_type;
+	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_global_disk_dqblk dqblk;
+	s64 spacechange, inodechange;
+	time_t olditime, oldbtime;
+
+	err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+				   sizeof(struct ocfs2_global_disk_dqblk),
+				   dquot->dq_off);
+	if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
+		if (err >= 0) {
+			mlog(ML_ERROR, "Short read from global quota file "
+				       "(%u read)\n", err);
+			err = -EIO;
+		}
+		goto out;
+	}
+
+	/* Update space and inode usage. Get also other information from
+	 * global quota file so that we don't overwrite any changes there.
+	 * We are */
+	spin_lock(&dq_data_lock);
+	spacechange = dquot->dq_dqb.dqb_curspace -
+					OCFS2_DQUOT(dquot)->dq_origspace;
+	inodechange = dquot->dq_dqb.dqb_curinodes -
+					OCFS2_DQUOT(dquot)->dq_originodes;
+	olditime = dquot->dq_dqb.dqb_itime;
+	oldbtime = dquot->dq_dqb.dqb_btime;
+	ocfs2_global_disk2memdqb(dquot, &dqblk);
+	mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
+	     dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
+	     dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
+	if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+		dquot->dq_dqb.dqb_curspace += spacechange;
+	if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+		dquot->dq_dqb.dqb_curinodes += inodechange;
+	/* Set properly space grace time... */
+	if (dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
+		if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
+		    oldbtime > 0) {
+			if (dquot->dq_dqb.dqb_btime > 0)
+				dquot->dq_dqb.dqb_btime =
+					min(dquot->dq_dqb.dqb_btime, oldbtime);
+			else
+				dquot->dq_dqb.dqb_btime = oldbtime;
+		}
+	} else {
+		dquot->dq_dqb.dqb_btime = 0;
+		clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+	}
+	/* Set properly inode grace time... */
+	if (dquot->dq_dqb.dqb_isoftlimit &&
+	    dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
+		if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
+		    olditime > 0) {
+			if (dquot->dq_dqb.dqb_itime > 0)
+				dquot->dq_dqb.dqb_itime =
+					min(dquot->dq_dqb.dqb_itime, olditime);
+			else
+				dquot->dq_dqb.dqb_itime = olditime;
+		}
+	} else {
+		dquot->dq_dqb.dqb_itime = 0;
+		clear_bit(DQ_INODES_B, &dquot->dq_flags);
+	}
+	/* All information is properly updated, clear the flags */
+	__clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+	spin_unlock(&dq_data_lock);
+	err = ocfs2_qinfo_lock(info, freeing);
+	if (err < 0) {
+		mlog(ML_ERROR, "Failed to lock quota info, loosing quota write"
+			       " (type=%d, id=%u)\n", dquot->dq_type,
+			       (unsigned)dquot->dq_id);
+		goto out;
+	}
+	if (freeing)
+		OCFS2_DQUOT(dquot)->dq_use_count--;
+	err = qtree_write_dquot(&info->dqi_gi, dquot);
+	if (err < 0)
+		goto out_qlock;
+	if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
+		err = qtree_release_dquot(&info->dqi_gi, dquot);
+		if (info_dirty(sb_dqinfo(sb, type))) {
+			err2 = __ocfs2_global_write_info(sb, type);
+			if (!err)
+				err = err2;
+		}
+	}
+out_qlock:
+	ocfs2_qinfo_unlock(info, freeing);
+out:
+	if (err < 0)
+		mlog_errno(err);
+	return err;
+}
+
+/*
+ *  Functions for periodic syncing of dquots with global file
+ */
+static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
+{
+	handle_t *handle;
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	int status = 0;
+
+	mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
+		   dquot->dq_type, type, sb->s_id);
+	if (type != dquot->dq_type)
+		goto out;
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+
+	handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+	status = ocfs2_sync_dquot(dquot);
+	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+	if (status < 0)
+		mlog_errno(status);
+	/* We have to write local structure as well... */
+	dquot_mark_dquot_dirty(dquot);
+	status = dquot_commit(dquot);
+	if (status < 0)
+		mlog_errno(status);
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static void qsync_work_fn(struct work_struct *work)
+{
+	struct ocfs2_mem_dqinfo *oinfo = container_of(work,
+						      struct ocfs2_mem_dqinfo,
+						      dqi_sync_work.work);
+	struct super_block *sb = oinfo->dqi_gqinode->i_sb;
+
+	dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+	queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+			   oinfo->dqi_syncjiff);
+}
+
+/*
+ *  Wrappers for generic quota functions
+ */
+
+static int ocfs2_write_dquot(struct dquot *dquot)
+{
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+	int status = 0;
+
+	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+
+	handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+	status = dquot_commit(dquot);
+	ocfs2_commit_trans(osb, handle);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+{
+	struct ocfs2_mem_dqinfo *oinfo;
+	int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+				    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+		return 0;
+
+	oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	/* We modify tree, leaf block, global info, local chunk header,
+	 * global and local inode */
+	return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
+	       2 * OCFS2_INODE_UPDATE_CREDITS;
+}
+
+static int ocfs2_release_dquot(struct dquot *dquot)
+{
+	handle_t *handle;
+	struct ocfs2_mem_dqinfo *oinfo =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+	int status = 0;
+
+	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(osb,
+		ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	status = dquot_release(dquot);
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
+{
+	struct ocfs2_mem_dqinfo *oinfo;
+	int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+				    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+	struct ocfs2_dinode *lfe, *gfe;
+
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+		return 0;
+
+	oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
+	lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
+	/* We can extend local file + global file. In local file we
+	 * can modify info, chunk header block and dquot block. In
+	 * global file we can modify info, tree and leaf block */
+	return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
+	       ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
+	       3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
+}
+
+static int ocfs2_acquire_dquot(struct dquot *dquot)
+{
+	handle_t *handle;
+	struct ocfs2_mem_dqinfo *oinfo =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+	int status = 0;
+
+	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+	/* We need an exclusive lock, because we're going to update use count
+	 * and instantiate possibly new dquot structure */
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(osb,
+		ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	status = dquot_acquire(dquot);
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
+{
+	unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_INODES_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_ITIME_B));
+	int sync = 0;
+	int status;
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_type;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+	dquot_mark_dquot_dirty(dquot);
+
+	/* In case user set some limits, sync dquot immediately to global
+	 * quota file so that information propagates quicker */
+	spin_lock(&dq_data_lock);
+	if (dquot->dq_flags & mask)
+		sync = 1;
+	spin_unlock(&dq_data_lock);
+	if (!sync) {
+		status = ocfs2_write_dquot(dquot);
+		goto out;
+	}
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	status = ocfs2_sync_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	/* Now write updated local dquot structure */
+	status = dquot_commit(dquot);
+out_trans:
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/* This should happen only after set_dqinfo(). */
+static int ocfs2_write_info(struct super_block *sb, int type)
+{
+	handle_t *handle;
+	int status = 0;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+
+	mlog_entry_void();
+
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	status = dquot_commit_info(sb, type);
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/* This is difficult. We have to lock quota inode and start transaction
+ * in this function but we don't want to take the penalty of exlusive
+ * quota file lock when we are just going to use cached structures. So
+ * we just take read lock check whether we have dquot cached and if so,
+ * we don't have to take the write lock... */
+static int ocfs2_dquot_initialize(struct inode *inode, int type)
+{
+	handle_t *handle = NULL;
+	int status = 0;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_mem_dqinfo *oinfo;
+	int exclusive = 0;
+	int cnt;
+	qid_t id;
+
+	mlog_entry_void();
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+		status = ocfs2_lock_global_qf(oinfo, 0);
+		if (status < 0)
+			goto out;
+		/* This is just a performance optimization not a reliable test.
+		 * Since we hold an inode lock, noone can actually release
+		 * the structure until we are finished with initialization. */
+		if (inode->i_dquot[cnt] != NODQUOT) {
+			ocfs2_unlock_global_qf(oinfo, 0);
+			continue;
+		}
+		/* When we have inode lock, we know that no dquot_release() can
+		 * run and thus we can safely check whether we need to
+		 * read+modify global file to get quota information or whether
+		 * our node already has it. */
+		if (cnt == USRQUOTA)
+			id = inode->i_uid;
+		else if (cnt == GRPQUOTA)
+			id = inode->i_gid;
+		else
+			BUG();
+		/* Obtain exclusion from quota off... */
+		down_write(&sb_dqopt(sb)->dqptr_sem);
+		exclusive = !dquot_is_cached(sb, id, cnt);
+		up_write(&sb_dqopt(sb)->dqptr_sem);
+		if (exclusive) {
+			status = ocfs2_lock_global_qf(oinfo, 1);
+			if (status < 0) {
+				exclusive = 0;
+				mlog_errno(status);
+				goto out_ilock;
+			}
+			handle = ocfs2_start_trans(OCFS2_SB(sb),
+					ocfs2_calc_qinit_credits(sb, cnt));
+			if (IS_ERR(handle)) {
+				status = PTR_ERR(handle);
+				mlog_errno(status);
+				goto out_ilock;
+			}
+		}
+		dquot_initialize(inode, cnt);
+		if (exclusive) {
+			ocfs2_commit_trans(OCFS2_SB(sb), handle);
+			ocfs2_unlock_global_qf(oinfo, 1);
+		}
+		ocfs2_unlock_global_qf(oinfo, 0);
+	}
+	mlog_exit(0);
+	return 0;
+out_ilock:
+	if (exclusive)
+		ocfs2_unlock_global_qf(oinfo, 1);
+	ocfs2_unlock_global_qf(oinfo, 0);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_dquot_drop_slow(struct inode *inode)
+{
+	int status = 0;
+	int cnt;
+	int got_lock[MAXQUOTAS] = {0, 0};
+	handle_t *handle;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_mem_dqinfo *oinfo;
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+		status = ocfs2_lock_global_qf(oinfo, 1);
+		if (status < 0)
+			goto out;
+		got_lock[cnt] = 1;
+	}
+	handle = ocfs2_start_trans(OCFS2_SB(sb),
+			ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+			ocfs2_calc_qinit_credits(sb, GRPQUOTA));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+	dquot_drop(inode);
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (got_lock[cnt]) {
+			oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+			ocfs2_unlock_global_qf(oinfo, 1);
+		}
+	return status;
+}
+
+/* See the comment before ocfs2_dquot_initialize. */
+static int ocfs2_dquot_drop(struct inode *inode)
+{
+	int status = 0;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_mem_dqinfo *oinfo;
+	int exclusive = 0;
+	int cnt;
+	int got_lock[MAXQUOTAS] = {0, 0};
+
+	mlog_entry_void();
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+		status = ocfs2_lock_global_qf(oinfo, 0);
+		if (status < 0)
+			goto out;
+		got_lock[cnt] = 1;
+	}
+	/* Lock against anyone releasing references so that when when we check
+	 * we know we are not going to be last ones to release dquot */
+	down_write(&sb_dqopt(sb)->dqptr_sem);
+	/* Urgh, this is a terrible hack :( */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (inode->i_dquot[cnt] != NODQUOT &&
+		    atomic_read(&inode->i_dquot[cnt]->dq_count) > 1) {
+			exclusive = 1;
+			break;
+		}
+	}
+	if (!exclusive)
+		dquot_drop_locked(inode);
+	up_write(&sb_dqopt(sb)->dqptr_sem);
+out:
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (got_lock[cnt]) {
+			oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+			ocfs2_unlock_global_qf(oinfo, 0);
+		}
+	/* In case we bailed out because we had to do expensive locking
+	 * do it now... */
+	if (exclusive)
+		status = ocfs2_dquot_drop_slow(inode);
+	mlog_exit(status);
+	return status;
+}
+
+static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
+{
+	struct ocfs2_dquot *dquot =
+				kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
+
+	if (!dquot)
+		return NULL;
+	return &dquot->dq_dquot;
+}
+
+static void ocfs2_destroy_dquot(struct dquot *dquot)
+{
+	kmem_cache_free(ocfs2_dquot_cachep, dquot);
+}
+
+struct dquot_operations ocfs2_quota_operations = {
+	.initialize	= ocfs2_dquot_initialize,
+	.drop		= ocfs2_dquot_drop,
+	.alloc_space	= dquot_alloc_space,
+	.alloc_inode	= dquot_alloc_inode,
+	.free_space	= dquot_free_space,
+	.free_inode	= dquot_free_inode,
+	.transfer	= dquot_transfer,
+	.write_dquot	= ocfs2_write_dquot,
+	.acquire_dquot	= ocfs2_acquire_dquot,
+	.release_dquot	= ocfs2_release_dquot,
+	.mark_dirty	= ocfs2_mark_dquot_dirty,
+	.write_info	= ocfs2_write_info,
+	.alloc_dquot	= ocfs2_alloc_dquot,
+	.destroy_dquot	= ocfs2_destroy_dquot,
+};
+
+int ocfs2_quota_setup(void)
+{
+	ocfs2_quota_wq = create_workqueue("o2quot");
+	if (!ocfs2_quota_wq)
+		return -ENOMEM;
+	return 0;
+}
+
+void ocfs2_quota_shutdown(void)
+{
+	if (ocfs2_quota_wq) {
+		flush_workqueue(ocfs2_quota_wq);
+		destroy_workqueue(ocfs2_quota_wq);
+		ocfs2_quota_wq = NULL;
+	}
+}

+ 1253 - 0
fs/ocfs2/quota_local.c

@@ -0,0 +1,1253 @@
+/*
+ *  Implementation of operations over local quota file
+ */
+
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/module.h>
+
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "file.h"
+#include "buffer_head_io.h"
+#include "journal.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "quota.h"
+
+/* Number of local quota structures per block */
+static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
+{
+	return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) /
+		sizeof(struct ocfs2_local_disk_dqblk));
+}
+
+/* Number of blocks with entries in one chunk */
+static inline unsigned int ol_chunk_blocks(struct super_block *sb)
+{
+	return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+		 OCFS2_QBLK_RESERVED_SPACE) << 3) /
+	       ol_quota_entries_per_block(sb);
+}
+
+/* Number of entries in a chunk bitmap */
+static unsigned int ol_chunk_entries(struct super_block *sb)
+{
+	return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb);
+}
+
+/* Offset of the chunk in quota file */
+static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
+{
+	/* 1 block for local quota file info, 1 block per chunk for chunk info */
+	return 1 + (ol_chunk_blocks(sb) + 1) * c;
+}
+
+static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off)
+{
+	int epb = ol_quota_entries_per_block(sb);
+
+	return ol_quota_chunk_block(sb, c) + 1 + off / epb;
+}
+
+static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off)
+{
+	int epb = ol_quota_entries_per_block(sb);
+
+	return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Offset of the dquot structure in the quota file */
+static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
+{
+	return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) +
+	       ol_dqblk_block_off(sb, c, off);
+}
+
+/* Compute block number from given offset */
+static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
+{
+	return off >> sb->s_blocksize_bits;
+}
+
+static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
+{
+	return off & ((1 << sb->s_blocksize_bits) - 1);
+}
+
+/* Compute offset in the chunk of a structure with the given offset */
+static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off)
+{
+	int epb = ol_quota_entries_per_block(sb);
+
+	return ((off >> sb->s_blocksize_bits) -
+			ol_quota_chunk_block(sb, c) - 1) * epb
+	       + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) /
+		 sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Write bufferhead into the fs */
+static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
+		void (*modify)(struct buffer_head *, void *), void *private)
+{
+	struct super_block *sb = inode->i_sb;
+	handle_t *handle;
+	int status;
+
+	handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		return status;
+	}
+	status = ocfs2_journal_access_dq(handle, inode, bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		ocfs2_commit_trans(OCFS2_SB(sb), handle);
+		return status;
+	}
+	lock_buffer(bh);
+	modify(bh, private);
+	unlock_buffer(bh);
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		ocfs2_commit_trans(OCFS2_SB(sb), handle);
+		return status;
+	}
+	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+	if (status < 0) {
+		mlog_errno(status);
+		return status;
+	}
+	return 0;
+}
+
+/* Check whether we understand format of quota files */
+static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
+{
+	unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
+	unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
+	unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
+	unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
+	unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+					GROUP_QUOTA_SYSTEM_INODE };
+	struct buffer_head *bh = NULL;
+	struct inode *linode = sb_dqopt(sb)->files[type];
+	struct inode *ginode = NULL;
+	struct ocfs2_disk_dqheader *dqhead;
+	int status, ret = 0;
+
+	/* First check whether we understand local quota file */
+	status = ocfs2_read_quota_block(linode, 0, &bh);
+	if (status) {
+		mlog_errno(status);
+		mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
+			type);
+		goto out_err;
+	}
+	dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+	if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) {
+		mlog(ML_ERROR, "quota file magic does not match (%u != %u),"
+			" type=%d\n", le32_to_cpu(dqhead->dqh_magic),
+			lmagics[type], type);
+		goto out_err;
+	}
+	if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) {
+		mlog(ML_ERROR, "quota file version does not match (%u != %u),"
+			" type=%d\n", le32_to_cpu(dqhead->dqh_version),
+			lversions[type], type);
+		goto out_err;
+	}
+	brelse(bh);
+	bh = NULL;
+
+	/* Next check whether we understand global quota file */
+	ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+						OCFS2_INVALID_SLOT);
+	if (!ginode) {
+		mlog(ML_ERROR, "cannot get global quota file inode "
+				"(type=%d)\n", type);
+		goto out_err;
+	}
+	/* Since the header is read only, we don't care about locking */
+	status = ocfs2_read_quota_block(ginode, 0, &bh);
+	if (status) {
+		mlog_errno(status);
+		mlog(ML_ERROR, "failed to read global quota file header "
+				"(type=%d)\n", type);
+		goto out_err;
+	}
+	dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+	if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) {
+		mlog(ML_ERROR, "global quota file magic does not match "
+			"(%u != %u), type=%d\n",
+			le32_to_cpu(dqhead->dqh_magic), gmagics[type], type);
+		goto out_err;
+	}
+	if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) {
+		mlog(ML_ERROR, "global quota file version does not match "
+			"(%u != %u), type=%d\n",
+			le32_to_cpu(dqhead->dqh_version), gversions[type],
+			type);
+		goto out_err;
+	}
+
+	ret = 1;
+out_err:
+	brelse(bh);
+	iput(ginode);
+	return ret;
+}
+
+/* Release given list of quota file chunks */
+static void ocfs2_release_local_quota_bitmaps(struct list_head *head)
+{
+	struct ocfs2_quota_chunk *pos, *next;
+
+	list_for_each_entry_safe(pos, next, head, qc_chunk) {
+		list_del(&pos->qc_chunk);
+		brelse(pos->qc_headerbh);
+		kmem_cache_free(ocfs2_qf_chunk_cachep, pos);
+	}
+}
+
+/* Load quota bitmaps into memory */
+static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
+			struct ocfs2_local_disk_dqinfo *ldinfo,
+			struct list_head *head)
+{
+	struct ocfs2_quota_chunk *newchunk;
+	int i, status;
+
+	INIT_LIST_HEAD(head);
+	for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) {
+		newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+		if (!newchunk) {
+			ocfs2_release_local_quota_bitmaps(head);
+			return -ENOMEM;
+		}
+		newchunk->qc_num = i;
+		newchunk->qc_headerbh = NULL;
+		status = ocfs2_read_quota_block(inode,
+				ol_quota_chunk_block(inode->i_sb, i),
+				&newchunk->qc_headerbh);
+		if (status) {
+			mlog_errno(status);
+			kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
+			ocfs2_release_local_quota_bitmaps(head);
+			return status;
+		}
+		list_add_tail(&newchunk->qc_chunk, head);
+	}
+	return 0;
+}
+
+static void olq_update_info(struct buffer_head *bh, void *private)
+{
+	struct mem_dqinfo *info = private;
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+
+	ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+						OCFS2_LOCAL_INFO_OFF);
+	spin_lock(&dq_data_lock);
+	ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+	ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
+	ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
+	spin_unlock(&dq_data_lock);
+}
+
+static int ocfs2_add_recovery_chunk(struct super_block *sb,
+				    struct ocfs2_local_disk_chunk *dchunk,
+				    int chunk,
+				    struct list_head *head)
+{
+	struct ocfs2_recovery_chunk *rc;
+
+	rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS);
+	if (!rc)
+		return -ENOMEM;
+	rc->rc_chunk = chunk;
+	rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+	if (!rc->rc_bitmap) {
+		kfree(rc);
+		return -ENOMEM;
+	}
+	memcpy(rc->rc_bitmap, dchunk->dqc_bitmap,
+	       (ol_chunk_entries(sb) + 7) >> 3);
+	list_add_tail(&rc->rc_list, head);
+	return 0;
+}
+
+static void free_recovery_list(struct list_head *head)
+{
+	struct ocfs2_recovery_chunk *next;
+	struct ocfs2_recovery_chunk *rchunk;
+
+	list_for_each_entry_safe(rchunk, next, head, rc_list) {
+		list_del(&rchunk->rc_list);
+		kfree(rchunk->rc_bitmap);
+		kfree(rchunk);
+	}
+}
+
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
+{
+	int type;
+
+	for (type = 0; type < MAXQUOTAS; type++)
+		free_recovery_list(&(rec->r_list[type]));
+	kfree(rec);
+}
+
+/* Load entries in our quota file we have to recover*/
+static int ocfs2_recovery_load_quota(struct inode *lqinode,
+				     struct ocfs2_local_disk_dqinfo *ldinfo,
+				     int type,
+				     struct list_head *head)
+{
+	struct super_block *sb = lqinode->i_sb;
+	struct buffer_head *hbh;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int i, chunks = le32_to_cpu(ldinfo->dqi_chunks);
+	int status = 0;
+
+	for (i = 0; i < chunks; i++) {
+		hbh = NULL;
+		status = ocfs2_read_quota_block(lqinode,
+						ol_quota_chunk_block(sb, i),
+						&hbh);
+		if (status) {
+			mlog_errno(status);
+			break;
+		}
+		dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+		if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb))
+			status = ocfs2_add_recovery_chunk(sb, dchunk, i, head);
+		brelse(hbh);
+		if (status < 0)
+			break;
+	}
+	if (status < 0)
+		free_recovery_list(head);
+	return status;
+}
+
+static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
+{
+	int type;
+	struct ocfs2_quota_recovery *rec;
+
+	rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
+	if (!rec)
+		return NULL;
+	for (type = 0; type < MAXQUOTAS; type++)
+		INIT_LIST_HEAD(&(rec->r_list[type]));
+	return rec;
+}
+
+/* Load information we need for quota recovery into memory */
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+						struct ocfs2_super *osb,
+						int slot_num)
+{
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+	unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+					LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+	struct super_block *sb = osb->sb;
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+	struct inode *lqinode;
+	struct buffer_head *bh;
+	int type;
+	int status = 0;
+	struct ocfs2_quota_recovery *rec;
+
+	mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+	rec = ocfs2_alloc_quota_recovery();
+	if (!rec)
+		return ERR_PTR(-ENOMEM);
+	/* First init... */
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+			continue;
+		/* At this point, journal of the slot is already replayed so
+		 * we can trust metadata and data of the quota file */
+		lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+		if (!lqinode) {
+			status = -ENOENT;
+			goto out;
+		}
+		status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+					       OCFS2_META_LOCK_RECOVERY);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out_put;
+		}
+		/* Now read local header */
+		bh = NULL;
+		status = ocfs2_read_quota_block(lqinode, 0, &bh);
+		if (status) {
+			mlog_errno(status);
+			mlog(ML_ERROR, "failed to read quota file info header "
+				"(slot=%d type=%d)\n", slot_num, type);
+			goto out_lock;
+		}
+		ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+							OCFS2_LOCAL_INFO_OFF);
+		status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+						   &rec->r_list[type]);
+		brelse(bh);
+out_lock:
+		ocfs2_inode_unlock(lqinode, 1);
+out_put:
+		iput(lqinode);
+		if (status < 0)
+			break;
+	}
+out:
+	if (status < 0) {
+		ocfs2_free_quota_recovery(rec);
+		rec = ERR_PTR(status);
+	}
+	return rec;
+}
+
+/* Sync changes in local quota file into global quota file and
+ * reinitialize local quota file.
+ * The function expects local quota file to be already locked and
+ * dqonoff_mutex locked. */
+static int ocfs2_recover_local_quota_file(struct inode *lqinode,
+					  int type,
+					  struct ocfs2_quota_recovery *rec)
+{
+	struct super_block *sb = lqinode->i_sb;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_local_disk_chunk *dchunk;
+	struct ocfs2_local_disk_dqblk *dqblk;
+	struct dquot *dquot;
+	handle_t *handle;
+	struct buffer_head *hbh = NULL, *qbh = NULL;
+	int status = 0;
+	int bit, chunk;
+	struct ocfs2_recovery_chunk *rchunk, *next;
+	qsize_t spacechange, inodechange;
+
+	mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
+
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+
+	list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
+		chunk = rchunk->rc_chunk;
+		hbh = NULL;
+		status = ocfs2_read_quota_block(lqinode,
+						ol_quota_chunk_block(sb, chunk),
+						&hbh);
+		if (status) {
+			mlog_errno(status);
+			break;
+		}
+		dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+		for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+			qbh = NULL;
+			status = ocfs2_read_quota_block(lqinode,
+						ol_dqblk_block(sb, chunk, bit),
+						&qbh);
+			if (status) {
+				mlog_errno(status);
+				break;
+			}
+			dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
+				ol_dqblk_block_off(sb, chunk, bit));
+			dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type);
+			if (!dquot) {
+				status = -EIO;
+				mlog(ML_ERROR, "Failed to get quota structure "
+				     "for id %u, type %d. Cannot finish quota "
+				     "file recovery.\n",
+				     (unsigned)le64_to_cpu(dqblk->dqb_id),
+				     type);
+				goto out_put_bh;
+			}
+			handle = ocfs2_start_trans(OCFS2_SB(sb),
+						   OCFS2_QSYNC_CREDITS);
+			if (IS_ERR(handle)) {
+				status = PTR_ERR(handle);
+				mlog_errno(status);
+				goto out_put_dquot;
+			}
+			mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+			spin_lock(&dq_data_lock);
+			/* Add usage from quota entry into quota changes
+			 * of our node. Auxiliary variables are important
+			 * due to signedness */
+			spacechange = le64_to_cpu(dqblk->dqb_spacemod);
+			inodechange = le64_to_cpu(dqblk->dqb_inodemod);
+			dquot->dq_dqb.dqb_curspace += spacechange;
+			dquot->dq_dqb.dqb_curinodes += inodechange;
+			spin_unlock(&dq_data_lock);
+			/* We want to drop reference held by the crashed
+			 * node. Since we have our own reference we know
+			 * global structure actually won't be freed. */
+			status = ocfs2_global_release_dquot(dquot);
+			if (status < 0) {
+				mlog_errno(status);
+				goto out_commit;
+			}
+			/* Release local quota file entry */
+			status = ocfs2_journal_access_dq(handle, lqinode,
+					qbh, OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto out_commit;
+			}
+			lock_buffer(qbh);
+			WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
+			ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+			le32_add_cpu(&dchunk->dqc_free, 1);
+			unlock_buffer(qbh);
+			status = ocfs2_journal_dirty(handle, qbh);
+			if (status < 0)
+				mlog_errno(status);
+out_commit:
+			mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+			ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_put_dquot:
+			dqput(dquot);
+out_put_bh:
+			brelse(qbh);
+			if (status < 0)
+				break;
+		}
+		brelse(hbh);
+		list_del(&rchunk->rc_list);
+		kfree(rchunk->rc_bitmap);
+		kfree(rchunk);
+		if (status < 0)
+			break;
+	}
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	if (status < 0)
+		free_recovery_list(&(rec->r_list[type]));
+	mlog_exit(status);
+	return status;
+}
+
+/* Recover local quota files for given node different from us */
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+				struct ocfs2_quota_recovery *rec,
+				int slot_num)
+{
+	unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+					LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+	struct super_block *sb = osb->sb;
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+	struct buffer_head *bh;
+	handle_t *handle;
+	int type;
+	int status = 0;
+	struct inode *lqinode;
+	unsigned int flags;
+
+	mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (list_empty(&(rec->r_list[type])))
+			continue;
+		mlog(0, "Recovering quota in slot %d\n", slot_num);
+		lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+		if (!lqinode) {
+			status = -ENOENT;
+			goto out;
+		}
+		status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+						       OCFS2_META_LOCK_NOQUEUE);
+		/* Someone else is holding the lock? Then he must be
+		 * doing the recovery. Just skip the file... */
+		if (status == -EAGAIN) {
+			mlog(ML_NOTICE, "skipping quota recovery for slot %d "
+			     "because quota file is locked.\n", slot_num);
+			status = 0;
+			goto out_put;
+		} else if (status < 0) {
+			mlog_errno(status);
+			goto out_put;
+		}
+		/* Now read local header */
+		bh = NULL;
+		status = ocfs2_read_quota_block(lqinode, 0, &bh);
+		if (status) {
+			mlog_errno(status);
+			mlog(ML_ERROR, "failed to read quota file info header "
+				"(slot=%d type=%d)\n", slot_num, type);
+			goto out_lock;
+		}
+		ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+							OCFS2_LOCAL_INFO_OFF);
+		/* Is recovery still needed? */
+		flags = le32_to_cpu(ldinfo->dqi_flags);
+		if (!(flags & OLQF_CLEAN))
+			status = ocfs2_recover_local_quota_file(lqinode,
+								type,
+								rec);
+		/* We don't want to mark file as clean when it is actually
+		 * active */
+		if (slot_num == osb->slot_num)
+			goto out_bh;
+		/* Mark quota file as clean if we are recovering quota file of
+		 * some other node. */
+		handle = ocfs2_start_trans(osb, 1);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto out_bh;
+		}
+		status = ocfs2_journal_access_dq(handle, lqinode, bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out_trans;
+		}
+		lock_buffer(bh);
+		ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
+		unlock_buffer(bh);
+		status = ocfs2_journal_dirty(handle, bh);
+		if (status < 0)
+			mlog_errno(status);
+out_trans:
+		ocfs2_commit_trans(osb, handle);
+out_bh:
+		brelse(bh);
+out_lock:
+		ocfs2_inode_unlock(lqinode, 1);
+out_put:
+		iput(lqinode);
+		if (status < 0)
+			break;
+	}
+out:
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	kfree(rec);
+	return status;
+}
+
+/* Read information header from quota file */
+static int ocfs2_local_read_info(struct super_block *sb, int type)
+{
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	int status;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_quota_recovery *rec;
+	int locked = 0;
+
+	info->dqi_maxblimit = 0x7fffffffffffffffLL;
+	info->dqi_maxilimit = 0x7fffffffffffffffLL;
+	oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
+	if (!oinfo) {
+		mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
+			       " info.");
+		goto out_err;
+	}
+	info->dqi_priv = oinfo;
+	oinfo->dqi_type = type;
+	INIT_LIST_HEAD(&oinfo->dqi_chunk);
+	oinfo->dqi_rec = NULL;
+	oinfo->dqi_lqi_bh = NULL;
+	oinfo->dqi_ibh = NULL;
+
+	status = ocfs2_global_read_info(sb, type);
+	if (status < 0)
+		goto out_err;
+
+	status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+	locked = 1;
+
+	/* Now read local header */
+	status = ocfs2_read_quota_block(lqinode, 0, &bh);
+	if (status) {
+		mlog_errno(status);
+		mlog(ML_ERROR, "failed to read quota file info header "
+			"(type=%d)\n", type);
+		goto out_err;
+	}
+	ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+						OCFS2_LOCAL_INFO_OFF);
+	info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+	oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
+	oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
+	oinfo->dqi_ibh = bh;
+
+	/* We crashed when using local quota file? */
+	if (!(info->dqi_flags & OLQF_CLEAN)) {
+		rec = OCFS2_SB(sb)->quota_rec;
+		if (!rec) {
+			rec = ocfs2_alloc_quota_recovery();
+			if (!rec) {
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto out_err;
+			}
+			OCFS2_SB(sb)->quota_rec = rec;
+		}
+
+		status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+                                                   &rec->r_list[type]);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out_err;
+		}
+	}
+
+	status = ocfs2_load_local_quota_bitmaps(lqinode,
+						ldinfo,
+						&oinfo->dqi_chunk);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	/* Now mark quota file as used */
+	info->dqi_flags &= ~OLQF_CLEAN;
+	status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	return 0;
+out_err:
+	if (oinfo) {
+		iput(oinfo->dqi_gqinode);
+		ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+		ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+		brelse(oinfo->dqi_lqi_bh);
+		if (locked)
+			ocfs2_inode_unlock(lqinode, 1);
+		ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+		kfree(oinfo);
+	}
+	brelse(bh);
+	return -1;
+}
+
+/* Write local info to quota file */
+static int ocfs2_local_write_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
+						->dqi_ibh;
+	int status;
+
+	status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
+				 info);
+	if (status < 0) {
+		mlog_errno(status);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Release info from memory */
+static int ocfs2_local_free_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_quota_chunk *chunk;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int mark_clean = 1, len;
+	int status;
+
+	/* At this point we know there are no more dquots and thus
+	 * even if there's some sync in the pdflush queue, it won't
+	 * find any dquots and return without doing anything */
+	cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+	iput(oinfo->dqi_gqinode);
+	ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+	ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+	list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+		dchunk = (struct ocfs2_local_disk_chunk *)
+					(chunk->qc_headerbh->b_data);
+		if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+			len = ol_chunk_entries(sb);
+		} else {
+			len = (oinfo->dqi_blocks -
+			       ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+			      * ol_quota_entries_per_block(sb);
+		}
+		/* Not all entries free? Bug! */
+		if (le32_to_cpu(dchunk->dqc_free) != len) {
+			mlog(ML_ERROR, "releasing quota file with used "
+					"entries (type=%d)\n", type);
+			mark_clean = 0;
+		}
+	}
+	ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+
+	/* dqonoff_mutex protects us against racing with recovery thread... */
+	if (oinfo->dqi_rec) {
+		ocfs2_free_quota_recovery(oinfo->dqi_rec);
+		mark_clean = 0;
+	}
+
+	if (!mark_clean)
+		goto out;
+
+	/* Mark local file as clean */
+	info->dqi_flags |= OLQF_CLEAN;
+	status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
+				 oinfo->dqi_ibh,
+				 olq_update_info,
+				 info);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+out:
+	ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
+	brelse(oinfo->dqi_ibh);
+	brelse(oinfo->dqi_lqi_bh);
+	kfree(oinfo);
+	return 0;
+}
+
+static void olq_set_dquot(struct buffer_head *bh, void *private)
+{
+	struct ocfs2_dquot *od = private;
+	struct ocfs2_local_disk_dqblk *dqblk;
+	struct super_block *sb = od->dq_dquot.dq_sb;
+
+	dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
+		+ ol_dqblk_block_offset(sb, od->dq_local_off));
+
+	dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id);
+	spin_lock(&dq_data_lock);
+	dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
+					  od->dq_origspace);
+	dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
+					  od->dq_originodes);
+	spin_unlock(&dq_data_lock);
+	mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
+	     od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
+	     (long long)le64_to_cpu(dqblk->dqb_inodemod));
+}
+
+/* Write dquot to local quota file */
+static int ocfs2_local_write_dquot(struct dquot *dquot)
+{
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+	struct buffer_head *bh = NULL;
+	int status;
+
+	status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
+				    ol_dqblk_file_block(sb, od->dq_local_off),
+				    &bh);
+	if (status) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
+				 olq_set_dquot, od);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+out:
+	brelse(bh);
+	return status;
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
+						       int type,
+						       int *offset)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_quota_chunk *chunk;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int found = 0, len;
+
+	list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+		dchunk = (struct ocfs2_local_disk_chunk *)
+						chunk->qc_headerbh->b_data;
+		if (le32_to_cpu(dchunk->dqc_free) > 0) {
+			found = 1;
+			break;
+		}
+	}
+	if (!found)
+		return NULL;
+
+	if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+		len = ol_chunk_entries(sb);
+	} else {
+		len = (oinfo->dqi_blocks -
+		       ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+		      * ol_quota_entries_per_block(sb);
+	}
+
+	found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+	/* We failed? */
+	if (found == len) {
+		mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
+		     " entries free (type=%d)\n", chunk->qc_num,
+		     le32_to_cpu(dchunk->dqc_free), type);
+		return ERR_PTR(-EIO);
+	}
+	*offset = found;
+	return chunk;
+}
+
+/* Add new chunk to the local quota file */
+static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
+							struct super_block *sb,
+							int type,
+							int *offset)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	struct ocfs2_quota_chunk *chunk = NULL;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int status;
+	handle_t *handle;
+	struct buffer_head *bh = NULL;
+	u64 p_blkno;
+
+	/* We are protected by dqio_sem so no locking needed */
+	status = ocfs2_extend_no_holes(lqinode,
+				       lqinode->i_size + 2 * sb->s_blocksize,
+				       lqinode->i_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+					  lqinode->i_size + 2 * sb->s_blocksize);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+	if (!chunk) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+					     &p_blkno, NULL, NULL);
+	up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	bh = sb_getblk(sb, p_blkno);
+	if (!bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out;
+	}
+	dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+
+	handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_journal_access_dq(handle, lqinode, bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	lock_buffer(bh);
+	dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb));
+	memset(dchunk->dqc_bitmap, 0,
+	       sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+	       OCFS2_QBLK_RESERVED_SPACE);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+
+	oinfo->dqi_blocks += 2;
+	oinfo->dqi_chunks++;
+	status = ocfs2_local_write_info(sb, type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk);
+	chunk->qc_num = list_entry(chunk->qc_chunk.prev,
+				   struct ocfs2_quota_chunk,
+				   qc_chunk)->qc_num + 1;
+	chunk->qc_headerbh = bh;
+	*offset = 0;
+	return chunk;
+out_trans:
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+	brelse(bh);
+	kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
+	return ERR_PTR(status);
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
+						       struct super_block *sb,
+						       int type,
+						       int *offset)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_quota_chunk *chunk;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	struct ocfs2_local_disk_chunk *dchunk;
+	int epb = ol_quota_entries_per_block(sb);
+	unsigned int chunk_blocks;
+	int status;
+	handle_t *handle;
+
+	if (list_empty(&oinfo->dqi_chunk))
+		return ocfs2_local_quota_add_chunk(sb, type, offset);
+	/* Is the last chunk full? */
+	chunk = list_entry(oinfo->dqi_chunk.prev,
+			struct ocfs2_quota_chunk, qc_chunk);
+	chunk_blocks = oinfo->dqi_blocks -
+			ol_quota_chunk_block(sb, chunk->qc_num) - 1;
+	if (ol_chunk_blocks(sb) == chunk_blocks)
+		return ocfs2_local_quota_add_chunk(sb, type, offset);
+
+	/* We are protected by dqio_sem so no locking needed */
+	status = ocfs2_extend_no_holes(lqinode,
+				       lqinode->i_size + sb->s_blocksize,
+				       lqinode->i_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+					  lqinode->i_size + sb->s_blocksize);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
+				 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+
+	dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data;
+	lock_buffer(chunk->qc_headerbh);
+	le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
+	unlock_buffer(chunk->qc_headerbh);
+	status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	oinfo->dqi_blocks++;
+	status = ocfs2_local_write_info(sb, type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+
+	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	*offset = chunk_blocks * epb;
+	return chunk;
+out_trans:
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+	return ERR_PTR(status);
+}
+
+static void olq_alloc_dquot(struct buffer_head *bh, void *private)
+{
+	int *offset = private;
+	struct ocfs2_local_disk_chunk *dchunk;
+
+	dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+	ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+	le32_add_cpu(&dchunk->dqc_free, -1);
+}
+
+/* Create dquot in the local file for given id */
+static int ocfs2_create_local_dquot(struct dquot *dquot)
+{
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_type;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	struct ocfs2_quota_chunk *chunk;
+	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+	int offset;
+	int status;
+
+	chunk = ocfs2_find_free_entry(sb, type, &offset);
+	if (!chunk) {
+		chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
+		if (IS_ERR(chunk))
+			return PTR_ERR(chunk);
+	} else if (IS_ERR(chunk)) {
+		return PTR_ERR(chunk);
+	}
+	od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
+	od->dq_chunk = chunk;
+
+	/* Initialize dquot structure on disk */
+	status = ocfs2_local_write_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	/* Mark structure as allocated */
+	status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot,
+				 &offset);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+out:
+	return status;
+}
+
+/* Create entry in local file for dquot, load data from the global file */
+static int ocfs2_local_read_dquot(struct dquot *dquot)
+{
+	int status;
+
+	mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
+
+	status = ocfs2_global_read_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	/* Now create entry in the local quota file */
+	status = ocfs2_create_local_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+	mlog_exit(0);
+	return 0;
+out_err:
+	mlog_exit(status);
+	return status;
+}
+
+/* Release dquot structure from local quota file. ocfs2_release_dquot() has
+ * already started a transaction and obtained exclusive lock for global
+ * quota file. */
+static int ocfs2_local_release_dquot(struct dquot *dquot)
+{
+	int status;
+	int type = dquot->dq_type;
+	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int offset;
+	handle_t *handle = journal_current_handle();
+
+	BUG_ON(!handle);
+	/* First write all local changes to global file */
+	status = ocfs2_global_release_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type],
+			od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num,
+					     od->dq_local_off);
+	dchunk = (struct ocfs2_local_disk_chunk *)
+			(od->dq_chunk->qc_headerbh->b_data);
+	/* Mark structure as freed */
+	lock_buffer(od->dq_chunk->qc_headerbh);
+	ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+	le32_add_cpu(&dchunk->dqc_free, 1);
+	unlock_buffer(od->dq_chunk->qc_headerbh);
+	status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = 0;
+out:
+	/* Clear the read bit so that next time someone uses this
+	 * dquot he reads fresh info from disk and allocates local
+	 * dquot structure */
+	clear_bit(DQ_READ_B, &dquot->dq_flags);
+	return status;
+}
+
+static struct quota_format_ops ocfs2_format_ops = {
+	.check_quota_file	= ocfs2_local_check_quota_file,
+	.read_file_info		= ocfs2_local_read_info,
+	.write_file_info	= ocfs2_global_write_info,
+	.free_file_info		= ocfs2_local_free_info,
+	.read_dqblk		= ocfs2_local_read_dquot,
+	.commit_dqblk		= ocfs2_local_write_dquot,
+	.release_dqblk		= ocfs2_local_release_dquot,
+};
+
+struct quota_format_type ocfs2_quota_format = {
+	.qf_fmt_id = QFMT_OCFS2,
+	.qf_ops = &ocfs2_format_ops,
+	.qf_owner = THIS_MODULE
+};

+ 22 - 54
fs/ocfs2/resize.c

@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
 	mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
 	mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
 		   new_clusters, first_new_cluster);
 		   new_clusters, first_new_cluster);
 
 
-	ret = ocfs2_journal_access(handle, bm_inode, group_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 	if (ret < 0) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out;
 		goto out;
@@ -141,8 +141,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
 	}
 	}
 
 
 	/* update the inode accordingly. */
 	/* update the inode accordingly. */
-	ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 	if (ret < 0) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out_rollback;
 		goto out_rollback;
@@ -314,6 +314,10 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 
 
 	fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
 	fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
 
 
+	/* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
+	 * so any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
 	if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
 	if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
 				 ocfs2_group_bitmap_size(osb->sb) * 8) {
 				 ocfs2_group_bitmap_size(osb->sb) * 8) {
 		mlog(ML_ERROR, "The disk is too old and small. "
 		mlog(ML_ERROR, "The disk is too old and small. "
@@ -322,30 +326,18 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 		goto out_unlock;
 		goto out_unlock;
 	}
 	}
 
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
-		ret = -EIO;
-		goto out_unlock;
-	}
-
 	first_new_cluster = le32_to_cpu(fe->i_clusters);
 	first_new_cluster = le32_to_cpu(fe->i_clusters);
 	lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
 	lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
 					      first_new_cluster - 1);
 					      first_new_cluster - 1);
 
 
-	ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
+	ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
+					  &group_bh);
 	if (ret < 0) {
 	if (ret < 0) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out_unlock;
 		goto out_unlock;
 	}
 	}
-
 	group = (struct ocfs2_group_desc *)group_bh->b_data;
 	group = (struct ocfs2_group_desc *)group_bh->b_data;
 
 
-	ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_unlock;
-	}
-
 	cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
 	cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
 	if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
 	if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
 		le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
 		le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
@@ -398,41 +390,16 @@ static int ocfs2_check_new_group(struct inode *inode,
 				 struct buffer_head *group_bh)
 				 struct buffer_head *group_bh)
 {
 {
 	int ret;
 	int ret;
-	struct ocfs2_group_desc *gd;
+	struct ocfs2_group_desc *gd =
+		(struct ocfs2_group_desc *)group_bh->b_data;
 	u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
 	u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
-	unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
-				le16_to_cpu(di->id2.i_chain.cl_bpc);
-
 
 
-	gd = (struct ocfs2_group_desc *)group_bh->b_data;
+	ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
+	if (ret)
+		goto out;
 
 
-	ret = -EIO;
-	if (!OCFS2_IS_VALID_GROUP_DESC(gd))
-		mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
-		     (unsigned long long)le64_to_cpu(gd->bg_blkno));
-	else if (di->i_blkno != gd->bg_parent_dinode)
-		mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
-		     "pointer (%llu, expected %llu)\n",
-		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-		     (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
-		     (unsigned long long)le64_to_cpu(di->i_blkno));
-	else if (le16_to_cpu(gd->bg_bits) > max_bits)
-		mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
-		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-		     le16_to_cpu(gd->bg_bits));
-	else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
-		mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
-		     "claims that %u are free\n",
-		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-		     le16_to_cpu(gd->bg_bits),
-		     le16_to_cpu(gd->bg_free_bits_count));
-	else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
-		mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
-		     "max bitmap bits of %u\n",
-		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-		     le16_to_cpu(gd->bg_bits),
-		     8 * le16_to_cpu(gd->bg_size));
-	else if (le16_to_cpu(gd->bg_chain) != input->chain)
+	ret = -EINVAL;
+	if (le16_to_cpu(gd->bg_chain) != input->chain)
 		mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
 		mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
 		     "while input has %u set.\n",
 		     "while input has %u set.\n",
 		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
 		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
@@ -451,6 +418,7 @@ static int ocfs2_check_new_group(struct inode *inode,
 	else
 	else
 		ret = 0;
 		ret = 0;
 
 
+out:
 	return ret;
 	return ret;
 }
 }
 
 
@@ -568,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 	cl = &fe->id2.i_chain;
 	cl = &fe->id2.i_chain;
 	cr = &cl->cl_recs[input->chain];
 	cr = &cl->cl_recs[input->chain];
 
 
-	ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 	if (ret < 0) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out_commit;
 		goto out_commit;
@@ -584,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 		goto out_commit;
 		goto out_commit;
 	}
 	}
 
 
-	ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 	if (ret < 0) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out_commit;
 		goto out_commit;

+ 2 - 2
fs/ocfs2/slot_map.c

@@ -151,7 +151,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
 	 * this is not true, the read of -1 (UINT64_MAX) will fail.
 	 * this is not true, the read of -1 (UINT64_MAX) will fail.
 	 */
 	 */
 	ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
 	ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
-				OCFS2_BH_IGNORE_CACHE);
+				OCFS2_BH_IGNORE_CACHE, NULL);
 	if (ret == 0) {
 	if (ret == 0) {
 		spin_lock(&osb->osb_lock);
 		spin_lock(&osb->osb_lock);
 		ocfs2_update_slot_info(si);
 		ocfs2_update_slot_info(si);
@@ -405,7 +405,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 
 
 		bh = NULL;  /* Acquire a fresh bh */
 		bh = NULL;  /* Acquire a fresh bh */
 		status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
 		status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
-					   OCFS2_BH_IGNORE_CACHE);
+					   OCFS2_BH_IGNORE_CACHE, NULL);
 		if (status < 0) {
 		if (status < 0) {
 			mlog_errno(status);
 			mlog_errno(status);
 			goto bail;
 			goto bail;

+ 226 - 137
fs/ocfs2/suballoc.c

@@ -35,6 +35,7 @@
 #include "ocfs2.h"
 #include "ocfs2.h"
 
 
 #include "alloc.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "dlmglue.h"
 #include "inode.h"
 #include "inode.h"
 #include "journal.h"
 #include "journal.h"
@@ -145,62 +146,183 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
 	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
 }
 }
 
 
-/* somewhat more expensive than our other checks, so use sparingly. */
-int ocfs2_check_group_descriptor(struct super_block *sb,
-				 struct ocfs2_dinode *di,
-				 struct ocfs2_group_desc *gd)
+#define do_error(fmt, ...)						\
+	do{								\
+		if (clean_error)					\
+			mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);	\
+		else							\
+			ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
+	} while (0)
+
+static int ocfs2_validate_gd_self(struct super_block *sb,
+				  struct buffer_head *bh,
+				  int clean_error)
 {
 {
-	unsigned int max_bits;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 
 
 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
-		return -EIO;
+		do_error("Group descriptor #%llu has bad signature %.*s",
+			 (unsigned long long)bh->b_blocknr, 7,
+			 gd->bg_signature);
+		return -EINVAL;
 	}
 	}
 
 
+	if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
+		do_error("Group descriptor #%llu has an invalid bg_blkno "
+			 "of %llu",
+			 (unsigned long long)bh->b_blocknr,
+			 (unsigned long long)le64_to_cpu(gd->bg_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
+		do_error("Group descriptor #%llu has an invalid "
+			 "fs_generation of #%u",
+			 (unsigned long long)bh->b_blocknr,
+			 le32_to_cpu(gd->bg_generation));
+		return -EINVAL;
+	}
+
+	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+		do_error("Group descriptor #%llu has bit count %u but "
+			 "claims that %u are free",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_bits),
+			 le16_to_cpu(gd->bg_free_bits_count));
+		return -EINVAL;
+	}
+
+	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+		do_error("Group descriptor #%llu has bit count %u but "
+			 "max bitmap bits of %u",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_bits),
+			 8 * le16_to_cpu(gd->bg_size));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_validate_gd_parent(struct super_block *sb,
+				    struct ocfs2_dinode *di,
+				    struct buffer_head *bh,
+				    int clean_error)
+{
+	unsigned int max_bits;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
 	if (di->i_blkno != gd->bg_parent_dinode) {
 	if (di->i_blkno != gd->bg_parent_dinode) {
-		ocfs2_error(sb, "Group descriptor # %llu has bad parent "
-			    "pointer (%llu, expected %llu)",
-			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			    (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
-			    (unsigned long long)le64_to_cpu(di->i_blkno));
-		return -EIO;
+		do_error("Group descriptor #%llu has bad parent "
+			 "pointer (%llu, expected %llu)",
+			 (unsigned long long)bh->b_blocknr,
+			 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+			 (unsigned long long)le64_to_cpu(di->i_blkno));
+		return -EINVAL;
 	}
 	}
 
 
 	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
 	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
 	if (le16_to_cpu(gd->bg_bits) > max_bits) {
 	if (le16_to_cpu(gd->bg_bits) > max_bits) {
-		ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
-			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			    le16_to_cpu(gd->bg_bits));
-		return -EIO;
+		do_error("Group descriptor #%llu has bit count of %u",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_bits));
+		return -EINVAL;
 	}
 	}
 
 
 	if (le16_to_cpu(gd->bg_chain) >=
 	if (le16_to_cpu(gd->bg_chain) >=
 	    le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
 	    le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
-		ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
-			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			    le16_to_cpu(gd->bg_chain));
-		return -EIO;
+		do_error("Group descriptor #%llu has bad chain %u",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_chain));
+		return -EINVAL;
 	}
 	}
 
 
-	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
-		ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
-			    "claims that %u are free",
-			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			    le16_to_cpu(gd->bg_bits),
-			    le16_to_cpu(gd->bg_free_bits_count));
-		return -EIO;
-	}
+	return 0;
+}
 
 
-	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
-		ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
-			    "max bitmap bits of %u",
-			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			    le16_to_cpu(gd->bg_bits),
-			    8 * le16_to_cpu(gd->bg_size));
-		return -EIO;
+#undef do_error
+
+/*
+ * This version only prints errors.  It does not fail the filesystem, and
+ * exists only for resize.
+ */
+int ocfs2_check_group_descriptor(struct super_block *sb,
+				 struct ocfs2_dinode *di,
+				 struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+	if (rc) {
+		mlog(ML_ERROR,
+		     "Checksum failed for group descriptor %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+	} else
+		rc = ocfs2_validate_gd_self(sb, bh, 1);
+	if (!rc)
+		rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
+
+	return rc;
+}
+
+static int ocfs2_validate_group_descriptor(struct super_block *sb,
+					   struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+	mlog(0, "Validating group descriptor %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+	if (rc)
+		return rc;
+
+	/*
+	 * Errors after here are fatal.
+	 */
+
+	return ocfs2_validate_gd_self(sb, bh, 0);
+}
+
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+				u64 gd_blkno, struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(inode, gd_blkno, &tmp,
+			      ocfs2_validate_group_descriptor);
+	if (rc)
+		goto out;
+
+	rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
+	if (rc) {
+		brelse(tmp);
+		goto out;
 	}
 	}
 
 
-	return 0;
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!*bh)
+		*bh = tmp;
+
+out:
+	return rc;
 }
 }
 
 
 static int ocfs2_block_group_fill(handle_t *handle,
 static int ocfs2_block_group_fill(handle_t *handle,
@@ -225,10 +347,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
 		goto bail;
 		goto bail;
 	}
 	}
 
 
-	status = ocfs2_journal_access(handle,
-				      alloc_inode,
-				      bg_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	status = ocfs2_journal_access_gd(handle,
+					 alloc_inode,
+					 bg_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
@@ -358,8 +480,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 
 
 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 
 
-	status = ocfs2_journal_access(handle, alloc_inode,
-				      bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, alloc_inode,
+					 bh, OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
@@ -441,11 +563,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 	ac->ac_alloc_slot = slot;
 	ac->ac_alloc_slot = slot;
 
 
 	fe = (struct ocfs2_dinode *) bh->b_data;
 	fe = (struct ocfs2_dinode *) bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
-		status = -EIO;
-		goto bail;
-	}
+
+	/* The bh was validated by the inode read inside
+	 * ocfs2_inode_lock().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
 		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
 		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
 			    (unsigned long long)le64_to_cpu(fe->i_blkno));
 			    (unsigned long long)le64_to_cpu(fe->i_blkno));
@@ -790,10 +912,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
 	int offset, start, found, status = 0;
 	int offset, start, found, status = 0;
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 
 
-	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
-		return -EIO;
-	}
+	/* Callers got this descriptor from
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 
 
 	found = start = best_offset = best_size = 0;
 	found = start = best_offset = best_size = 0;
 	bitmap = bg->bg_bitmap;
 	bitmap = bg->bg_bitmap;
@@ -858,11 +979,9 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 
 
 	mlog_entry_void();
 	mlog_entry_void();
 
 
-	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
-		status = -EIO;
-		goto bail;
-	}
+	/* All callers get the descriptor via
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
 
 
 	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
 	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
@@ -871,10 +990,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 	if (ocfs2_is_cluster_bitmap(alloc_inode))
 	if (ocfs2_is_cluster_bitmap(alloc_inode))
 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
 
 
-	status = ocfs2_journal_access(handle,
-				      alloc_inode,
-				      group_bh,
-				      journal_type);
+	status = ocfs2_journal_access_gd(handle,
+					 alloc_inode,
+					 group_bh,
+					 journal_type);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
@@ -931,21 +1050,10 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
 
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
-		status = -EIO;
-		goto out;
-	}
-	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
-		status = -EIO;
-		goto out;
-	}
-	if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
-		status = -EIO;
-		goto out;
-	}
+	/* The caller got these descriptors from
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
 
 
 	mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
 	mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
 	     (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
 	     (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
@@ -956,8 +1064,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	bg_ptr = le64_to_cpu(bg->bg_next_group);
 	bg_ptr = le64_to_cpu(bg->bg_next_group);
 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
 
 
-	status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto out_rollback;
 		goto out_rollback;
@@ -971,8 +1079,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
 		goto out_rollback;
 		goto out_rollback;
 	}
 	}
 
 
-	status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto out_rollback;
 		goto out_rollback;
@@ -986,8 +1094,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
 		goto out_rollback;
 		goto out_rollback;
 	}
 	}
 
 
-	status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto out_rollback;
 		goto out_rollback;
@@ -1008,7 +1116,7 @@ out_rollback:
 		bg->bg_next_group = cpu_to_le64(bg_ptr);
 		bg->bg_next_group = cpu_to_le64(bg_ptr);
 		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
 		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
 	}
 	}
-out:
+
 	mlog_exit(status);
 	mlog_exit(status);
 	return status;
 	return status;
 }
 }
@@ -1138,8 +1246,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
 
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 	if (ret < 0) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		goto out;
 		goto out;
@@ -1170,21 +1278,17 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 	u16 found;
 	u16 found;
 	struct buffer_head *group_bh = NULL;
 	struct buffer_head *group_bh = NULL;
 	struct ocfs2_group_desc *gd;
 	struct ocfs2_group_desc *gd;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
 	struct inode *alloc_inode = ac->ac_inode;
 	struct inode *alloc_inode = ac->ac_inode;
 
 
-	ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
+	ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
+					  &group_bh);
 	if (ret < 0) {
 	if (ret < 0) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		return ret;
 		return ret;
 	}
 	}
 
 
 	gd = (struct ocfs2_group_desc *) group_bh->b_data;
 	gd = (struct ocfs2_group_desc *) group_bh->b_data;
-	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
-		ret = -EIO;
-		goto out;
-	}
-
 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
 				  ac->ac_max_block, bit_off, &found);
 				  ac->ac_max_block, bit_off, &found);
 	if (ret < 0) {
 	if (ret < 0) {
@@ -1241,19 +1345,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	     bits_wanted, chain,
 	     bits_wanted, chain,
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
 
 
-	status = ocfs2_read_block(alloc_inode,
-				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
-				  &group_bh);
+	status = ocfs2_read_group_descriptor(alloc_inode, fe,
+					     le64_to_cpu(cl->cl_recs[chain].c_blkno),
+					     &group_bh);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
 	}
 	}
 	bg = (struct ocfs2_group_desc *) group_bh->b_data;
 	bg = (struct ocfs2_group_desc *) group_bh->b_data;
-	status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
-	if (status) {
-		mlog_errno(status);
-		goto bail;
-	}
 
 
 	status = -ENOSPC;
 	status = -ENOSPC;
 	/* for now, the chain search is a bit simplistic. We just use
 	/* for now, the chain search is a bit simplistic. We just use
@@ -1271,18 +1370,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 		next_group = le64_to_cpu(bg->bg_next_group);
 		next_group = le64_to_cpu(bg->bg_next_group);
 		prev_group_bh = group_bh;
 		prev_group_bh = group_bh;
 		group_bh = NULL;
 		group_bh = NULL;
-		status = ocfs2_read_block(alloc_inode,
-					  next_group, &group_bh);
+		status = ocfs2_read_group_descriptor(alloc_inode, fe,
+						     next_group, &group_bh);
 		if (status < 0) {
 		if (status < 0) {
 			mlog_errno(status);
 			mlog_errno(status);
 			goto bail;
 			goto bail;
 		}
 		}
 		bg = (struct ocfs2_group_desc *) group_bh->b_data;
 		bg = (struct ocfs2_group_desc *) group_bh->b_data;
-		status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
-		if (status) {
-			mlog_errno(status);
-			goto bail;
-		}
 	}
 	}
 	if (status < 0) {
 	if (status < 0) {
 		if (status != -ENOSPC)
 		if (status != -ENOSPC)
@@ -1324,10 +1418,10 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 
 
 	/* Ok, claim our bits now: set the info on dinode, chainlist
 	/* Ok, claim our bits now: set the info on dinode, chainlist
 	 * and then the group */
 	 * and then the group */
-	status = ocfs2_journal_access(handle,
-				      alloc_inode,
-				      ac->ac_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle,
+					 alloc_inode,
+					 ac->ac_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
@@ -1392,11 +1486,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
 	BUG_ON(!ac->ac_bh);
 	BUG_ON(!ac->ac_bh);
 
 
 	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
 	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
-		status = -EIO;
-		goto bail;
-	}
+
+	/* The bh was validated by the inode read during
+	 * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
 		ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
 		ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
@@ -1725,19 +1819,17 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
 
 
 	mlog_entry_void();
 	mlog_entry_void();
 
 
-	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
-		status = -EIO;
-		goto bail;
-	}
+	/* The caller got this descriptor from
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 
 
 	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
 	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
 
 
 	if (ocfs2_is_cluster_bitmap(alloc_inode))
 	if (ocfs2_is_cluster_bitmap(alloc_inode))
 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
 
 
-	status = ocfs2_journal_access(handle, alloc_inode, group_bh,
-				      journal_type);
+	status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh,
+					 journal_type);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
@@ -1782,29 +1874,26 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 
 
 	mlog_entry_void();
 	mlog_entry_void();
 
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
-		status = -EIO;
-		goto bail;
-	}
+	/* The alloc_bh comes from ocfs2_free_dinode() or
+	 * ocfs2_free_clusters().  The callers have all locked the
+	 * allocator and gotten alloc_bh from the lock call.  This
+	 * validates the dinode buffer.  Any corruption that has happended
+	 * is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
 
 
 	mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
 	mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
 	     (unsigned long long)bg_blkno, start_bit);
 	     (unsigned long long)bg_blkno, start_bit);
 
 
-	status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
+	status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
+					     &group_bh);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;
 	}
 	}
-
 	group = (struct ocfs2_group_desc *) group_bh->b_data;
 	group = (struct ocfs2_group_desc *) group_bh->b_data;
-	status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
-	if (status) {
-		mlog_errno(status);
-		goto bail;
-	}
+
 	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
 	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
 
 
 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
@@ -1815,8 +1904,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 		goto bail;
 		goto bail;
 	}
 	}
 
 
-	status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		goto bail;
 		goto bail;

+ 16 - 2
fs/ocfs2/suballoc.h

@@ -164,10 +164,24 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
  * and return that block offset. */
  * and return that block offset. */
 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
 
 
-/* somewhat more expensive than our other checks, so use sparingly. */
+/*
+ * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
+ * finds a problem.  A caller that wants to check a group descriptor
+ * without going readonly should read the block with ocfs2_read_block[s]()
+ * and then checking it with this function.  This is only resize, really.
+ * Everyone else should be using ocfs2_read_group_descriptor().
+ */
 int ocfs2_check_group_descriptor(struct super_block *sb,
 int ocfs2_check_group_descriptor(struct super_block *sb,
 				 struct ocfs2_dinode *di,
 				 struct ocfs2_dinode *di,
-				 struct ocfs2_group_desc *gd);
+				 struct buffer_head *bh);
+/*
+ * Read a group descriptor block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The descriptor will be validated with
+ * ocfs2_validate_group_descriptor().
+ */
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+				u64 gd_blkno, struct buffer_head **bh);
+
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
 			  struct ocfs2_alloc_context **data_ac,

+ 326 - 2
fs/ocfs2/super.c

@@ -41,6 +41,7 @@
 #include <linux/debugfs.h>
 #include <linux/debugfs.h>
 #include <linux/mount.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/seq_file.h>
+#include <linux/quotaops.h>
 
 
 #define MLOG_MASK_PREFIX ML_SUPER
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
 #include <cluster/masklog.h>
@@ -51,6 +52,7 @@
 #include "ocfs1_fs_compat.h"
 #include "ocfs1_fs_compat.h"
 
 
 #include "alloc.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "dlmglue.h"
 #include "export.h"
 #include "export.h"
 #include "extent_map.h"
 #include "extent_map.h"
@@ -65,10 +67,13 @@
 #include "uptodate.h"
 #include "uptodate.h"
 #include "ver.h"
 #include "ver.h"
 #include "xattr.h"
 #include "xattr.h"
+#include "quota.h"
 
 
 #include "buffer_head_io.h"
 #include "buffer_head_io.h"
 
 
 static struct kmem_cache *ocfs2_inode_cachep = NULL;
 static struct kmem_cache *ocfs2_inode_cachep = NULL;
+struct kmem_cache *ocfs2_dquot_cachep;
+struct kmem_cache *ocfs2_qf_chunk_cachep;
 
 
 /* OCFS2 needs to schedule several differnt types of work which
 /* OCFS2 needs to schedule several differnt types of work which
  * require cluster locking, disk I/O, recovery waits, etc. Since these
  * require cluster locking, disk I/O, recovery waits, etc. Since these
@@ -124,6 +129,9 @@ static int ocfs2_get_sector(struct super_block *sb,
 static void ocfs2_write_super(struct super_block *sb);
 static void ocfs2_write_super(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
 static void ocfs2_destroy_inode(struct inode *inode);
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
+static int ocfs2_enable_quotas(struct ocfs2_super *osb);
+static void ocfs2_disable_quotas(struct ocfs2_super *osb);
 
 
 static const struct super_operations ocfs2_sops = {
 static const struct super_operations ocfs2_sops = {
 	.statfs		= ocfs2_statfs,
 	.statfs		= ocfs2_statfs,
@@ -137,6 +145,8 @@ static const struct super_operations ocfs2_sops = {
 	.put_super	= ocfs2_put_super,
 	.put_super	= ocfs2_put_super,
 	.remount_fs	= ocfs2_remount,
 	.remount_fs	= ocfs2_remount,
 	.show_options   = ocfs2_show_options,
 	.show_options   = ocfs2_show_options,
+	.quota_read	= ocfs2_quota_read,
+	.quota_write	= ocfs2_quota_write,
 };
 };
 
 
 enum {
 enum {
@@ -158,6 +168,10 @@ enum {
 	Opt_user_xattr,
 	Opt_user_xattr,
 	Opt_nouser_xattr,
 	Opt_nouser_xattr,
 	Opt_inode64,
 	Opt_inode64,
+	Opt_acl,
+	Opt_noacl,
+	Opt_usrquota,
+	Opt_grpquota,
 	Opt_err,
 	Opt_err,
 };
 };
 
 
@@ -180,6 +194,10 @@ static const match_table_t tokens = {
 	{Opt_user_xattr, "user_xattr"},
 	{Opt_user_xattr, "user_xattr"},
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_inode64, "inode64"},
 	{Opt_inode64, "inode64"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_grpquota, "grpquota"},
 	{Opt_err, NULL}
 	{Opt_err, NULL}
 };
 };
 
 
@@ -221,6 +239,19 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
 	return 0;
 	return 0;
 }
 }
 
 
+static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
+{
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+	    && (ino == USER_QUOTA_SYSTEM_INODE
+		|| ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
+		return 0;
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+	    && (ino == GROUP_QUOTA_SYSTEM_INODE
+		|| ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
+		return 0;
+	return 1;
+}
+
 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 {
 {
 	struct inode *new = NULL;
 	struct inode *new = NULL;
@@ -247,6 +278,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 
 
 	for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
 	for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
 	     i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
 	     i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
+		if (!ocfs2_need_system_inode(osb, i))
+			continue;
 		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
 		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
 		if (!new) {
 		if (!new) {
 			ocfs2_release_system_inodes(osb);
 			ocfs2_release_system_inodes(osb);
@@ -277,6 +310,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
 	for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
 	for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
 	     i < NUM_SYSTEM_INODES;
 	     i < NUM_SYSTEM_INODES;
 	     i++) {
 	     i++) {
+		if (!ocfs2_need_system_inode(osb, i))
+			continue;
 		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
 		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
 		if (!new) {
 		if (!new) {
 			ocfs2_release_system_inodes(osb);
 			ocfs2_release_system_inodes(osb);
@@ -426,6 +461,12 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 
 
 	/* We're going to/from readonly mode. */
 	/* We're going to/from readonly mode. */
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+		/* Disable quota accounting before remounting RO */
+		if (*flags & MS_RDONLY) {
+			ret = ocfs2_susp_quotas(osb, 0);
+			if (ret < 0)
+				goto out;
+		}
 		/* Lock here so the check of HARD_RO and the potential
 		/* Lock here so the check of HARD_RO and the potential
 		 * setting of SOFT_RO is atomic. */
 		 * setting of SOFT_RO is atomic. */
 		spin_lock(&osb->osb_lock);
 		spin_lock(&osb->osb_lock);
@@ -461,11 +502,28 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 		}
 		}
 unlock_osb:
 unlock_osb:
 		spin_unlock(&osb->osb_lock);
 		spin_unlock(&osb->osb_lock);
+		/* Enable quota accounting after remounting RW */
+		if (!ret && !(*flags & MS_RDONLY)) {
+			if (sb_any_quota_suspended(sb))
+				ret = ocfs2_susp_quotas(osb, 1);
+			else
+				ret = ocfs2_enable_quotas(osb);
+			if (ret < 0) {
+				/* Return back changes... */
+				spin_lock(&osb->osb_lock);
+				sb->s_flags |= MS_RDONLY;
+				osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+				spin_unlock(&osb->osb_lock);
+				goto out;
+			}
+		}
 	}
 	}
 
 
 	if (!ret) {
 	if (!ret) {
 		/* Only save off the new mount options in case of a successful
 		/* Only save off the new mount options in case of a successful
 		 * remount. */
 		 * remount. */
+		if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+			parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
 		osb->s_mount_opt = parsed_options.mount_opt;
 		osb->s_mount_opt = parsed_options.mount_opt;
 		osb->s_atime_quantum = parsed_options.atime_quantum;
 		osb->s_atime_quantum = parsed_options.atime_quantum;
 		osb->preferred_slot = parsed_options.slot;
 		osb->preferred_slot = parsed_options.slot;
@@ -619,6 +677,131 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
 	return 0;
 	return 0;
 }
 }
 
 
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
+{
+	int type;
+	struct super_block *sb = osb->sb;
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+	int status = 0;
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+			continue;
+		if (unsuspend)
+			status = vfs_quota_enable(
+					sb_dqopt(sb)->files[type],
+					type, QFMT_OCFS2,
+					DQUOT_SUSPENDED);
+		else
+			status = vfs_quota_disable(sb, type,
+						   DQUOT_SUSPENDED);
+		if (status < 0)
+			break;
+	}
+	if (status < 0)
+		mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
+		     "remount (error = %d).\n", status);
+	return status;
+}
+
+static int ocfs2_enable_quotas(struct ocfs2_super *osb)
+{
+	struct inode *inode[MAXQUOTAS] = { NULL, NULL };
+	struct super_block *sb = osb->sb;
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+	unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+					LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+	int status;
+	int type;
+
+	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+			continue;
+		inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
+							osb->slot_num);
+		if (!inode[type]) {
+			status = -ENOENT;
+			goto out_quota_off;
+		}
+		status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
+						DQUOT_USAGE_ENABLED);
+		if (status < 0)
+			goto out_quota_off;
+	}
+
+	for (type = 0; type < MAXQUOTAS; type++)
+		iput(inode[type]);
+	return 0;
+out_quota_off:
+	ocfs2_disable_quotas(osb);
+	for (type = 0; type < MAXQUOTAS; type++)
+		iput(inode[type]);
+	mlog_errno(status);
+	return status;
+}
+
+static void ocfs2_disable_quotas(struct ocfs2_super *osb)
+{
+	int type;
+	struct inode *inode;
+	struct super_block *sb = osb->sb;
+
+	/* We mostly ignore errors in this function because there's not much
+	 * we can do when we see them */
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!sb_has_quota_loaded(sb, type))
+			continue;
+		inode = igrab(sb->s_dquot.files[type]);
+		/* Turn off quotas. This will remove all dquot structures from
+		 * memory and so they will be automatically synced to global
+		 * quota files */
+		vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
+					    DQUOT_LIMITS_ENABLED);
+		if (!inode)
+			continue;
+		iput(inode);
+	}
+}
+
+/* Handle quota on quotactl */
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
+			  char *path, int remount)
+{
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+		return -EINVAL;
+
+	if (remount)
+		return 0;	/* Just ignore it has been handled in
+				 * ocfs2_remount() */
+	return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
+				    format_id, DQUOT_LIMITS_ENABLED);
+}
+
+/* Handle quota off quotactl */
+static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+{
+	if (remount)
+		return 0;	/* Ignore now and handle later in
+				 * ocfs2_remount() */
+	return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
+}
+
+static struct quotactl_ops ocfs2_quotactl_ops = {
+	.quota_on	= ocfs2_quota_on,
+	.quota_off	= ocfs2_quota_off,
+	.quota_sync	= vfs_quota_sync,
+	.get_info	= vfs_get_dqinfo,
+	.set_info	= vfs_set_dqinfo,
+	.get_dqblk	= vfs_get_dqblk,
+	.set_dqblk	= vfs_set_dqblk,
+};
+
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
 {
 	struct dentry *root;
 	struct dentry *root;
@@ -651,12 +834,32 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	}
 	brelse(bh);
 	brelse(bh);
 	bh = NULL;
 	bh = NULL;
+
+	if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+		parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+
 	osb->s_mount_opt = parsed_options.mount_opt;
 	osb->s_mount_opt = parsed_options.mount_opt;
 	osb->s_atime_quantum = parsed_options.atime_quantum;
 	osb->s_atime_quantum = parsed_options.atime_quantum;
 	osb->preferred_slot = parsed_options.slot;
 	osb->preferred_slot = parsed_options.slot;
 	osb->osb_commit_interval = parsed_options.commit_interval;
 	osb->osb_commit_interval = parsed_options.commit_interval;
 	osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
 	osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
 	osb->local_alloc_bits = osb->local_alloc_default_bits;
 	osb->local_alloc_bits = osb->local_alloc_default_bits;
+	if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
+	    !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+					 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "User quotas were requested, but this "
+		     "filesystem does not have the feature enabled.\n");
+		goto read_super_error;
+	}
+	if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+	    !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+					 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Group quotas were requested, but this "
+		     "filesystem does not have the feature enabled.\n");
+		goto read_super_error;
+	}
 
 
 	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
 	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
 	if (status)
 	if (status)
@@ -664,6 +867,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 
 
 	sb->s_magic = OCFS2_SUPER_MAGIC;
 	sb->s_magic = OCFS2_SUPER_MAGIC;
 
 
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
 	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
 	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
 	 * heartbeat=none */
 	 * heartbeat=none */
 	if (bdev_read_only(sb->s_bdev)) {
 	if (bdev_read_only(sb->s_bdev)) {
@@ -758,6 +964,28 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	atomic_set(&osb->vol_state, VOLUME_MOUNTED);
 	atomic_set(&osb->vol_state, VOLUME_MOUNTED);
 	wake_up(&osb->osb_mount_event);
 	wake_up(&osb->osb_mount_event);
 
 
+	/* Now we can initialize quotas because we can afford to wait
+	 * for cluster locks recovery now. That also means that truncation
+	 * log recovery can happen but that waits for proper quota setup */
+	if (!(sb->s_flags & MS_RDONLY)) {
+		status = ocfs2_enable_quotas(osb);
+		if (status < 0) {
+			/* We have to err-out specially here because
+			 * s_root is already set */
+			mlog_errno(status);
+			atomic_set(&osb->vol_state, VOLUME_DISABLED);
+			wake_up(&osb->osb_mount_event);
+			mlog_exit(status);
+			return status;
+		}
+	}
+
+	ocfs2_complete_quota_recovery(osb);
+
+	/* Now we wake up again for processes waiting for quotas */
+	atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
+	wake_up(&osb->osb_mount_event);
+
 	mlog_exit(status);
 	mlog_exit(status);
 	return status;
 	return status;
 
 
@@ -945,6 +1173,41 @@ static int ocfs2_parse_options(struct super_block *sb,
 		case Opt_inode64:
 		case Opt_inode64:
 			mopt->mount_opt |= OCFS2_MOUNT_INODE64;
 			mopt->mount_opt |= OCFS2_MOUNT_INODE64;
 			break;
 			break;
+		case Opt_usrquota:
+			/* We check only on remount, otherwise features
+			 * aren't yet initialized. */
+			if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+			    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+				mlog(ML_ERROR, "User quota requested but "
+				     "filesystem feature is not set\n");
+				status = 0;
+				goto bail;
+			}
+			mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
+			break;
+		case Opt_grpquota:
+			if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+			    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+				mlog(ML_ERROR, "Group quota requested but "
+				     "filesystem feature is not set\n");
+				status = 0;
+				goto bail;
+			}
+			mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
+			break;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+		case Opt_acl:
+			mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+			break;
+		case Opt_noacl:
+			mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+			break;
+#else
+		case Opt_acl:
+		case Opt_noacl:
+			printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
+			break;
+#endif
 		default:
 		default:
 			mlog(ML_ERROR,
 			mlog(ML_ERROR,
 			     "Unrecognized mount option \"%s\" "
 			     "Unrecognized mount option \"%s\" "
@@ -1008,6 +1271,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (osb->osb_cluster_stack[0])
 	if (osb->osb_cluster_stack[0])
 		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
 		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
 			   osb->osb_cluster_stack);
 			   osb->osb_cluster_stack);
+	if (opts & OCFS2_MOUNT_USRQUOTA)
+		seq_printf(s, ",usrquota");
+	if (opts & OCFS2_MOUNT_GRPQUOTA)
+		seq_printf(s, ",grpquota");
 
 
 	if (opts & OCFS2_MOUNT_NOUSERXATTR)
 	if (opts & OCFS2_MOUNT_NOUSERXATTR)
 		seq_printf(s, ",nouser_xattr");
 		seq_printf(s, ",nouser_xattr");
@@ -1017,6 +1284,13 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (opts & OCFS2_MOUNT_INODE64)
 	if (opts & OCFS2_MOUNT_INODE64)
 		seq_printf(s, ",inode64");
 		seq_printf(s, ",inode64");
 
 
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+	if (opts & OCFS2_MOUNT_POSIX_ACL)
+		seq_printf(s, ",acl");
+	else
+		seq_printf(s, ",noacl");
+#endif
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -1052,10 +1326,16 @@ static int __init ocfs2_init(void)
 		mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
 		mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
 	}
 	}
 
 
+	status = ocfs2_quota_setup();
+	if (status)
+		goto leave;
+
 	ocfs2_set_locking_protocol();
 	ocfs2_set_locking_protocol();
 
 
+	status = register_quota_format(&ocfs2_quota_format);
 leave:
 leave:
 	if (status < 0) {
 	if (status < 0) {
+		ocfs2_quota_shutdown();
 		ocfs2_free_mem_caches();
 		ocfs2_free_mem_caches();
 		exit_ocfs2_uptodate_cache();
 		exit_ocfs2_uptodate_cache();
 	}
 	}
@@ -1072,11 +1352,15 @@ static void __exit ocfs2_exit(void)
 {
 {
 	mlog_entry_void();
 	mlog_entry_void();
 
 
+	ocfs2_quota_shutdown();
+
 	if (ocfs2_wq) {
 	if (ocfs2_wq) {
 		flush_workqueue(ocfs2_wq);
 		flush_workqueue(ocfs2_wq);
 		destroy_workqueue(ocfs2_wq);
 		destroy_workqueue(ocfs2_wq);
 	}
 	}
 
 
+	unregister_quota_format(&ocfs2_quota_format);
+
 	debugfs_remove(ocfs2_debugfs_root);
 	debugfs_remove(ocfs2_debugfs_root);
 
 
 	ocfs2_free_mem_caches();
 	ocfs2_free_mem_caches();
@@ -1192,8 +1476,27 @@ static int ocfs2_initialize_mem_caches(void)
 				       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 				       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
 						SLAB_MEM_SPREAD),
 				       ocfs2_inode_init_once);
 				       ocfs2_inode_init_once);
-	if (!ocfs2_inode_cachep)
+	ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
+					sizeof(struct ocfs2_dquot),
+					0,
+					(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					NULL);
+	ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
+					sizeof(struct ocfs2_quota_chunk),
+					0,
+					(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+					NULL);
+	if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
+	    !ocfs2_qf_chunk_cachep) {
+		if (ocfs2_inode_cachep)
+			kmem_cache_destroy(ocfs2_inode_cachep);
+		if (ocfs2_dquot_cachep)
+			kmem_cache_destroy(ocfs2_dquot_cachep);
+		if (ocfs2_qf_chunk_cachep)
+			kmem_cache_destroy(ocfs2_qf_chunk_cachep);
 		return -ENOMEM;
 		return -ENOMEM;
+	}
 
 
 	return 0;
 	return 0;
 }
 }
@@ -1202,8 +1505,15 @@ static void ocfs2_free_mem_caches(void)
 {
 {
 	if (ocfs2_inode_cachep)
 	if (ocfs2_inode_cachep)
 		kmem_cache_destroy(ocfs2_inode_cachep);
 		kmem_cache_destroy(ocfs2_inode_cachep);
-
 	ocfs2_inode_cachep = NULL;
 	ocfs2_inode_cachep = NULL;
+
+	if (ocfs2_dquot_cachep)
+		kmem_cache_destroy(ocfs2_dquot_cachep);
+	ocfs2_dquot_cachep = NULL;
+
+	if (ocfs2_qf_chunk_cachep)
+		kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+	ocfs2_qf_chunk_cachep = NULL;
 }
 }
 
 
 static int ocfs2_get_sector(struct super_block *sb,
 static int ocfs2_get_sector(struct super_block *sb,
@@ -1303,6 +1613,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 	osb = OCFS2_SB(sb);
 	osb = OCFS2_SB(sb);
 	BUG_ON(!osb);
 	BUG_ON(!osb);
 
 
+	ocfs2_disable_quotas(osb);
+
 	ocfs2_shutdown_local_alloc(osb);
 	ocfs2_shutdown_local_alloc(osb);
 
 
 	ocfs2_truncate_log_shutdown(osb);
 	ocfs2_truncate_log_shutdown(osb);
@@ -1413,6 +1725,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	sb->s_fs_info = osb;
 	sb->s_fs_info = osb;
 	sb->s_op = &ocfs2_sops;
 	sb->s_op = &ocfs2_sops;
 	sb->s_export_op = &ocfs2_export_ops;
 	sb->s_export_op = &ocfs2_export_ops;
+	sb->s_qcop = &ocfs2_quotactl_ops;
+	sb->dq_op = &ocfs2_quota_operations;
 	sb->s_xattr = ocfs2_xattr_handlers;
 	sb->s_xattr = ocfs2_xattr_handlers;
 	sb->s_time_gran = 1;
 	sb->s_time_gran = 1;
 	sb->s_flags |= MS_NOATIME;
 	sb->s_flags |= MS_NOATIME;
@@ -1676,6 +1990,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 
 
 	if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
 	if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
 		   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
 		   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
+		/* We have to do a raw check of the feature here */
+		if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
+		    OCFS2_FEATURE_INCOMPAT_META_ECC) {
+			status = ocfs2_block_check_validate(bh->b_data,
+							    bh->b_size,
+							    &di->i_check);
+			if (status)
+				goto out;
+		}
 		status = -EINVAL;
 		status = -EINVAL;
 		if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
 		if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
 			mlog(ML_ERROR, "found superblock with incorrect block "
 			mlog(ML_ERROR, "found superblock with incorrect block "
@@ -1717,6 +2040,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 		}
 		}
 	}
 	}
 
 
+out:
 	mlog_exit(status);
 	mlog_exit(status);
 	return status;
 	return status;
 }
 }

+ 1 - 1
fs/ocfs2/symlink.c

@@ -84,7 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
 
 
 	mlog_entry_void();
 	mlog_entry_void();
 
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
+	status = ocfs2_read_inode_block(inode, bh);
 	if (status < 0) {
 	if (status < 0) {
 		mlog_errno(status);
 		mlog_errno(status);
 		link = ERR_PTR(status);
 		link = ERR_PTR(status);

Plik diff jest za duży
+ 495 - 212
fs/ocfs2/xattr.c


+ 45 - 0
fs/ocfs2/xattr.h

@@ -30,13 +30,58 @@ enum ocfs2_xattr_type {
 	OCFS2_XATTR_MAX
 	OCFS2_XATTR_MAX
 };
 };
 
 
+struct ocfs2_security_xattr_info {
+	int enable;
+	char *name;
+	void *value;
+	size_t value_len;
+};
+
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
+extern struct xattr_handler ocfs2_xattr_security_handler;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+extern struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern struct xattr_handler ocfs2_xattr_acl_default_handler;
+#endif
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 
 
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
+			   const char *, void *, size_t);
 int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
 int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
 		    size_t, int);
 		    size_t, int);
+int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
+			   int, const char *, const void *, size_t, int,
+			   struct ocfs2_alloc_context *,
+			   struct ocfs2_alloc_context *);
 int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
 int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
+int ocfs2_init_security_get(struct inode *, struct inode *,
+			    struct ocfs2_security_xattr_info *);
+int ocfs2_init_security_set(handle_t *, struct inode *,
+			    struct buffer_head *,
+			    struct ocfs2_security_xattr_info *,
+			    struct ocfs2_alloc_context *,
+			    struct ocfs2_alloc_context *);
+int ocfs2_calc_security_init(struct inode *,
+			     struct ocfs2_security_xattr_info *,
+			     int *, int *, struct ocfs2_alloc_context **);
+int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
+			  int, struct ocfs2_security_xattr_info *,
+			  int *, int *, struct ocfs2_alloc_context **);
+
+/*
+ * xattrs can live inside an inode, as part of an external xattr block,
+ * or inside an xattr bucket, which is the leaf of a tree rooted in an
+ * xattr block.  Some of the xattr calls, especially the value setting
+ * functions, want to treat each of these locations as equal.  Let's wrap
+ * them in a structure that we can pass around instead of raw buffer_heads.
+ */
+struct ocfs2_xattr_value_buf {
+	struct buffer_head		*vb_bh;
+	ocfs2_journal_access_func	vb_access;
+	struct ocfs2_xattr_value_root	*vb_xv;
+};
+
 
 
 #endif /* OCFS2_XATTR_H */
 #endif /* OCFS2_XATTR_H */

+ 7 - 4
fs/quota.c

@@ -73,7 +73,7 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
 		case Q_SETQUOTA:
 		case Q_SETQUOTA:
 		case Q_GETQUOTA:
 		case Q_GETQUOTA:
 			/* This is just informative test so we are satisfied without a lock */
 			/* This is just informative test so we are satisfied without a lock */
-			if (!sb_has_quota_enabled(sb, type))
+			if (!sb_has_quota_active(sb, type))
 				return -ESRCH;
 				return -ESRCH;
 	}
 	}
 
 
@@ -160,6 +160,9 @@ static void quota_sync_sb(struct super_block *sb, int type)
 	int cnt;
 	int cnt;
 
 
 	sb->s_qcop->quota_sync(sb, type);
 	sb->s_qcop->quota_sync(sb, type);
+
+	if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
+		return;
 	/* This is not very clever (and fast) but currently I don't know about
 	/* This is not very clever (and fast) but currently I don't know about
 	 * any other simple way of getting quota data to disk and we must get
 	 * any other simple way of getting quota data to disk and we must get
 	 * them there for userspace to be visible... */
 	 * them there for userspace to be visible... */
@@ -175,7 +178,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (type != -1 && cnt != type)
 		if (type != -1 && cnt != type)
 			continue;
 			continue;
-		if (!sb_has_quota_enabled(sb, cnt))
+		if (!sb_has_quota_active(sb, cnt))
 			continue;
 			continue;
 		mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
 		mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
 		truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
 		truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
@@ -201,7 +204,7 @@ restart:
 		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 			if (type != -1 && type != cnt)
 			if (type != -1 && type != cnt)
 				continue;
 				continue;
-			if (!sb_has_quota_enabled(sb, cnt))
+			if (!sb_has_quota_active(sb, cnt))
 				continue;
 				continue;
 			if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
 			if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
 			    list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
 			    list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
@@ -245,7 +248,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
 			__u32 fmt;
 			__u32 fmt;
 
 
 			down_read(&sb_dqopt(sb)->dqptr_sem);
 			down_read(&sb_dqopt(sb)->dqptr_sem);
-			if (!sb_has_quota_enabled(sb, type)) {
+			if (!sb_has_quota_active(sb, type)) {
 				up_read(&sb_dqopt(sb)->dqptr_sem);
 				up_read(&sb_dqopt(sb)->dqptr_sem);
 				return -ESRCH;
 				return -ESRCH;
 			}
 			}

+ 645 - 0
fs/quota_tree.c

@@ -0,0 +1,645 @@
+/*
+ *	vfsv0 quota IO operations on file
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/dqblk_v2.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/quotaops.h>
+
+#include <asm/byteorder.h>
+
+#include "quota_tree.h"
+
+MODULE_AUTHOR("Jan Kara");
+MODULE_DESCRIPTION("Quota trie support");
+MODULE_LICENSE("GPL");
+
+#define __QUOTA_QT_PARANOIA
+
+typedef char *dqbuf_t;
+
+static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
+{
+	unsigned int epb = info->dqi_usable_bs >> 2;
+
+	depth = info->dqi_qtree_depth - depth - 1;
+	while (depth--)
+		id /= epb;
+	return id % epb;
+}
+
+/* Number of entries in one blocks */
+static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
+{
+	return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
+	       / info->dqi_entry_size;
+}
+
+static dqbuf_t getdqbuf(size_t size)
+{
+	dqbuf_t buf = kmalloc(size, GFP_NOFS);
+	if (!buf)
+		printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
+	return buf;
+}
+
+static inline void freedqbuf(dqbuf_t buf)
+{
+	kfree(buf);
+}
+
+static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+	struct super_block *sb = info->dqi_sb;
+
+	memset(buf, 0, info->dqi_usable_bs);
+	return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf,
+	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+
+static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+	struct super_block *sb = info->dqi_sb;
+
+	return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf,
+	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+
+/* Remove empty block from list and return it */
+static int get_free_dqblk(struct qtree_mem_dqinfo *info)
+{
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	int ret, blk;
+
+	if (!buf)
+		return -ENOMEM;
+	if (info->dqi_free_blk) {
+		blk = info->dqi_free_blk;
+		ret = read_blk(info, blk, buf);
+		if (ret < 0)
+			goto out_buf;
+		info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
+	}
+	else {
+		memset(buf, 0, info->dqi_usable_bs);
+		/* Assure block allocation... */
+		ret = write_blk(info, info->dqi_blocks, buf);
+		if (ret < 0)
+			goto out_buf;
+		blk = info->dqi_blocks++;
+	}
+	mark_info_dirty(info->dqi_sb, info->dqi_type);
+	ret = blk;
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Insert empty block to the list */
+static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	int err;
+
+	dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk);
+	dh->dqdh_prev_free = cpu_to_le32(0);
+	dh->dqdh_entries = cpu_to_le16(0);
+	err = write_blk(info, blk, buf);
+	if (err < 0)
+		return err;
+	info->dqi_free_blk = blk;
+	mark_info_dirty(info->dqi_sb, info->dqi_type);
+	return 0;
+}
+
+/* Remove given block from the list of blocks with free entries */
+static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+	dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	uint nextblk = le32_to_cpu(dh->dqdh_next_free);
+	uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
+	int err;
+
+	if (!tmpbuf)
+		return -ENOMEM;
+	if (nextblk) {
+		err = read_blk(info, nextblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+		((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+							dh->dqdh_prev_free;
+		err = write_blk(info, nextblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+	}
+	if (prevblk) {
+		err = read_blk(info, prevblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+		((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free =
+							dh->dqdh_next_free;
+		err = write_blk(info, prevblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+	} else {
+		info->dqi_free_entry = nextblk;
+		mark_info_dirty(info->dqi_sb, info->dqi_type);
+	}
+	freedqbuf(tmpbuf);
+	dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
+	/* No matter whether write succeeds block is out of list */
+	if (write_blk(info, blk, buf) < 0)
+		printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
+	return 0;
+out_buf:
+	freedqbuf(tmpbuf);
+	return err;
+}
+
+/* Insert given block to the beginning of list with free entries */
+static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+	dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	int err;
+
+	if (!tmpbuf)
+		return -ENOMEM;
+	dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry);
+	dh->dqdh_prev_free = cpu_to_le32(0);
+	err = write_blk(info, blk, buf);
+	if (err < 0)
+		goto out_buf;
+	if (info->dqi_free_entry) {
+		err = read_blk(info, info->dqi_free_entry, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+		((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+							cpu_to_le32(blk);
+		err = write_blk(info, info->dqi_free_entry, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+	}
+	freedqbuf(tmpbuf);
+	info->dqi_free_entry = blk;
+	mark_info_dirty(info->dqi_sb, info->dqi_type);
+	return 0;
+out_buf:
+	freedqbuf(tmpbuf);
+	return err;
+}
+
+/* Is the entry in the block free? */
+int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk)
+{
+	int i;
+
+	for (i = 0; i < info->dqi_entry_size; i++)
+		if (disk[i])
+			return 0;
+	return 1;
+}
+EXPORT_SYMBOL(qtree_entry_unused);
+
+/* Find space for dquot */
+static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
+			      struct dquot *dquot, int *err)
+{
+	uint blk, i;
+	struct qt_disk_dqdbheader *dh;
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *ddquot;
+
+	*err = 0;
+	if (!buf) {
+		*err = -ENOMEM;
+		return 0;
+	}
+	dh = (struct qt_disk_dqdbheader *)buf;
+	if (info->dqi_free_entry) {
+		blk = info->dqi_free_entry;
+		*err = read_blk(info, blk, buf);
+		if (*err < 0)
+			goto out_buf;
+	} else {
+		blk = get_free_dqblk(info);
+		if ((int)blk < 0) {
+			*err = blk;
+			freedqbuf(buf);
+			return 0;
+		}
+		memset(buf, 0, info->dqi_usable_bs);
+		/* This is enough as block is already zeroed and entry list is empty... */
+		info->dqi_free_entry = blk;
+		mark_info_dirty(dquot->dq_sb, dquot->dq_type);
+	}
+	/* Block will be full? */
+	if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
+		*err = remove_free_dqentry(info, buf, blk);
+		if (*err < 0) {
+			printk(KERN_ERR "VFS: find_free_dqentry(): Can't "
+			       "remove block (%u) from entry free list.\n",
+			       blk);
+			goto out_buf;
+		}
+	}
+	le16_add_cpu(&dh->dqdh_entries, 1);
+	/* Find free structure in block */
+	for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+	     i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot);
+	     i++, ddquot += info->dqi_entry_size);
+#ifdef __QUOTA_QT_PARANOIA
+	if (i == qtree_dqstr_in_blk(info)) {
+		printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
+				"but it shouldn't.\n");
+		*err = -EIO;
+		goto out_buf;
+	}
+#endif
+	*err = write_blk(info, blk, buf);
+	if (*err < 0) {
+		printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
+				"data block %u.\n", blk);
+		goto out_buf;
+	}
+	dquot->dq_off = (blk << info->dqi_blocksize_bits) +
+			sizeof(struct qt_disk_dqdbheader) +
+			i * info->dqi_entry_size;
+	freedqbuf(buf);
+	return blk;
+out_buf:
+	freedqbuf(buf);
+	return 0;
+}
+
+/* Insert reference to structure into the trie */
+static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+			  uint *treeblk, int depth)
+{
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	int ret = 0, newson = 0, newact = 0;
+	__le32 *ref;
+	uint newblk;
+
+	if (!buf)
+		return -ENOMEM;
+	if (!*treeblk) {
+		ret = get_free_dqblk(info);
+		if (ret < 0)
+			goto out_buf;
+		*treeblk = ret;
+		memset(buf, 0, info->dqi_usable_bs);
+		newact = 1;
+	} else {
+		ret = read_blk(info, *treeblk, buf);
+		if (ret < 0) {
+			printk(KERN_ERR "VFS: Can't read tree quota block "
+					"%u.\n", *treeblk);
+			goto out_buf;
+		}
+	}
+	ref = (__le32 *)buf;
+	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (!newblk)
+		newson = 1;
+	if (depth == info->dqi_qtree_depth - 1) {
+#ifdef __QUOTA_QT_PARANOIA
+		if (newblk) {
+			printk(KERN_ERR "VFS: Inserting already present quota "
+					"entry (block %u).\n",
+			       le32_to_cpu(ref[get_index(info,
+						dquot->dq_id, depth)]));
+			ret = -EIO;
+			goto out_buf;
+		}
+#endif
+		newblk = find_free_dqentry(info, dquot, &ret);
+	} else {
+		ret = do_insert_tree(info, dquot, &newblk, depth+1);
+	}
+	if (newson && ret >= 0) {
+		ref[get_index(info, dquot->dq_id, depth)] =
+							cpu_to_le32(newblk);
+		ret = write_blk(info, *treeblk, buf);
+	} else if (newact && ret < 0) {
+		put_free_dqblk(info, buf, *treeblk);
+	}
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Wrapper for inserting quota structure into tree */
+static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
+				 struct dquot *dquot)
+{
+	int tmp = QT_TREEOFF;
+	return do_insert_tree(info, dquot, &tmp, 0);
+}
+
+/*
+ *	We don't have to be afraid of deadlocks as we never have quotas on quota files...
+ */
+int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	int type = dquot->dq_type;
+	struct super_block *sb = dquot->dq_sb;
+	ssize_t ret;
+	dqbuf_t ddquot = getdqbuf(info->dqi_entry_size);
+
+	if (!ddquot)
+		return -ENOMEM;
+
+	/* dq_off is guarded by dqio_mutex */
+	if (!dquot->dq_off) {
+		ret = dq_insert_tree(info, dquot);
+		if (ret < 0) {
+			printk(KERN_ERR "VFS: Error %zd occurred while "
+					"creating quota.\n", ret);
+			freedqbuf(ddquot);
+			return ret;
+		}
+	}
+	spin_lock(&dq_data_lock);
+	info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
+	spin_unlock(&dq_data_lock);
+	ret = sb->s_op->quota_write(sb, type, (char *)ddquot,
+					info->dqi_entry_size, dquot->dq_off);
+	if (ret != info->dqi_entry_size) {
+		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+		       sb->s_id);
+		if (ret >= 0)
+			ret = -ENOSPC;
+	} else {
+		ret = 0;
+	}
+	dqstats.writes++;
+	freedqbuf(ddquot);
+
+	return ret;
+}
+EXPORT_SYMBOL(qtree_write_dquot);
+
+/* Free dquot entry in data block */
+static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+			uint blk)
+{
+	struct qt_disk_dqdbheader *dh;
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	int ret = 0;
+
+	if (!buf)
+		return -ENOMEM;
+	if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
+		printk(KERN_ERR "VFS: Quota structure has offset to other "
+		  "block (%u) than it should (%u).\n", blk,
+		  (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+		goto out_buf;
+	}
+	ret = read_blk(info, blk, buf);
+	if (ret < 0) {
+		printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+		goto out_buf;
+	}
+	dh = (struct qt_disk_dqdbheader *)buf;
+	le16_add_cpu(&dh->dqdh_entries, -1);
+	if (!le16_to_cpu(dh->dqdh_entries)) {	/* Block got free? */
+		ret = remove_free_dqentry(info, buf, blk);
+		if (ret >= 0)
+			ret = put_free_dqblk(info, buf, blk);
+		if (ret < 0) {
+			printk(KERN_ERR "VFS: Can't move quota data block (%u) "
+			  "to free list.\n", blk);
+			goto out_buf;
+		}
+	} else {
+		memset(buf +
+		       (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)),
+		       0, info->dqi_entry_size);
+		if (le16_to_cpu(dh->dqdh_entries) ==
+		    qtree_dqstr_in_blk(info) - 1) {
+			/* Insert will write block itself */
+			ret = insert_free_dqentry(info, buf, blk);
+			if (ret < 0) {
+				printk(KERN_ERR "VFS: Can't insert quota data "
+				       "block (%u) to free entry list.\n", blk);
+				goto out_buf;
+			}
+		} else {
+			ret = write_blk(info, blk, buf);
+			if (ret < 0) {
+				printk(KERN_ERR "VFS: Can't write quota data "
+				  "block %u\n", blk);
+				goto out_buf;
+			}
+		}
+	}
+	dquot->dq_off = 0;	/* Quota is now unattached */
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Remove reference to dquot from tree */
+static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+		       uint *blk, int depth)
+{
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	int ret = 0;
+	uint newblk;
+	__le32 *ref = (__le32 *)buf;
+
+	if (!buf)
+		return -ENOMEM;
+	ret = read_blk(info, *blk, buf);
+	if (ret < 0) {
+		printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+		goto out_buf;
+	}
+	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (depth == info->dqi_qtree_depth - 1) {
+		ret = free_dqentry(info, dquot, newblk);
+		newblk = 0;
+	} else {
+		ret = remove_tree(info, dquot, &newblk, depth+1);
+	}
+	if (ret >= 0 && !newblk) {
+		int i;
+		ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
+		/* Block got empty? */
+		for (i = 0;
+		     i < (info->dqi_usable_bs >> 2) && !ref[i];
+		     i++);
+		/* Don't put the root block into the free block list */
+		if (i == (info->dqi_usable_bs >> 2)
+		    && *blk != QT_TREEOFF) {
+			put_free_dqblk(info, buf, *blk);
+			*blk = 0;
+		} else {
+			ret = write_blk(info, *blk, buf);
+			if (ret < 0)
+				printk(KERN_ERR "VFS: Can't write quota tree "
+				  "block %u.\n", *blk);
+		}
+	}
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Delete dquot from tree */
+int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	uint tmp = QT_TREEOFF;
+
+	if (!dquot->dq_off)	/* Even not allocated? */
+		return 0;
+	return remove_tree(info, dquot, &tmp, 0);
+}
+EXPORT_SYMBOL(qtree_delete_dquot);
+
+/* Find entry in block */
+static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
+				 struct dquot *dquot, uint blk)
+{
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	loff_t ret = 0;
+	int i;
+	char *ddquot;
+
+	if (!buf)
+		return -ENOMEM;
+	ret = read_blk(info, blk, buf);
+	if (ret < 0) {
+		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+		goto out_buf;
+	}
+	for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+	     i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot);
+	     i++, ddquot += info->dqi_entry_size);
+	if (i == qtree_dqstr_in_blk(info)) {
+		printk(KERN_ERR "VFS: Quota for id %u referenced "
+		  "but not present.\n", dquot->dq_id);
+		ret = -EIO;
+		goto out_buf;
+	} else {
+		ret = (blk << info->dqi_blocksize_bits) + sizeof(struct
+		  qt_disk_dqdbheader) + i * info->dqi_entry_size;
+	}
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Find entry for given id in the tree */
+static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
+				struct dquot *dquot, uint blk, int depth)
+{
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	loff_t ret = 0;
+	__le32 *ref = (__le32 *)buf;
+
+	if (!buf)
+		return -ENOMEM;
+	ret = read_blk(info, blk, buf);
+	if (ret < 0) {
+		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+		goto out_buf;
+	}
+	ret = 0;
+	blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (!blk)	/* No reference? */
+		goto out_buf;
+	if (depth < info->dqi_qtree_depth - 1)
+		ret = find_tree_dqentry(info, dquot, blk, depth+1);
+	else
+		ret = find_block_dqentry(info, dquot, blk);
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Find entry for given id in the tree - wrapper function */
+static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
+				  struct dquot *dquot)
+{
+	return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
+}
+
+int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	int type = dquot->dq_type;
+	struct super_block *sb = dquot->dq_sb;
+	loff_t offset;
+	dqbuf_t ddquot;
+	int ret = 0;
+
+#ifdef __QUOTA_QT_PARANOIA
+	/* Invalidated quota? */
+	if (!sb_dqopt(dquot->dq_sb)->files[type]) {
+		printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
+		return -EIO;
+	}
+#endif
+	/* Do we know offset of the dquot entry in the quota file? */
+	if (!dquot->dq_off) {
+		offset = find_dqentry(info, dquot);
+		if (offset <= 0) {	/* Entry not present? */
+			if (offset < 0)
+				printk(KERN_ERR "VFS: Can't read quota "
+				  "structure for id %u.\n", dquot->dq_id);
+			dquot->dq_off = 0;
+			set_bit(DQ_FAKE_B, &dquot->dq_flags);
+			memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+			ret = offset;
+			goto out;
+		}
+		dquot->dq_off = offset;
+	}
+	ddquot = getdqbuf(info->dqi_entry_size);
+	if (!ddquot)
+		return -ENOMEM;
+	ret = sb->s_op->quota_read(sb, type, (char *)ddquot,
+				   info->dqi_entry_size, dquot->dq_off);
+	if (ret != info->dqi_entry_size) {
+		if (ret >= 0)
+			ret = -EIO;
+		printk(KERN_ERR "VFS: Error while reading quota "
+				"structure for id %u.\n", dquot->dq_id);
+		set_bit(DQ_FAKE_B, &dquot->dq_flags);
+		memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+		freedqbuf(ddquot);
+		goto out;
+	}
+	spin_lock(&dq_data_lock);
+	info->dqi_ops->disk2mem_dqblk(dquot, ddquot);
+	if (!dquot->dq_dqb.dqb_bhardlimit &&
+	    !dquot->dq_dqb.dqb_bsoftlimit &&
+	    !dquot->dq_dqb.dqb_ihardlimit &&
+	    !dquot->dq_dqb.dqb_isoftlimit)
+		set_bit(DQ_FAKE_B, &dquot->dq_flags);
+	spin_unlock(&dq_data_lock);
+	freedqbuf(ddquot);
+out:
+	dqstats.reads++;
+	return ret;
+}
+EXPORT_SYMBOL(qtree_read_dquot);
+
+/* Check whether dquot should not be deleted. We know we are
+ * the only one operating on dquot (thanks to dq_lock) */
+int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
+		return qtree_delete_dquot(info, dquot);
+	return 0;
+}
+EXPORT_SYMBOL(qtree_release_dquot);

+ 25 - 0
fs/quota_tree.h

@@ -0,0 +1,25 @@
+/*
+ *	Definitions of structures for vfsv0 quota format
+ */
+
+#ifndef _LINUX_QUOTA_TREE_H
+#define _LINUX_QUOTA_TREE_H
+
+#include <linux/types.h>
+#include <linux/quota.h>
+
+/*
+ *  Structure of header of block with quota structures. It is padded to 16 bytes so
+ *  there will be space for exactly 21 quota-entries in a block
+ */
+struct qt_disk_dqdbheader {
+	__le32 dqdh_next_free;	/* Number of next block with free entry */
+	__le32 dqdh_prev_free;	/* Number of previous block with free entry */
+	__le16 dqdh_entries;	/* Number of valid entries in block */
+	__le16 dqdh_pad1;
+	__le32 dqdh_pad2;
+};
+
+#define QT_TREEOFF	1		/* Offset of tree in file in blocks */
+
+#endif /* _LINUX_QUOTAIO_TREE_H */

+ 21 - 7
fs/quota_v1.c

@@ -3,25 +3,39 @@
 #include <linux/quota.h>
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/quotaops.h>
 #include <linux/dqblk_v1.h>
 #include <linux/dqblk_v1.h>
-#include <linux/quotaio_v1.h>
 #include <linux/kernel.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/module.h>
 
 
 #include <asm/byteorder.h>
 #include <asm/byteorder.h>
 
 
+#include "quotaio_v1.h"
+
 MODULE_AUTHOR("Jan Kara");
 MODULE_AUTHOR("Jan Kara");
 MODULE_DESCRIPTION("Old quota format support");
 MODULE_DESCRIPTION("Old quota format support");
 MODULE_LICENSE("GPL");
 MODULE_LICENSE("GPL");
 
 
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+
+static inline qsize_t v1_stoqb(qsize_t space)
+{
+	return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+
+static inline qsize_t v1_qbtos(qsize_t blocks)
+{
+	return blocks << QUOTABLOCK_BITS;
+}
+
 static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d)
 static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d)
 {
 {
 	m->dqb_ihardlimit = d->dqb_ihardlimit;
 	m->dqb_ihardlimit = d->dqb_ihardlimit;
 	m->dqb_isoftlimit = d->dqb_isoftlimit;
 	m->dqb_isoftlimit = d->dqb_isoftlimit;
 	m->dqb_curinodes = d->dqb_curinodes;
 	m->dqb_curinodes = d->dqb_curinodes;
-	m->dqb_bhardlimit = d->dqb_bhardlimit;
-	m->dqb_bsoftlimit = d->dqb_bsoftlimit;
-	m->dqb_curspace = ((qsize_t)d->dqb_curblocks) << QUOTABLOCK_BITS;
+	m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit);
+	m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit);
+	m->dqb_curspace = v1_qbtos(d->dqb_curblocks);
 	m->dqb_itime = d->dqb_itime;
 	m->dqb_itime = d->dqb_itime;
 	m->dqb_btime = d->dqb_btime;
 	m->dqb_btime = d->dqb_btime;
 }
 }
@@ -31,9 +45,9 @@ static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m)
 	d->dqb_ihardlimit = m->dqb_ihardlimit;
 	d->dqb_ihardlimit = m->dqb_ihardlimit;
 	d->dqb_isoftlimit = m->dqb_isoftlimit;
 	d->dqb_isoftlimit = m->dqb_isoftlimit;
 	d->dqb_curinodes = m->dqb_curinodes;
 	d->dqb_curinodes = m->dqb_curinodes;
-	d->dqb_bhardlimit = m->dqb_bhardlimit;
-	d->dqb_bsoftlimit = m->dqb_bsoftlimit;
-	d->dqb_curblocks = toqb(m->dqb_curspace);
+	d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit);
+	d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit);
+	d->dqb_curblocks = v1_stoqb(m->dqb_curspace);
 	d->dqb_itime = m->dqb_itime;
 	d->dqb_itime = m->dqb_itime;
 	d->dqb_btime = m->dqb_btime;
 	d->dqb_btime = m->dqb_btime;
 }
 }

+ 84 - 547
fs/quota_v2.c

@@ -6,7 +6,6 @@
 #include <linux/fs.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/mount.h>
 #include <linux/dqblk_v2.h>
 #include <linux/dqblk_v2.h>
-#include <linux/quotaio_v2.h>
 #include <linux/kernel.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/module.h>
@@ -15,16 +14,37 @@
 
 
 #include <asm/byteorder.h>
 #include <asm/byteorder.h>
 
 
+#include "quota_tree.h"
+#include "quotaio_v2.h"
+
 MODULE_AUTHOR("Jan Kara");
 MODULE_AUTHOR("Jan Kara");
 MODULE_DESCRIPTION("Quota format v2 support");
 MODULE_DESCRIPTION("Quota format v2 support");
 MODULE_LICENSE("GPL");
 MODULE_LICENSE("GPL");
 
 
 #define __QUOTA_V2_PARANOIA
 #define __QUOTA_V2_PARANOIA
 
 
-typedef char *dqbuf_t;
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot);
+static void v2_disk2memdqb(struct dquot *dquot, void *dp);
+static int v2_is_id(void *dp, struct dquot *dquot);
+
+static struct qtree_fmt_operations v2_qtree_ops = {
+	.mem2disk_dqblk = v2_mem2diskdqb,
+	.disk2mem_dqblk = v2_disk2memdqb,
+	.is_id = v2_is_id,
+};
+
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
 
 
-#define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff)
-#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader)))
+static inline qsize_t v2_stoqb(qsize_t space)
+{
+	return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+
+static inline qsize_t v2_qbtos(qsize_t blocks)
+{
+	return blocks << QUOTABLOCK_BITS;
+}
 
 
 /* Check whether given file is really vfsv0 quotafile */
 /* Check whether given file is really vfsv0 quotafile */
 static int v2_check_quota_file(struct super_block *sb, int type)
 static int v2_check_quota_file(struct super_block *sb, int type)
@@ -50,7 +70,8 @@ static int v2_check_quota_file(struct super_block *sb, int type)
 static int v2_read_file_info(struct super_block *sb, int type)
 static int v2_read_file_info(struct super_block *sb, int type)
 {
 {
 	struct v2_disk_dqinfo dinfo;
 	struct v2_disk_dqinfo dinfo;
-	struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct qtree_mem_dqinfo *qinfo;
 	ssize_t size;
 	ssize_t size;
 
 
 	size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
 	size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
@@ -60,15 +81,29 @@ static int v2_read_file_info(struct super_block *sb, int type)
 			sb->s_id);
 			sb->s_id);
 		return -1;
 		return -1;
 	}
 	}
+	info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
+	if (!info->dqi_priv) {
+		printk(KERN_WARNING
+		       "Not enough memory for quota information structure.\n");
+		return -1;
+	}
+	qinfo = info->dqi_priv;
 	/* limits are stored as unsigned 32-bit data */
 	/* limits are stored as unsigned 32-bit data */
 	info->dqi_maxblimit = 0xffffffff;
 	info->dqi_maxblimit = 0xffffffff;
 	info->dqi_maxilimit = 0xffffffff;
 	info->dqi_maxilimit = 0xffffffff;
 	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
 	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
 	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
 	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
 	info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
 	info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
-	info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
-	info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
-	info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+	qinfo->dqi_sb = sb;
+	qinfo->dqi_type = type;
+	qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+	qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+	qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+	qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
+	qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
+	qinfo->dqi_qtree_depth = qtree_depth(qinfo);
+	qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk);
+	qinfo->dqi_ops = &v2_qtree_ops;
 	return 0;
 	return 0;
 }
 }
 
 
@@ -76,7 +111,8 @@ static int v2_read_file_info(struct super_block *sb, int type)
 static int v2_write_file_info(struct super_block *sb, int type)
 static int v2_write_file_info(struct super_block *sb, int type)
 {
 {
 	struct v2_disk_dqinfo dinfo;
 	struct v2_disk_dqinfo dinfo;
-	struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
 	ssize_t size;
 	ssize_t size;
 
 
 	spin_lock(&dq_data_lock);
 	spin_lock(&dq_data_lock);
@@ -85,9 +121,9 @@ static int v2_write_file_info(struct super_block *sb, int type)
 	dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
 	dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
 	dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
 	dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
 	spin_unlock(&dq_data_lock);
 	spin_unlock(&dq_data_lock);
-	dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks);
-	dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk);
-	dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry);
+	dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
+	dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
+	dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
 	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
 	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
 	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
 	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
 	if (size != sizeof(struct v2_disk_dqinfo)) {
 	if (size != sizeof(struct v2_disk_dqinfo)) {
@@ -98,574 +134,75 @@ static int v2_write_file_info(struct super_block *sb, int type)
 	return 0;
 	return 0;
 }
 }
 
 
-static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d)
+static void v2_disk2memdqb(struct dquot *dquot, void *dp)
 {
 {
+	struct v2_disk_dqblk *d = dp, empty;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
 	m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
 	m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
 	m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
 	m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
 	m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
 	m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
 	m->dqb_itime = le64_to_cpu(d->dqb_itime);
 	m->dqb_itime = le64_to_cpu(d->dqb_itime);
-	m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit);
-	m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit);
+	m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit));
+	m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit));
 	m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
 	m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
 	m->dqb_btime = le64_to_cpu(d->dqb_btime);
 	m->dqb_btime = le64_to_cpu(d->dqb_btime);
+	/* We need to escape back all-zero structure */
+	memset(&empty, 0, sizeof(struct v2_disk_dqblk));
+	empty.dqb_itime = cpu_to_le64(1);
+	if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk)))
+		m->dqb_itime = 0;
 }
 }
 
 
-static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id)
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
 {
 {
+	struct v2_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+	struct qtree_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
 	d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
 	d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
 	d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
 	d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
 	d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
 	d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
 	d->dqb_itime = cpu_to_le64(m->dqb_itime);
 	d->dqb_itime = cpu_to_le64(m->dqb_itime);
-	d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit);
-	d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit);
+	d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit));
+	d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit));
 	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
 	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
 	d->dqb_btime = cpu_to_le64(m->dqb_btime);
 	d->dqb_btime = cpu_to_le64(m->dqb_btime);
-	d->dqb_id = cpu_to_le32(id);
-}
-
-static dqbuf_t getdqbuf(void)
-{
-	dqbuf_t buf = kmalloc(V2_DQBLKSIZE, GFP_NOFS);
-	if (!buf)
-		printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
-	return buf;
-}
-
-static inline void freedqbuf(dqbuf_t buf)
-{
-	kfree(buf);
-}
-
-static inline ssize_t read_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
-	memset(buf, 0, V2_DQBLKSIZE);
-	return sb->s_op->quota_read(sb, type, (char *)buf,
-	       V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-
-static inline ssize_t write_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
-	return sb->s_op->quota_write(sb, type, (char *)buf,
-	       V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-
-/* Remove empty block from list and return it */
-static int get_free_dqblk(struct super_block *sb, int type)
-{
-	dqbuf_t buf = getdqbuf();
-	struct mem_dqinfo *info = sb_dqinfo(sb, type);
-	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-	int ret, blk;
-
-	if (!buf)
-		return -ENOMEM;
-	if (info->u.v2_i.dqi_free_blk) {
-		blk = info->u.v2_i.dqi_free_blk;
-		if ((ret = read_blk(sb, type, blk, buf)) < 0)
-			goto out_buf;
-		info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
-	}
-	else {
-		memset(buf, 0, V2_DQBLKSIZE);
-		/* Assure block allocation... */
-		if ((ret = write_blk(sb, type, info->u.v2_i.dqi_blocks, buf)) < 0)
-			goto out_buf;
-		blk = info->u.v2_i.dqi_blocks++;
-	}
-	mark_info_dirty(sb, type);
-	ret = blk;
-out_buf:
-	freedqbuf(buf);
-	return ret;
-}
-
-/* Insert empty block to the list */
-static int put_free_dqblk(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
-	struct mem_dqinfo *info = sb_dqinfo(sb, type);
-	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-	int err;
-
-	dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_blk);
-	dh->dqdh_prev_free = cpu_to_le32(0);
-	dh->dqdh_entries = cpu_to_le16(0);
-	info->u.v2_i.dqi_free_blk = blk;
-	mark_info_dirty(sb, type);
-	/* Some strange block. We had better leave it... */
-	if ((err = write_blk(sb, type, blk, buf)) < 0)
-		return err;
-	return 0;
+	d->dqb_id = cpu_to_le32(dquot->dq_id);
+	if (qtree_entry_unused(info, dp))
+		d->dqb_itime = cpu_to_le64(1);
 }
 }
 
 
-/* Remove given block from the list of blocks with free entries */
-static int remove_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
+static int v2_is_id(void *dp, struct dquot *dquot)
 {
 {
-	dqbuf_t tmpbuf = getdqbuf();
-	struct mem_dqinfo *info = sb_dqinfo(sb, type);
-	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-	uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free);
-	int err;
+	struct v2_disk_dqblk *d = dp;
+	struct qtree_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
 
 
-	if (!tmpbuf)
-		return -ENOMEM;
-	if (nextblk) {
-		if ((err = read_blk(sb, type, nextblk, tmpbuf)) < 0)
-			goto out_buf;
-		((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free;
-		if ((err = write_blk(sb, type, nextblk, tmpbuf)) < 0)
-			goto out_buf;
-	}
-	if (prevblk) {
-		if ((err = read_blk(sb, type, prevblk, tmpbuf)) < 0)
-			goto out_buf;
-		((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free;
-		if ((err = write_blk(sb, type, prevblk, tmpbuf)) < 0)
-			goto out_buf;
-	}
-	else {
-		info->u.v2_i.dqi_free_entry = nextblk;
-		mark_info_dirty(sb, type);
-	}
-	freedqbuf(tmpbuf);
-	dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
-	/* No matter whether write succeeds block is out of list */
-	if (write_blk(sb, type, blk, buf) < 0)
-		printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
-	return 0;
-out_buf:
-	freedqbuf(tmpbuf);
-	return err;
-}
-
-/* Insert given block to the beginning of list with free entries */
-static int insert_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
-	dqbuf_t tmpbuf = getdqbuf();
-	struct mem_dqinfo *info = sb_dqinfo(sb, type);
-	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-	int err;
-
-	if (!tmpbuf)
-		return -ENOMEM;
-	dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry);
-	dh->dqdh_prev_free = cpu_to_le32(0);
-	if ((err = write_blk(sb, type, blk, buf)) < 0)
-		goto out_buf;
-	if (info->u.v2_i.dqi_free_entry) {
-		if ((err = read_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
-			goto out_buf;
-		((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk);
-		if ((err = write_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
-			goto out_buf;
-	}
-	freedqbuf(tmpbuf);
-	info->u.v2_i.dqi_free_entry = blk;
-	mark_info_dirty(sb, type);
-	return 0;
-out_buf:
-	freedqbuf(tmpbuf);
-	return err;
-}
-
-/* Find space for dquot */
-static uint find_free_dqentry(struct dquot *dquot, int *err)
-{
-	struct super_block *sb = dquot->dq_sb;
-	struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
-	uint blk, i;
-	struct v2_disk_dqdbheader *dh;
-	struct v2_disk_dqblk *ddquot;
-	struct v2_disk_dqblk fakedquot;
-	dqbuf_t buf;
-
-	*err = 0;
-	if (!(buf = getdqbuf())) {
-		*err = -ENOMEM;
+	if (qtree_entry_unused(info, dp))
 		return 0;
 		return 0;
-	}
-	dh = (struct v2_disk_dqdbheader *)buf;
-	ddquot = GETENTRIES(buf);
-	if (info->u.v2_i.dqi_free_entry) {
-		blk = info->u.v2_i.dqi_free_entry;
-		if ((*err = read_blk(sb, dquot->dq_type, blk, buf)) < 0)
-			goto out_buf;
-	}
-	else {
-		blk = get_free_dqblk(sb, dquot->dq_type);
-		if ((int)blk < 0) {
-			*err = blk;
-			freedqbuf(buf);
-			return 0;
-		}
-		memset(buf, 0, V2_DQBLKSIZE);
-		/* This is enough as block is already zeroed and entry list is empty... */
-		info->u.v2_i.dqi_free_entry = blk;
-		mark_info_dirty(sb, dquot->dq_type);
-	}
-	if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK)	/* Block will be full? */
-		if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) {
-			printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
-			goto out_buf;
-		}
-	le16_add_cpu(&dh->dqdh_entries, 1);
-	memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
-	/* Find free structure in block */
-	for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++);
-#ifdef __QUOTA_V2_PARANOIA
-	if (i == V2_DQSTRINBLK) {
-		printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
-		*err = -EIO;
-		goto out_buf;
-	}
-#endif
-	if ((*err = write_blk(sb, dquot->dq_type, blk, buf)) < 0) {
-		printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
-		goto out_buf;
-	}
-	dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk);
-	freedqbuf(buf);
-	return blk;
-out_buf:
-	freedqbuf(buf);
-	return 0;
-}
-
-/* Insert reference to structure into the trie */
-static int do_insert_tree(struct dquot *dquot, uint *treeblk, int depth)
-{
-	struct super_block *sb = dquot->dq_sb;
-	dqbuf_t buf;
-	int ret = 0, newson = 0, newact = 0;
-	__le32 *ref;
-	uint newblk;
-
-	if (!(buf = getdqbuf()))
-		return -ENOMEM;
-	if (!*treeblk) {
-		ret = get_free_dqblk(sb, dquot->dq_type);
-		if (ret < 0)
-			goto out_buf;
-		*treeblk = ret;
-		memset(buf, 0, V2_DQBLKSIZE);
-		newact = 1;
-	}
-	else {
-		if ((ret = read_blk(sb, dquot->dq_type, *treeblk, buf)) < 0) {
-			printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk);
-			goto out_buf;
-		}
-	}
-	ref = (__le32 *)buf;
-	newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-	if (!newblk)
-		newson = 1;
-	if (depth == V2_DQTREEDEPTH-1) {
-#ifdef __QUOTA_V2_PARANOIA
-		if (newblk) {
-			printk(KERN_ERR "VFS: Inserting already present quota entry (block %u).\n", le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]));
-			ret = -EIO;
-			goto out_buf;
-		}
-#endif
-		newblk = find_free_dqentry(dquot, &ret);
-	}
-	else
-		ret = do_insert_tree(dquot, &newblk, depth+1);
-	if (newson && ret >= 0) {
-		ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk);
-		ret = write_blk(sb, dquot->dq_type, *treeblk, buf);
-	}
-	else if (newact && ret < 0)
-		put_free_dqblk(sb, dquot->dq_type, buf, *treeblk);
-out_buf:
-	freedqbuf(buf);
-	return ret;
+	return le32_to_cpu(d->dqb_id) == dquot->dq_id;
 }
 }
 
 
-/* Wrapper for inserting quota structure into tree */
-static inline int dq_insert_tree(struct dquot *dquot)
+static int v2_read_dquot(struct dquot *dquot)
 {
 {
-	int tmp = V2_DQTREEOFF;
-	return do_insert_tree(dquot, &tmp, 0);
+	return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
 }
 }
 
 
-/*
- *	We don't have to be afraid of deadlocks as we never have quotas on quota files...
- */
 static int v2_write_dquot(struct dquot *dquot)
 static int v2_write_dquot(struct dquot *dquot)
 {
 {
-	int type = dquot->dq_type;
-	ssize_t ret;
-	struct v2_disk_dqblk ddquot, empty;
-
-	/* dq_off is guarded by dqio_mutex */
-	if (!dquot->dq_off)
-		if ((ret = dq_insert_tree(dquot)) < 0) {
-			printk(KERN_ERR "VFS: Error %zd occurred while creating quota.\n", ret);
-			return ret;
-		}
-	spin_lock(&dq_data_lock);
-	mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
-	/* Argh... We may need to write structure full of zeroes but that would be
-	 * treated as an empty place by the rest of the code. Format change would
-	 * be definitely cleaner but the problems probably are not worth it */
-	memset(&empty, 0, sizeof(struct v2_disk_dqblk));
-	if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
-		ddquot.dqb_itime = cpu_to_le64(1);
-	spin_unlock(&dq_data_lock);
-	ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
-	      (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off);
-	if (ret != sizeof(struct v2_disk_dqblk)) {
-		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id);
-		if (ret >= 0)
-			ret = -ENOSPC;
-	}
-	else
-		ret = 0;
-	dqstats.writes++;
-
-	return ret;
+	return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
 }
 }
 
 
-/* Free dquot entry in data block */
-static int free_dqentry(struct dquot *dquot, uint blk)
-{
-	struct super_block *sb = dquot->dq_sb;
-	int type = dquot->dq_type;
-	struct v2_disk_dqdbheader *dh;
-	dqbuf_t buf = getdqbuf();
-	int ret = 0;
-
-	if (!buf)
-		return -ENOMEM;
-	if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) {
-		printk(KERN_ERR "VFS: Quota structure has offset to other "
-		  "block (%u) than it should (%u).\n", blk,
-		  (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS));
-		goto out_buf;
-	}
-	if ((ret = read_blk(sb, type, blk, buf)) < 0) {
-		printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
-		goto out_buf;
-	}
-	dh = (struct v2_disk_dqdbheader *)buf;
-	le16_add_cpu(&dh->dqdh_entries, -1);
-	if (!le16_to_cpu(dh->dqdh_entries)) {	/* Block got free? */
-		if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 ||
-		    (ret = put_free_dqblk(sb, type, buf, blk)) < 0) {
-			printk(KERN_ERR "VFS: Can't move quota data block (%u) "
-			  "to free list.\n", blk);
-			goto out_buf;
-		}
-	}
-	else {
-		memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
-		  sizeof(struct v2_disk_dqblk));
-		if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) {
-			/* Insert will write block itself */
-			if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
-				printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
-				goto out_buf;
-			}
-		}
-		else
-			if ((ret = write_blk(sb, type, blk, buf)) < 0) {
-				printk(KERN_ERR "VFS: Can't write quota data "
-				  "block %u\n", blk);
-				goto out_buf;
-			}
-	}
-	dquot->dq_off = 0;	/* Quota is now unattached */
-out_buf:
-	freedqbuf(buf);
-	return ret;
-}
-
-/* Remove reference to dquot from tree */
-static int remove_tree(struct dquot *dquot, uint *blk, int depth)
-{
-	struct super_block *sb = dquot->dq_sb;
-	int type = dquot->dq_type;
-	dqbuf_t buf = getdqbuf();
-	int ret = 0;
-	uint newblk;
-	__le32 *ref = (__le32 *)buf;
-	
-	if (!buf)
-		return -ENOMEM;
-	if ((ret = read_blk(sb, type, *blk, buf)) < 0) {
-		printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
-		goto out_buf;
-	}
-	newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-	if (depth == V2_DQTREEDEPTH-1) {
-		ret = free_dqentry(dquot, newblk);
-		newblk = 0;
-	}
-	else
-		ret = remove_tree(dquot, &newblk, depth+1);
-	if (ret >= 0 && !newblk) {
-		int i;
-		ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0);
-		for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++);	/* Block got empty? */
-		/* Don't put the root block into the free block list */
-		if (i == V2_DQBLKSIZE && *blk != V2_DQTREEOFF) {
-			put_free_dqblk(sb, type, buf, *blk);
-			*blk = 0;
-		}
-		else
-			if ((ret = write_blk(sb, type, *blk, buf)) < 0)
-				printk(KERN_ERR "VFS: Can't write quota tree "
-				  "block %u.\n", *blk);
-	}
-out_buf:
-	freedqbuf(buf);
-	return ret;	
-}
-
-/* Delete dquot from tree */
-static int v2_delete_dquot(struct dquot *dquot)
-{
-	uint tmp = V2_DQTREEOFF;
-
-	if (!dquot->dq_off)	/* Even not allocated? */
-		return 0;
-	return remove_tree(dquot, &tmp, 0);
-}
-
-/* Find entry in block */
-static loff_t find_block_dqentry(struct dquot *dquot, uint blk)
-{
-	dqbuf_t buf = getdqbuf();
-	loff_t ret = 0;
-	int i;
-	struct v2_disk_dqblk *ddquot = GETENTRIES(buf);
-
-	if (!buf)
-		return -ENOMEM;
-	if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
-		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
-		goto out_buf;
-	}
-	if (dquot->dq_id)
-		for (i = 0; i < V2_DQSTRINBLK &&
-		     le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
-	else {	/* ID 0 as a bit more complicated searching... */
-		struct v2_disk_dqblk fakedquot;
-
-		memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
-		for (i = 0; i < V2_DQSTRINBLK; i++)
-			if (!le32_to_cpu(ddquot[i].dqb_id) &&
-			    memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
-				break;
-	}
-	if (i == V2_DQSTRINBLK) {
-		printk(KERN_ERR "VFS: Quota for id %u referenced "
-		  "but not present.\n", dquot->dq_id);
-		ret = -EIO;
-		goto out_buf;
-	}
-	else
-		ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct
-		  v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
-out_buf:
-	freedqbuf(buf);
-	return ret;
-}
-
-/* Find entry for given id in the tree */
-static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth)
-{
-	dqbuf_t buf = getdqbuf();
-	loff_t ret = 0;
-	__le32 *ref = (__le32 *)buf;
-
-	if (!buf)
-		return -ENOMEM;
-	if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
-		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
-		goto out_buf;
-	}
-	ret = 0;
-	blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-	if (!blk)	/* No reference? */
-		goto out_buf;
-	if (depth < V2_DQTREEDEPTH-1)
-		ret = find_tree_dqentry(dquot, blk, depth+1);
-	else
-		ret = find_block_dqentry(dquot, blk);
-out_buf:
-	freedqbuf(buf);
-	return ret;
-}
-
-/* Find entry for given id in the tree - wrapper function */
-static inline loff_t find_dqentry(struct dquot *dquot)
-{
-	return find_tree_dqentry(dquot, V2_DQTREEOFF, 0);
-}
-
-static int v2_read_dquot(struct dquot *dquot)
+static int v2_release_dquot(struct dquot *dquot)
 {
 {
-	int type = dquot->dq_type;
-	loff_t offset;
-	struct v2_disk_dqblk ddquot, empty;
-	int ret = 0;
-
-#ifdef __QUOTA_V2_PARANOIA
-	/* Invalidated quota? */
-	if (!dquot->dq_sb || !sb_dqopt(dquot->dq_sb)->files[type]) {
-		printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
-		return -EIO;
-	}
-#endif
-	offset = find_dqentry(dquot);
-	if (offset <= 0) {	/* Entry not present? */
-		if (offset < 0)
-			printk(KERN_ERR "VFS: Can't read quota "
-			  "structure for id %u.\n", dquot->dq_id);
-		dquot->dq_off = 0;
-		set_bit(DQ_FAKE_B, &dquot->dq_flags);
-		memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
-		ret = offset;
-	}
-	else {
-		dquot->dq_off = offset;
-		if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
-		    (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset))
-		    != sizeof(struct v2_disk_dqblk)) {
-			if (ret >= 0)
-				ret = -EIO;
-			printk(KERN_ERR "VFS: Error while reading quota "
-			  "structure for id %u.\n", dquot->dq_id);
-			memset(&ddquot, 0, sizeof(struct v2_disk_dqblk));
-		}
-		else {
-			ret = 0;
-			/* We need to escape back all-zero structure */
-			memset(&empty, 0, sizeof(struct v2_disk_dqblk));
-			empty.dqb_itime = cpu_to_le64(1);
-			if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
-				ddquot.dqb_itime = 0;
-		}
-		disk2memdqb(&dquot->dq_dqb, &ddquot);
-		if (!dquot->dq_dqb.dqb_bhardlimit &&
-			!dquot->dq_dqb.dqb_bsoftlimit &&
-			!dquot->dq_dqb.dqb_ihardlimit &&
-			!dquot->dq_dqb.dqb_isoftlimit)
-			set_bit(DQ_FAKE_B, &dquot->dq_flags);
-	}
-	dqstats.reads++;
-
-	return ret;
+	return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
 }
 }
 
 
-/* Check whether dquot should not be deleted. We know we are
- * the only one operating on dquot (thanks to dq_lock) */
-static int v2_release_dquot(struct dquot *dquot)
+static int v2_free_file_info(struct super_block *sb, int type)
 {
 {
-	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
-		return v2_delete_dquot(dquot);
+	kfree(sb_dqinfo(sb, type)->dqi_priv);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -673,7 +210,7 @@ static struct quota_format_ops v2_format_ops = {
 	.check_quota_file	= v2_check_quota_file,
 	.check_quota_file	= v2_check_quota_file,
 	.read_file_info		= v2_read_file_info,
 	.read_file_info		= v2_read_file_info,
 	.write_file_info	= v2_write_file_info,
 	.write_file_info	= v2_write_file_info,
-	.free_file_info		= NULL,
+	.free_file_info		= v2_free_file_info,
 	.read_dqblk		= v2_read_dquot,
 	.read_dqblk		= v2_read_dquot,
 	.commit_dqblk		= v2_write_dquot,
 	.commit_dqblk		= v2_write_dquot,
 	.release_dqblk		= v2_release_dquot,
 	.release_dqblk		= v2_release_dquot,

+ 0 - 0
include/linux/quotaio_v1.h → fs/quotaio_v1.h


+ 7 - 26
include/linux/quotaio_v2.h → fs/quotaio_v2.h

@@ -21,6 +21,12 @@
 	0		/* GRPQUOTA */\
 	0		/* GRPQUOTA */\
 }
 }
 
 
+/* First generic header */
+struct v2_disk_dqheader {
+	__le32 dqh_magic;	/* Magic number identifying file */
+	__le32 dqh_version;	/* File version */
+};
+
 /*
 /*
  * The following structure defines the format of the disk quota file
  * The following structure defines the format of the disk quota file
  * (as it appears on disk) - the file is a radix tree whose leaves point
  * (as it appears on disk) - the file is a radix tree whose leaves point
@@ -38,15 +44,6 @@ struct v2_disk_dqblk {
 	__le64 dqb_itime;	/* time limit for excessive inode use */
 	__le64 dqb_itime;	/* time limit for excessive inode use */
 };
 };
 
 
-/*
- * Here are header structures as written on disk and their in-memory copies
- */
-/* First generic header */
-struct v2_disk_dqheader {
-	__le32 dqh_magic;	/* Magic number identifying file */
-	__le32 dqh_version;	/* File version */
-};
-
 /* Header with type and version specific information */
 /* Header with type and version specific information */
 struct v2_disk_dqinfo {
 struct v2_disk_dqinfo {
 	__le32 dqi_bgrace;	/* Time before block soft limit becomes hard limit */
 	__le32 dqi_bgrace;	/* Time before block soft limit becomes hard limit */
@@ -57,23 +54,7 @@ struct v2_disk_dqinfo {
 	__le32 dqi_free_entry;	/* Number of block with at least one free entry */
 	__le32 dqi_free_entry;	/* Number of block with at least one free entry */
 };
 };
 
 
-/*
- *  Structure of header of block with quota structures. It is padded to 16 bytes so
- *  there will be space for exactly 21 quota-entries in a block
- */
-struct v2_disk_dqdbheader {
-	__le32 dqdh_next_free;	/* Number of next block with free entry */
-	__le32 dqdh_prev_free;	/* Number of previous block with free entry */
-	__le16 dqdh_entries;	/* Number of valid entries in block */
-	__le16 dqdh_pad1;
-	__le32 dqdh_pad2;
-};
-
 #define V2_DQINFOOFF	sizeof(struct v2_disk_dqheader)	/* Offset of info header in file */
 #define V2_DQINFOOFF	sizeof(struct v2_disk_dqheader)	/* Offset of info header in file */
-#define V2_DQBLKSIZE_BITS	10
-#define V2_DQBLKSIZE	(1 << V2_DQBLKSIZE_BITS)	/* Size of block with quota structures */
-#define V2_DQTREEOFF	1		/* Offset of tree in file in blocks */
-#define V2_DQTREEDEPTH	4		/* Depth of quota tree */
-#define V2_DQSTRINBLK	((V2_DQBLKSIZE - sizeof(struct v2_disk_dqdbheader)) / sizeof(struct v2_disk_dqblk))	/* Number of entries in one blocks */
+#define V2_DQBLKSIZE_BITS 10				/* Size of leaf block in tree */
 
 
 #endif /* _LINUX_QUOTAIO_V2_H */
 #endif /* _LINUX_QUOTAIO_V2_H */

+ 5 - 5
fs/reiserfs/super.c

@@ -649,6 +649,8 @@ static struct dquot_operations reiserfs_quota_operations = {
 	.release_dquot = reiserfs_release_dquot,
 	.release_dquot = reiserfs_release_dquot,
 	.mark_dirty = reiserfs_mark_dquot_dirty,
 	.mark_dirty = reiserfs_mark_dquot_dirty,
 	.write_info = reiserfs_write_info,
 	.write_info = reiserfs_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
 };
 };
 
 
 static struct quotactl_ops reiserfs_qctl_operations = {
 static struct quotactl_ops reiserfs_qctl_operations = {
@@ -994,8 +996,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 		if (c == 'u' || c == 'g') {
 		if (c == 'u' || c == 'g') {
 			int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
 			int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
 
 
-			if ((sb_any_quota_enabled(s) ||
-			     sb_any_quota_suspended(s)) &&
+			if (sb_any_quota_loaded(s) &&
 			    (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
 			    (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
 				reiserfs_warning(s,
 				reiserfs_warning(s,
 						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
 						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1041,8 +1042,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 						 "reiserfs_parse_options: unknown quota format specified.");
 						 "reiserfs_parse_options: unknown quota format specified.");
 				return 0;
 				return 0;
 			}
 			}
-			if ((sb_any_quota_enabled(s) ||
-			     sb_any_quota_suspended(s)) &&
+			if (sb_any_quota_loaded(s) &&
 			    *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
 			    *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
 				reiserfs_warning(s,
 				reiserfs_warning(s,
 						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
 						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1067,7 +1067,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 	}
 	}
 	/* This checking is not precise wrt the quota type but for our purposes it is sufficient */
 	/* This checking is not precise wrt the quota type but for our purposes it is sufficient */
 	if (!(*mount_options & (1 << REISERFS_QUOTA))
 	if (!(*mount_options & (1 << REISERFS_QUOTA))
-	    && sb_any_quota_enabled(s)) {
+	    && sb_any_quota_loaded(s)) {
 		reiserfs_warning(s,
 		reiserfs_warning(s,
 				 "reiserfs_parse_options: quota options must be present when quota is turned on.");
 				 "reiserfs_parse_options: quota options must be present when quota is turned on.");
 		return 0;
 		return 0;

+ 0 - 4
include/linux/Kbuild

@@ -56,8 +56,6 @@ header-y += dlm_device.h
 header-y += dlm_netlink.h
 header-y += dlm_netlink.h
 header-y += dm-ioctl.h
 header-y += dm-ioctl.h
 header-y += dn.h
 header-y += dn.h
-header-y += dqblk_v1.h
-header-y += dqblk_v2.h
 header-y += dqblk_xfs.h
 header-y += dqblk_xfs.h
 header-y += efs_fs_sb.h
 header-y += efs_fs_sb.h
 header-y += elf-fdpic.h
 header-y += elf-fdpic.h
@@ -134,8 +132,6 @@ header-y += posix_types.h
 header-y += ppdev.h
 header-y += ppdev.h
 header-y += prctl.h
 header-y += prctl.h
 header-y += qnxtypes.h
 header-y += qnxtypes.h
-header-y += quotaio_v1.h
-header-y += quotaio_v2.h
 header-y += radeonfb.h
 header-y += radeonfb.h
 header-y += raw.h
 header-y += raw.h
 header-y += resource.h
 header-y += resource.h

+ 56 - 0
include/linux/dqblk_qtree.h

@@ -0,0 +1,56 @@
+/*
+ *	Definitions of structures and functions for quota formats using trie
+ */
+
+#ifndef _LINUX_DQBLK_QTREE_H
+#define _LINUX_DQBLK_QTREE_H
+
+#include <linux/types.h>
+
+/* Numbers of blocks needed for updates - we count with the smallest
+ * possible block size (1024) */
+#define QTREE_INIT_ALLOC 4
+#define QTREE_INIT_REWRITE 2
+#define QTREE_DEL_ALLOC 0
+#define QTREE_DEL_REWRITE 6
+
+struct dquot;
+
+/* Operations */
+struct qtree_fmt_operations {
+	void (*mem2disk_dqblk)(void *disk, struct dquot *dquot);	/* Convert given entry from in memory format to disk one */
+	void (*disk2mem_dqblk)(struct dquot *dquot, void *disk);	/* Convert given entry from disk format to in memory one */
+	int (*is_id)(void *disk, struct dquot *dquot);	/* Is this structure for given id? */
+};
+
+/* Inmemory copy of version specific information */
+struct qtree_mem_dqinfo {
+	struct super_block *dqi_sb;	/* Sb quota is on */
+	int dqi_type;			/* Quota type */
+	unsigned int dqi_blocks;	/* # of blocks in quota file */
+	unsigned int dqi_free_blk;	/* First block in list of free blocks */
+	unsigned int dqi_free_entry;	/* First block with free entry */
+	unsigned int dqi_blocksize_bits;	/* Block size of quota file */
+	unsigned int dqi_entry_size;	/* Size of quota entry in quota file */
+	unsigned int dqi_usable_bs;	/* Space usable in block for quota data */
+	unsigned int dqi_qtree_depth;	/* Precomputed depth of quota tree */
+	struct qtree_fmt_operations *dqi_ops;	/* Operations for entry manipulation */
+};
+
+int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot);
+int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot);
+int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot);
+int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot);
+int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk);
+static inline int qtree_depth(struct qtree_mem_dqinfo *info)
+{
+	unsigned int epb = info->dqi_usable_bs >> 2;
+	unsigned long long entries = epb;
+	int i;
+
+	for (i = 1; entries < (1ULL << 32); i++)
+		entries *= epb;
+	return i;
+}
+
+#endif /* _LINUX_DQBLK_QTREE_H */

+ 0 - 7
include/linux/dqblk_v1.h

@@ -5,9 +5,6 @@
 #ifndef _LINUX_DQBLK_V1_H
 #ifndef _LINUX_DQBLK_V1_H
 #define _LINUX_DQBLK_V1_H
 #define _LINUX_DQBLK_V1_H
 
 
-/* Id of quota format */
-#define QFMT_VFS_OLD 1
-
 /* Root squash turned on */
 /* Root squash turned on */
 #define V1_DQF_RSQUASH 1
 #define V1_DQF_RSQUASH 1
 
 
@@ -17,8 +14,4 @@
 #define V1_DEL_ALLOC 0
 #define V1_DEL_ALLOC 0
 #define V1_DEL_REWRITE 2
 #define V1_DEL_REWRITE 2
 
 
-/* Special information about quotafile */
-struct v1_mem_dqinfo {
-};
-
 #endif	/* _LINUX_DQBLK_V1_H */
 #endif	/* _LINUX_DQBLK_V1_H */

+ 6 - 16
include/linux/dqblk_v2.h

@@ -1,26 +1,16 @@
 /*
 /*
- *	Definitions of structures for vfsv0 quota format
+ *  Definitions for vfsv0 quota format
  */
  */
 
 
 #ifndef _LINUX_DQBLK_V2_H
 #ifndef _LINUX_DQBLK_V2_H
 #define _LINUX_DQBLK_V2_H
 #define _LINUX_DQBLK_V2_H
 
 
-#include <linux/types.h>
-
-/* id numbers of quota format */
-#define QFMT_VFS_V0 2
+#include <linux/dqblk_qtree.h>
 
 
 /* Numbers of blocks needed for updates */
 /* Numbers of blocks needed for updates */
-#define V2_INIT_ALLOC 4
-#define V2_INIT_REWRITE 2
-#define V2_DEL_ALLOC 0
-#define V2_DEL_REWRITE 6
-
-/* Inmemory copy of version specific information */
-struct v2_mem_dqinfo {
-	unsigned int dqi_blocks;
-	unsigned int dqi_free_blk;
-	unsigned int dqi_free_entry;
-};
+#define V2_INIT_ALLOC QTREE_INIT_ALLOC
+#define V2_INIT_REWRITE QTREE_INIT_REWRITE
+#define V2_DEL_ALLOC QTREE_DEL_ALLOC
+#define V2_DEL_REWRITE QTREE_DEL_REWRITE
 
 
 #endif /* _LINUX_DQBLK_V2_H */
 #endif /* _LINUX_DQBLK_V2_H */

+ 32 - 0
include/linux/jbd2.h

@@ -329,6 +329,7 @@ enum jbd_state_bits {
 	BH_State,		/* Pins most journal_head state */
 	BH_State,		/* Pins most journal_head state */
 	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
 	BH_JournalHead,		/* Pins bh->b_private and jh->b_bh */
 	BH_Unshadow,		/* Dummy bit, for BJ_Shadow wakeup filtering */
 	BH_Unshadow,		/* Dummy bit, for BJ_Shadow wakeup filtering */
+	BH_JBDPrivateStart,	/* First bit available for private use by FS */
 };
 };
 
 
 BUFFER_FNS(JBD, jbd)
 BUFFER_FNS(JBD, jbd)
@@ -1007,6 +1008,35 @@ int __jbd2_journal_clean_checkpoint_list(journal_t *journal);
 int __jbd2_journal_remove_checkpoint(struct journal_head *);
 int __jbd2_journal_remove_checkpoint(struct journal_head *);
 void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);
 void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);
 
 
+
+/*
+ * Triggers
+ */
+
+struct jbd2_buffer_trigger_type {
+	/*
+	 * Fired just before a buffer is written to the journal.
+	 * mapped_data is a mapped buffer that is the frozen data for
+	 * commit.
+	 */
+	void (*t_commit)(struct jbd2_buffer_trigger_type *type,
+			 struct buffer_head *bh, void *mapped_data,
+			 size_t size);
+
+	/*
+	 * Fired during journal abort for dirty buffers that will not be
+	 * committed.
+	 */
+	void (*t_abort)(struct jbd2_buffer_trigger_type *type,
+			struct buffer_head *bh);
+};
+
+extern void jbd2_buffer_commit_trigger(struct journal_head *jh,
+				       void *mapped_data,
+				       struct jbd2_buffer_trigger_type *triggers);
+extern void jbd2_buffer_abort_trigger(struct journal_head *jh,
+				      struct jbd2_buffer_trigger_type *triggers);
+
 /* Buffer IO */
 /* Buffer IO */
 extern int
 extern int
 jbd2_journal_write_metadata_buffer(transaction_t	  *transaction,
 jbd2_journal_write_metadata_buffer(transaction_t	  *transaction,
@@ -1045,6 +1075,8 @@ extern int	 jbd2_journal_extend (handle_t *, int nblocks);
 extern int	 jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
+void		 jbd2_journal_set_triggers(struct buffer_head *,
+					   struct jbd2_buffer_trigger_type *type);
 extern int	 jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
 extern void	 jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
 extern void	 jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_forget (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_forget (handle_t *, struct buffer_head *);

+ 8 - 0
include/linux/journal-head.h

@@ -12,6 +12,8 @@
 
 
 typedef unsigned int		tid_t;		/* Unique transaction ID */
 typedef unsigned int		tid_t;		/* Unique transaction ID */
 typedef struct transaction_s	transaction_t;	/* Compound transaction type */
 typedef struct transaction_s	transaction_t;	/* Compound transaction type */
+
+
 struct buffer_head;
 struct buffer_head;
 
 
 struct journal_head {
 struct journal_head {
@@ -87,6 +89,12 @@ struct journal_head {
 	 * [j_list_lock]
 	 * [j_list_lock]
 	 */
 	 */
 	struct journal_head *b_cpnext, *b_cpprev;
 	struct journal_head *b_cpnext, *b_cpprev;
+
+	/* Trigger type */
+	struct jbd2_buffer_trigger_type *b_triggers;
+
+	/* Trigger type for the committing transaction's frozen data */
+	struct jbd2_buffer_trigger_type *b_frozen_triggers;
 };
 };
 
 
 #endif		/* JOURNAL_HEAD_H_INCLUDED */
 #endif		/* JOURNAL_HEAD_H_INCLUDED */

+ 75 - 33
include/linux/quota.h

@@ -36,17 +36,7 @@
 #include <linux/errno.h>
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/types.h>
 
 
-#define __DQUOT_VERSION__	"dquot_6.5.1"
-#define __DQUOT_NUM_VERSION__	6*10000+5*100+1
-
-/* Size of blocks in which are counted size limits */
-#define QUOTABLOCK_BITS 10
-#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
-
-/* Conversion routines from and to quota blocks */
-#define qb2kb(x) ((x) << (QUOTABLOCK_BITS-10))
-#define kb2qb(x) ((x) >> (QUOTABLOCK_BITS-10))
-#define toqb(x) (((x) + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS)
+#define __DQUOT_VERSION__	"dquot_6.5.2"
 
 
 #define MAXQUOTAS 2
 #define MAXQUOTAS 2
 #define USRQUOTA  0		/* element used for user quotas */
 #define USRQUOTA  0		/* element used for user quotas */
@@ -80,16 +70,34 @@
 #define Q_GETQUOTA 0x800007	/* get user quota structure */
 #define Q_GETQUOTA 0x800007	/* get user quota structure */
 #define Q_SETQUOTA 0x800008	/* set user quota structure */
 #define Q_SETQUOTA 0x800008	/* set user quota structure */
 
 
+/* Quota format type IDs */
+#define	QFMT_VFS_OLD 1
+#define	QFMT_VFS_V0 2
+
+/* Size of block in which space limits are passed through the quota
+ * interface */
+#define QIF_DQBLKSIZE_BITS 10
+#define QIF_DQBLKSIZE (1 << QIF_DQBLKSIZE_BITS)
+
 /*
 /*
  * Quota structure used for communication with userspace via quotactl
  * Quota structure used for communication with userspace via quotactl
  * Following flags are used to specify which fields are valid
  * Following flags are used to specify which fields are valid
  */
  */
-#define QIF_BLIMITS	1
-#define QIF_SPACE	2
-#define QIF_ILIMITS	4
-#define QIF_INODES	8
-#define QIF_BTIME	16
-#define QIF_ITIME	32
+enum {
+	QIF_BLIMITS_B = 0,
+	QIF_SPACE_B,
+	QIF_ILIMITS_B,
+	QIF_INODES_B,
+	QIF_BTIME_B,
+	QIF_ITIME_B,
+};
+
+#define QIF_BLIMITS	(1 << QIF_BLIMITS_B)
+#define QIF_SPACE	(1 << QIF_SPACE_B)
+#define QIF_ILIMITS	(1 << QIF_ILIMITS_B)
+#define QIF_INODES	(1 << QIF_INODES_B)
+#define QIF_BTIME	(1 << QIF_BTIME_B)
+#define QIF_ITIME	(1 << QIF_ITIME_B)
 #define QIF_LIMITS	(QIF_BLIMITS | QIF_ILIMITS)
 #define QIF_LIMITS	(QIF_BLIMITS | QIF_ILIMITS)
 #define QIF_USAGE	(QIF_SPACE | QIF_INODES)
 #define QIF_USAGE	(QIF_SPACE | QIF_INODES)
 #define QIF_TIMES	(QIF_BTIME | QIF_ITIME)
 #define QIF_TIMES	(QIF_BTIME | QIF_ITIME)
@@ -172,7 +180,7 @@ enum {
 #include <asm/atomic.h>
 #include <asm/atomic.h>
 
 
 typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */
 typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */
-typedef __u64 qsize_t;          /* Type in which we store sizes */
+typedef long long qsize_t;	/* Type in which we store sizes */
 
 
 extern spinlock_t dq_data_lock;
 extern spinlock_t dq_data_lock;
 
 
@@ -187,12 +195,12 @@ extern spinlock_t dq_data_lock;
  * Data for one user/group kept in memory
  * Data for one user/group kept in memory
  */
  */
 struct mem_dqblk {
 struct mem_dqblk {
-	__u32 dqb_bhardlimit;	/* absolute limit on disk blks alloc */
-	__u32 dqb_bsoftlimit;	/* preferred limit on disk blks */
+	qsize_t dqb_bhardlimit;	/* absolute limit on disk blks alloc */
+	qsize_t dqb_bsoftlimit;	/* preferred limit on disk blks */
 	qsize_t dqb_curspace;	/* current used space */
 	qsize_t dqb_curspace;	/* current used space */
-	__u32 dqb_ihardlimit;	/* absolute limit on allocated inodes */
-	__u32 dqb_isoftlimit;	/* preferred inode limit */
-	__u32 dqb_curinodes;	/* current # allocated inodes */
+	qsize_t dqb_ihardlimit;	/* absolute limit on allocated inodes */
+	qsize_t dqb_isoftlimit;	/* preferred inode limit */
+	qsize_t dqb_curinodes;	/* current # allocated inodes */
 	time_t dqb_btime;	/* time limit for excessive disk use */
 	time_t dqb_btime;	/* time limit for excessive disk use */
 	time_t dqb_itime;	/* time limit for excessive inode use */
 	time_t dqb_itime;	/* time limit for excessive inode use */
 };
 };
@@ -212,10 +220,7 @@ struct mem_dqinfo {
 	unsigned int dqi_igrace;
 	unsigned int dqi_igrace;
 	qsize_t dqi_maxblimit;
 	qsize_t dqi_maxblimit;
 	qsize_t dqi_maxilimit;
 	qsize_t dqi_maxilimit;
-	union {
-		struct v1_mem_dqinfo v1_i;
-		struct v2_mem_dqinfo v2_i;
-	} u;
+	void *dqi_priv;
 };
 };
 
 
 struct super_block;
 struct super_block;
@@ -249,6 +254,11 @@ extern struct dqstats dqstats;
 #define DQ_FAKE_B	3	/* no limits only usage */
 #define DQ_FAKE_B	3	/* no limits only usage */
 #define DQ_READ_B	4	/* dquot was read into memory */
 #define DQ_READ_B	4	/* dquot was read into memory */
 #define DQ_ACTIVE_B	5	/* dquot is active (dquot_release not called) */
 #define DQ_ACTIVE_B	5	/* dquot is active (dquot_release not called) */
+#define DQ_LASTSET_B	6	/* Following 6 bits (see QIF_) are reserved\
+				 * for the mask of entries set via SETQUOTA\
+				 * quotactl. They are set under dq_data_lock\
+				 * and the quota format handling dquot can\
+				 * clear them when it sees fit. */
 
 
 struct dquot {
 struct dquot {
 	struct hlist_node dq_hash;	/* Hash list in memory */
 	struct hlist_node dq_hash;	/* Hash list in memory */
@@ -287,11 +297,13 @@ struct dquot_operations {
 	int (*initialize) (struct inode *, int);
 	int (*initialize) (struct inode *, int);
 	int (*drop) (struct inode *);
 	int (*drop) (struct inode *);
 	int (*alloc_space) (struct inode *, qsize_t, int);
 	int (*alloc_space) (struct inode *, qsize_t, int);
-	int (*alloc_inode) (const struct inode *, unsigned long);
+	int (*alloc_inode) (const struct inode *, qsize_t);
 	int (*free_space) (struct inode *, qsize_t);
 	int (*free_space) (struct inode *, qsize_t);
-	int (*free_inode) (const struct inode *, unsigned long);
+	int (*free_inode) (const struct inode *, qsize_t);
 	int (*transfer) (struct inode *, struct iattr *);
 	int (*transfer) (struct inode *, struct iattr *);
 	int (*write_dquot) (struct dquot *);		/* Ordinary dquot write */
 	int (*write_dquot) (struct dquot *);		/* Ordinary dquot write */
+	struct dquot *(*alloc_dquot)(struct super_block *, int);	/* Allocate memory for new dquot */
+	void (*destroy_dquot)(struct dquot *);		/* Free memory for dquot */
 	int (*acquire_dquot) (struct dquot *);		/* Quota is going to be created on disk */
 	int (*acquire_dquot) (struct dquot *);		/* Quota is going to be created on disk */
 	int (*release_dquot) (struct dquot *);		/* Quota is going to be deleted from disk */
 	int (*release_dquot) (struct dquot *);		/* Quota is going to be deleted from disk */
 	int (*mark_dirty) (struct dquot *);		/* Dquot is marked dirty */
 	int (*mark_dirty) (struct dquot *);		/* Dquot is marked dirty */
@@ -320,12 +332,42 @@ struct quota_format_type {
 	struct quota_format_type *qf_next;
 	struct quota_format_type *qf_next;
 };
 };
 
 
-#define DQUOT_USR_ENABLED	0x01		/* User diskquotas enabled */
-#define DQUOT_GRP_ENABLED	0x02		/* Group diskquotas enabled */
-#define DQUOT_USR_SUSPENDED	0x04		/* User diskquotas are off, but
+/* Quota state flags - they actually come in two flavors - for users and groups */
+enum {
+	_DQUOT_USAGE_ENABLED = 0,		/* Track disk usage for users */
+	_DQUOT_LIMITS_ENABLED,			/* Enforce quota limits for users */
+	_DQUOT_SUSPENDED,			/* User diskquotas are off, but
 						 * we have necessary info in
 						 * we have necessary info in
 						 * memory to turn them on */
 						 * memory to turn them on */
-#define DQUOT_GRP_SUSPENDED	0x08		/* The same for group quotas */
+	_DQUOT_STATE_FLAGS
+};
+#define DQUOT_USAGE_ENABLED	(1 << _DQUOT_USAGE_ENABLED)
+#define DQUOT_LIMITS_ENABLED	(1 << _DQUOT_LIMITS_ENABLED)
+#define DQUOT_SUSPENDED		(1 << _DQUOT_SUSPENDED)
+#define DQUOT_STATE_FLAGS	(DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED | \
+				 DQUOT_SUSPENDED)
+/* Other quota flags */
+#define DQUOT_QUOTA_SYS_FILE	(1 << 6)	/* Quota file is a special
+						 * system file and user cannot
+						 * touch it. Filesystem is
+						 * responsible for setting
+						 * S_NOQUOTA, S_NOATIME flags
+						 */
+#define DQUOT_NEGATIVE_USAGE	(1 << 7)	/* Allow negative quota usage */
+
+static inline unsigned int dquot_state_flag(unsigned int flags, int type)
+{
+	if (type == USRQUOTA)
+		return flags;
+	return flags << _DQUOT_STATE_FLAGS;
+}
+
+static inline unsigned int dquot_generic_flag(unsigned int flags, int type)
+{
+	if (type == USRQUOTA)
+		return flags;
+	return flags >> _DQUOT_STATE_FLAGS;
+}
 
 
 struct quota_info {
 struct quota_info {
 	unsigned int flags;			/* Flags for diskquotas on this device */
 	unsigned int flags;			/* Flags for diskquotas on this device */

+ 76 - 20
include/linux/quotaops.h

@@ -24,12 +24,21 @@ void sync_dquots(struct super_block *sb, int type);
 
 
 int dquot_initialize(struct inode *inode, int type);
 int dquot_initialize(struct inode *inode, int type);
 int dquot_drop(struct inode *inode);
 int dquot_drop(struct inode *inode);
+int dquot_drop_locked(struct inode *inode);
+struct dquot *dqget(struct super_block *sb, unsigned int id, int type);
+void dqput(struct dquot *dquot);
+int dquot_is_cached(struct super_block *sb, unsigned int id, int type);
+int dquot_scan_active(struct super_block *sb,
+		      int (*fn)(struct dquot *dquot, unsigned long priv),
+		      unsigned long priv);
+struct dquot *dquot_alloc(struct super_block *sb, int type);
+void dquot_destroy(struct dquot *dquot);
 
 
 int dquot_alloc_space(struct inode *inode, qsize_t number, int prealloc);
 int dquot_alloc_space(struct inode *inode, qsize_t number, int prealloc);
-int dquot_alloc_inode(const struct inode *inode, unsigned long number);
+int dquot_alloc_inode(const struct inode *inode, qsize_t number);
 
 
 int dquot_free_space(struct inode *inode, qsize_t number);
 int dquot_free_space(struct inode *inode, qsize_t number);
-int dquot_free_inode(const struct inode *inode, unsigned long number);
+int dquot_free_inode(const struct inode *inode, qsize_t number);
 
 
 int dquot_transfer(struct inode *inode, struct iattr *iattr);
 int dquot_transfer(struct inode *inode, struct iattr *iattr);
 int dquot_commit(struct dquot *dquot);
 int dquot_commit(struct dquot *dquot);
@@ -40,11 +49,14 @@ int dquot_mark_dquot_dirty(struct dquot *dquot);
 
 
 int vfs_quota_on(struct super_block *sb, int type, int format_id,
 int vfs_quota_on(struct super_block *sb, int type, int format_id,
  	char *path, int remount);
  	char *path, int remount);
+int vfs_quota_enable(struct inode *inode, int type, int format_id,
+	unsigned int flags);
 int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
 int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
  	struct path *path);
  	struct path *path);
 int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
 int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
  	int format_id, int type);
  	int format_id, int type);
 int vfs_quota_off(struct super_block *sb, int type, int remount);
 int vfs_quota_off(struct super_block *sb, int type, int remount);
+int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags);
 int vfs_quota_sync(struct super_block *sb, int type);
 int vfs_quota_sync(struct super_block *sb, int type);
 int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
 int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
 int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
 int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
@@ -64,24 +76,22 @@ static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type)
  * Functions for checking status of quota
  * Functions for checking status of quota
  */
  */
 
 
-static inline int sb_has_quota_enabled(struct super_block *sb, int type)
+static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type)
 {
 {
-	if (type == USRQUOTA)
-		return sb_dqopt(sb)->flags & DQUOT_USR_ENABLED;
-	return sb_dqopt(sb)->flags & DQUOT_GRP_ENABLED;
+	return sb_dqopt(sb)->flags &
+				dquot_state_flag(DQUOT_USAGE_ENABLED, type);
 }
 }
 
 
-static inline int sb_any_quota_enabled(struct super_block *sb)
+static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type)
 {
 {
-	return sb_has_quota_enabled(sb, USRQUOTA) ||
-		sb_has_quota_enabled(sb, GRPQUOTA);
+	return sb_dqopt(sb)->flags &
+				dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
 }
 }
 
 
 static inline int sb_has_quota_suspended(struct super_block *sb, int type)
 static inline int sb_has_quota_suspended(struct super_block *sb, int type)
 {
 {
-	if (type == USRQUOTA)
-		return sb_dqopt(sb)->flags & DQUOT_USR_SUSPENDED;
-	return sb_dqopt(sb)->flags & DQUOT_GRP_SUSPENDED;
+	return sb_dqopt(sb)->flags &
+				dquot_state_flag(DQUOT_SUSPENDED, type);
 }
 }
 
 
 static inline int sb_any_quota_suspended(struct super_block *sb)
 static inline int sb_any_quota_suspended(struct super_block *sb)
@@ -90,6 +100,31 @@ static inline int sb_any_quota_suspended(struct super_block *sb)
 		sb_has_quota_suspended(sb, GRPQUOTA);
 		sb_has_quota_suspended(sb, GRPQUOTA);
 }
 }
 
 
+/* Does kernel know about any quota information for given sb + type? */
+static inline int sb_has_quota_loaded(struct super_block *sb, int type)
+{
+	/* Currently if anything is on, then quota usage is on as well */
+	return sb_has_quota_usage_enabled(sb, type);
+}
+
+static inline int sb_any_quota_loaded(struct super_block *sb)
+{
+	return sb_has_quota_loaded(sb, USRQUOTA) ||
+		sb_has_quota_loaded(sb, GRPQUOTA);
+}
+
+static inline int sb_has_quota_active(struct super_block *sb, int type)
+{
+	return sb_has_quota_loaded(sb, type) &&
+	       !sb_has_quota_suspended(sb, type);
+}
+
+static inline int sb_any_quota_active(struct super_block *sb)
+{
+	return sb_has_quota_active(sb, USRQUOTA) ||
+	       sb_has_quota_active(sb, GRPQUOTA);
+}
+
 /*
 /*
  * Operations supported for diskquotas.
  * Operations supported for diskquotas.
  */
  */
@@ -104,7 +139,7 @@ extern struct quotactl_ops vfs_quotactl_ops;
 static inline void vfs_dq_init(struct inode *inode)
 static inline void vfs_dq_init(struct inode *inode)
 {
 {
 	BUG_ON(!inode->i_sb);
 	BUG_ON(!inode->i_sb);
-	if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode))
+	if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode))
 		inode->i_sb->dq_op->initialize(inode, -1);
 		inode->i_sb->dq_op->initialize(inode, -1);
 }
 }
 
 
@@ -112,7 +147,7 @@ static inline void vfs_dq_init(struct inode *inode)
  * a transaction (deadlocks possible otherwise) */
  * a transaction (deadlocks possible otherwise) */
 static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr)
 static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr)
 {
 {
-	if (sb_any_quota_enabled(inode->i_sb)) {
+	if (sb_any_quota_active(inode->i_sb)) {
 		/* Used space is updated in alloc_space() */
 		/* Used space is updated in alloc_space() */
 		if (inode->i_sb->dq_op->alloc_space(inode, nr, 1) == NO_QUOTA)
 		if (inode->i_sb->dq_op->alloc_space(inode, nr, 1) == NO_QUOTA)
 			return 1;
 			return 1;
@@ -132,7 +167,7 @@ static inline int vfs_dq_prealloc_space(struct inode *inode, qsize_t nr)
 
 
 static inline int vfs_dq_alloc_space_nodirty(struct inode *inode, qsize_t nr)
 static inline int vfs_dq_alloc_space_nodirty(struct inode *inode, qsize_t nr)
 {
 {
-	if (sb_any_quota_enabled(inode->i_sb)) {
+	if (sb_any_quota_active(inode->i_sb)) {
 		/* Used space is updated in alloc_space() */
 		/* Used space is updated in alloc_space() */
 		if (inode->i_sb->dq_op->alloc_space(inode, nr, 0) == NO_QUOTA)
 		if (inode->i_sb->dq_op->alloc_space(inode, nr, 0) == NO_QUOTA)
 			return 1;
 			return 1;
@@ -152,7 +187,7 @@ static inline int vfs_dq_alloc_space(struct inode *inode, qsize_t nr)
 
 
 static inline int vfs_dq_alloc_inode(struct inode *inode)
 static inline int vfs_dq_alloc_inode(struct inode *inode)
 {
 {
-	if (sb_any_quota_enabled(inode->i_sb)) {
+	if (sb_any_quota_active(inode->i_sb)) {
 		vfs_dq_init(inode);
 		vfs_dq_init(inode);
 		if (inode->i_sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA)
 		if (inode->i_sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA)
 			return 1;
 			return 1;
@@ -162,7 +197,7 @@ static inline int vfs_dq_alloc_inode(struct inode *inode)
 
 
 static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr)
 static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr)
 {
 {
-	if (sb_any_quota_enabled(inode->i_sb))
+	if (sb_any_quota_active(inode->i_sb))
 		inode->i_sb->dq_op->free_space(inode, nr);
 		inode->i_sb->dq_op->free_space(inode, nr);
 	else
 	else
 		inode_sub_bytes(inode, nr);
 		inode_sub_bytes(inode, nr);
@@ -176,7 +211,7 @@ static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr)
 
 
 static inline void vfs_dq_free_inode(struct inode *inode)
 static inline void vfs_dq_free_inode(struct inode *inode)
 {
 {
-	if (sb_any_quota_enabled(inode->i_sb))
+	if (sb_any_quota_active(inode->i_sb))
 		inode->i_sb->dq_op->free_inode(inode, 1);
 		inode->i_sb->dq_op->free_inode(inode, 1);
 }
 }
 
 
@@ -197,12 +232,12 @@ static inline int vfs_dq_off(struct super_block *sb, int remount)
 
 
 #else
 #else
 
 
-static inline int sb_has_quota_enabled(struct super_block *sb, int type)
+static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type)
 {
 {
 	return 0;
 	return 0;
 }
 }
 
 
-static inline int sb_any_quota_enabled(struct super_block *sb)
+static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type)
 {
 {
 	return 0;
 	return 0;
 }
 }
@@ -217,6 +252,27 @@ static inline int sb_any_quota_suspended(struct super_block *sb)
 	return 0;
 	return 0;
 }
 }
 
 
+/* Does kernel know about any quota information for given sb + type? */
+static inline int sb_has_quota_loaded(struct super_block *sb, int type)
+{
+	return 0;
+}
+
+static inline int sb_any_quota_loaded(struct super_block *sb)
+{
+	return 0;
+}
+
+static inline int sb_has_quota_active(struct super_block *sb, int type)
+{
+	return 0;
+}
+
+static inline int sb_any_quota_active(struct super_block *sb)
+{
+	return 0;
+}
+
 /*
 /*
  * NO-OP when quota not configured.
  * NO-OP when quota not configured.
  */
  */

Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików