Browse Source

Merge git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6-nmw

* git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6-nmw: (34 commits)
  [GFS2] Uncomment sprintf_symbol calling code
  [DLM] lowcomms style
  [GFS2] printk warning fixes
  [GFS2] Patch to fix mmap of stuffed files
  [GFS2] use lib/parser for parsing mount options
  [DLM] Lowcomms nodeid range & initialisation fixes
  [DLM] Fix dlm_lowcoms_stop hang
  [DLM] fix mode munging
  [GFS2] lockdump improvements
  [GFS2] Patch to detect corrupt number of dir entries in leaf and/or inode blocks
  [GFS2] bz 236008: Kernel gpf doing cat /debugfs/gfs2/xxx (lock dump)
  [DLM] fs/dlm/ast.c should #include "ast.h"
  [DLM] Consolidate transport protocols
  [DLM] Remove redundant assignment
  [GFS2] Fix bz 234168 (ignoring rgrp flags)
  [DLM] change lkid format
  [DLM] interface for purge (2/2)
  [DLM] add orphan purging code (1/2)
  [DLM] split create_message function
  [GFS2] Set drop_count to 0 (off) by default
  ...
Linus Torvalds 18 years ago
parent
commit
5cefcab3db

+ 7 - 24
fs/dlm/Kconfig

@@ -3,36 +3,19 @@ menu "Distributed Lock Manager"
 
 
 config DLM
 config DLM
 	tristate "Distributed Lock Manager (DLM)"
 	tristate "Distributed Lock Manager (DLM)"
-	depends on SYSFS && (IPV6 || IPV6=n)
+	depends on IPV6 || IPV6=n
 	select CONFIGFS_FS
 	select CONFIGFS_FS
-	select IP_SCTP if DLM_SCTP
+	select IP_SCTP
 	help
 	help
-	  A general purpose distributed lock manager for kernel or userspace
-	  applications.
-
-choice
-	prompt "Select DLM communications protocol"
-	depends on DLM
-	default DLM_TCP
-	help
-	  The DLM Can use TCP or SCTP for it's network communications.
-	  SCTP supports multi-homed operations whereas TCP doesn't.
-	  However, SCTP seems to have stability problems at the moment.
-
-config DLM_TCP
-	bool "TCP/IP"
-
-config DLM_SCTP
-	bool "SCTP"
-
-endchoice
+	A general purpose distributed lock manager for kernel or userspace
+	applications.
 
 
 config DLM_DEBUG
 config DLM_DEBUG
 	bool "DLM debugging"
 	bool "DLM debugging"
 	depends on DLM
 	depends on DLM
 	help
 	help
-	  Under the debugfs mount point, the name of each lockspace will
-	  appear as a file in the "dlm" directory.  The output is the
-	  list of resource and locks the local node knows about.
+	Under the debugfs mount point, the name of each lockspace will
+	appear as a file in the "dlm" directory.  The output is the
+	list of resource and locks the local node knows about.
 
 
 endmenu
 endmenu

+ 2 - 4
fs/dlm/Makefile

@@ -8,14 +8,12 @@ dlm-y :=			ast.o \
 				member.o \
 				member.o \
 				memory.o \
 				memory.o \
 				midcomms.o \
 				midcomms.o \
+				lowcomms.o \
 				rcom.o \
 				rcom.o \
 				recover.o \
 				recover.o \
 				recoverd.o \
 				recoverd.o \
 				requestqueue.o \
 				requestqueue.o \
 				user.o \
 				user.o \
-				util.o
+				util.o 
 dlm-$(CONFIG_DLM_DEBUG) +=	debug_fs.o
 dlm-$(CONFIG_DLM_DEBUG) +=	debug_fs.o
 
 
-dlm-$(CONFIG_DLM_TCP)   += lowcomms-tcp.o
-
-dlm-$(CONFIG_DLM_SCTP)  += lowcomms-sctp.o

+ 1 - 0
fs/dlm/ast.c

@@ -14,6 +14,7 @@
 #include "dlm_internal.h"
 #include "dlm_internal.h"
 #include "lock.h"
 #include "lock.h"
 #include "user.h"
 #include "user.h"
+#include "ast.h"
 
 
 #define WAKE_ASTS  0
 #define WAKE_ASTS  0
 
 

+ 8 - 2
fs/dlm/config.c

@@ -2,7 +2,7 @@
 *******************************************************************************
 *******************************************************************************
 **
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -89,6 +89,7 @@ struct cluster {
 	unsigned int cl_toss_secs;
 	unsigned int cl_toss_secs;
 	unsigned int cl_scan_secs;
 	unsigned int cl_scan_secs;
 	unsigned int cl_log_debug;
 	unsigned int cl_log_debug;
+	unsigned int cl_protocol;
 };
 };
 
 
 enum {
 enum {
@@ -101,6 +102,7 @@ enum {
 	CLUSTER_ATTR_TOSS_SECS,
 	CLUSTER_ATTR_TOSS_SECS,
 	CLUSTER_ATTR_SCAN_SECS,
 	CLUSTER_ATTR_SCAN_SECS,
 	CLUSTER_ATTR_LOG_DEBUG,
 	CLUSTER_ATTR_LOG_DEBUG,
+	CLUSTER_ATTR_PROTOCOL,
 };
 };
 
 
 struct cluster_attribute {
 struct cluster_attribute {
@@ -159,6 +161,7 @@ CLUSTER_ATTR(recover_timer, 1);
 CLUSTER_ATTR(toss_secs, 1);
 CLUSTER_ATTR(toss_secs, 1);
 CLUSTER_ATTR(scan_secs, 1);
 CLUSTER_ATTR(scan_secs, 1);
 CLUSTER_ATTR(log_debug, 0);
 CLUSTER_ATTR(log_debug, 0);
+CLUSTER_ATTR(protocol, 0);
 
 
 static struct configfs_attribute *cluster_attrs[] = {
 static struct configfs_attribute *cluster_attrs[] = {
 	[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
 	[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -170,6 +173,7 @@ static struct configfs_attribute *cluster_attrs[] = {
 	[CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
 	[CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
 	[CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
 	[CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
 	[CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
 	[CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
+	[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
 	NULL,
 	NULL,
 };
 };
 
 
@@ -904,6 +908,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 #define DEFAULT_TOSS_SECS         10
 #define DEFAULT_TOSS_SECS         10
 #define DEFAULT_SCAN_SECS          5
 #define DEFAULT_SCAN_SECS          5
 #define DEFAULT_LOG_DEBUG          0
 #define DEFAULT_LOG_DEBUG          0
+#define DEFAULT_PROTOCOL           0
 
 
 struct dlm_config_info dlm_config = {
 struct dlm_config_info dlm_config = {
 	.ci_tcp_port = DEFAULT_TCP_PORT,
 	.ci_tcp_port = DEFAULT_TCP_PORT,
@@ -914,6 +919,7 @@ struct dlm_config_info dlm_config = {
 	.ci_recover_timer = DEFAULT_RECOVER_TIMER,
 	.ci_recover_timer = DEFAULT_RECOVER_TIMER,
 	.ci_toss_secs = DEFAULT_TOSS_SECS,
 	.ci_toss_secs = DEFAULT_TOSS_SECS,
 	.ci_scan_secs = DEFAULT_SCAN_SECS,
 	.ci_scan_secs = DEFAULT_SCAN_SECS,
-	.ci_log_debug = DEFAULT_LOG_DEBUG
+	.ci_log_debug = DEFAULT_LOG_DEBUG,
+	.ci_protocol = DEFAULT_PROTOCOL
 };
 };
 
 

+ 2 - 1
fs/dlm/config.h

@@ -2,7 +2,7 @@
 *******************************************************************************
 *******************************************************************************
 **
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -26,6 +26,7 @@ struct dlm_config_info {
 	int ci_toss_secs;
 	int ci_toss_secs;
 	int ci_scan_secs;
 	int ci_scan_secs;
 	int ci_log_debug;
 	int ci_log_debug;
+	int ci_protocol;
 };
 };
 
 
 extern struct dlm_config_info dlm_config;
 extern struct dlm_config_info dlm_config;

+ 9 - 2
fs/dlm/dlm_internal.h

@@ -2,7 +2,7 @@
 *******************************************************************************
 *******************************************************************************
 **
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -210,6 +210,9 @@ struct dlm_args {
 #define DLM_IFL_MSTCPY		0x00010000
 #define DLM_IFL_MSTCPY		0x00010000
 #define DLM_IFL_RESEND		0x00020000
 #define DLM_IFL_RESEND		0x00020000
 #define DLM_IFL_DEAD		0x00040000
 #define DLM_IFL_DEAD		0x00040000
+#define DLM_IFL_OVERLAP_UNLOCK  0x00080000
+#define DLM_IFL_OVERLAP_CANCEL  0x00100000
+#define DLM_IFL_ENDOFLIFE	0x00200000
 #define DLM_IFL_USER		0x00000001
 #define DLM_IFL_USER		0x00000001
 #define DLM_IFL_ORPHAN		0x00000002
 #define DLM_IFL_ORPHAN		0x00000002
 
 
@@ -230,8 +233,8 @@ struct dlm_lkb {
 	int8_t			lkb_grmode;	/* granted lock mode */
 	int8_t			lkb_grmode;	/* granted lock mode */
 	int8_t			lkb_bastmode;	/* requested mode */
 	int8_t			lkb_bastmode;	/* requested mode */
 	int8_t			lkb_highbast;	/* highest mode bast sent for */
 	int8_t			lkb_highbast;	/* highest mode bast sent for */
-
 	int8_t			lkb_wait_type;	/* type of reply waiting for */
 	int8_t			lkb_wait_type;	/* type of reply waiting for */
+	int8_t			lkb_wait_count;
 	int8_t			lkb_ast_type;	/* type of ast queued for */
 	int8_t			lkb_ast_type;	/* type of ast queued for */
 
 
 	struct list_head	lkb_idtbl_list;	/* lockspace lkbtbl */
 	struct list_head	lkb_idtbl_list;	/* lockspace lkbtbl */
@@ -339,6 +342,7 @@ struct dlm_header {
 #define DLM_MSG_LOOKUP		11
 #define DLM_MSG_LOOKUP		11
 #define DLM_MSG_REMOVE		12
 #define DLM_MSG_REMOVE		12
 #define DLM_MSG_LOOKUP_REPLY	13
 #define DLM_MSG_LOOKUP_REPLY	13
+#define DLM_MSG_PURGE		14
 
 
 struct dlm_message {
 struct dlm_message {
 	struct dlm_header	m_header;
 	struct dlm_header	m_header;
@@ -440,6 +444,9 @@ struct dlm_ls {
 	struct mutex		ls_waiters_mutex;
 	struct mutex		ls_waiters_mutex;
 	struct list_head	ls_waiters;	/* lkbs needing a reply */
 	struct list_head	ls_waiters;	/* lkbs needing a reply */
 
 
+	struct mutex		ls_orphans_mutex;
+	struct list_head	ls_orphans;
+
 	struct list_head	ls_nodes;	/* current nodes in ls */
 	struct list_head	ls_nodes;	/* current nodes in ls */
 	struct list_head	ls_nodes_gone;	/* dead node list, recovery */
 	struct list_head	ls_nodes_gone;	/* dead node list, recovery */
 	int			ls_num_nodes;	/* number of nodes in ls */
 	int			ls_num_nodes;	/* number of nodes in ls */

File diff suppressed because it is too large
+ 472 - 112
fs/dlm/lock.c


+ 2 - 0
fs/dlm/lock.h

@@ -41,6 +41,8 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	uint32_t flags, uint32_t lkid, char *lvb_in);
 	uint32_t flags, uint32_t lkid, char *lvb_in);
 int dlm_user_cancel(struct dlm_ls *ls,  struct dlm_user_args *ua_tmp,
 int dlm_user_cancel(struct dlm_ls *ls,  struct dlm_user_args *ua_tmp,
 	uint32_t flags, uint32_t lkid);
 	uint32_t flags, uint32_t lkid);
+int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
+	int nodeid, int pid);
 void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
 void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
 
 
 static inline int is_master(struct dlm_rsb *r)
 static inline int is_master(struct dlm_rsb *r)

+ 3 - 1
fs/dlm/lockspace.c

@@ -2,7 +2,7 @@
 *******************************************************************************
 *******************************************************************************
 **
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -459,6 +459,8 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 
 
 	INIT_LIST_HEAD(&ls->ls_waiters);
 	INIT_LIST_HEAD(&ls->ls_waiters);
 	mutex_init(&ls->ls_waiters_mutex);
 	mutex_init(&ls->ls_waiters_mutex);
+	INIT_LIST_HEAD(&ls->ls_orphans);
+	mutex_init(&ls->ls_orphans_mutex);
 
 
 	INIT_LIST_HEAD(&ls->ls_nodes);
 	INIT_LIST_HEAD(&ls->ls_nodes);
 	INIT_LIST_HEAD(&ls->ls_nodes_gone);
 	INIT_LIST_HEAD(&ls->ls_nodes_gone);

+ 0 - 1210
fs/dlm/lowcomms-sctp.c

@@ -1,1210 +0,0 @@
-/******************************************************************************
-*******************************************************************************
-**
-**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
-**
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
-**
-*******************************************************************************
-******************************************************************************/
-
-/*
- * lowcomms.c
- *
- * This is the "low-level" comms layer.
- *
- * It is responsible for sending/receiving messages
- * from other nodes in the cluster.
- *
- * Cluster nodes are referred to by their nodeids. nodeids are
- * simply 32 bit numbers to the locking module - if they need to
- * be expanded for the cluster infrastructure then that is it's
- * responsibility. It is this layer's
- * responsibility to resolve these into IP address or
- * whatever it needs for inter-node communication.
- *
- * The comms level is two kernel threads that deal mainly with
- * the receiving of messages from other nodes and passing them
- * up to the mid-level comms layer (which understands the
- * message format) for execution by the locking core, and
- * a send thread which does all the setting up of connections
- * to remote nodes and the sending of data. Threads are not allowed
- * to send their own data because it may cause them to wait in times
- * of high load. Also, this way, the sending thread can collect together
- * messages bound for one node and send them in one block.
- *
- * I don't see any problem with the recv thread executing the locking
- * code on behalf of remote processes as the locking code is
- * short, efficient and never (well, hardly ever) waits.
- *
- */
-
-#include <asm/ioctls.h>
-#include <net/sock.h>
-#include <net/tcp.h>
-#include <net/sctp/user.h>
-#include <linux/pagemap.h>
-#include <linux/socket.h>
-#include <linux/idr.h>
-
-#include "dlm_internal.h"
-#include "lowcomms.h"
-#include "config.h"
-#include "midcomms.h"
-
-static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
-static int			dlm_local_count;
-static int			dlm_local_nodeid;
-
-/* One of these per connected node */
-
-#define NI_INIT_PENDING 1
-#define NI_WRITE_PENDING 2
-
-struct nodeinfo {
-	spinlock_t		lock;
-	sctp_assoc_t		assoc_id;
-	unsigned long		flags;
-	struct list_head	write_list; /* nodes with pending writes */
-	struct list_head	writequeue; /* outgoing writequeue_entries */
-	spinlock_t		writequeue_lock;
-	int			nodeid;
-	struct work_struct      swork; /* Send workqueue */
-	struct work_struct      lwork; /* Locking workqueue */
-};
-
-static DEFINE_IDR(nodeinfo_idr);
-static DECLARE_RWSEM(nodeinfo_lock);
-static int max_nodeid;
-
-struct cbuf {
-	unsigned int base;
-	unsigned int len;
-	unsigned int mask;
-};
-
-/* Just the one of these, now. But this struct keeps
-   the connection-specific variables together */
-
-#define CF_READ_PENDING 1
-
-struct connection {
-	struct socket           *sock;
-	unsigned long		flags;
-	struct page             *rx_page;
-	atomic_t		waiting_requests;
-	struct cbuf		cb;
-	int                     eagain_flag;
-	struct work_struct      work; /* Send workqueue */
-};
-
-/* An entry waiting to be sent */
-
-struct writequeue_entry {
-	struct list_head	list;
-	struct page             *page;
-	int			offset;
-	int			len;
-	int			end;
-	int			users;
-	struct nodeinfo         *ni;
-};
-
-static void cbuf_add(struct cbuf *cb, int n)
-{
-	cb->len += n;
-}
-
-static int cbuf_data(struct cbuf *cb)
-{
-	return ((cb->base + cb->len) & cb->mask);
-}
-
-static void cbuf_init(struct cbuf *cb, int size)
-{
-	cb->base = cb->len = 0;
-	cb->mask = size-1;
-}
-
-static void cbuf_eat(struct cbuf *cb, int n)
-{
-	cb->len  -= n;
-	cb->base += n;
-	cb->base &= cb->mask;
-}
-
-/* List of nodes which have writes pending */
-static LIST_HEAD(write_nodes);
-static DEFINE_SPINLOCK(write_nodes_lock);
-
-
-/* Maximum number of incoming messages to process before
- * doing a schedule()
- */
-#define MAX_RX_MSG_COUNT 25
-
-/* Work queues */
-static struct workqueue_struct *recv_workqueue;
-static struct workqueue_struct *send_workqueue;
-static struct workqueue_struct *lock_workqueue;
-
-/* The SCTP connection */
-static struct connection sctp_con;
-
-static void process_send_sockets(struct work_struct *work);
-static void process_recv_sockets(struct work_struct *work);
-static void process_lock_request(struct work_struct *work);
-
-static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
-{
-	struct sockaddr_storage addr;
-	int error;
-
-	if (!dlm_local_count)
-		return -1;
-
-	error = dlm_nodeid_to_addr(nodeid, &addr);
-	if (error)
-		return error;
-
-	if (dlm_local_addr[0]->ss_family == AF_INET) {
-		struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
-		struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
-		ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
-	} else {
-		struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
-		struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
-		memcpy(&ret6->sin6_addr, &in6->sin6_addr,
-		       sizeof(in6->sin6_addr));
-	}
-
-	return 0;
-}
-
-/* If alloc is 0 here we will not attempt to allocate a new
-   nodeinfo struct */
-static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
-{
-	struct nodeinfo *ni;
-	int r;
-	int n;
-
-	down_read(&nodeinfo_lock);
-	ni = idr_find(&nodeinfo_idr, nodeid);
-	up_read(&nodeinfo_lock);
-
-	if (ni || !alloc)
-		return ni;
-
-	down_write(&nodeinfo_lock);
-
-	ni = idr_find(&nodeinfo_idr, nodeid);
-	if (ni)
-		goto out_up;
-
-	r = idr_pre_get(&nodeinfo_idr, alloc);
-	if (!r)
-		goto out_up;
-
-	ni = kmalloc(sizeof(struct nodeinfo), alloc);
-	if (!ni)
-		goto out_up;
-
-	r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
-	if (r) {
-		kfree(ni);
-		ni = NULL;
-		goto out_up;
-	}
-	if (n != nodeid) {
-		idr_remove(&nodeinfo_idr, n);
-		kfree(ni);
-		ni = NULL;
-		goto out_up;
-	}
-	memset(ni, 0, sizeof(struct nodeinfo));
-	spin_lock_init(&ni->lock);
-	INIT_LIST_HEAD(&ni->writequeue);
-	spin_lock_init(&ni->writequeue_lock);
-	INIT_WORK(&ni->lwork, process_lock_request);
-	INIT_WORK(&ni->swork, process_send_sockets);
-	ni->nodeid = nodeid;
-
-	if (nodeid > max_nodeid)
-		max_nodeid = nodeid;
-out_up:
-	up_write(&nodeinfo_lock);
-
-	return ni;
-}
-
-/* Don't call this too often... */
-static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
-{
-	int i;
-	struct nodeinfo *ni;
-
-	for (i=1; i<=max_nodeid; i++) {
-		ni = nodeid2nodeinfo(i, 0);
-		if (ni && ni->assoc_id == assoc)
-			return ni;
-	}
-	return NULL;
-}
-
-/* Data or notification available on socket */
-static void lowcomms_data_ready(struct sock *sk, int count_unused)
-{
-	if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
-		queue_work(recv_workqueue, &sctp_con.work);
-}
-
-
-/* Add the port number to an IP6 or 4 sockaddr and return the address length.
-   Also padd out the struct with zeros to make comparisons meaningful */
-
-static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
-			  int *addr_len)
-{
-	struct sockaddr_in *local4_addr;
-	struct sockaddr_in6 *local6_addr;
-
-	if (!dlm_local_count)
-		return;
-
-	if (!port) {
-		if (dlm_local_addr[0]->ss_family == AF_INET) {
-			local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
-			port = be16_to_cpu(local4_addr->sin_port);
-		} else {
-			local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
-			port = be16_to_cpu(local6_addr->sin6_port);
-		}
-	}
-
-	saddr->ss_family = dlm_local_addr[0]->ss_family;
-	if (dlm_local_addr[0]->ss_family == AF_INET) {
-		struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
-		in4_addr->sin_port = cpu_to_be16(port);
-		memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
-		memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
-		       sizeof(struct sockaddr_in));
-		*addr_len = sizeof(struct sockaddr_in);
-	} else {
-		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
-		in6_addr->sin6_port = cpu_to_be16(port);
-		memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
-		       sizeof(struct sockaddr_in6));
-		*addr_len = sizeof(struct sockaddr_in6);
-	}
-}
-
-/* Close the connection and tidy up */
-static void close_connection(void)
-{
-	if (sctp_con.sock) {
-		sock_release(sctp_con.sock);
-		sctp_con.sock = NULL;
-	}
-
-	if (sctp_con.rx_page) {
-		__free_page(sctp_con.rx_page);
-		sctp_con.rx_page = NULL;
-	}
-}
-
-/* We only send shutdown messages to nodes that are not part of the cluster */
-static void send_shutdown(sctp_assoc_t associd)
-{
-	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-	struct msghdr outmessage;
-	struct cmsghdr *cmsg;
-	struct sctp_sndrcvinfo *sinfo;
-	int ret;
-
-	outmessage.msg_name = NULL;
-	outmessage.msg_namelen = 0;
-	outmessage.msg_control = outcmsg;
-	outmessage.msg_controllen = sizeof(outcmsg);
-	outmessage.msg_flags = MSG_EOR;
-
-	cmsg = CMSG_FIRSTHDR(&outmessage);
-	cmsg->cmsg_level = IPPROTO_SCTP;
-	cmsg->cmsg_type = SCTP_SNDRCV;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-	outmessage.msg_controllen = cmsg->cmsg_len;
-	sinfo = CMSG_DATA(cmsg);
-	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-
-	sinfo->sinfo_flags |= MSG_EOF;
-	sinfo->sinfo_assoc_id = associd;
-
-	ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
-
-	if (ret != 0)
-		log_print("send EOF to node failed: %d", ret);
-}
-
-
-/* INIT failed but we don't know which node...
-   restart INIT on all pending nodes */
-static void init_failed(void)
-{
-	int i;
-	struct nodeinfo *ni;
-
-	for (i=1; i<=max_nodeid; i++) {
-		ni = nodeid2nodeinfo(i, 0);
-		if (!ni)
-			continue;
-
-		if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
-			ni->assoc_id = 0;
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-				queue_work(send_workqueue, &ni->swork);
-			}
-		}
-	}
-}
-
-/* Something happened to an association */
-static void process_sctp_notification(struct msghdr *msg, char *buf)
-{
-	union sctp_notification *sn = (union sctp_notification *)buf;
-
-	if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
-		switch (sn->sn_assoc_change.sac_state) {
-
-		case SCTP_COMM_UP:
-		case SCTP_RESTART:
-		{
-			/* Check that the new node is in the lockspace */
-			struct sctp_prim prim;
-			mm_segment_t fs;
-			int nodeid;
-			int prim_len, ret;
-			int addr_len;
-			struct nodeinfo *ni;
-
-			/* This seems to happen when we received a connection
-			 * too early... or something...  anyway, it happens but
-			 * we always seem to get a real message too, see
-			 * receive_from_sock */
-
-			if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
-				log_print("COMM_UP for invalid assoc ID %d",
-					  (int)sn->sn_assoc_change.sac_assoc_id);
-				init_failed();
-				return;
-			}
-			memset(&prim, 0, sizeof(struct sctp_prim));
-			prim_len = sizeof(struct sctp_prim);
-			prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
-
-			fs = get_fs();
-			set_fs(get_ds());
-			ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
-							     IPPROTO_SCTP,
-							     SCTP_PRIMARY_ADDR,
-							     (char*)&prim,
-							     &prim_len);
-			set_fs(fs);
-			if (ret < 0) {
-				struct nodeinfo *ni;
-
-				log_print("getsockopt/sctp_primary_addr on "
-					  "new assoc %d failed : %d",
-					  (int)sn->sn_assoc_change.sac_assoc_id,
-					  ret);
-
-				/* Retry INIT later */
-				ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
-				if (ni)
-					clear_bit(NI_INIT_PENDING, &ni->flags);
-				return;
-			}
-			make_sockaddr(&prim.ssp_addr, 0, &addr_len);
-			if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
-				log_print("reject connect from unknown addr");
-				send_shutdown(prim.ssp_assoc_id);
-				return;
-			}
-
-			ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
-			if (!ni)
-				return;
-
-			/* Save the assoc ID */
-			ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
-
-			log_print("got new/restarted association %d nodeid %d",
-				  (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
-
-			/* Send any pending writes */
-			clear_bit(NI_INIT_PENDING, &ni->flags);
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-				queue_work(send_workqueue, &ni->swork);
-			}
-		}
-		break;
-
-		case SCTP_COMM_LOST:
-		case SCTP_SHUTDOWN_COMP:
-		{
-			struct nodeinfo *ni;
-
-			ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
-			if (ni) {
-				spin_lock(&ni->lock);
-				ni->assoc_id = 0;
-				spin_unlock(&ni->lock);
-			}
-		}
-		break;
-
-		/* We don't know which INIT failed, so clear the PENDING flags
-		 * on them all.  if assoc_id is zero then it will then try
-		 * again */
-
-		case SCTP_CANT_STR_ASSOC:
-		{
-			log_print("Can't start SCTP association - retrying");
-			init_failed();
-		}
-		break;
-
-		default:
-			log_print("unexpected SCTP assoc change id=%d state=%d",
-				  (int)sn->sn_assoc_change.sac_assoc_id,
-				  sn->sn_assoc_change.sac_state);
-		}
-	}
-}
-
-/* Data received from remote end */
-static int receive_from_sock(void)
-{
-	int ret = 0;
-	struct msghdr msg;
-	struct kvec iov[2];
-	unsigned len;
-	int r;
-	struct sctp_sndrcvinfo *sinfo;
-	struct cmsghdr *cmsg;
-	struct nodeinfo *ni;
-
-	/* These two are marginally too big for stack allocation, but this
-	 * function is (currently) only called by dlm_recvd so static should be
-	 * OK.
-	 */
-	static struct sockaddr_storage msgname;
-	static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-
-	if (sctp_con.sock == NULL)
-		goto out;
-
-	if (sctp_con.rx_page == NULL) {
-		/*
-		 * This doesn't need to be atomic, but I think it should
-		 * improve performance if it is.
-		 */
-		sctp_con.rx_page = alloc_page(GFP_ATOMIC);
-		if (sctp_con.rx_page == NULL)
-			goto out_resched;
-		cbuf_init(&sctp_con.cb, PAGE_CACHE_SIZE);
-	}
-
-	memset(&incmsg, 0, sizeof(incmsg));
-	memset(&msgname, 0, sizeof(msgname));
-
-	msg.msg_name = &msgname;
-	msg.msg_namelen = sizeof(msgname);
-	msg.msg_flags = 0;
-	msg.msg_control = incmsg;
-	msg.msg_controllen = sizeof(incmsg);
-	msg.msg_iovlen = 1;
-
-	/* I don't see why this circular buffer stuff is necessary for SCTP
-	 * which is a packet-based protocol, but the whole thing breaks under
-	 * load without it! The overhead is minimal (and is in the TCP lowcomms
-	 * anyway, of course) so I'll leave it in until I can figure out what's
-	 * really happening.
-	 */
-
-	/*
-	 * iov[0] is the bit of the circular buffer between the current end
-	 * point (cb.base + cb.len) and the end of the buffer.
-	 */
-	iov[0].iov_len = sctp_con.cb.base - cbuf_data(&sctp_con.cb);
-	iov[0].iov_base = page_address(sctp_con.rx_page) +
-		cbuf_data(&sctp_con.cb);
-	iov[1].iov_len = 0;
-
-	/*
-	 * iov[1] is the bit of the circular buffer between the start of the
-	 * buffer and the start of the currently used section (cb.base)
-	 */
-	if (cbuf_data(&sctp_con.cb) >= sctp_con.cb.base) {
-		iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&sctp_con.cb);
-		iov[1].iov_len = sctp_con.cb.base;
-		iov[1].iov_base = page_address(sctp_con.rx_page);
-		msg.msg_iovlen = 2;
-	}
-	len = iov[0].iov_len + iov[1].iov_len;
-
-	r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, msg.msg_iovlen, len,
-				 MSG_NOSIGNAL | MSG_DONTWAIT);
-	if (ret <= 0)
-		goto out_close;
-
-	msg.msg_control = incmsg;
-	msg.msg_controllen = sizeof(incmsg);
-	cmsg = CMSG_FIRSTHDR(&msg);
-	sinfo = CMSG_DATA(cmsg);
-
-	if (msg.msg_flags & MSG_NOTIFICATION) {
-		process_sctp_notification(&msg, page_address(sctp_con.rx_page));
-		return 0;
-	}
-
-	/* Is this a new association ? */
-	ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
-	if (ni) {
-		ni->assoc_id = sinfo->sinfo_assoc_id;
-		if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
-
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-				queue_work(send_workqueue, &ni->swork);
-			}
-		}
-	}
-
-	/* INIT sends a message with length of 1 - ignore it */
-	if (r == 1)
-		return 0;
-
-	cbuf_add(&sctp_con.cb, ret);
-	// PJC: TODO: Add to node's workqueue....can we ??
-	ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
-					  page_address(sctp_con.rx_page),
-					  sctp_con.cb.base, sctp_con.cb.len,
-					  PAGE_CACHE_SIZE);
-	if (ret < 0)
-		goto out_close;
-	cbuf_eat(&sctp_con.cb, ret);
-
-out:
-	ret = 0;
-	goto out_ret;
-
-out_resched:
-	lowcomms_data_ready(sctp_con.sock->sk, 0);
-	ret = 0;
-	cond_resched();
-	goto out_ret;
-
-out_close:
-	if (ret != -EAGAIN)
-		log_print("error reading from sctp socket: %d", ret);
-out_ret:
-	return ret;
-}
-
-/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
-static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
-{
-	mm_segment_t fs;
-	int result = 0;
-
-	fs = get_fs();
-	set_fs(get_ds());
-	if (num == 1)
-		result = sctp_con.sock->ops->bind(sctp_con.sock,
-						  (struct sockaddr *) addr,
-						  addr_len);
-	else
-		result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
-							SCTP_SOCKOPT_BINDX_ADD,
-							(char *)addr, addr_len);
-	set_fs(fs);
-
-	if (result < 0)
-		log_print("Can't bind to port %d addr number %d",
-			  dlm_config.ci_tcp_port, num);
-
-	return result;
-}
-
-static void init_local(void)
-{
-	struct sockaddr_storage sas, *addr;
-	int i;
-
-	dlm_local_nodeid = dlm_our_nodeid();
-
-	for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
-		if (dlm_our_addr(&sas, i))
-			break;
-
-		addr = kmalloc(sizeof(*addr), GFP_KERNEL);
-		if (!addr)
-			break;
-		memcpy(addr, &sas, sizeof(*addr));
-		dlm_local_addr[dlm_local_count++] = addr;
-	}
-}
-
-/* Initialise SCTP socket and bind to all interfaces */
-static int init_sock(void)
-{
-	mm_segment_t fs;
-	struct socket *sock = NULL;
-	struct sockaddr_storage localaddr;
-	struct sctp_event_subscribe subscribe;
-	int result = -EINVAL, num = 1, i, addr_len;
-
-	if (!dlm_local_count) {
-		init_local();
-		if (!dlm_local_count) {
-			log_print("no local IP address has been set");
-			goto out;
-		}
-	}
-
-	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
-				  IPPROTO_SCTP, &sock);
-	if (result < 0) {
-		log_print("Can't create comms socket, check SCTP is loaded");
-		goto out;
-	}
-
-	/* Listen for events */
-	memset(&subscribe, 0, sizeof(subscribe));
-	subscribe.sctp_data_io_event = 1;
-	subscribe.sctp_association_event = 1;
-	subscribe.sctp_send_failure_event = 1;
-	subscribe.sctp_shutdown_event = 1;
-	subscribe.sctp_partial_delivery_event = 1;
-
-	fs = get_fs();
-	set_fs(get_ds());
-	result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
-				       (char *)&subscribe, sizeof(subscribe));
-	set_fs(fs);
-
-	if (result < 0) {
-		log_print("Failed to set SCTP_EVENTS on socket: result=%d",
-			  result);
-		goto create_delsock;
-	}
-
-	/* Init con struct */
-	sock->sk->sk_user_data = &sctp_con;
-	sctp_con.sock = sock;
-	sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
-
-	/* Bind to all interfaces. */
-	for (i = 0; i < dlm_local_count; i++) {
-		memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
-		make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len);
-
-		result = add_bind_addr(&localaddr, addr_len, num);
-		if (result)
-			goto create_delsock;
-		++num;
-	}
-
-	result = sock->ops->listen(sock, 5);
-	if (result < 0) {
-		log_print("Can't set socket listening");
-		goto create_delsock;
-	}
-
-	return 0;
-
-create_delsock:
-	sock_release(sock);
-	sctp_con.sock = NULL;
-out:
-	return result;
-}
-
-
-static struct writequeue_entry *new_writequeue_entry(gfp_t allocation)
-{
-	struct writequeue_entry *entry;
-
-	entry = kmalloc(sizeof(struct writequeue_entry), allocation);
-	if (!entry)
-		return NULL;
-
-	entry->page = alloc_page(allocation);
-	if (!entry->page) {
-		kfree(entry);
-		return NULL;
-	}
-
-	entry->offset = 0;
-	entry->len = 0;
-	entry->end = 0;
-	entry->users = 0;
-
-	return entry;
-}
-
-void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
-{
-	struct writequeue_entry *e;
-	int offset = 0;
-	int users = 0;
-	struct nodeinfo *ni;
-
-	ni = nodeid2nodeinfo(nodeid, allocation);
-	if (!ni)
-		return NULL;
-
-	spin_lock(&ni->writequeue_lock);
-	e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
-	if ((&e->list == &ni->writequeue) ||
-	    (PAGE_CACHE_SIZE - e->end < len)) {
-		e = NULL;
-	} else {
-		offset = e->end;
-		e->end += len;
-		users = e->users++;
-	}
-	spin_unlock(&ni->writequeue_lock);
-
-	if (e) {
-	got_one:
-		if (users == 0)
-			kmap(e->page);
-		*ppc = page_address(e->page) + offset;
-		return e;
-	}
-
-	e = new_writequeue_entry(allocation);
-	if (e) {
-		spin_lock(&ni->writequeue_lock);
-		offset = e->end;
-		e->end += len;
-		e->ni = ni;
-		users = e->users++;
-		list_add_tail(&e->list, &ni->writequeue);
-		spin_unlock(&ni->writequeue_lock);
-		goto got_one;
-	}
-	return NULL;
-}
-
-void dlm_lowcomms_commit_buffer(void *arg)
-{
-	struct writequeue_entry *e = (struct writequeue_entry *) arg;
-	int users;
-	struct nodeinfo *ni = e->ni;
-
-	spin_lock(&ni->writequeue_lock);
-	users = --e->users;
-	if (users)
-		goto out;
-	e->len = e->end - e->offset;
-	kunmap(e->page);
-	spin_unlock(&ni->writequeue_lock);
-
-	if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-		spin_lock_bh(&write_nodes_lock);
-		list_add_tail(&ni->write_list, &write_nodes);
-		spin_unlock_bh(&write_nodes_lock);
-
-		queue_work(send_workqueue, &ni->swork);
-	}
-	return;
-
-out:
-	spin_unlock(&ni->writequeue_lock);
-	return;
-}
-
-static void free_entry(struct writequeue_entry *e)
-{
-	__free_page(e->page);
-	kfree(e);
-}
-
-/* Initiate an SCTP association. In theory we could just use sendmsg() on
-   the first IP address and it should work, but this allows us to set up the
-   association before sending any valuable data that we can't afford to lose.
-   It also keeps the send path clean as it can now always use the association ID */
-static void initiate_association(int nodeid)
-{
-	struct sockaddr_storage rem_addr;
-	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-	struct msghdr outmessage;
-	struct cmsghdr *cmsg;
-	struct sctp_sndrcvinfo *sinfo;
-	int ret;
-	int addrlen;
-	char buf[1];
-	struct kvec iov[1];
-	struct nodeinfo *ni;
-
-	log_print("Initiating association with node %d", nodeid);
-
-	ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
-	if (!ni)
-		return;
-
-	if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
-		log_print("no address for nodeid %d", nodeid);
-		return;
-	}
-
-	make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
-
-	outmessage.msg_name = &rem_addr;
-	outmessage.msg_namelen = addrlen;
-	outmessage.msg_control = outcmsg;
-	outmessage.msg_controllen = sizeof(outcmsg);
-	outmessage.msg_flags = MSG_EOR;
-
-	iov[0].iov_base = buf;
-	iov[0].iov_len = 1;
-
-	/* Real INIT messages seem to cause trouble. Just send a 1 byte message
-	   we can afford to lose */
-	cmsg = CMSG_FIRSTHDR(&outmessage);
-	cmsg->cmsg_level = IPPROTO_SCTP;
-	cmsg->cmsg_type = SCTP_SNDRCV;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-	sinfo = CMSG_DATA(cmsg);
-	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-	sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
-
-	outmessage.msg_controllen = cmsg->cmsg_len;
-	ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
-	if (ret < 0) {
-		log_print("send INIT to node failed: %d", ret);
-		/* Try again later */
-		clear_bit(NI_INIT_PENDING, &ni->flags);
-	}
-}
-
-/* Send a message */
-static void send_to_sock(struct nodeinfo *ni)
-{
-	int ret = 0;
-	struct writequeue_entry *e;
-	int len, offset;
-	struct msghdr outmsg;
-	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-	struct cmsghdr *cmsg;
-	struct sctp_sndrcvinfo *sinfo;
-	struct kvec iov;
-
-	/* See if we need to init an association before we start
-	   sending precious messages */
-	spin_lock(&ni->lock);
-	if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
-		spin_unlock(&ni->lock);
-		initiate_association(ni->nodeid);
-		return;
-	}
-	spin_unlock(&ni->lock);
-
-	outmsg.msg_name = NULL; /* We use assoc_id */
-	outmsg.msg_namelen = 0;
-	outmsg.msg_control = outcmsg;
-	outmsg.msg_controllen = sizeof(outcmsg);
-	outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
-
-	cmsg = CMSG_FIRSTHDR(&outmsg);
-	cmsg->cmsg_level = IPPROTO_SCTP;
-	cmsg->cmsg_type = SCTP_SNDRCV;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-	sinfo = CMSG_DATA(cmsg);
-	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-	sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
-	sinfo->sinfo_assoc_id = ni->assoc_id;
-	outmsg.msg_controllen = cmsg->cmsg_len;
-
-	spin_lock(&ni->writequeue_lock);
-	for (;;) {
-		if (list_empty(&ni->writequeue))
-			break;
-		e = list_entry(ni->writequeue.next, struct writequeue_entry,
-			       list);
-		len = e->len;
-		offset = e->offset;
-		BUG_ON(len == 0 && e->users == 0);
-		spin_unlock(&ni->writequeue_lock);
-		kmap(e->page);
-
-		ret = 0;
-		if (len) {
-			iov.iov_base = page_address(e->page)+offset;
-			iov.iov_len = len;
-
-			ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
-					     len);
-			if (ret == -EAGAIN) {
-				sctp_con.eagain_flag = 1;
-				goto out;
-			} else if (ret < 0)
-				goto send_error;
-		} else {
-			/* Don't starve people filling buffers */
-			cond_resched();
-		}
-
-		spin_lock(&ni->writequeue_lock);
-		e->offset += ret;
-		e->len -= ret;
-
-		if (e->len == 0 && e->users == 0) {
-			list_del(&e->list);
-			kunmap(e->page);
-			free_entry(e);
-			continue;
-		}
-	}
-	spin_unlock(&ni->writequeue_lock);
-out:
-	return;
-
-send_error:
-	log_print("Error sending to node %d %d", ni->nodeid, ret);
-	spin_lock(&ni->lock);
-	if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
-		ni->assoc_id = 0;
-		spin_unlock(&ni->lock);
-		initiate_association(ni->nodeid);
-	} else
-		spin_unlock(&ni->lock);
-
-	return;
-}
-
-/* Try to send any messages that are pending */
-static void process_output_queue(void)
-{
-	struct list_head *list;
-	struct list_head *temp;
-
-	spin_lock_bh(&write_nodes_lock);
-	list_for_each_safe(list, temp, &write_nodes) {
-		struct nodeinfo *ni =
-			list_entry(list, struct nodeinfo, write_list);
-		clear_bit(NI_WRITE_PENDING, &ni->flags);
-		list_del(&ni->write_list);
-
-		spin_unlock_bh(&write_nodes_lock);
-
-		send_to_sock(ni);
-		spin_lock_bh(&write_nodes_lock);
-	}
-	spin_unlock_bh(&write_nodes_lock);
-}
-
-/* Called after we've had -EAGAIN and been woken up */
-static void refill_write_queue(void)
-{
-	int i;
-
-	for (i=1; i<=max_nodeid; i++) {
-		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
-
-		if (ni) {
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-			}
-		}
-	}
-}
-
-static void clean_one_writequeue(struct nodeinfo *ni)
-{
-	struct list_head *list;
-	struct list_head *temp;
-
-	spin_lock(&ni->writequeue_lock);
-	list_for_each_safe(list, temp, &ni->writequeue) {
-		struct writequeue_entry *e =
-			list_entry(list, struct writequeue_entry, list);
-		list_del(&e->list);
-		free_entry(e);
-	}
-	spin_unlock(&ni->writequeue_lock);
-}
-
-static void clean_writequeues(void)
-{
-	int i;
-
-	for (i=1; i<=max_nodeid; i++) {
-		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
-		if (ni)
-			clean_one_writequeue(ni);
-	}
-}
-
-
-static void dealloc_nodeinfo(void)
-{
-	int i;
-
-	for (i=1; i<=max_nodeid; i++) {
-		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
-		if (ni) {
-			idr_remove(&nodeinfo_idr, i);
-			kfree(ni);
-		}
-	}
-}
-
-int dlm_lowcomms_close(int nodeid)
-{
-	struct nodeinfo *ni;
-
-	ni = nodeid2nodeinfo(nodeid, 0);
-	if (!ni)
-		return -1;
-
-	spin_lock(&ni->lock);
-	if (ni->assoc_id) {
-		ni->assoc_id = 0;
-		/* Don't send shutdown here, sctp will just queue it
-		   till the node comes back up! */
-	}
-	spin_unlock(&ni->lock);
-
-	clean_one_writequeue(ni);
-	clear_bit(NI_INIT_PENDING, &ni->flags);
-	return 0;
-}
-
-// PJC: The work queue function for receiving.
-static void process_recv_sockets(struct work_struct *work)
-{
-	if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
-		int ret;
-		int count = 0;
-
-		do {
-			ret = receive_from_sock();
-
-			/* Don't starve out everyone else */
-			if (++count >= MAX_RX_MSG_COUNT) {
-				cond_resched();
-				count = 0;
-			}
-		} while (!kthread_should_stop() && ret >=0);
-	}
-	cond_resched();
-}
-
-// PJC: the work queue function for sending
-static void process_send_sockets(struct work_struct *work)
-{
-	if (sctp_con.eagain_flag) {
-		sctp_con.eagain_flag = 0;
-		refill_write_queue();
-	}
-	process_output_queue();
-}
-
-// PJC: Process lock requests from a particular node.
-// TODO: can we optimise this out on UP ??
-static void process_lock_request(struct work_struct *work)
-{
-}
-
-static void daemons_stop(void)
-{
-	destroy_workqueue(recv_workqueue);
-	destroy_workqueue(send_workqueue);
-	destroy_workqueue(lock_workqueue);
-}
-
-static int daemons_start(void)
-{
-	int error;
-	recv_workqueue = create_workqueue("dlm_recv");
-	error = IS_ERR(recv_workqueue);
-	if (error) {
-		log_print("can't start dlm_recv %d", error);
-		return error;
-	}
-
-	send_workqueue = create_singlethread_workqueue("dlm_send");
-	error = IS_ERR(send_workqueue);
-	if (error) {
-		log_print("can't start dlm_send %d", error);
-		destroy_workqueue(recv_workqueue);
-		return error;
-	}
-
-	lock_workqueue = create_workqueue("dlm_rlock");
-	error = IS_ERR(lock_workqueue);
-	if (error) {
-		log_print("can't start dlm_rlock %d", error);
-		destroy_workqueue(send_workqueue);
-		destroy_workqueue(recv_workqueue);
-		return error;
-	}
-
-	return 0;
-}
-
-/*
- * This is quite likely to sleep...
- */
-int dlm_lowcomms_start(void)
-{
-	int error;
-
-	INIT_WORK(&sctp_con.work, process_recv_sockets);
-
-	error = init_sock();
-	if (error)
-		goto fail_sock;
-	error = daemons_start();
-	if (error)
-		goto fail_sock;
-	return 0;
-
-fail_sock:
-	close_connection();
-	return error;
-}
-
-void dlm_lowcomms_stop(void)
-{
-	int i;
-
-	sctp_con.flags = 0x7;
-	daemons_stop();
-	clean_writequeues();
-	close_connection();
-	dealloc_nodeinfo();
-	max_nodeid = 0;
-
-	dlm_local_count = 0;
-	dlm_local_nodeid = 0;
-
-	for (i = 0; i < dlm_local_count; i++)
-		kfree(dlm_local_addr[i]);
-}

File diff suppressed because it is too large
+ 585 - 108
fs/dlm/lowcomms.c


+ 107 - 56
fs/dlm/user.c

@@ -1,5 +1,5 @@
 /*
 /*
- * Copyright (C) 2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2007 Red Hat, Inc.  All rights reserved.
  *
  *
  * This copyrighted material is made available to anyone wishing to use,
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -56,6 +56,7 @@ struct dlm_write_request32 {
 	union  {
 	union  {
 		struct dlm_lock_params32 lock;
 		struct dlm_lock_params32 lock;
 		struct dlm_lspace_params lspace;
 		struct dlm_lspace_params lspace;
+		struct dlm_purge_params purge;
 	} i;
 	} i;
 };
 };
 
 
@@ -92,6 +93,9 @@ static void compat_input(struct dlm_write_request *kb,
 		kb->i.lspace.flags = kb32->i.lspace.flags;
 		kb->i.lspace.flags = kb32->i.lspace.flags;
 		kb->i.lspace.minor = kb32->i.lspace.minor;
 		kb->i.lspace.minor = kb32->i.lspace.minor;
 		strcpy(kb->i.lspace.name, kb32->i.lspace.name);
 		strcpy(kb->i.lspace.name, kb32->i.lspace.name);
+	} else if (kb->cmd == DLM_USER_PURGE) {
+		kb->i.purge.nodeid = kb32->i.purge.nodeid;
+		kb->i.purge.pid = kb32->i.purge.pid;
 	} else {
 	} else {
 		kb->i.lock.mode = kb32->i.lock.mode;
 		kb->i.lock.mode = kb32->i.lock.mode;
 		kb->i.lock.namelen = kb32->i.lock.namelen;
 		kb->i.lock.namelen = kb32->i.lock.namelen;
@@ -111,8 +115,6 @@ static void compat_input(struct dlm_write_request *kb,
 static void compat_output(struct dlm_lock_result *res,
 static void compat_output(struct dlm_lock_result *res,
 			  struct dlm_lock_result32 *res32)
 			  struct dlm_lock_result32 *res32)
 {
 {
-	res32->length = res->length - (sizeof(struct dlm_lock_result) -
-				       sizeof(struct dlm_lock_result32));
 	res32->user_astaddr = (__u32)(long)res->user_astaddr;
 	res32->user_astaddr = (__u32)(long)res->user_astaddr;
 	res32->user_astparam = (__u32)(long)res->user_astparam;
 	res32->user_astparam = (__u32)(long)res->user_astparam;
 	res32->user_lksb = (__u32)(long)res->user_lksb;
 	res32->user_lksb = (__u32)(long)res->user_lksb;
@@ -128,35 +130,30 @@ static void compat_output(struct dlm_lock_result *res,
 }
 }
 #endif
 #endif
 
 
+/* we could possibly check if the cancel of an orphan has resulted in the lkb
+   being removed and then remove that lkb from the orphans list and free it */
 
 
 void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
 void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
 {
 {
 	struct dlm_ls *ls;
 	struct dlm_ls *ls;
 	struct dlm_user_args *ua;
 	struct dlm_user_args *ua;
 	struct dlm_user_proc *proc;
 	struct dlm_user_proc *proc;
-	int remove_ownqueue = 0;
+	int eol = 0, ast_type;
 
 
-	/* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each
-	   lkb before dealing with it.  We need to check this
-	   flag before taking ls_clear_proc_locks mutex because if
-	   it's set, dlm_clear_proc_locks() holds the mutex. */
-
-	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
-		/* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
+	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
 		return;
 		return;
-	}
 
 
 	ls = lkb->lkb_resource->res_ls;
 	ls = lkb->lkb_resource->res_ls;
 	mutex_lock(&ls->ls_clear_proc_locks);
 	mutex_lock(&ls->ls_clear_proc_locks);
 
 
 	/* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
 	/* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
 	   can't be delivered.  For ORPHAN's, dlm_clear_proc_locks() freed
 	   can't be delivered.  For ORPHAN's, dlm_clear_proc_locks() freed
-	   lkb->ua so we can't try to use it. */
+	   lkb->ua so we can't try to use it.  This second check is necessary
+	   for cases where a completion ast is received for an operation that
+	   began before clear_proc_locks did its cancel/unlock. */
 
 
-	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
-		/* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
+	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
 		goto out;
 		goto out;
-	}
 
 
 	DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
 	DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
 	ua = (struct dlm_user_args *)lkb->lkb_astparam;
 	ua = (struct dlm_user_args *)lkb->lkb_astparam;
@@ -166,28 +163,42 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
 		goto out;
 		goto out;
 
 
 	spin_lock(&proc->asts_spin);
 	spin_lock(&proc->asts_spin);
-	if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+
+	ast_type = lkb->lkb_ast_type;
+	lkb->lkb_ast_type |= type;
+
+	if (!ast_type) {
 		kref_get(&lkb->lkb_ref);
 		kref_get(&lkb->lkb_ref);
 		list_add_tail(&lkb->lkb_astqueue, &proc->asts);
 		list_add_tail(&lkb->lkb_astqueue, &proc->asts);
-		lkb->lkb_ast_type |= type;
 		wake_up_interruptible(&proc->wait);
 		wake_up_interruptible(&proc->wait);
 	}
 	}
-
-	/* noqueue requests that fail may need to be removed from the
-	   proc's locks list, there should be a better way of detecting
-	   this situation than checking all these things... */
-
-	if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV &&
-	    ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
-		remove_ownqueue = 1;
-
-	/* unlocks or cancels of waiting requests need to be removed from the
-	   proc's unlocking list, again there must be a better way...  */
-
-	if (ua->lksb.sb_status == -DLM_EUNLOCK ||
+	if (type == AST_COMP && (ast_type & AST_COMP))
+		log_debug(ls, "ast overlap %x status %x %x",
+			  lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags);
+
+	/* Figure out if this lock is at the end of its life and no longer
+	   available for the application to use.  The lkb still exists until
+	   the final ast is read.  A lock becomes EOL in three situations:
+	     1. a noqueue request fails with EAGAIN
+	     2. an unlock completes with EUNLOCK
+	     3. a cancel of a waiting request completes with ECANCEL
+	   An EOL lock needs to be removed from the process's list of locks.
+	   And we can't allow any new operation on an EOL lock.  This is
+	   not related to the lifetime of the lkb struct which is managed
+	   entirely by refcount. */
+
+	if (type == AST_COMP &&
+	    lkb->lkb_grmode == DLM_LOCK_IV &&
+	    ua->lksb.sb_status == -EAGAIN)
+		eol = 1;
+	else if (ua->lksb.sb_status == -DLM_EUNLOCK ||
 	    (ua->lksb.sb_status == -DLM_ECANCEL &&
 	    (ua->lksb.sb_status == -DLM_ECANCEL &&
 	     lkb->lkb_grmode == DLM_LOCK_IV))
 	     lkb->lkb_grmode == DLM_LOCK_IV))
-		remove_ownqueue = 1;
+		eol = 1;
+	if (eol) {
+		lkb->lkb_ast_type &= ~AST_BAST;
+		lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
+	}
 
 
 	/* We want to copy the lvb to userspace when the completion
 	/* We want to copy the lvb to userspace when the completion
 	   ast is read if the status is 0, the lock has an lvb and
 	   ast is read if the status is 0, the lock has an lvb and
@@ -204,11 +215,13 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
 
 
 	spin_unlock(&proc->asts_spin);
 	spin_unlock(&proc->asts_spin);
 
 
-	if (remove_ownqueue) {
+	if (eol) {
 		spin_lock(&ua->proc->locks_spin);
 		spin_lock(&ua->proc->locks_spin);
-		list_del_init(&lkb->lkb_ownqueue);
+		if (!list_empty(&lkb->lkb_ownqueue)) {
+			list_del_init(&lkb->lkb_ownqueue);
+			dlm_put_lkb(lkb);
+		}
 		spin_unlock(&ua->proc->locks_spin);
 		spin_unlock(&ua->proc->locks_spin);
-		dlm_put_lkb(lkb);
 	}
 	}
  out:
  out:
 	mutex_unlock(&ls->ls_clear_proc_locks);
 	mutex_unlock(&ls->ls_clear_proc_locks);
@@ -286,47 +299,71 @@ static int device_user_unlock(struct dlm_user_proc *proc,
 	return error;
 	return error;
 }
 }
 
 
-static int device_create_lockspace(struct dlm_lspace_params *params)
+static int create_misc_device(struct dlm_ls *ls, char *name)
 {
 {
-	dlm_lockspace_t *lockspace;
-	struct dlm_ls *ls;
 	int error, len;
 	int error, len;
 
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	error = dlm_new_lockspace(params->name, strlen(params->name),
-				  &lockspace, 0, DLM_USER_LVB_LEN);
-	if (error)
-		return error;
-
-	ls = dlm_find_lockspace_local(lockspace);
-	if (!ls)
-		return -ENOENT;
-
 	error = -ENOMEM;
 	error = -ENOMEM;
-	len = strlen(params->name) + strlen(name_prefix) + 2;
+	len = strlen(name) + strlen(name_prefix) + 2;
 	ls->ls_device.name = kzalloc(len, GFP_KERNEL);
 	ls->ls_device.name = kzalloc(len, GFP_KERNEL);
 	if (!ls->ls_device.name)
 	if (!ls->ls_device.name)
 		goto fail;
 		goto fail;
+
 	snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
 	snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
-		 params->name);
+		 name);
 	ls->ls_device.fops = &device_fops;
 	ls->ls_device.fops = &device_fops;
 	ls->ls_device.minor = MISC_DYNAMIC_MINOR;
 	ls->ls_device.minor = MISC_DYNAMIC_MINOR;
 
 
 	error = misc_register(&ls->ls_device);
 	error = misc_register(&ls->ls_device);
 	if (error) {
 	if (error) {
 		kfree(ls->ls_device.name);
 		kfree(ls->ls_device.name);
-		goto fail;
 	}
 	}
+fail:
+	return error;
+}
+
+static int device_user_purge(struct dlm_user_proc *proc,
+			     struct dlm_purge_params *params)
+{
+	struct dlm_ls *ls;
+	int error;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	error = dlm_user_purge(ls, proc, params->nodeid, params->pid);
 
 
-	error = ls->ls_device.minor;
 	dlm_put_lockspace(ls);
 	dlm_put_lockspace(ls);
 	return error;
 	return error;
+}
+
+static int device_create_lockspace(struct dlm_lspace_params *params)
+{
+	dlm_lockspace_t *lockspace;
+	struct dlm_ls *ls;
+	int error;
 
 
- fail:
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	error = dlm_new_lockspace(params->name, strlen(params->name),
+				  &lockspace, 0, DLM_USER_LVB_LEN);
+	if (error)
+		return error;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	error = create_misc_device(ls, params->name);
 	dlm_put_lockspace(ls);
 	dlm_put_lockspace(ls);
-	dlm_release_lockspace(lockspace, 0);
+
+	if (error)
+		dlm_release_lockspace(lockspace, 0);
+	else
+		error = ls->ls_device.minor;
+
 	return error;
 	return error;
 }
 }
 
 
@@ -343,6 +380,10 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
 	if (!ls)
 	if (!ls)
 		return -ENOENT;
 		return -ENOENT;
 
 
+	/* Deregister the misc device first, so we don't have
+	 * a device that's not attached to a lockspace. If
+	 * dlm_release_lockspace fails then we can recreate it
+	 */
 	error = misc_deregister(&ls->ls_device);
 	error = misc_deregister(&ls->ls_device);
 	if (error) {
 	if (error) {
 		dlm_put_lockspace(ls);
 		dlm_put_lockspace(ls);
@@ -361,6 +402,8 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
 
 
 	dlm_put_lockspace(ls);
 	dlm_put_lockspace(ls);
 	error = dlm_release_lockspace(lockspace, force);
 	error = dlm_release_lockspace(lockspace, force);
+	if (error)
+		create_misc_device(ls, ls->ls_name);
  out:
  out:
 	return error;
 	return error;
 }
 }
@@ -497,6 +540,14 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 		error = device_remove_lockspace(&kbuf->i.lspace);
 		error = device_remove_lockspace(&kbuf->i.lspace);
 		break;
 		break;
 
 
+	case DLM_USER_PURGE:
+		if (!proc) {
+			log_print("no locking on control device");
+			goto out_sig;
+		}
+		error = device_user_purge(proc, &kbuf->i.purge);
+		break;
+
 	default:
 	default:
 		log_print("Unknown command passed to DLM device : %d\n",
 		log_print("Unknown command passed to DLM device : %d\n",
 			  kbuf->cmd);
 			  kbuf->cmd);

+ 33 - 5
fs/gfs2/dir.c

@@ -1262,9 +1262,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 			      u64 leaf_no)
 			      u64 leaf_no)
 {
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *bh;
 	struct buffer_head *bh;
 	struct gfs2_leaf *lf;
 	struct gfs2_leaf *lf;
-	unsigned entries = 0;
+	unsigned entries = 0, entries2 = 0;
 	unsigned leaves = 0;
 	unsigned leaves = 0;
 	const struct gfs2_dirent **darr, *dent;
 	const struct gfs2_dirent **darr, *dent;
 	struct dirent_gather g;
 	struct dirent_gather g;
@@ -1290,7 +1291,13 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 		return 0;
 		return 0;
 
 
 	error = -ENOMEM;
 	error = -ENOMEM;
-	larr = vmalloc((leaves + entries) * sizeof(void *));
+	/*
+	 * The extra 99 entries are not normally used, but are a buffer
+	 * zone in case the number of entries in the leaf is corrupt.
+	 * 99 is the maximum number of entries that can fit in a single
+	 * leaf block.
+	 */
+	larr = vmalloc((leaves + entries + 99) * sizeof(void *));
 	if (!larr)
 	if (!larr)
 		goto out;
 		goto out;
 	darr = (const struct gfs2_dirent **)(larr + leaves);
 	darr = (const struct gfs2_dirent **)(larr + leaves);
@@ -1305,10 +1312,20 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 		lf = (struct gfs2_leaf *)bh->b_data;
 		lf = (struct gfs2_leaf *)bh->b_data;
 		lfn = be64_to_cpu(lf->lf_next);
 		lfn = be64_to_cpu(lf->lf_next);
 		if (lf->lf_entries) {
 		if (lf->lf_entries) {
+			entries2 += be16_to_cpu(lf->lf_entries);
 			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
 			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
 						gfs2_dirent_gather, NULL, &g);
 						gfs2_dirent_gather, NULL, &g);
 			error = PTR_ERR(dent);
 			error = PTR_ERR(dent);
-			if (IS_ERR(dent)) {
+			if (IS_ERR(dent))
+				goto out_kfree;
+			if (entries2 != g.offset) {
+				fs_warn(sdp, "Number of entries corrupt in dir "
+						"leaf %llu, entries2 (%u) != "
+						"g.offset (%u)\n",
+					(unsigned long long)bh->b_blocknr,
+					entries2, g.offset);
+					
+				error = -EIO;
 				goto out_kfree;
 				goto out_kfree;
 			}
 			}
 			error = 0;
 			error = 0;
@@ -1318,6 +1335,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 		}
 		}
 	} while(lfn);
 	} while(lfn);
 
 
+	BUG_ON(entries2 != entries);
 	error = do_filldir_main(ip, offset, opaque, filldir, darr,
 	error = do_filldir_main(ip, offset, opaque, filldir, darr,
 				entries, copied);
 				entries, copied);
 out_kfree:
 out_kfree:
@@ -1401,6 +1419,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 		  filldir_t filldir)
 		  filldir_t filldir)
 {
 {
 	struct gfs2_inode *dip = GFS2_I(inode);
 	struct gfs2_inode *dip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct dirent_gather g;
 	struct dirent_gather g;
 	const struct gfs2_dirent **darr, *dent;
 	const struct gfs2_dirent **darr, *dent;
 	struct buffer_head *dibh;
 	struct buffer_head *dibh;
@@ -1423,8 +1442,8 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 		return error;
 		return error;
 
 
 	error = -ENOMEM;
 	error = -ENOMEM;
-	darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
-		       GFP_KERNEL);
+	/* 96 is max number of dirents which can be stuffed into an inode */
+	darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_KERNEL);
 	if (darr) {
 	if (darr) {
 		g.pdent = darr;
 		g.pdent = darr;
 		g.offset = 0;
 		g.offset = 0;
@@ -1434,6 +1453,15 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 			error = PTR_ERR(dent);
 			error = PTR_ERR(dent);
 			goto out;
 			goto out;
 		}
 		}
+		if (dip->i_di.di_entries != g.offset) {
+			fs_warn(sdp, "Number of entries corrupt in dir %llu, "
+				"ip->i_di.di_entries (%u) != g.offset (%u)\n",
+				(unsigned long long)dip->i_num.no_addr,
+				dip->i_di.di_entries,
+				g.offset);
+			error = -EIO;
+			goto out;
+		}
 		error = do_filldir_main(dip, offset, opaque, filldir, darr,
 		error = do_filldir_main(dip, offset, opaque, filldir, darr,
 					dip->i_di.di_entries, &copied);
 					dip->i_di.di_entries, &copied);
 out:
 out:

+ 373 - 246
fs/gfs2/glock.c

@@ -23,6 +23,10 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/rwsem.h>
 #include <linux/rwsem.h>
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
 
 
 #include "gfs2.h"
 #include "gfs2.h"
 #include "incore.h"
 #include "incore.h"
@@ -40,20 +44,30 @@ struct gfs2_gl_hash_bucket {
         struct hlist_head hb_list;
         struct hlist_head hb_list;
 };
 };
 
 
+struct glock_iter {
+	int hash;                     /* hash bucket index         */
+	struct gfs2_sbd *sdp;         /* incore superblock         */
+	struct gfs2_glock *gl;        /* current glock struct      */
+	struct hlist_head *hb_list;   /* current hash bucket ptr   */
+	struct seq_file *seq;         /* sequence file for debugfs */
+	char string[512];             /* scratch space             */
+};
+
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
 
 
 static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
 static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
-static int dump_glock(struct gfs2_glock *gl);
-static int dump_inode(struct gfs2_inode *ip);
-static void gfs2_glock_xmote_th(struct gfs2_holder *gh);
+static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
+static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
 static void gfs2_glock_drop_th(struct gfs2_glock *gl);
 static void gfs2_glock_drop_th(struct gfs2_glock *gl);
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
+static struct dentry *gfs2_root;
 
 
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
 #define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
 #define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
 
 
 static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
 static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
+static struct dentry *gfs2_root;
 
 
 /*
 /*
  * Despite what you might think, the numbers below are not arbitrary :-)
  * Despite what you might think, the numbers below are not arbitrary :-)
@@ -202,7 +216,6 @@ int gfs2_glock_put(struct gfs2_glock *gl)
 		gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
 		gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
 		gfs2_assert(sdp, list_empty(&gl->gl_holders));
 		gfs2_assert(sdp, list_empty(&gl->gl_holders));
 		gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
 		gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
-		gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
 		gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
 		gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
 		glock_free(gl);
 		glock_free(gl);
 		rv = 1;
 		rv = 1;
@@ -303,7 +316,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	atomic_set(&gl->gl_ref, 1);
 	atomic_set(&gl->gl_ref, 1);
 	gl->gl_state = LM_ST_UNLOCKED;
 	gl->gl_state = LM_ST_UNLOCKED;
 	gl->gl_hash = hash;
 	gl->gl_hash = hash;
-	gl->gl_owner = NULL;
+	gl->gl_owner_pid = 0;
 	gl->gl_ip = 0;
 	gl->gl_ip = 0;
 	gl->gl_ops = glops;
 	gl->gl_ops = glops;
 	gl->gl_req_gh = NULL;
 	gl->gl_req_gh = NULL;
@@ -367,7 +380,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
 	INIT_LIST_HEAD(&gh->gh_list);
 	INIT_LIST_HEAD(&gh->gh_list);
 	gh->gh_gl = gl;
 	gh->gh_gl = gl;
 	gh->gh_ip = (unsigned long)__builtin_return_address(0);
 	gh->gh_ip = (unsigned long)__builtin_return_address(0);
-	gh->gh_owner = current;
+	gh->gh_owner_pid = current->pid;
 	gh->gh_state = state;
 	gh->gh_state = state;
 	gh->gh_flags = flags;
 	gh->gh_flags = flags;
 	gh->gh_error = 0;
 	gh->gh_error = 0;
@@ -389,7 +402,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
 {
 {
 	gh->gh_state = state;
 	gh->gh_state = state;
 	gh->gh_flags = flags;
 	gh->gh_flags = flags;
-	gh->gh_iflags &= 1 << HIF_ALLOCED;
+	gh->gh_iflags = 0;
 	gh->gh_ip = (unsigned long)__builtin_return_address(0);
 	gh->gh_ip = (unsigned long)__builtin_return_address(0);
 }
 }
 
 
@@ -406,54 +419,8 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
 	gh->gh_ip = 0;
 	gh->gh_ip = 0;
 }
 }
 
 
-/**
- * gfs2_holder_get - get a struct gfs2_holder structure
- * @gl: the glock
- * @state: the state we're requesting
- * @flags: the modifier flags
- * @gfp_flags:
- *
- * Figure out how big an impact this function has.  Either:
- * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
- * 2) Leave it like it is
- *
- * Returns: the holder structure, NULL on ENOMEM
- */
-
-static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
-					   unsigned int state,
-					   int flags, gfp_t gfp_flags)
-{
-	struct gfs2_holder *gh;
-
-	gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
-	if (!gh)
-		return NULL;
-
-	gfs2_holder_init(gl, state, flags, gh);
-	set_bit(HIF_ALLOCED, &gh->gh_iflags);
-	gh->gh_ip = (unsigned long)__builtin_return_address(0);
-	return gh;
-}
-
-/**
- * gfs2_holder_put - get rid of a struct gfs2_holder structure
- * @gh: the holder structure
- *
- */
-
-static void gfs2_holder_put(struct gfs2_holder *gh)
+static void gfs2_holder_wake(struct gfs2_holder *gh)
 {
 {
-	gfs2_holder_uninit(gh);
-	kfree(gh);
-}
-
-static void gfs2_holder_dispose_or_wake(struct gfs2_holder *gh)
-{
-	if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) {
-		gfs2_holder_put(gh);
-		return;
-	}
 	clear_bit(HIF_WAIT, &gh->gh_iflags);
 	clear_bit(HIF_WAIT, &gh->gh_iflags);
 	smp_mb();
 	smp_mb();
 	wake_up_bit(&gh->gh_iflags, HIF_WAIT);
 	wake_up_bit(&gh->gh_iflags, HIF_WAIT);
@@ -519,7 +486,7 @@ static int rq_promote(struct gfs2_holder *gh)
 				gfs2_reclaim_glock(sdp);
 				gfs2_reclaim_glock(sdp);
 			}
 			}
 
 
-			gfs2_glock_xmote_th(gh);
+			gfs2_glock_xmote_th(gh->gh_gl, gh);
 			spin_lock(&gl->gl_spin);
 			spin_lock(&gl->gl_spin);
 		}
 		}
 		return 1;
 		return 1;
@@ -542,7 +509,7 @@ static int rq_promote(struct gfs2_holder *gh)
 	gh->gh_error = 0;
 	gh->gh_error = 0;
 	set_bit(HIF_HOLDER, &gh->gh_iflags);
 	set_bit(HIF_HOLDER, &gh->gh_iflags);
 
 
-	gfs2_holder_dispose_or_wake(gh);
+	gfs2_holder_wake(gh);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -554,32 +521,24 @@ static int rq_promote(struct gfs2_holder *gh)
  * Returns: 1 if the queue is blocked
  * Returns: 1 if the queue is blocked
  */
  */
 
 
-static int rq_demote(struct gfs2_holder *gh)
+static int rq_demote(struct gfs2_glock *gl)
 {
 {
-	struct gfs2_glock *gl = gh->gh_gl;
-
 	if (!list_empty(&gl->gl_holders))
 	if (!list_empty(&gl->gl_holders))
 		return 1;
 		return 1;
 
 
-	if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
-		list_del_init(&gh->gh_list);
-		gh->gh_error = 0;
-		spin_unlock(&gl->gl_spin);
-		gfs2_holder_dispose_or_wake(gh);
-		spin_lock(&gl->gl_spin);
-	} else {
-		gl->gl_req_gh = gh;
-		set_bit(GLF_LOCK, &gl->gl_flags);
-		spin_unlock(&gl->gl_spin);
-
-		if (gh->gh_state == LM_ST_UNLOCKED ||
-		    gl->gl_state != LM_ST_EXCLUSIVE)
-			gfs2_glock_drop_th(gl);
-		else
-			gfs2_glock_xmote_th(gh);
-
-		spin_lock(&gl->gl_spin);
+	if (gl->gl_state == gl->gl_demote_state ||
+	    gl->gl_state == LM_ST_UNLOCKED) {
+		clear_bit(GLF_DEMOTE, &gl->gl_flags);
+		return 0;
 	}
 	}
+	set_bit(GLF_LOCK, &gl->gl_flags);
+	spin_unlock(&gl->gl_spin);
+	if (gl->gl_demote_state == LM_ST_UNLOCKED ||
+	    gl->gl_state != LM_ST_EXCLUSIVE)
+		gfs2_glock_drop_th(gl);
+	else
+		gfs2_glock_xmote_th(gl, NULL);
+	spin_lock(&gl->gl_spin);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -607,16 +566,8 @@ static void run_queue(struct gfs2_glock *gl)
 			else
 			else
 				gfs2_assert_warn(gl->gl_sbd, 0);
 				gfs2_assert_warn(gl->gl_sbd, 0);
 
 
-		} else if (!list_empty(&gl->gl_waiters2) &&
-			   !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
-			gh = list_entry(gl->gl_waiters2.next,
-					struct gfs2_holder, gh_list);
-
-			if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
-				blocked = rq_demote(gh);
-			else
-				gfs2_assert_warn(gl->gl_sbd, 0);
-
+		} else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
+			blocked = rq_demote(gl);
 		} else if (!list_empty(&gl->gl_waiters3)) {
 		} else if (!list_empty(&gl->gl_waiters3)) {
 			gh = list_entry(gl->gl_waiters3.next,
 			gh = list_entry(gl->gl_waiters3.next,
 					struct gfs2_holder, gh_list);
 					struct gfs2_holder, gh_list);
@@ -654,7 +605,7 @@ static void gfs2_glmutex_lock(struct gfs2_glock *gl)
 	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
 	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
 		list_add_tail(&gh.gh_list, &gl->gl_waiters1);
 		list_add_tail(&gh.gh_list, &gl->gl_waiters1);
 	} else {
 	} else {
-		gl->gl_owner = current;
+		gl->gl_owner_pid = current->pid;
 		gl->gl_ip = (unsigned long)__builtin_return_address(0);
 		gl->gl_ip = (unsigned long)__builtin_return_address(0);
 		clear_bit(HIF_WAIT, &gh.gh_iflags);
 		clear_bit(HIF_WAIT, &gh.gh_iflags);
 		smp_mb();
 		smp_mb();
@@ -681,7 +632,7 @@ static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
 	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
 	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
 		acquired = 0;
 		acquired = 0;
 	} else {
 	} else {
-		gl->gl_owner = current;
+		gl->gl_owner_pid = current->pid;
 		gl->gl_ip = (unsigned long)__builtin_return_address(0);
 		gl->gl_ip = (unsigned long)__builtin_return_address(0);
 	}
 	}
 	spin_unlock(&gl->gl_spin);
 	spin_unlock(&gl->gl_spin);
@@ -699,7 +650,7 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
 {
 {
 	spin_lock(&gl->gl_spin);
 	spin_lock(&gl->gl_spin);
 	clear_bit(GLF_LOCK, &gl->gl_flags);
 	clear_bit(GLF_LOCK, &gl->gl_flags);
-	gl->gl_owner = NULL;
+	gl->gl_owner_pid = 0;
 	gl->gl_ip = 0;
 	gl->gl_ip = 0;
 	run_queue(gl);
 	run_queue(gl);
 	BUG_ON(!spin_is_locked(&gl->gl_spin));
 	BUG_ON(!spin_is_locked(&gl->gl_spin));
@@ -707,50 +658,24 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
 }
 }
 
 
 /**
 /**
- * handle_callback - add a demote request to a lock's queue
+ * handle_callback - process a demote request
  * @gl: the glock
  * @gl: the glock
  * @state: the state the caller wants us to change to
  * @state: the state the caller wants us to change to
  *
  *
- * Note: This may fail sliently if we are out of memory.
+ * There are only two requests that we are going to see in actual
+ * practise: LM_ST_SHARED and LM_ST_UNLOCKED
  */
  */
 
 
 static void handle_callback(struct gfs2_glock *gl, unsigned int state)
 static void handle_callback(struct gfs2_glock *gl, unsigned int state)
 {
 {
-	struct gfs2_holder *gh, *new_gh = NULL;
-
-restart:
 	spin_lock(&gl->gl_spin);
 	spin_lock(&gl->gl_spin);
-
-	list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
-		if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
-		    gl->gl_req_gh != gh) {
-			if (gh->gh_state != state)
-				gh->gh_state = LM_ST_UNLOCKED;
-			goto out;
-		}
-	}
-
-	if (new_gh) {
-		list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
-		new_gh = NULL;
-	} else {
-		spin_unlock(&gl->gl_spin);
-
-		new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_NOFS);
-		if (!new_gh)
-			return;
-		set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
-		set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
-		set_bit(HIF_WAIT, &new_gh->gh_iflags);
-
-		goto restart;
+	if (test_and_set_bit(GLF_DEMOTE, &gl->gl_flags) == 0) {
+		gl->gl_demote_state = state;
+		gl->gl_demote_time = jiffies;
+	} else if (gl->gl_demote_state != LM_ST_UNLOCKED) {
+		gl->gl_demote_state = state;
 	}
 	}
-
-out:
 	spin_unlock(&gl->gl_spin);
 	spin_unlock(&gl->gl_spin);
-
-	if (new_gh)
-		gfs2_holder_put(new_gh);
 }
 }
 
 
 /**
 /**
@@ -810,56 +735,37 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 
 
 	/*  Deal with each possible exit condition  */
 	/*  Deal with each possible exit condition  */
 
 
-	if (!gh)
+	if (!gh) {
 		gl->gl_stamp = jiffies;
 		gl->gl_stamp = jiffies;
-	else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+		if (ret & LM_OUT_CANCELED)
+			op_done = 0;
+		else
+			clear_bit(GLF_DEMOTE, &gl->gl_flags);
+	} else {
 		spin_lock(&gl->gl_spin);
 		spin_lock(&gl->gl_spin);
 		list_del_init(&gh->gh_list);
 		list_del_init(&gh->gh_list);
 		gh->gh_error = -EIO;
 		gh->gh_error = -EIO;
-		spin_unlock(&gl->gl_spin);
-	} else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
-		spin_lock(&gl->gl_spin);
-		list_del_init(&gh->gh_list);
-		if (gl->gl_state == gh->gh_state ||
-		    gl->gl_state == LM_ST_UNLOCKED) {
+		if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 
+			goto out;
+		gh->gh_error = GLR_CANCELED;
+		if (ret & LM_OUT_CANCELED) 
+			goto out;
+		if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+			list_add_tail(&gh->gh_list, &gl->gl_holders);
 			gh->gh_error = 0;
 			gh->gh_error = 0;
-		} else {
-			if (gfs2_assert_warn(sdp, gh->gh_flags &
-					(LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
-				fs_warn(sdp, "ret = 0x%.8X\n", ret);
-			gh->gh_error = GLR_TRYFAILED;
+			set_bit(HIF_HOLDER, &gh->gh_iflags);
+			set_bit(HIF_FIRST, &gh->gh_iflags);
+			op_done = 0;
+			goto out;
 		}
 		}
-		spin_unlock(&gl->gl_spin);
-
-		if (ret & LM_OUT_CANCELED)
-			handle_callback(gl, LM_ST_UNLOCKED);
-
-	} else if (ret & LM_OUT_CANCELED) {
-		spin_lock(&gl->gl_spin);
-		list_del_init(&gh->gh_list);
-		gh->gh_error = GLR_CANCELED;
-		spin_unlock(&gl->gl_spin);
-
-	} else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
-		spin_lock(&gl->gl_spin);
-		list_move_tail(&gh->gh_list, &gl->gl_holders);
-		gh->gh_error = 0;
-		set_bit(HIF_HOLDER, &gh->gh_iflags);
-		spin_unlock(&gl->gl_spin);
-
-		set_bit(HIF_FIRST, &gh->gh_iflags);
-
-		op_done = 0;
-
-	} else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
-		spin_lock(&gl->gl_spin);
-		list_del_init(&gh->gh_list);
 		gh->gh_error = GLR_TRYFAILED;
 		gh->gh_error = GLR_TRYFAILED;
-		spin_unlock(&gl->gl_spin);
-
-	} else {
+		if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+			goto out;
+		gh->gh_error = -EINVAL;
 		if (gfs2_assert_withdraw(sdp, 0) == -1)
 		if (gfs2_assert_withdraw(sdp, 0) == -1)
 			fs_err(sdp, "ret = 0x%.8X\n", ret);
 			fs_err(sdp, "ret = 0x%.8X\n", ret);
+out:
+		spin_unlock(&gl->gl_spin);
 	}
 	}
 
 
 	if (glops->go_xmote_bh)
 	if (glops->go_xmote_bh)
@@ -877,7 +783,7 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 	gfs2_glock_put(gl);
 	gfs2_glock_put(gl);
 
 
 	if (gh)
 	if (gh)
-		gfs2_holder_dispose_or_wake(gh);
+		gfs2_holder_wake(gh);
 }
 }
 
 
 /**
 /**
@@ -888,12 +794,11 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
  *
  *
  */
  */
 
 
-void gfs2_glock_xmote_th(struct gfs2_holder *gh)
+void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
 {
 {
-	struct gfs2_glock *gl = gh->gh_gl;
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct gfs2_sbd *sdp = gl->gl_sbd;
-	int flags = gh->gh_flags;
-	unsigned state = gh->gh_state;
+	int flags = gh ? gh->gh_flags : 0;
+	unsigned state = gh ? gh->gh_state : gl->gl_demote_state;
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
 	int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
 				 LM_FLAG_NOEXP | LM_FLAG_ANY |
 				 LM_FLAG_NOEXP | LM_FLAG_ANY |
@@ -943,6 +848,7 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 	gfs2_assert_warn(sdp, !ret);
 	gfs2_assert_warn(sdp, !ret);
 
 
 	state_change(gl, LM_ST_UNLOCKED);
 	state_change(gl, LM_ST_UNLOCKED);
+	clear_bit(GLF_DEMOTE, &gl->gl_flags);
 
 
 	if (glops->go_inval)
 	if (glops->go_inval)
 		glops->go_inval(gl, DIO_METADATA);
 		glops->go_inval(gl, DIO_METADATA);
@@ -964,7 +870,7 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 	gfs2_glock_put(gl);
 	gfs2_glock_put(gl);
 
 
 	if (gh)
 	if (gh)
-		gfs2_holder_dispose_or_wake(gh);
+		gfs2_holder_wake(gh);
 }
 }
 
 
 /**
 /**
@@ -1097,18 +1003,32 @@ static int glock_wait_internal(struct gfs2_holder *gh)
 }
 }
 
 
 static inline struct gfs2_holder *
 static inline struct gfs2_holder *
-find_holder_by_owner(struct list_head *head, struct task_struct *owner)
+find_holder_by_owner(struct list_head *head, pid_t pid)
 {
 {
 	struct gfs2_holder *gh;
 	struct gfs2_holder *gh;
 
 
 	list_for_each_entry(gh, head, gh_list) {
 	list_for_each_entry(gh, head, gh_list) {
-		if (gh->gh_owner == owner)
+		if (gh->gh_owner_pid == pid)
 			return gh;
 			return gh;
 	}
 	}
 
 
 	return NULL;
 	return NULL;
 }
 }
 
 
+static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	if (gi) {
+		vsprintf(gi->string, fmt, args);
+		seq_printf(gi->seq, gi->string);
+	}
+	else
+		vprintk(fmt, args);
+	va_end(args);
+}
+
 /**
 /**
  * add_to_queue - Add a holder to the wait queue (but look for recursion)
  * add_to_queue - Add a holder to the wait queue (but look for recursion)
  * @gh: the holder structure to add
  * @gh: the holder structure to add
@@ -1120,24 +1040,24 @@ static void add_to_queue(struct gfs2_holder *gh)
 	struct gfs2_glock *gl = gh->gh_gl;
 	struct gfs2_glock *gl = gh->gh_gl;
 	struct gfs2_holder *existing;
 	struct gfs2_holder *existing;
 
 
-	BUG_ON(!gh->gh_owner);
+	BUG_ON(!gh->gh_owner_pid);
 	if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
 	if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
 		BUG();
 		BUG();
 
 
-	existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
+	existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner_pid);
 	if (existing) {
 	if (existing) {
 		print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
 		print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
-		printk(KERN_INFO "pid : %d\n", existing->gh_owner->pid);
+		printk(KERN_INFO "pid : %d\n", existing->gh_owner_pid);
 		printk(KERN_INFO "lock type : %d lock state : %d\n",
 		printk(KERN_INFO "lock type : %d lock state : %d\n",
 				existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state);
 				existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state);
 		print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
 		print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
-		printk(KERN_INFO "pid : %d\n", gh->gh_owner->pid);
+		printk(KERN_INFO "pid : %d\n", gh->gh_owner_pid);
 		printk(KERN_INFO "lock type : %d lock state : %d\n",
 		printk(KERN_INFO "lock type : %d lock state : %d\n",
 				gl->gl_name.ln_type, gl->gl_state);
 				gl->gl_name.ln_type, gl->gl_state);
 		BUG();
 		BUG();
 	}
 	}
 
 
-	existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
+	existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner_pid);
 	if (existing) {
 	if (existing) {
 		print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
 		print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
 		print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
 		print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
@@ -1267,9 +1187,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 		if (glops->go_unlock)
 		if (glops->go_unlock)
 			glops->go_unlock(gh);
 			glops->go_unlock(gh);
 
 
-		gl->gl_stamp = jiffies;
-
 		spin_lock(&gl->gl_spin);
 		spin_lock(&gl->gl_spin);
+		gl->gl_stamp = jiffies;
 	}
 	}
 
 
 	clear_bit(GLF_LOCK, &gl->gl_flags);
 	clear_bit(GLF_LOCK, &gl->gl_flags);
@@ -1841,6 +1760,15 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
  *  Diagnostic routines to help debug distributed deadlock
  *  Diagnostic routines to help debug distributed deadlock
  */
  */
 
 
+static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt,
+                              unsigned long address)
+{
+	char buffer[KSYM_SYMBOL_LEN];
+
+	sprint_symbol(buffer, address);
+	print_dbg(gi, fmt, buffer);
+}
+
 /**
 /**
  * dump_holder - print information about a glock holder
  * dump_holder - print information about a glock holder
  * @str: a string naming the type of holder
  * @str: a string naming the type of holder
@@ -1849,31 +1777,37 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
  * Returns: 0 on success, -ENOBUFS when we run out of space
  * Returns: 0 on success, -ENOBUFS when we run out of space
  */
  */
 
 
-static int dump_holder(char *str, struct gfs2_holder *gh)
+static int dump_holder(struct glock_iter *gi, char *str,
+		       struct gfs2_holder *gh)
 {
 {
 	unsigned int x;
 	unsigned int x;
-	int error = -ENOBUFS;
-
-	printk(KERN_INFO "  %s\n", str);
-	printk(KERN_INFO "    owner = %ld\n",
-		   (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
-	printk(KERN_INFO "    gh_state = %u\n", gh->gh_state);
-	printk(KERN_INFO "    gh_flags =");
+	struct task_struct *gh_owner;
+
+	print_dbg(gi, "  %s\n", str);
+	if (gh->gh_owner_pid) {
+		print_dbg(gi, "    owner = %ld ", (long)gh->gh_owner_pid);
+		gh_owner = find_task_by_pid(gh->gh_owner_pid);
+		if (gh_owner)
+			print_dbg(gi, "(%s)\n", gh_owner->comm);
+		else
+			print_dbg(gi, "(ended)\n");
+	} else
+		print_dbg(gi, "    owner = -1\n");
+	print_dbg(gi, "    gh_state = %u\n", gh->gh_state);
+	print_dbg(gi, "    gh_flags =");
 	for (x = 0; x < 32; x++)
 	for (x = 0; x < 32; x++)
 		if (gh->gh_flags & (1 << x))
 		if (gh->gh_flags & (1 << x))
-			printk(" %u", x);
-	printk(" \n");
-	printk(KERN_INFO "    error = %d\n", gh->gh_error);
-	printk(KERN_INFO "    gh_iflags =");
+			print_dbg(gi, " %u", x);
+	print_dbg(gi, " \n");
+	print_dbg(gi, "    error = %d\n", gh->gh_error);
+	print_dbg(gi, "    gh_iflags =");
 	for (x = 0; x < 32; x++)
 	for (x = 0; x < 32; x++)
 		if (test_bit(x, &gh->gh_iflags))
 		if (test_bit(x, &gh->gh_iflags))
-			printk(" %u", x);
-	printk(" \n");
-	print_symbol(KERN_INFO "    initialized at: %s\n", gh->gh_ip);
-
-	error = 0;
+			print_dbg(gi, " %u", x);
+	print_dbg(gi, " \n");
+        gfs2_print_symbol(gi, "    initialized at: %s\n", gh->gh_ip);
 
 
-	return error;
+	return 0;
 }
 }
 
 
 /**
 /**
@@ -1883,25 +1817,20 @@ static int dump_holder(char *str, struct gfs2_holder *gh)
  * Returns: 0 on success, -ENOBUFS when we run out of space
  * Returns: 0 on success, -ENOBUFS when we run out of space
  */
  */
 
 
-static int dump_inode(struct gfs2_inode *ip)
+static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip)
 {
 {
 	unsigned int x;
 	unsigned int x;
-	int error = -ENOBUFS;
 
 
-	printk(KERN_INFO "  Inode:\n");
-	printk(KERN_INFO "    num = %llu %llu\n",
-		    (unsigned long long)ip->i_num.no_formal_ino,
-		    (unsigned long long)ip->i_num.no_addr);
-	printk(KERN_INFO "    type = %u\n", IF2DT(ip->i_inode.i_mode));
-	printk(KERN_INFO "    i_flags =");
+	print_dbg(gi, "  Inode:\n");
+	print_dbg(gi, "    num = %llu/%llu\n",
+		    ip->i_num.no_formal_ino, ip->i_num.no_addr);
+	print_dbg(gi, "    type = %u\n", IF2DT(ip->i_inode.i_mode));
+	print_dbg(gi, "    i_flags =");
 	for (x = 0; x < 32; x++)
 	for (x = 0; x < 32; x++)
 		if (test_bit(x, &ip->i_flags))
 		if (test_bit(x, &ip->i_flags))
-			printk(" %u", x);
-	printk(" \n");
-
-	error = 0;
-
-	return error;
+			print_dbg(gi, " %u", x);
+	print_dbg(gi, " \n");
+	return 0;
 }
 }
 
 
 /**
 /**
@@ -1912,74 +1841,86 @@ static int dump_inode(struct gfs2_inode *ip)
  * Returns: 0 on success, -ENOBUFS when we run out of space
  * Returns: 0 on success, -ENOBUFS when we run out of space
  */
  */
 
 
-static int dump_glock(struct gfs2_glock *gl)
+static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
 {
 {
 	struct gfs2_holder *gh;
 	struct gfs2_holder *gh;
 	unsigned int x;
 	unsigned int x;
 	int error = -ENOBUFS;
 	int error = -ENOBUFS;
+	struct task_struct *gl_owner;
 
 
 	spin_lock(&gl->gl_spin);
 	spin_lock(&gl->gl_spin);
 
 
-	printk(KERN_INFO "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
-	       (unsigned long long)gl->gl_name.ln_number);
-	printk(KERN_INFO "  gl_flags =");
+	print_dbg(gi, "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
+		   (unsigned long long)gl->gl_name.ln_number);
+	print_dbg(gi, "  gl_flags =");
 	for (x = 0; x < 32; x++) {
 	for (x = 0; x < 32; x++) {
 		if (test_bit(x, &gl->gl_flags))
 		if (test_bit(x, &gl->gl_flags))
-			printk(" %u", x);
+			print_dbg(gi, " %u", x);
 	}
 	}
-	printk(" \n");
-	printk(KERN_INFO "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
-	printk(KERN_INFO "  gl_state = %u\n", gl->gl_state);
-	printk(KERN_INFO "  gl_owner = %s\n", gl->gl_owner->comm);
-	print_symbol(KERN_INFO "  gl_ip = %s\n", gl->gl_ip);
-	printk(KERN_INFO "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
-	printk(KERN_INFO "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
-	printk(KERN_INFO "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
-	printk(KERN_INFO "  object = %s\n", (gl->gl_object) ? "yes" : "no");
-	printk(KERN_INFO "  le = %s\n",
+	if (!test_bit(GLF_LOCK, &gl->gl_flags))
+		print_dbg(gi, " (unlocked)");
+	print_dbg(gi, " \n");
+	print_dbg(gi, "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
+	print_dbg(gi, "  gl_state = %u\n", gl->gl_state);
+	if (gl->gl_owner_pid) {
+		gl_owner = find_task_by_pid(gl->gl_owner_pid);
+		if (gl_owner)
+			print_dbg(gi, "  gl_owner = pid %d (%s)\n",
+				  gl->gl_owner_pid, gl_owner->comm);
+		else
+			print_dbg(gi, "  gl_owner = %d (ended)\n",
+				  gl->gl_owner_pid);
+	} else
+		print_dbg(gi, "  gl_owner = -1\n");
+	print_dbg(gi, "  gl_ip = %lu\n", gl->gl_ip);
+	print_dbg(gi, "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
+	print_dbg(gi, "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
+	print_dbg(gi, "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
+	print_dbg(gi, "  object = %s\n", (gl->gl_object) ? "yes" : "no");
+	print_dbg(gi, "  le = %s\n",
 		   (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
 		   (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
-	printk(KERN_INFO "  reclaim = %s\n",
-		    (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
+	print_dbg(gi, "  reclaim = %s\n",
+		   (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
 	if (gl->gl_aspace)
 	if (gl->gl_aspace)
-		printk(KERN_INFO "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
-		       gl->gl_aspace->i_mapping->nrpages);
+		print_dbg(gi, "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
+			   gl->gl_aspace->i_mapping->nrpages);
 	else
 	else
-		printk(KERN_INFO "  aspace = no\n");
-	printk(KERN_INFO "  ail = %d\n", atomic_read(&gl->gl_ail_count));
+		print_dbg(gi, "  aspace = no\n");
+	print_dbg(gi, "  ail = %d\n", atomic_read(&gl->gl_ail_count));
 	if (gl->gl_req_gh) {
 	if (gl->gl_req_gh) {
-		error = dump_holder("Request", gl->gl_req_gh);
+		error = dump_holder(gi, "Request", gl->gl_req_gh);
 		if (error)
 		if (error)
 			goto out;
 			goto out;
 	}
 	}
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-		error = dump_holder("Holder", gh);
+		error = dump_holder(gi, "Holder", gh);
 		if (error)
 		if (error)
 			goto out;
 			goto out;
 	}
 	}
 	list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
 	list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
-		error = dump_holder("Waiter1", gh);
-		if (error)
-			goto out;
-	}
-	list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
-		error = dump_holder("Waiter2", gh);
+		error = dump_holder(gi, "Waiter1", gh);
 		if (error)
 		if (error)
 			goto out;
 			goto out;
 	}
 	}
 	list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
 	list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
-		error = dump_holder("Waiter3", gh);
+		error = dump_holder(gi, "Waiter3", gh);
 		if (error)
 		if (error)
 			goto out;
 			goto out;
 	}
 	}
+	if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
+		print_dbg(gi, "  Demotion req to state %u (%llu uS ago)\n",
+			  gl->gl_demote_state,
+			  (u64)(jiffies - gl->gl_demote_time)*(1000000/HZ));
+	}
 	if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
 	if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
 		if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
 		if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
-		    list_empty(&gl->gl_holders)) {
-			error = dump_inode(gl->gl_object);
+			list_empty(&gl->gl_holders)) {
+			error = dump_inode(gi, gl->gl_object);
 			if (error)
 			if (error)
 				goto out;
 				goto out;
 		} else {
 		} else {
 			error = -ENOBUFS;
 			error = -ENOBUFS;
-			printk(KERN_INFO "  Inode: busy\n");
+			print_dbg(gi, "  Inode: busy\n");
 		}
 		}
 	}
 	}
 
 
@@ -2014,7 +1955,7 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
 			if (gl->gl_sbd != sdp)
 			if (gl->gl_sbd != sdp)
 				continue;
 				continue;
 
 
-			error = dump_glock(gl);
+			error = dump_glock(NULL, gl);
 			if (error)
 			if (error)
 				break;
 				break;
 		}
 		}
@@ -2043,3 +1984,189 @@ int __init gfs2_glock_init(void)
 	return 0;
 	return 0;
 }
 }
 
 
+static int gfs2_glock_iter_next(struct glock_iter *gi)
+{
+	read_lock(gl_lock_addr(gi->hash));
+	while (1) {
+		if (!gi->hb_list) {  /* If we don't have a hash bucket yet */
+			gi->hb_list = &gl_hash_table[gi->hash].hb_list;
+			if (hlist_empty(gi->hb_list)) {
+				read_unlock(gl_lock_addr(gi->hash));
+				gi->hash++;
+				read_lock(gl_lock_addr(gi->hash));
+				gi->hb_list = NULL;
+				if (gi->hash >= GFS2_GL_HASH_SIZE) {
+					read_unlock(gl_lock_addr(gi->hash));
+					return 1;
+				}
+				else
+					continue;
+			}
+			if (!hlist_empty(gi->hb_list)) {
+				gi->gl = list_entry(gi->hb_list->first,
+						    struct gfs2_glock,
+						    gl_list);
+			}
+		} else {
+			if (gi->gl->gl_list.next == NULL) {
+				read_unlock(gl_lock_addr(gi->hash));
+				gi->hash++;
+				read_lock(gl_lock_addr(gi->hash));
+				gi->hb_list = NULL;
+				continue;
+			}
+			gi->gl = list_entry(gi->gl->gl_list.next,
+					    struct gfs2_glock, gl_list);
+		}
+		if (gi->gl)
+			break;
+	}
+	read_unlock(gl_lock_addr(gi->hash));
+	return 0;
+}
+
+static void gfs2_glock_iter_free(struct glock_iter *gi)
+{
+	kfree(gi);
+}
+
+static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
+{
+	struct glock_iter *gi;
+
+	gi = kmalloc(sizeof (*gi), GFP_KERNEL);
+	if (!gi)
+		return NULL;
+
+	gi->sdp = sdp;
+	gi->hash = 0;
+	gi->gl = NULL;
+	gi->hb_list = NULL;
+	gi->seq = NULL;
+	memset(gi->string, 0, sizeof(gi->string));
+
+	if (gfs2_glock_iter_next(gi)) {
+		gfs2_glock_iter_free(gi);
+		return NULL;
+	}
+
+	return gi;
+}
+
+static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct glock_iter *gi;
+	loff_t n = *pos;
+
+	gi = gfs2_glock_iter_init(file->private);
+	if (!gi)
+		return NULL;
+
+	while (n--) {
+		if (gfs2_glock_iter_next(gi)) {
+			gfs2_glock_iter_free(gi);
+			return NULL;
+		}
+	}
+
+	return gi;
+}
+
+static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
+				 loff_t *pos)
+{
+	struct glock_iter *gi = iter_ptr;
+
+	(*pos)++;
+
+	if (gfs2_glock_iter_next(gi)) {
+		gfs2_glock_iter_free(gi);
+		return NULL;
+	}
+
+	return gi;
+}
+
+static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+	/* nothing for now */
+}
+
+static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
+{
+	struct glock_iter *gi = iter_ptr;
+
+	gi->seq = file;
+	dump_glock(gi, gi->gl);
+
+	return 0;
+}
+
+static struct seq_operations gfs2_glock_seq_ops = {
+	.start = gfs2_glock_seq_start,
+	.next  = gfs2_glock_seq_next,
+	.stop  = gfs2_glock_seq_stop,
+	.show  = gfs2_glock_seq_show,
+};
+
+static int gfs2_debugfs_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &gfs2_glock_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+
+	return 0;
+}
+
+static const struct file_operations gfs2_debug_fops = {
+	.owner   = THIS_MODULE,
+	.open    = gfs2_debugfs_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
+{
+	sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
+	if (!sdp->debugfs_dir)
+		return -ENOMEM;
+	sdp->debugfs_dentry_glocks = debugfs_create_file("glocks",
+							 S_IFREG | S_IRUGO,
+							 sdp->debugfs_dir, sdp,
+							 &gfs2_debug_fops);
+	if (!sdp->debugfs_dentry_glocks)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
+{
+	if (sdp && sdp->debugfs_dir) {
+		if (sdp->debugfs_dentry_glocks) {
+			debugfs_remove(sdp->debugfs_dentry_glocks);
+			sdp->debugfs_dentry_glocks = NULL;
+		}
+		debugfs_remove(sdp->debugfs_dir);
+		sdp->debugfs_dir = NULL;
+	}
+}
+
+int gfs2_register_debugfs(void)
+{
+	gfs2_root = debugfs_create_dir("gfs2", NULL);
+	return gfs2_root ? 0 : -ENOMEM;
+}
+
+void gfs2_unregister_debugfs(void)
+{
+	debugfs_remove(gfs2_root);
+	gfs2_root = NULL;
+}

+ 6 - 2
fs/gfs2/glock.h

@@ -38,7 +38,7 @@ static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 	/* Look in glock's list of holders for one with current task as owner */
 	/* Look in glock's list of holders for one with current task as owner */
 	spin_lock(&gl->gl_spin);
 	spin_lock(&gl->gl_spin);
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-		if (gh->gh_owner == current) {
+		if (gh->gh_owner_pid == current->pid) {
 			locked = 1;
 			locked = 1;
 			break;
 			break;
 		}
 		}
@@ -67,7 +67,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 {
 {
 	int ret;
 	int ret;
 	spin_lock(&gl->gl_spin);
 	spin_lock(&gl->gl_spin);
-	ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
+	ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3);
 	spin_unlock(&gl->gl_spin);
 	spin_unlock(&gl->gl_spin);
 	return ret;
 	return ret;
 }
 }
@@ -135,5 +135,9 @@ void gfs2_scand_internal(struct gfs2_sbd *sdp);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
 
 
 int __init gfs2_glock_init(void);
 int __init gfs2_glock_init(void);
+int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
+int gfs2_register_debugfs(void);
+void gfs2_unregister_debugfs(void);
 
 
 #endif /* __GLOCK_DOT_H__ */
 #endif /* __GLOCK_DOT_H__ */

+ 7 - 7
fs/gfs2/incore.h

@@ -115,11 +115,8 @@ enum {
 	/* Actions */
 	/* Actions */
 	HIF_MUTEX		= 0,
 	HIF_MUTEX		= 0,
 	HIF_PROMOTE		= 1,
 	HIF_PROMOTE		= 1,
-	HIF_DEMOTE		= 2,
 
 
 	/* States */
 	/* States */
-	HIF_ALLOCED		= 4,
-	HIF_DEALLOC		= 5,
 	HIF_HOLDER		= 6,
 	HIF_HOLDER		= 6,
 	HIF_FIRST		= 7,
 	HIF_FIRST		= 7,
 	HIF_ABORTED		= 9,
 	HIF_ABORTED		= 9,
@@ -130,7 +127,7 @@ struct gfs2_holder {
 	struct list_head gh_list;
 	struct list_head gh_list;
 
 
 	struct gfs2_glock *gh_gl;
 	struct gfs2_glock *gh_gl;
-	struct task_struct *gh_owner;
+	pid_t gh_owner_pid;
 	unsigned int gh_state;
 	unsigned int gh_state;
 	unsigned gh_flags;
 	unsigned gh_flags;
 
 
@@ -142,8 +139,8 @@ struct gfs2_holder {
 enum {
 enum {
 	GLF_LOCK		= 1,
 	GLF_LOCK		= 1,
 	GLF_STICKY		= 2,
 	GLF_STICKY		= 2,
+	GLF_DEMOTE		= 3,
 	GLF_DIRTY		= 5,
 	GLF_DIRTY		= 5,
-	GLF_SKIP_WAITERS2	= 6,
 };
 };
 
 
 struct gfs2_glock {
 struct gfs2_glock {
@@ -156,11 +153,12 @@ struct gfs2_glock {
 
 
 	unsigned int gl_state;
 	unsigned int gl_state;
 	unsigned int gl_hash;
 	unsigned int gl_hash;
-	struct task_struct *gl_owner;
+	unsigned int gl_demote_state; /* state requested by remote node */
+	unsigned long gl_demote_time; /* time of first demote request */
+	pid_t gl_owner_pid;
 	unsigned long gl_ip;
 	unsigned long gl_ip;
 	struct list_head gl_holders;
 	struct list_head gl_holders;
 	struct list_head gl_waiters1;	/* HIF_MUTEX */
 	struct list_head gl_waiters1;	/* HIF_MUTEX */
-	struct list_head gl_waiters2;	/* HIF_DEMOTE */
 	struct list_head gl_waiters3;	/* HIF_PROMOTE */
 	struct list_head gl_waiters3;	/* HIF_PROMOTE */
 
 
 	const struct gfs2_glock_operations *gl_ops;
 	const struct gfs2_glock_operations *gl_ops;
@@ -611,6 +609,8 @@ struct gfs2_sbd {
 
 
 	unsigned long sd_last_warning;
 	unsigned long sd_last_warning;
 	struct vfsmount *sd_gfs2mnt;
 	struct vfsmount *sd_gfs2mnt;
+	struct dentry *debugfs_dir;    /* debugfs directory */
+	struct dentry *debugfs_dentry_glocks; /* for debugfs */
 };
 };
 
 
 #endif /* __INCORE_DOT_H__ */
 #endif /* __INCORE_DOT_H__ */

+ 6 - 8
fs/gfs2/locking/dlm/lock.c

@@ -151,7 +151,7 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
 
 
 /* make_strname - convert GFS lock numbers to a string */
 /* make_strname - convert GFS lock numbers to a string */
 
 
-static inline void make_strname(struct lm_lockname *lockname,
+static inline void make_strname(const struct lm_lockname *lockname,
 				struct gdlm_strname *str)
 				struct gdlm_strname *str)
 {
 {
 	sprintf(str->name, "%8x%16llx", lockname->ln_type,
 	sprintf(str->name, "%8x%16llx", lockname->ln_type,
@@ -169,6 +169,7 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
 		return -ENOMEM;
 		return -ENOMEM;
 
 
 	lp->lockname = *name;
 	lp->lockname = *name;
+	make_strname(name, &lp->strname);
 	lp->ls = ls;
 	lp->ls = ls;
 	lp->cur = DLM_LOCK_IV;
 	lp->cur = DLM_LOCK_IV;
 	lp->lvb = NULL;
 	lp->lvb = NULL;
@@ -227,7 +228,6 @@ void gdlm_put_lock(void *lock)
 unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 {
 {
 	struct gdlm_ls *ls = lp->ls;
 	struct gdlm_ls *ls = lp->ls;
-	struct gdlm_strname str;
 	int error, bast = 1;
 	int error, bast = 1;
 
 
 	/*
 	/*
@@ -249,8 +249,6 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 	if (test_bit(LFL_NOBAST, &lp->flags))
 	if (test_bit(LFL_NOBAST, &lp->flags))
 		bast = 0;
 		bast = 0;
 
 
-	make_strname(&lp->lockname, &str);
-
 	set_bit(LFL_ACTIVE, &lp->flags);
 	set_bit(LFL_ACTIVE, &lp->flags);
 
 
 	log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
 	log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
@@ -258,8 +256,8 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 		  lp->cur, lp->req, lp->lkf);
 		  lp->cur, lp->req, lp->lkf);
 
 
 	error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
 	error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
-			 str.name, str.namelen, 0, gdlm_ast, lp,
-			 bast ? gdlm_bast : NULL);
+			 lp->strname.name, lp->strname.namelen, 0, gdlm_ast,
+			 lp, bast ? gdlm_bast : NULL);
 
 
 	if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
 	if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
 		lp->lksb.sb_status = -EAGAIN;
 		lp->lksb.sb_status = -EAGAIN;
@@ -268,7 +266,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 	}
 	}
 
 
 	if (error) {
 	if (error) {
-		log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
+		log_error("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
 			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
 			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
 			  (unsigned long long)lp->lockname.ln_number, error,
 			  (unsigned long long)lp->lockname.ln_number, error,
 			  lp->cur, lp->req, lp->lkf, lp->flags);
 			  lp->cur, lp->req, lp->lkf, lp->flags);
@@ -296,7 +294,7 @@ static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
 	error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
 	error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
 
 
 	if (error) {
 	if (error) {
-		log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
+		log_error("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
 			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
 			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
 			  (unsigned long long)lp->lockname.ln_number, error,
 			  (unsigned long long)lp->lockname.ln_number, error,
 			  lp->cur, lp->req, lp->lkf, lp->flags);
 			  lp->cur, lp->req, lp->lkf, lp->flags);

+ 2 - 1
fs/gfs2/locking/dlm/lock_dlm.h

@@ -36,7 +36,7 @@
 
 
 #define GDLM_STRNAME_BYTES	24
 #define GDLM_STRNAME_BYTES	24
 #define GDLM_LVB_SIZE		32
 #define GDLM_LVB_SIZE		32
-#define GDLM_DROP_COUNT		200000
+#define GDLM_DROP_COUNT		0
 #define GDLM_DROP_PERIOD	60
 #define GDLM_DROP_PERIOD	60
 #define GDLM_NAME_LEN		128
 #define GDLM_NAME_LEN		128
 
 
@@ -106,6 +106,7 @@ enum {
 struct gdlm_lock {
 struct gdlm_lock {
 	struct gdlm_ls		*ls;
 	struct gdlm_ls		*ls;
 	struct lm_lockname	lockname;
 	struct lm_lockname	lockname;
+	struct gdlm_strname	strname;
 	char			*lvb;
 	char			*lvb;
 	struct dlm_lksb		lksb;
 	struct dlm_lksb		lksb;
 
 

+ 11 - 9
fs/gfs2/lops.c

@@ -33,16 +33,17 @@ static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 
 
 	tr->tr_touched = 1;
 	tr->tr_touched = 1;
 
 
-	if (!list_empty(&le->le_list))
-		return;
-
 	gl = container_of(le, struct gfs2_glock, gl_le);
 	gl = container_of(le, struct gfs2_glock, gl_le);
 	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
 	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
 		return;
 		return;
-	gfs2_glock_hold(gl);
-	set_bit(GLF_DIRTY, &gl->gl_flags);
 
 
 	gfs2_log_lock(sdp);
 	gfs2_log_lock(sdp);
+	if (!list_empty(&le->le_list)){
+		gfs2_log_unlock(sdp);
+		return;
+	}
+	gfs2_glock_hold(gl);
+	set_bit(GLF_DIRTY, &gl->gl_flags);
 	sdp->sd_log_num_gl++;
 	sdp->sd_log_num_gl++;
 	list_add(&le->le_list, &sdp->sd_log_le_gl);
 	list_add(&le->le_list, &sdp->sd_log_le_gl);
 	gfs2_log_unlock(sdp);
 	gfs2_log_unlock(sdp);
@@ -415,13 +416,14 @@ static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 
 
 	tr->tr_touched = 1;
 	tr->tr_touched = 1;
 
 
-	if (!list_empty(&le->le_list))
-		return;
-
 	rgd = container_of(le, struct gfs2_rgrpd, rd_le);
 	rgd = container_of(le, struct gfs2_rgrpd, rd_le);
-	gfs2_rgrp_bh_hold(rgd);
 
 
 	gfs2_log_lock(sdp);
 	gfs2_log_lock(sdp);
+	if (!list_empty(&le->le_list)){
+		gfs2_log_unlock(sdp);
+		return;
+	}
+	gfs2_rgrp_bh_hold(rgd);
 	sdp->sd_log_num_rg++;
 	sdp->sd_log_num_rg++;
 	list_add(&le->le_list, &sdp->sd_log_le_rg);
 	list_add(&le->le_list, &sdp->sd_log_le_rg);
 	gfs2_log_unlock(sdp);
 	gfs2_log_unlock(sdp);

+ 3 - 1
fs/gfs2/main.c

@@ -43,7 +43,6 @@ static void gfs2_init_glock_once(void *foo, struct kmem_cache *cachep, unsigned
 		spin_lock_init(&gl->gl_spin);
 		spin_lock_init(&gl->gl_spin);
 		INIT_LIST_HEAD(&gl->gl_holders);
 		INIT_LIST_HEAD(&gl->gl_holders);
 		INIT_LIST_HEAD(&gl->gl_waiters1);
 		INIT_LIST_HEAD(&gl->gl_waiters1);
-		INIT_LIST_HEAD(&gl->gl_waiters2);
 		INIT_LIST_HEAD(&gl->gl_waiters3);
 		INIT_LIST_HEAD(&gl->gl_waiters3);
 		gl->gl_lvb = NULL;
 		gl->gl_lvb = NULL;
 		atomic_set(&gl->gl_lvb_count, 0);
 		atomic_set(&gl->gl_lvb_count, 0);
@@ -101,6 +100,8 @@ static int __init init_gfs2_fs(void)
 	if (error)
 	if (error)
 		goto fail_unregister;
 		goto fail_unregister;
 
 
+	gfs2_register_debugfs();
+
 	printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
 	printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
 
 
 	return 0;
 	return 0;
@@ -128,6 +129,7 @@ fail:
 
 
 static void __exit exit_gfs2_fs(void)
 static void __exit exit_gfs2_fs(void)
 {
 {
+	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
 
 

+ 143 - 96
fs/gfs2/mount.c

@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
 #include <linux/lm_interface.h>
+#include <linux/parser.h>
 
 
 #include "gfs2.h"
 #include "gfs2.h"
 #include "incore.h"
 #include "incore.h"
@@ -20,6 +21,52 @@
 #include "sys.h"
 #include "sys.h"
 #include "util.h"
 #include "util.h"
 
 
+enum {
+	Opt_lockproto,
+	Opt_locktable,
+	Opt_hostdata,
+	Opt_spectator,
+	Opt_ignore_local_fs,
+	Opt_localflocks,
+	Opt_localcaching,
+	Opt_debug,
+	Opt_nodebug,
+	Opt_upgrade,
+	Opt_num_glockd,
+	Opt_acl,
+	Opt_noacl,
+	Opt_quota_off,
+	Opt_quota_account,
+	Opt_quota_on,
+	Opt_suiddir,
+	Opt_nosuiddir,
+	Opt_data_writeback,
+	Opt_data_ordered,
+};
+
+static match_table_t tokens = {
+	{Opt_lockproto, "lockproto=%s"},
+	{Opt_locktable, "locktable=%s"},
+	{Opt_hostdata, "hostdata=%s"},
+	{Opt_spectator, "spectator"},
+	{Opt_ignore_local_fs, "ignore_local_fs"},
+	{Opt_localflocks, "localflocks"},
+	{Opt_localcaching, "localcaching"},
+	{Opt_debug, "debug"},
+	{Opt_nodebug, "nodebug"},
+	{Opt_upgrade, "upgrade"},
+	{Opt_num_glockd, "num_glockd=%d"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_quota_off, "quota=off"},
+	{Opt_quota_account, "quota=account"},
+	{Opt_quota_on, "quota=on"},
+	{Opt_suiddir, "suiddir"},
+	{Opt_nosuiddir, "nosuiddir"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_ordered, "data=ordered"}
+};
+
 /**
 /**
  * gfs2_mount_args - Parse mount options
  * gfs2_mount_args - Parse mount options
  * @sdp:
  * @sdp:
@@ -54,146 +101,150 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 	   process them */
 	   process them */
 
 
 	for (options = data; (o = strsep(&options, ",")); ) {
 	for (options = data; (o = strsep(&options, ",")); ) {
+		int token, option;
+		substring_t tmp[MAX_OPT_ARGS];
+
 		if (!*o)
 		if (!*o)
 			continue;
 			continue;
 
 
-		v = strchr(o, '=');
-		if (v)
-			*v++ = 0;
+		token = match_token(o, tokens, tmp);
+		switch (token) {
+		case Opt_lockproto:
+			v = match_strdup(&tmp[0]);
+			if (!v) {
+				fs_info(sdp, "no memory for lockproto\n");
+				error = -ENOMEM;
+				goto out_error;
+			}
 
 
-		if (!strcmp(o, "lockproto")) {
-			if (!v)
-				goto need_value;
-			if (remount && strcmp(v, args->ar_lockproto))
+			if (remount && strcmp(v, args->ar_lockproto)) {
+				kfree(v);
 				goto cant_remount;
 				goto cant_remount;
+			}
+			
 			strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
 			strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
 			args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
 			args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
-		}
+			kfree(v);
+			break;
+		case Opt_locktable:
+			v = match_strdup(&tmp[0]);
+			if (!v) {
+				fs_info(sdp, "no memory for locktable\n");
+				error = -ENOMEM;
+				goto out_error;
+			}
 
 
-		else if (!strcmp(o, "locktable")) {
-			if (!v)
-				goto need_value;
-			if (remount && strcmp(v, args->ar_locktable))
+			if (remount && strcmp(v, args->ar_locktable)) {
+				kfree(v);
 				goto cant_remount;
 				goto cant_remount;
+			}
+
 			strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
 			strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
-			args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
-		}
+			args->ar_locktable[GFS2_LOCKNAME_LEN - 1]  = 0;
+			kfree(v);
+			break;
+		case Opt_hostdata:
+			v = match_strdup(&tmp[0]);
+			if (!v) {
+				fs_info(sdp, "no memory for hostdata\n");
+				error = -ENOMEM;
+				goto out_error;
+			}
 
 
-		else if (!strcmp(o, "hostdata")) {
-			if (!v)
-				goto need_value;
-			if (remount && strcmp(v, args->ar_hostdata))
+			if (remount && strcmp(v, args->ar_hostdata)) {
+				kfree(v);
 				goto cant_remount;
 				goto cant_remount;
+			}
+
 			strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
 			strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
 			args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
 			args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
-		}
-
-		else if (!strcmp(o, "spectator")) {
+			kfree(v);
+			break;
+		case Opt_spectator:
 			if (remount && !args->ar_spectator)
 			if (remount && !args->ar_spectator)
 				goto cant_remount;
 				goto cant_remount;
 			args->ar_spectator = 1;
 			args->ar_spectator = 1;
 			sdp->sd_vfs->s_flags |= MS_RDONLY;
 			sdp->sd_vfs->s_flags |= MS_RDONLY;
-		}
-
-		else if (!strcmp(o, "ignore_local_fs")) {
+			break;
+		case Opt_ignore_local_fs:
 			if (remount && !args->ar_ignore_local_fs)
 			if (remount && !args->ar_ignore_local_fs)
 				goto cant_remount;
 				goto cant_remount;
 			args->ar_ignore_local_fs = 1;
 			args->ar_ignore_local_fs = 1;
-		}
-
-		else if (!strcmp(o, "localflocks")) {
+			break;
+		case Opt_localflocks:
 			if (remount && !args->ar_localflocks)
 			if (remount && !args->ar_localflocks)
 				goto cant_remount;
 				goto cant_remount;
 			args->ar_localflocks = 1;
 			args->ar_localflocks = 1;
-		}
-
-		else if (!strcmp(o, "localcaching")) {
+			break;
+		case Opt_localcaching:
 			if (remount && !args->ar_localcaching)
 			if (remount && !args->ar_localcaching)
 				goto cant_remount;
 				goto cant_remount;
 			args->ar_localcaching = 1;
 			args->ar_localcaching = 1;
-		}
-
-		else if (!strcmp(o, "debug"))
+			break;
+		case Opt_debug:
 			args->ar_debug = 1;
 			args->ar_debug = 1;
-
-		else if (!strcmp(o, "nodebug"))
+			break;
+		case Opt_nodebug:
 			args->ar_debug = 0;
 			args->ar_debug = 0;
-
-		else if (!strcmp(o, "upgrade")) {
+			break;
+		case Opt_upgrade:
 			if (remount && !args->ar_upgrade)
 			if (remount && !args->ar_upgrade)
 				goto cant_remount;
 				goto cant_remount;
 			args->ar_upgrade = 1;
 			args->ar_upgrade = 1;
-		}
+			break;
+		case Opt_num_glockd:
+			if ((error = match_int(&tmp[0], &option))) {
+				fs_info(sdp, "problem getting num_glockd\n");
+				goto out_error;
+			}
 
 
-		else if (!strcmp(o, "num_glockd")) {
-			unsigned int x;
-			if (!v)
-				goto need_value;
-			sscanf(v, "%u", &x);
-			if (remount && x != args->ar_num_glockd)
+			if (remount && option != args->ar_num_glockd)
 				goto cant_remount;
 				goto cant_remount;
-			if (!x || x > GFS2_GLOCKD_MAX) {
+			if (!option || option > GFS2_GLOCKD_MAX) {
 				fs_info(sdp, "0 < num_glockd <= %u  (not %u)\n",
 				fs_info(sdp, "0 < num_glockd <= %u  (not %u)\n",
-				        GFS2_GLOCKD_MAX, x);
+				        GFS2_GLOCKD_MAX, option);
 				error = -EINVAL;
 				error = -EINVAL;
-				break;
+				goto out_error;
 			}
 			}
-			args->ar_num_glockd = x;
-		}
-
-		else if (!strcmp(o, "acl")) {
+			args->ar_num_glockd = option;
+			break;
+		case Opt_acl:
 			args->ar_posix_acl = 1;
 			args->ar_posix_acl = 1;
 			sdp->sd_vfs->s_flags |= MS_POSIXACL;
 			sdp->sd_vfs->s_flags |= MS_POSIXACL;
-		}
-
-		else if (!strcmp(o, "noacl")) {
+			break;
+		case Opt_noacl:
 			args->ar_posix_acl = 0;
 			args->ar_posix_acl = 0;
 			sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
 			sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
-		}
-
-		else if (!strcmp(o, "quota")) {
-			if (!v)
-				goto need_value;
-			if (!strcmp(v, "off"))
-				args->ar_quota = GFS2_QUOTA_OFF;
-			else if (!strcmp(v, "account"))
-				args->ar_quota = GFS2_QUOTA_ACCOUNT;
-			else if (!strcmp(v, "on"))
-				args->ar_quota = GFS2_QUOTA_ON;
-			else {
-				fs_info(sdp, "invalid value for quota\n");
-				error = -EINVAL;
-				break;
-			}
-		}
-
-		else if (!strcmp(o, "suiddir"))
+			break;
+		case Opt_quota_off:
+			args->ar_quota = GFS2_QUOTA_OFF;
+			break;
+		case Opt_quota_account:
+			args->ar_quota = GFS2_QUOTA_ACCOUNT;
+			break;
+		case Opt_quota_on:
+			args->ar_quota = GFS2_QUOTA_ON;
+			break;
+		case Opt_suiddir:
 			args->ar_suiddir = 1;
 			args->ar_suiddir = 1;
-
-		else if (!strcmp(o, "nosuiddir"))
+			break;
+		case Opt_nosuiddir:
 			args->ar_suiddir = 0;
 			args->ar_suiddir = 0;
-
-		else if (!strcmp(o, "data")) {
-			if (!v)
-				goto need_value;
-			if (!strcmp(v, "writeback"))
-				args->ar_data = GFS2_DATA_WRITEBACK;
-			else if (!strcmp(v, "ordered"))
-				args->ar_data = GFS2_DATA_ORDERED;
-			else {
-				fs_info(sdp, "invalid value for data\n");
-				error = -EINVAL;
-				break;
-			}
-		}
-
-		else {
+			break;
+		case Opt_data_writeback:
+			args->ar_data = GFS2_DATA_WRITEBACK;
+			break;
+		case Opt_data_ordered:
+			args->ar_data = GFS2_DATA_ORDERED;
+			break;
+		default:
 			fs_info(sdp, "unknown option: %s\n", o);
 			fs_info(sdp, "unknown option: %s\n", o);
 			error = -EINVAL;
 			error = -EINVAL;
-			break;
+			goto out_error;
 		}
 		}
 	}
 	}
 
 
+out_error:
 	if (error)
 	if (error)
 		fs_info(sdp, "invalid mount option(s)\n");
 		fs_info(sdp, "invalid mount option(s)\n");
 
 
@@ -202,10 +253,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 
 
 	return error;
 	return error;
 
 
-need_value:
-	fs_info(sdp, "need value for option %s\n", o);
-	return -EINVAL;
-
 cant_remount:
 cant_remount:
 	fs_info(sdp, "can't remount with option %s\n", o);
 	fs_info(sdp, "can't remount with option %s\n", o);
 	return -EINVAL;
 	return -EINVAL;

+ 18 - 3
fs/gfs2/ops_address.c

@@ -197,7 +197,19 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 	void *kaddr;
 	void *kaddr;
 	int error;
 	int error;
 
 
-	BUG_ON(page->index);
+	/*
+	 * Due to the order of unstuffing files and ->nopage(), we can be
+	 * asked for a zero page in the case of a stuffed file being extended,
+	 * so we need to supply one here. It doesn't happen often.
+	 */
+	if (unlikely(page->index)) {
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr, 0, PAGE_CACHE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+		return 0;
+	}
 
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
 	if (error)
@@ -208,9 +220,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 	       ip->i_di.di_size);
 	       ip->i_di.di_size);
 	memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
 	memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
 	kunmap_atomic(kaddr, KM_USER0);
 	kunmap_atomic(kaddr, KM_USER0);
-
+	flush_dcache_page(page);
 	brelse(dibh);
 	brelse(dibh);
-
 	SetPageUptodate(page);
 	SetPageUptodate(page);
 
 
 	return 0;
 	return 0;
@@ -507,7 +518,9 @@ static int gfs2_commit_write(struct file *file, struct page *page,
 		gfs2_quota_unlock(ip);
 		gfs2_quota_unlock(ip);
 		gfs2_alloc_put(ip);
 		gfs2_alloc_put(ip);
 	}
 	}
+	unlock_page(page);
 	gfs2_glock_dq_m(1, &ip->i_gh);
 	gfs2_glock_dq_m(1, &ip->i_gh);
+	lock_page(page);
 	gfs2_holder_uninit(&ip->i_gh);
 	gfs2_holder_uninit(&ip->i_gh);
 	return 0;
 	return 0;
 
 
@@ -520,7 +533,9 @@ fail_endtrans:
 		gfs2_quota_unlock(ip);
 		gfs2_quota_unlock(ip);
 		gfs2_alloc_put(ip);
 		gfs2_alloc_put(ip);
 	}
 	}
+	unlock_page(page);
 	gfs2_glock_dq_m(1, &ip->i_gh);
 	gfs2_glock_dq_m(1, &ip->i_gh);
+	lock_page(page);
 	gfs2_holder_uninit(&ip->i_gh);
 	gfs2_holder_uninit(&ip->i_gh);
 fail_nounlock:
 fail_nounlock:
 	ClearPageUptodate(page);
 	ClearPageUptodate(page);

+ 4 - 0
fs/gfs2/ops_fstype.c

@@ -690,6 +690,8 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	if (error)
 	if (error)
 		goto fail;
 		goto fail;
 
 
+	gfs2_create_debugfs_file(sdp);
+
 	error = gfs2_sys_fs_add(sdp);
 	error = gfs2_sys_fs_add(sdp);
 	if (error)
 	if (error)
 		goto fail;
 		goto fail;
@@ -754,6 +756,7 @@ fail_lm:
 fail_sys:
 fail_sys:
 	gfs2_sys_fs_del(sdp);
 	gfs2_sys_fs_del(sdp);
 fail:
 fail:
+	gfs2_delete_debugfs_file(sdp);
 	kfree(sdp);
 	kfree(sdp);
 	sb->s_fs_info = NULL;
 	sb->s_fs_info = NULL;
 	return error;
 	return error;
@@ -896,6 +899,7 @@ error:
 
 
 static void gfs2_kill_sb(struct super_block *sb)
 static void gfs2_kill_sb(struct super_block *sb)
 {
 {
+	gfs2_delete_debugfs_file(sb->s_fs_info);
 	kill_block_super(sb);
 	kill_block_super(sb);
 }
 }
 
 

+ 27 - 1
fs/gfs2/ops_super.c

@@ -283,6 +283,31 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 	return error;
 	return error;
 }
 }
 
 
+/**
+ * gfs2_drop_inode - Drop an inode (test for remote unlink)
+ * @inode: The inode to drop
+ *
+ * If we've received a callback on an iopen lock then its because a
+ * remote node tried to deallocate the inode but failed due to this node
+ * still having the inode open. Here we mark the link count zero
+ * since we know that it must have reached zero if the GLF_DEMOTE flag
+ * is set on the iopen glock. If we didn't do a disk read since the
+ * remote node removed the final link then we might otherwise miss
+ * this event. This check ensures that this node will deallocate the
+ * inode's blocks, or alternatively pass the baton on to another
+ * node for later deallocation.
+ */
+static void gfs2_drop_inode(struct inode *inode)
+{
+	if (inode->i_private && inode->i_nlink) {
+		struct gfs2_inode *ip = GFS2_I(inode);
+		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
+			clear_nlink(inode);
+	}
+	generic_drop_inode(inode);
+}
+
 /**
 /**
  * gfs2_clear_inode - Deallocate an inode when VFS is done with it
  * gfs2_clear_inode - Deallocate an inode when VFS is done with it
  * @inode: The VFS inode
  * @inode: The VFS inode
@@ -441,7 +466,7 @@ out_unlock:
 out_uninit:
 out_uninit:
 	gfs2_holder_uninit(&ip->i_iopen_gh);
 	gfs2_holder_uninit(&ip->i_iopen_gh);
 	gfs2_glock_dq_uninit(&gh);
 	gfs2_glock_dq_uninit(&gh);
-	if (error)
+	if (error && error != GLR_TRYFAILED)
 		fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
 		fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
 out:
 out:
 	truncate_inode_pages(&inode->i_data, 0);
 	truncate_inode_pages(&inode->i_data, 0);
@@ -481,6 +506,7 @@ const struct super_operations gfs2_super_ops = {
 	.statfs			= gfs2_statfs,
 	.statfs			= gfs2_statfs,
 	.remount_fs		= gfs2_remount_fs,
 	.remount_fs		= gfs2_remount_fs,
 	.clear_inode		= gfs2_clear_inode,
 	.clear_inode		= gfs2_clear_inode,
+	.drop_inode		= gfs2_drop_inode,
 	.show_options		= gfs2_show_options,
 	.show_options		= gfs2_show_options,
 };
 };
 
 

+ 9 - 3
fs/gfs2/rgrp.c

@@ -27,6 +27,7 @@
 #include "trans.h"
 #include "trans.h"
 #include "ops_file.h"
 #include "ops_file.h"
 #include "util.h"
 #include "util.h"
+#include "log.h"
 
 
 #define BFITNOENT ((u32)~0)
 #define BFITNOENT ((u32)~0)
 
 
@@ -697,8 +698,6 @@ struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
  * @al: the struct gfs2_alloc structure describing the reservation
  * @al: the struct gfs2_alloc structure describing the reservation
  *
  *
  * If there's room for the requested blocks to be allocated from the RG:
  * If there's room for the requested blocks to be allocated from the RG:
- *   Sets the $al_reserved_data field in @al.
- *   Sets the $al_reserved_meta field in @al.
  *   Sets the $al_rgd field in @al.
  *   Sets the $al_rgd field in @al.
  *
  *
  * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
  * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
@@ -709,6 +708,9 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
 	int ret = 0;
 	int ret = 0;
 
 
+	if (rgd->rd_rg.rg_flags & GFS2_RGF_NOALLOC)
+		return 0;
+
 	spin_lock(&sdp->sd_rindex_spin);
 	spin_lock(&sdp->sd_rindex_spin);
 	if (rgd->rd_free_clone >= al->al_requested) {
 	if (rgd->rd_free_clone >= al->al_requested) {
 		al->al_rgd = rgd;
 		al->al_rgd = rgd;
@@ -941,9 +943,13 @@ static int get_local_rgrp(struct gfs2_inode *ip)
 			rgd = gfs2_rgrpd_get_first(sdp);
 			rgd = gfs2_rgrpd_get_first(sdp);
 
 
 		if (rgd == begin) {
 		if (rgd == begin) {
-			if (++loops >= 2 || !skipped)
+			if (++loops >= 3)
 				return -ENOSPC;
 				return -ENOSPC;
+			if (!skipped)
+				loops++;
 			flags = 0;
 			flags = 0;
+			if (loops == 2)
+				gfs2_log_flush(sdp, NULL);
 		}
 		}
 	}
 	}
 
 

+ 8 - 1
include/linux/dlm_device.h

@@ -19,7 +19,7 @@
 
 
 /* Version of the device interface */
 /* Version of the device interface */
 #define DLM_DEVICE_VERSION_MAJOR 5
 #define DLM_DEVICE_VERSION_MAJOR 5
-#define DLM_DEVICE_VERSION_MINOR 0
+#define DLM_DEVICE_VERSION_MINOR 1
 #define DLM_DEVICE_VERSION_PATCH 0
 #define DLM_DEVICE_VERSION_PATCH 0
 
 
 /* struct passed to the lock write */
 /* struct passed to the lock write */
@@ -44,6 +44,11 @@ struct dlm_lspace_params {
 	char name[0];
 	char name[0];
 };
 };
 
 
+struct dlm_purge_params {
+	__u32 nodeid;
+	__u32 pid;
+};
+
 struct dlm_write_request {
 struct dlm_write_request {
 	__u32 version[3];
 	__u32 version[3];
 	__u8 cmd;
 	__u8 cmd;
@@ -53,6 +58,7 @@ struct dlm_write_request {
 	union  {
 	union  {
 		struct dlm_lock_params   lock;
 		struct dlm_lock_params   lock;
 		struct dlm_lspace_params lspace;
 		struct dlm_lspace_params lspace;
+		struct dlm_purge_params  purge;
 	} i;
 	} i;
 };
 };
 
 
@@ -76,6 +82,7 @@ struct dlm_lock_result {
 #define DLM_USER_QUERY        3
 #define DLM_USER_QUERY        3
 #define DLM_USER_CREATE_LOCKSPACE  4
 #define DLM_USER_CREATE_LOCKSPACE  4
 #define DLM_USER_REMOVE_LOCKSPACE  5
 #define DLM_USER_REMOVE_LOCKSPACE  5
+#define DLM_USER_PURGE        6
 
 
 /* Arbitrary length restriction */
 /* Arbitrary length restriction */
 #define MAX_LS_NAME_LEN 64
 #define MAX_LS_NAME_LEN 64

Some files were not shown because too many files changed in this diff