Browse Source

staging: ramster: ramster-specific changes to zcache/tmem

RAMster implements peer-to-peer transcendent memory, allowing a "cluster"
of kernels to dynamically pool their RAM.

This patch incorporates changes transforming zcache to work with
a remote store.

In tmem.[ch], new "repatriate" (provoke async get) and "localify" (handle
incoming data resulting from an async get) routines combine with a handful
of changes to existing pamops interfaces allow the generic tmem code
to support asynchronous operations.  Also, a new tmem_xhandle struct
groups together key information that must be passed to remote tmem stores.

Zcache-main.c is augmented with a large amount of ramster-specific code
to handle remote operations and "foreign" pages on both ends of the
"remotify" protocol.  New "foreign" pools are auto-created on demand.
A "selfshrinker" thread periodically repatriates remote persistent pages
when local memory conditions allow.  For certain operations, a queue is
necessary to guarantee strict ordering as out-of-order puts/flushes can
cause strange race conditions.  Pampd pointers now either point to local
memory OR describe a remote page; to allow the same 64-bits to describe
either, the LSB is used to differentiate.  Some acrobatics must be performed
to ensure local memory is available to handle a remote persistent get,
or deal with the data directly anyway if the malloc failed.  Lots
of ramster-specific statistics are available via sysfs.

Note: Some debug ifdefs left in for now.
Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Dan Magenheimer 13 years ago
parent
commit
c89126eabb

+ 9 - 9
drivers/staging/ramster/Kconfig

@@ -1,13 +1,13 @@
-config ZCACHE
-	tristate "Dynamic compression of swap pages and clean pagecache pages"
-	depends on CLEANCACHE || FRONTSWAP
-	select XVMALLOC
+config RAMSTER
+	bool "Cross-machine RAM capacity sharing, aka peer-to-peer tmem"
+	depends on (CLEANCACHE || FRONTSWAP) && CONFIGFS_FS && !ZCACHE && !XVMALLOC && !HIGHMEM
 	select LZO_COMPRESS
 	select LZO_DECOMPRESS
 	default n
 	help
-	  Zcache doubles RAM efficiency while providing a significant
-	  performance boosts on many workloads.  Zcache uses lzo1x
-	  compression and an in-kernel implementation of transcendent
-	  memory to store clean page cache pages and swap in RAM,
-	  providing a noticeable reduction in disk I/O.
+	  RAMster allows RAM on other machines in a cluster to be utilized
+	  dynamically and symmetrically instead of swapping to a local swap
+	  disk, thus improving performance on memory-constrained workloads
+	  while minimizing total RAM across the cluster.  RAMster, like
+	  zcache, compresses swap pages into local RAM, but then remotifies
+	  the compressed pages to another node in the RAMster cluster.

+ 1 - 3
drivers/staging/ramster/Makefile

@@ -1,3 +1 @@
-zcache-y	:=	zcache-main.o tmem.o
-
-obj-$(CONFIG_ZCACHE)	+=	zcache.o
+obj-$(CONFIG_RAMSTER)	+=	zcache-main.o tmem.o r2net.o xvmalloc.o cluster/

+ 98 - 17
drivers/staging/ramster/tmem.c

@@ -27,6 +27,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/atomic.h>
+#include <linux/delay.h>
 
 #include "tmem.h"
 
@@ -316,7 +317,7 @@ static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
 }
 
 static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
-					void *new_pampd)
+					void *new_pampd, bool no_free)
 {
 	struct tmem_objnode **slot;
 	void *ret = NULL;
@@ -325,7 +326,9 @@ static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
 	if ((slot != NULL) && (*slot != NULL)) {
 		void *old_pampd = *(void **)slot;
 		*(void **)slot = new_pampd;
-		(*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0);
+		if (!no_free)
+			(*tmem_pamops.free)(old_pampd, obj->pool,
+						NULL, 0, false);
 		ret = new_pampd;
 	}
 	return ret;
@@ -481,7 +484,7 @@ static void tmem_objnode_node_destroy(struct tmem_obj *obj,
 			if (ht == 1) {
 				obj->pampd_count--;
 				(*tmem_pamops.free)(objnode->slots[i],
-						obj->pool, NULL, 0);
+						obj->pool, NULL, 0, true);
 				objnode->slots[i] = NULL;
 				continue;
 			}
@@ -498,7 +501,8 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
 		return;
 	if (obj->objnode_tree_height == 0) {
 		obj->pampd_count--;
-		(*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0);
+		(*tmem_pamops.free)(obj->objnode_tree_root,
+					obj->pool, NULL, 0, true);
 	} else {
 		tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
 					obj->objnode_tree_height);
@@ -529,7 +533,7 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
  * always flushes for simplicity.
  */
 int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
-		char *data, size_t size, bool raw, bool ephemeral)
+		char *data, size_t size, bool raw, int ephemeral)
 {
 	struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
 	void *pampd = NULL, *pampd_del = NULL;
@@ -545,7 +549,7 @@ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
 			/* if found, is a dup put, flush the old one */
 			pampd_del = tmem_pampd_delete_from_obj(obj, index);
 			BUG_ON(pampd_del != pampd);
-			(*tmem_pamops.free)(pampd, pool, oidp, index);
+			(*tmem_pamops.free)(pampd, pool, oidp, index, true);
 			if (obj->pampd_count == 0) {
 				objnew = obj;
 				objfound = NULL;
@@ -576,7 +580,7 @@ delete_and_free:
 	(void)tmem_pampd_delete_from_obj(obj, index);
 free:
 	if (pampd)
-		(*tmem_pamops.free)(pampd, pool, NULL, 0);
+		(*tmem_pamops.free)(pampd, pool, NULL, 0, true);
 	if (objnew) {
 		tmem_obj_free(objnew, hb);
 		(*tmem_hostops.obj_free)(objnew, pool);
@@ -586,6 +590,65 @@ out:
 	return ret;
 }
 
+void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp,
+				uint32_t index, struct tmem_obj **ret_obj,
+				void **saved_hb)
+{
+	struct tmem_hashbucket *hb;
+	struct tmem_obj *obj = NULL;
+	void *pampd = NULL;
+
+	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
+	spin_lock(&hb->lock);
+	obj = tmem_obj_find(hb, oidp);
+	if (likely(obj != NULL))
+		pampd = tmem_pampd_lookup_in_obj(obj, index);
+	*ret_obj = obj;
+	*saved_hb = (void *)hb;
+	/* note, hashbucket remains locked */
+	return pampd;
+}
+
+void tmem_localify_finish(struct tmem_obj *obj, uint32_t index,
+			  void *pampd, void *saved_hb, bool delete)
+{
+	struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb;
+
+	BUG_ON(!spin_is_locked(&hb->lock));
+	if (pampd != NULL) {
+		BUG_ON(obj == NULL);
+		(void)tmem_pampd_replace_in_obj(obj, index, pampd, 1);
+	} else if (delete) {
+		BUG_ON(obj == NULL);
+		(void)tmem_pampd_delete_from_obj(obj, index);
+	}
+	spin_unlock(&hb->lock);
+}
+
+static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb,
+				struct tmem_pool *pool, struct tmem_oid *oidp,
+				uint32_t index, bool free, char *data)
+{
+	void *old_pampd = *ppampd, *new_pampd = NULL;
+	bool intransit = false;
+	int ret = 0;
+
+
+	if (!is_ephemeral(pool))
+		new_pampd = (*tmem_pamops.repatriate_preload)(
+				old_pampd, pool, oidp, index, &intransit);
+	if (intransit)
+		ret = -EAGAIN;
+	else if (new_pampd != NULL)
+		*ppampd = new_pampd;
+	/* must release the hb->lock else repatriate can't sleep */
+	spin_unlock(&hb->lock);
+	if (!intransit)
+		ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool,
+						oidp, index, free, data);
+	return ret;
+}
+
 /*
  * "Get" a page, e.g. if one can be found, copy the tmem page with the
  * matching handle from PAM space to the kernel.  By tmem definition,
@@ -607,14 +670,36 @@ int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
 	int ret = -1;
 	struct tmem_hashbucket *hb;
 	bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
-	bool lock_held = false;
+	bool lock_held = 0;
+	void **ppampd;
 
+again:
 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
 	spin_lock(&hb->lock);
-	lock_held = true;
+	lock_held = 1;
 	obj = tmem_obj_find(hb, oidp);
 	if (obj == NULL)
 		goto out;
+	ppampd = __tmem_pampd_lookup_in_obj(obj, index);
+	if (ppampd == NULL)
+		goto out;
+	if (tmem_pamops.is_remote(*ppampd)) {
+		ret = tmem_repatriate(ppampd, hb, pool, oidp,
+					index, free, data);
+		lock_held = 0; /* note hb->lock has been unlocked */
+		if (ret == -EAGAIN) {
+			/* rare I think, but should cond_resched()??? */
+			usleep_range(10, 1000);
+			goto again;
+		} else if (ret != 0) {
+			if (ret != -ENOENT)
+				pr_err("UNTESTED case in tmem_get, ret=%d\n",
+						ret);
+			ret = -1;
+			goto out;
+		}
+		goto out;
+	}
 	if (free)
 		pampd = tmem_pampd_delete_from_obj(obj, index);
 	else
@@ -628,10 +713,6 @@ int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
 			obj = NULL;
 		}
 	}
-	if (tmem_pamops.is_remote(pampd)) {
-		lock_held = false;
-		spin_unlock(&hb->lock);
-	}
 	if (free)
 		ret = (*tmem_pamops.get_data_and_free)(
 				data, size, raw, pampd, pool, oidp, index);
@@ -668,7 +749,7 @@ int tmem_flush_page(struct tmem_pool *pool,
 	pampd = tmem_pampd_delete_from_obj(obj, index);
 	if (pampd == NULL)
 		goto out;
-	(*tmem_pamops.free)(pampd, pool, oidp, index);
+	(*tmem_pamops.free)(pampd, pool, oidp, index, true);
 	if (obj->pampd_count == 0) {
 		tmem_obj_free(obj, hb);
 		(*tmem_hostops.obj_free)(obj, pool);
@@ -682,8 +763,8 @@ out:
 
 /*
  * If a page in tmem matches the handle, replace the page so that any
- * subsequent "get" gets the new page.  Returns 0 if
- * there was a page to replace, else returns -1.
+ * subsequent "get" gets the new page.  Returns the new page if
+ * there was a page to replace, else returns NULL.
  */
 int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
 			uint32_t index, void *new_pampd)
@@ -697,7 +778,7 @@ int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
 	obj = tmem_obj_find(hb, oidp);
 	if (obj == NULL)
 		goto out;
-	new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd);
+	new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0);
 	ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
 out:
 	spin_unlock(&hb->lock);

+ 42 - 4
drivers/staging/ramster/tmem.h

@@ -9,7 +9,6 @@
 #ifndef _TMEM_H_
 #define _TMEM_H_
 
-#include <linux/types.h>
 #include <linux/highmem.h>
 #include <linux/hash.h>
 #include <linux/atomic.h>
@@ -89,6 +88,31 @@ struct tmem_oid {
 	uint64_t oid[3];
 };
 
+struct tmem_xhandle {
+	uint8_t client_id;
+	uint8_t xh_data_cksum;
+	uint16_t xh_data_size;
+	uint16_t pool_id;
+	struct tmem_oid oid;
+	uint32_t index;
+	void *extra;
+};
+
+static inline struct tmem_xhandle tmem_xhandle_fill(uint16_t client_id,
+					struct tmem_pool *pool,
+					struct tmem_oid *oidp,
+					uint32_t index)
+{
+	struct tmem_xhandle xh;
+	xh.client_id = client_id;
+	xh.xh_data_cksum = (uint8_t)-1;
+	xh.xh_data_size = (uint16_t)-1;
+	xh.pool_id = pool->pool_id;
+	xh.oid = *oidp;
+	xh.index = index;
+	return xh;
+}
+
 static inline void tmem_oid_set_invalid(struct tmem_oid *oidp)
 {
 	oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
@@ -147,7 +171,11 @@ struct tmem_obj {
 	unsigned int objnode_tree_height;
 	unsigned long objnode_count;
 	long pampd_count;
-	void *extra; /* for private use by pampd implementation */
+	/* for current design of ramster, all pages belonging to
+	 * an object reside on the same remotenode and extra is
+	 * used to record the number of the remotenode so a
+	 * flush-object operation can specify it */
+	void *extra; /* for use by pampd implementation */
 	DECL_SENTINEL
 };
 
@@ -174,9 +202,14 @@ struct tmem_pamops {
 	int (*get_data_and_free)(char *, size_t *, bool, void *,
 				struct tmem_pool *, struct tmem_oid *,
 				uint32_t);
-	void (*free)(void *, struct tmem_pool *, struct tmem_oid *, uint32_t);
+	void (*free)(void *, struct tmem_pool *,
+				struct tmem_oid *, uint32_t, bool);
 	void (*free_obj)(struct tmem_pool *, struct tmem_obj *);
 	bool (*is_remote)(void *);
+	void *(*repatriate_preload)(void *, struct tmem_pool *,
+					struct tmem_oid *, uint32_t, bool *);
+	int (*repatriate)(void *, void *, struct tmem_pool *,
+				struct tmem_oid *, uint32_t, bool, void *);
 	void (*new_obj)(struct tmem_obj *);
 	int (*replace_in_obj)(void *, struct tmem_obj *);
 };
@@ -193,11 +226,16 @@ extern void tmem_register_hostops(struct tmem_hostops *m);
 
 /* core tmem accessor functions */
 extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index,
-			char *, size_t, bool, bool);
+			char *, size_t, bool, int);
 extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index,
 			char *, size_t *, bool, int);
 extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index,
 			void *);
+extern void *tmem_localify_get_pampd(struct tmem_pool *, struct tmem_oid *,
+				   uint32_t index, struct tmem_obj **,
+				   void **);
+extern void tmem_localify_finish(struct tmem_obj *, uint32_t index,
+				 void *, void *, bool);
 extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *,
 			uint32_t index);
 extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *);

File diff suppressed because it is too large
+ 613 - 112
drivers/staging/ramster/zcache-main.c


Some files were not shown because too many files changed in this diff