16 жил өмнө · 7992fde72c
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -121,6 +121,7 @@ enum {
 
				 	SWP_USED	= (1 << 0),	/* is slot in swap_info[] used? */
			
 
				 	SWP_WRITEOK	= (1 << 1),	/* ok to write to this swap?	*/
			
 
				 	SWP_DISCARDABLE = (1 << 2),	/* blkdev supports discard */
			
 
				+	SWP_DISCARDING	= (1 << 3),	/* now discarding a free cluster */
			
 
				 					/* add others here before... */
			
 
				 	SWP_SCANNING	= (1 << 8),	/* refcount in scan_swap_map */
			
 
				 };
			
@@ -144,6 +145,8 @@ struct swap_info_struct {
 
				 	unsigned short *swap_map;
			
 
				 	unsigned int lowest_bit;
			
 
				 	unsigned int highest_bit;
			
 
				+	unsigned int lowest_alloc;	/* while preparing discard cluster */
			
 
				+	unsigned int highest_alloc;	/* while preparing discard cluster */
			
 
				 	unsigned int cluster_next;
			
 
				 	unsigned int cluster_nr;
			
 
				 	unsigned int pages;
			
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -115,14 +115,62 @@ static int discard_swap(struct swap_info_struct *si)
 
				 	return err;		/* That will often be -EOPNOTSUPP */
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * swap allocation tell device that a cluster of swap can now be discarded,
			
 
				+ * to allow the swap device to optimize its wear-levelling.
			
 
				+ */
			
 
				+static void discard_swap_cluster(struct swap_info_struct *si,
			
 
				+				 pgoff_t start_page, pgoff_t nr_pages)
			
 
				+{
			
 
				+	struct swap_extent *se = si->curr_swap_extent;
			
 
				+	int found_extent = 0;
			
 
				+
			
 
				+	while (nr_pages) {
			
 
				+		struct list_head *lh;
			
 
				+
			
 
				+		if (se->start_page <= start_page &&
			
 
				+		    start_page < se->start_page + se->nr_pages) {
			
 
				+			pgoff_t offset = start_page - se->start_page;
			
 
				+			sector_t start_block = se->start_block + offset;
			
 
				+			pgoff_t nr_blocks = se->nr_pages - offset;
			
 
				+
			
 
				+			if (nr_blocks > nr_pages)
			
 
				+				nr_blocks = nr_pages;
			
 
				+			start_page += nr_blocks;
			
 
				+			nr_pages -= nr_blocks;
			
 
				+
			
 
				+			if (!found_extent++)
			
 
				+				si->curr_swap_extent = se;
			
 
				+
			
 
				+			start_block <<= PAGE_SHIFT - 9;
			
 
				+			nr_blocks <<= PAGE_SHIFT - 9;
			
 
				+			if (blkdev_issue_discard(si->bdev, start_block,
			
 
				+							nr_blocks, GFP_NOIO))
			
 
				+				break;
			
 
				+		}
			
 
				+
			
 
				+		lh = se->list.next;
			
 
				+		if (lh == &si->extent_list)
			
 
				+			lh = lh->next;
			
 
				+		se = list_entry(lh, struct swap_extent, list);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int wait_for_discard(void *word)
			
 
				+{
			
 
				+	schedule();
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 #define SWAPFILE_CLUSTER	256
			
 
				 #define LATENCY_LIMIT		256
			
 
				 
			
 
				 static inline unsigned long scan_swap_map(struct swap_info_struct *si)
			
 
				 {
			
 
				 	unsigned long offset;
			
 
				-	unsigned long last_in_cluster;
			
 
				+	unsigned long last_in_cluster = 0;
			
 
				 	int latency_ration = LATENCY_LIMIT;
			
 
				+	int found_free_cluster = 0;
			
 
				 
			
 
				 	/*
			
 
				 	 * We try to cluster swap pages by allocating them sequentially
			
@@ -142,6 +190,19 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 
				 			si->cluster_nr = SWAPFILE_CLUSTER - 1;
			
 
				 			goto checks;
			
 
				 		}
			
 
				+		if (si->flags & SWP_DISCARDABLE) {
			
 
				+			/*
			
 
				+			 * Start range check on racing allocations, in case
			
 
				+			 * they overlap the cluster we eventually decide on
			
 
				+			 * (we scan without swap_lock to allow preemption).
			
 
				+			 * It's hardly conceivable that cluster_nr could be
			
 
				+			 * wrapped during our scan, but don't depend on it.
			
 
				+			 */
			
 
				+			if (si->lowest_alloc)
			
 
				+				goto checks;
			
 
				+			si->lowest_alloc = si->max;
			
 
				+			si->highest_alloc = 0;
			
 
				+		}
			
 
				 		spin_unlock(&swap_lock);
			
 
				 
			
 
				 		offset = si->lowest_bit;
			
@@ -156,6 +217,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 
				 				offset -= SWAPFILE_CLUSTER - 1;
			
 
				 				si->cluster_next = offset;
			
 
				 				si->cluster_nr = SWAPFILE_CLUSTER - 1;
			
 
				+				found_free_cluster = 1;
			
 
				 				goto checks;
			
 
				 			}
			
 
				 			if (unlikely(--latency_ration < 0)) {
			
@@ -167,6 +229,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 
				 		offset = si->lowest_bit;
			
 
				 		spin_lock(&swap_lock);
			
 
				 		si->cluster_nr = SWAPFILE_CLUSTER - 1;
			
 
				+		si->lowest_alloc = 0;
			
 
				 	}
			
 
				 
			
 
				 checks:
			
@@ -191,6 +254,60 @@ checks:
 
				 	si->swap_map[offset] = 1;
			
 
				 	si->cluster_next = offset + 1;
			
 
				 	si->flags -= SWP_SCANNING;
			
 
				+
			
 
				+	if (si->lowest_alloc) {
			
 
				+		/*
			
 
				+		 * Only set when SWP_DISCARDABLE, and there's a scan
			
 
				+		 * for a free cluster in progress or just completed.
			
 
				+		 */
			
 
				+		if (found_free_cluster) {
			
 
				+			/*
			
 
				+			 * To optimize wear-levelling, discard the
			
 
				+			 * old data of the cluster, taking care not to
			
 
				+			 * discard any of its pages that have already
			
 
				+			 * been allocated by racing tasks (offset has
			
 
				+			 * already stepped over any at the beginning).
			
 
				+			 */
			
 
				+			if (offset < si->highest_alloc &&
			
 
				+			    si->lowest_alloc <= last_in_cluster)
			
 
				+				last_in_cluster = si->lowest_alloc - 1;
			
 
				+			si->flags |= SWP_DISCARDING;
			
 
				+			spin_unlock(&swap_lock);
			
 
				+
			
 
				+			if (offset < last_in_cluster)
			
 
				+				discard_swap_cluster(si, offset,
			
 
				+					last_in_cluster - offset + 1);
			
 
				+
			
 
				+			spin_lock(&swap_lock);
			
 
				+			si->lowest_alloc = 0;
			
 
				+			si->flags &= ~SWP_DISCARDING;
			
 
				+
			
 
				+			smp_mb();	/* wake_up_bit advises this */
			
 
				+			wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
			
 
				+
			
 
				+		} else if (si->flags & SWP_DISCARDING) {
			
 
				+			/*
			
 
				+			 * Delay using pages allocated by racing tasks
			
 
				+			 * until the whole discard has been issued. We
			
 
				+			 * could defer that delay until swap_writepage,
			
 
				+			 * but it's easier to keep this self-contained.
			
 
				+			 */
			
 
				+			spin_unlock(&swap_lock);
			
 
				+			wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
			
 
				+				wait_for_discard, TASK_UNINTERRUPTIBLE);
			
 
				+			spin_lock(&swap_lock);
			
 
				+		} else {
			
 
				+			/*
			
 
				+			 * Note pages allocated by racing tasks while
			
 
				+			 * scan for a free cluster is in progress, so
			
 
				+			 * that its final discard can exclude them.
			
 
				+			 */
			
 
				+			if (offset < si->lowest_alloc)
			
 
				+				si->lowest_alloc = offset;
			
 
				+			if (offset > si->highest_alloc)
			
 
				+				si->highest_alloc = offset;
			
 
				+		}
			
 
				+	}
			
 
				 	return offset;
			
 
				 
			
 
				 scan: