|
@@ -115,14 +115,62 @@ static int discard_swap(struct swap_info_struct *si)
|
|
|
return err; /* That will often be -EOPNOTSUPP */
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * swap allocation tell device that a cluster of swap can now be discarded,
|
|
|
+ * to allow the swap device to optimize its wear-levelling.
|
|
|
+ */
|
|
|
+static void discard_swap_cluster(struct swap_info_struct *si,
|
|
|
+ pgoff_t start_page, pgoff_t nr_pages)
|
|
|
+{
|
|
|
+ struct swap_extent *se = si->curr_swap_extent;
|
|
|
+ int found_extent = 0;
|
|
|
+
|
|
|
+ while (nr_pages) {
|
|
|
+ struct list_head *lh;
|
|
|
+
|
|
|
+ if (se->start_page <= start_page &&
|
|
|
+ start_page < se->start_page + se->nr_pages) {
|
|
|
+ pgoff_t offset = start_page - se->start_page;
|
|
|
+ sector_t start_block = se->start_block + offset;
|
|
|
+ pgoff_t nr_blocks = se->nr_pages - offset;
|
|
|
+
|
|
|
+ if (nr_blocks > nr_pages)
|
|
|
+ nr_blocks = nr_pages;
|
|
|
+ start_page += nr_blocks;
|
|
|
+ nr_pages -= nr_blocks;
|
|
|
+
|
|
|
+ if (!found_extent++)
|
|
|
+ si->curr_swap_extent = se;
|
|
|
+
|
|
|
+ start_block <<= PAGE_SHIFT - 9;
|
|
|
+ nr_blocks <<= PAGE_SHIFT - 9;
|
|
|
+ if (blkdev_issue_discard(si->bdev, start_block,
|
|
|
+ nr_blocks, GFP_NOIO))
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ lh = se->list.next;
|
|
|
+ if (lh == &si->extent_list)
|
|
|
+ lh = lh->next;
|
|
|
+ se = list_entry(lh, struct swap_extent, list);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static int wait_for_discard(void *word)
|
|
|
+{
|
|
|
+ schedule();
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
#define SWAPFILE_CLUSTER 256
|
|
|
#define LATENCY_LIMIT 256
|
|
|
|
|
|
static inline unsigned long scan_swap_map(struct swap_info_struct *si)
|
|
|
{
|
|
|
unsigned long offset;
|
|
|
- unsigned long last_in_cluster;
|
|
|
+ unsigned long last_in_cluster = 0;
|
|
|
int latency_ration = LATENCY_LIMIT;
|
|
|
+ int found_free_cluster = 0;
|
|
|
|
|
|
/*
|
|
|
* We try to cluster swap pages by allocating them sequentially
|
|
@@ -142,6 +190,19 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
|
|
|
si->cluster_nr = SWAPFILE_CLUSTER - 1;
|
|
|
goto checks;
|
|
|
}
|
|
|
+ if (si->flags & SWP_DISCARDABLE) {
|
|
|
+ /*
|
|
|
+ * Start range check on racing allocations, in case
|
|
|
+ * they overlap the cluster we eventually decide on
|
|
|
+ * (we scan without swap_lock to allow preemption).
|
|
|
+ * It's hardly conceivable that cluster_nr could be
|
|
|
+ * wrapped during our scan, but don't depend on it.
|
|
|
+ */
|
|
|
+ if (si->lowest_alloc)
|
|
|
+ goto checks;
|
|
|
+ si->lowest_alloc = si->max;
|
|
|
+ si->highest_alloc = 0;
|
|
|
+ }
|
|
|
spin_unlock(&swap_lock);
|
|
|
|
|
|
offset = si->lowest_bit;
|
|
@@ -156,6 +217,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
|
|
|
offset -= SWAPFILE_CLUSTER - 1;
|
|
|
si->cluster_next = offset;
|
|
|
si->cluster_nr = SWAPFILE_CLUSTER - 1;
|
|
|
+ found_free_cluster = 1;
|
|
|
goto checks;
|
|
|
}
|
|
|
if (unlikely(--latency_ration < 0)) {
|
|
@@ -167,6 +229,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
|
|
|
offset = si->lowest_bit;
|
|
|
spin_lock(&swap_lock);
|
|
|
si->cluster_nr = SWAPFILE_CLUSTER - 1;
|
|
|
+ si->lowest_alloc = 0;
|
|
|
}
|
|
|
|
|
|
checks:
|
|
@@ -191,6 +254,60 @@ checks:
|
|
|
si->swap_map[offset] = 1;
|
|
|
si->cluster_next = offset + 1;
|
|
|
si->flags -= SWP_SCANNING;
|
|
|
+
|
|
|
+ if (si->lowest_alloc) {
|
|
|
+ /*
|
|
|
+ * Only set when SWP_DISCARDABLE, and there's a scan
|
|
|
+ * for a free cluster in progress or just completed.
|
|
|
+ */
|
|
|
+ if (found_free_cluster) {
|
|
|
+ /*
|
|
|
+ * To optimize wear-levelling, discard the
|
|
|
+ * old data of the cluster, taking care not to
|
|
|
+ * discard any of its pages that have already
|
|
|
+ * been allocated by racing tasks (offset has
|
|
|
+ * already stepped over any at the beginning).
|
|
|
+ */
|
|
|
+ if (offset < si->highest_alloc &&
|
|
|
+ si->lowest_alloc <= last_in_cluster)
|
|
|
+ last_in_cluster = si->lowest_alloc - 1;
|
|
|
+ si->flags |= SWP_DISCARDING;
|
|
|
+ spin_unlock(&swap_lock);
|
|
|
+
|
|
|
+ if (offset < last_in_cluster)
|
|
|
+ discard_swap_cluster(si, offset,
|
|
|
+ last_in_cluster - offset + 1);
|
|
|
+
|
|
|
+ spin_lock(&swap_lock);
|
|
|
+ si->lowest_alloc = 0;
|
|
|
+ si->flags &= ~SWP_DISCARDING;
|
|
|
+
|
|
|
+ smp_mb(); /* wake_up_bit advises this */
|
|
|
+ wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
|
|
|
+
|
|
|
+ } else if (si->flags & SWP_DISCARDING) {
|
|
|
+ /*
|
|
|
+ * Delay using pages allocated by racing tasks
|
|
|
+ * until the whole discard has been issued. We
|
|
|
+ * could defer that delay until swap_writepage,
|
|
|
+ * but it's easier to keep this self-contained.
|
|
|
+ */
|
|
|
+ spin_unlock(&swap_lock);
|
|
|
+ wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
|
|
|
+ wait_for_discard, TASK_UNINTERRUPTIBLE);
|
|
|
+ spin_lock(&swap_lock);
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ * Note pages allocated by racing tasks while
|
|
|
+ * scan for a free cluster is in progress, so
|
|
|
+ * that its final discard can exclude them.
|
|
|
+ */
|
|
|
+ if (offset < si->lowest_alloc)
|
|
|
+ si->lowest_alloc = offset;
|
|
|
+ if (offset > si->highest_alloc)
|
|
|
+ si->highest_alloc = offset;
|
|
|
+ }
|
|
|
+ }
|
|
|
return offset;
|
|
|
|
|
|
scan:
|