15 роки тому · 34e55232e5
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -188,6 +188,12 @@ memory usage. Its seven fields are explained in Table 1-3.  The stat file
 
															 contains details information about the process itself.  Its fields are
														
 
															 explained in Table 1-4.
														
 
															+(for SMP CONFIG users)
														
 
															+For making accounting scalable, RSS related information are handled in
														
 
															+asynchronous manner and the vaule may not be very precise. To see a precise
														
 
															+snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
														
 
															+It's slow but very precise.
														
 
															+
														
 
															 Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
														
 
															 ..............................................................................
														
 
															  Field                       Content
														
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -718,6 +718,7 @@ static int exec_mmap(struct mm_struct *mm)
 
															 	/* Notify parent that we're no longer interested in the old VM */
														
 
															 	tsk = current;
														
 
															 	old_mm = current->mm;
														
 
															+	sync_mm_rss(tsk, old_mm);
														
 
															 	mm_release(tsk, old_mm);
														
 
															 	if (old_mm) {
														
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -873,7 +873,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 
															 /*
														
 
															  * per-process(per-mm_struct) statistics.
														
 
															  */
														
 
															-#if USE_SPLIT_PTLOCKS
														
 
															+#if defined(SPLIT_RSS_COUNTING)
														
 
															 /*
														
 
															  * The mm counters are not protected by its page_table_lock,
														
 
															  * so must be incremented atomically.
														
@@ -883,10 +883,7 @@ static inline void set_mm_counter(struct mm_struct *mm, int member, long value)
 
															 	atomic_long_set(&mm->rss_stat.count[member], value);
														
 
															 }
														
 
															-static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
														
 
															-{
														
 
															-	return (unsigned long)atomic_long_read(&mm->rss_stat.count[member]);
														
 
															-}
														
 
															+unsigned long get_mm_counter(struct mm_struct *mm, int member);
														
 
															 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
														
 
															 {
														
@@ -974,6 +971,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
 
															 		*maxrss = hiwater_rss;
														
 
															 }
														
 
															+void sync_mm_rss(struct task_struct *task, struct mm_struct *mm);
														
 
															 /*
														
 
															  * A callback you can register to apply pressure to ageable caches.
														
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -202,9 +202,15 @@ enum {
 
															 };
														
 
															 #if USE_SPLIT_PTLOCKS
														
 
															+#define SPLIT_RSS_COUNTING
														
 
															 struct mm_rss_stat {
														
 
															 	atomic_long_t count[NR_MM_COUNTERS];
														
 
															 };
														
 
															+/* per-thread cached information, */
														
 
															+struct task_rss_stat {
														
 
															+	int events;	/* for synchronization threshold */
														
 
															+	int count[NR_MM_COUNTERS];
														
 
															+};
														
 
															 #else  /* !USE_SPLIT_PTLOCKS */
														
 
															 struct mm_rss_stat {
														
 
															 	unsigned long count[NR_MM_COUNTERS];
														
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1220,7 +1220,9 @@ struct task_struct {
 
															 	struct plist_node pushable_tasks;
														
 
															 	struct mm_struct *mm, *active_mm;
														
 
															-
														
 
															+#if defined(SPLIT_RSS_COUNTING)
														
 
															+	struct task_rss_stat	rss_stat;
														
 
															+#endif
														
 
															 /* task state */
														
 
															 	int exit_state;
														
 
															 	int exit_code, exit_signal;
														
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -952,7 +952,8 @@ NORET_TYPE void do_exit(long code)
 
															 				preempt_count());
														
 
															 	acct_update_integrals(tsk);
														
 
															-
														
 
															+	/* sync mm's RSS info before statistics gathering */
														
 
															+	sync_mm_rss(tsk, tsk->mm);
														
 
															 	group_dead = atomic_dec_and_test(&tsk->signal->live);
														
 
															 	if (group_dead) {
														
 
															 		hrtimer_cancel(&tsk->signal->real_timer);
														
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -122,6 +122,79 @@ static int __init init_zero_pfn(void)
 
															 core_initcall(init_zero_pfn);
														
 
															+#if defined(SPLIT_RSS_COUNTING)
														
 
															+
														
 
															+void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
														
 
															+{
														
 
															+	int i;
														
 
															+
														
 
															+	for (i = 0; i < NR_MM_COUNTERS; i++) {
														
 
															+		if (task->rss_stat.count[i]) {
														
 
															+			add_mm_counter(mm, i, task->rss_stat.count[i]);
														
 
															+			task->rss_stat.count[i] = 0;
														
 
															+		}
														
 
															+	}
														
 
															+	task->rss_stat.events = 0;
														
 
															+}
														
 
															+
														
 
															+static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
														
 
															+{
														
 
															+	struct task_struct *task = current;
														
 
															+
														
 
															+	if (likely(task->mm == mm))
														
 
															+		task->rss_stat.count[member] += val;
														
 
															+	else
														
 
															+		add_mm_counter(mm, member, val);
														
 
															+}
														
 
															+#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
														
 
															+#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
														
 
															+
														
 
															+/* sync counter once per 64 page faults */
														
 
															+#define TASK_RSS_EVENTS_THRESH	(64)
														
 
															+static void check_sync_rss_stat(struct task_struct *task)
														
 
															+{
														
 
															+	if (unlikely(task != current))
														
 
															+		return;
														
 
															+	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
														
 
															+		__sync_task_rss_stat(task, task->mm);
														
 
															+}
														
 
															+
														
 
															+unsigned long get_mm_counter(struct mm_struct *mm, int member)
														
 
															+{
														
 
															+	long val = 0;
														
 
															+
														
 
															+	/*
														
 
															+	 * Don't use task->mm here...for avoiding to use task_get_mm()..
														
 
															+	 * The caller must guarantee task->mm is not invalid.
														
 
															+	 */
														
 
															+	val = atomic_long_read(&mm->rss_stat.count[member]);
														
 
															+	/*
														
 
															+	 * counter is updated in asynchronous manner and may go to minus.
														
 
															+	 * But it's never be expected number for users.
														
 
															+	 */
														
 
															+	if (val < 0)
														
 
															+		return 0;
														
 
															+	return (unsigned long)val;
														
 
															+}
														
 
															+
														
 
															+void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
														
 
															+{
														
 
															+	__sync_task_rss_stat(task, mm);
														
 
															+}
														
 
															+#else
														
 
															+
														
 
															+#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
														
 
															+#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
														
 
															+
														
 
															+static void check_sync_rss_stat(struct task_struct *task)
														
 
															+{
														
 
															+}
														
 
															+
														
 
															+void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
														
 
															+{
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															 /*
														
 
															  * If a p?d_bad entry is found while walking page tables, report
														
 
															  * the error, before resetting entry to p?d_none.  Usually (but
														
@@ -386,6 +459,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
 
															 {
														
 
															 	int i;
														
 
															+	if (current->mm == mm)
														
 
															+		sync_mm_rss(current, mm);
														
 
															 	for (i = 0; i < NR_MM_COUNTERS; i++)
														
 
															 		if (rss[i])
														
 
															 			add_mm_counter(mm, i, rss[i]);
														
@@ -1539,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
 
															 	/* Ok, finally just insert the thing.. */
														
 
															 	get_page(page);
														
 
															-	inc_mm_counter(mm, MM_FILEPAGES);
														
 
															+	inc_mm_counter_fast(mm, MM_FILEPAGES);
														
 
															 	page_add_file_rmap(page);
														
 
															 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
														
@@ -2175,11 +2250,11 @@ gotten:
 
															 	if (likely(pte_same(*page_table, orig_pte))) {
														
 
															 		if (old_page) {
														
 
															 			if (!PageAnon(old_page)) {
														
 
															-				dec_mm_counter(mm, MM_FILEPAGES);
														
 
															-				inc_mm_counter(mm, MM_ANONPAGES);
														
 
															+				dec_mm_counter_fast(mm, MM_FILEPAGES);
														
 
															+				inc_mm_counter_fast(mm, MM_ANONPAGES);
														
 
															 			}
														
 
															 		} else
														
 
															-			inc_mm_counter(mm, MM_ANONPAGES);
														
 
															+			inc_mm_counter_fast(mm, MM_ANONPAGES);
														
 
															 		flush_cache_page(vma, address, pte_pfn(orig_pte));
														
 
															 		entry = mk_pte(new_page, vma->vm_page_prot);
														
 
															 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
														
@@ -2616,7 +2691,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
															 	 * discarded at swap_free().
														
 
															 	 */
														
 
															-	inc_mm_counter(mm, MM_ANONPAGES);
														
 
															+	inc_mm_counter_fast(mm, MM_ANONPAGES);
														
 
															 	pte = mk_pte(page, vma->vm_page_prot);
														
 
															 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
														
 
															 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
														
@@ -2700,7 +2775,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
															 	if (!pte_none(*page_table))
														
 
															 		goto release;
														
 
															-	inc_mm_counter(mm, MM_ANONPAGES);
														
 
															+	inc_mm_counter_fast(mm, MM_ANONPAGES);
														
 
															 	page_add_new_anon_rmap(page, vma, address);
														
 
															 setpte:
														
 
															 	set_pte_at(mm, address, page_table, entry);
														
@@ -2854,10 +2929,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
															 		if (flags & FAULT_FLAG_WRITE)
														
 
															 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
														
 
															 		if (anon) {
														
 
															-			inc_mm_counter(mm, MM_ANONPAGES);
														
 
															+			inc_mm_counter_fast(mm, MM_ANONPAGES);
														
 
															 			page_add_new_anon_rmap(page, vma, address);
														
 
															 		} else {
														
 
															-			inc_mm_counter(mm, MM_FILEPAGES);
														
 
															+			inc_mm_counter_fast(mm, MM_FILEPAGES);
														
 
															 			page_add_file_rmap(page);
														
 
															 			if (flags & FAULT_FLAG_WRITE) {
														
 
															 				dirty_page = page;
														
@@ -3035,6 +3110,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
															 	count_vm_event(PGFAULT);
														
 
															+	/* do counter updates before entering really critical section. */
														
 
															+	check_sync_rss_stat(current);
														
 
															+
														
 
															 	if (unlikely(is_vm_hugetlb_page(vma)))
														
 
															 		return hugetlb_fault(mm, vma, address, flags);