kernel: Backport MGLRU patch from 6.4
This fixes a bug where the reclaim path could occasionally have long tail latency. Signed-off-by: Kazuki Hashimoto <kazukih0205@gmail.com>
This commit is contained in:
		 Kazuki Hashimoto
					Kazuki Hashimoto
				
			
				
					committed by
					
						 Christian Marangi
						Christian Marangi
					
				
			
			
				
	
			
			
			 Christian Marangi
						Christian Marangi
					
				
			
						parent
						
							b28b8ed1f4
						
					
				
				
					commit
					ae8db3941c
				
			| @@ -0,0 +1,266 @@ | |||||||
|  | From 087ed25eaf5a78a678508e893f80addab9b1c103 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Kalesh Singh <kaleshsingh@google.com> | ||||||
|  | Date: Thu, 13 Apr 2023 14:43:26 -0700 | ||||||
|  | Subject: [PATCH] mm: Multi-gen LRU: remove wait_event_killable() | ||||||
|  |  | ||||||
|  | Android 14 and later default to MGLRU [1] and field telemetry showed | ||||||
|  | occasional long tail latency (>100ms) in the reclaim path. | ||||||
|  |  | ||||||
|  | Tracing revealed priority inversion in the reclaim path.  In | ||||||
|  | try_to_inc_max_seq(), when high priority tasks were blocked on | ||||||
|  | wait_event_killable(), the preemption of the low priority task to call | ||||||
|  | wake_up_all() caused those high priority tasks to wait longer than | ||||||
|  | necessary.  In general, this problem is not different from others of its | ||||||
|  | kind, e.g., one caused by mutex_lock().  However, it is specific to MGLRU | ||||||
|  | because it introduced the new wait queue lruvec->mm_state.wait. | ||||||
|  |  | ||||||
|  | The purpose of this new wait queue is to avoid the thundering herd | ||||||
|  | problem.  If many direct reclaimers rush into try_to_inc_max_seq(), only | ||||||
|  | one can succeed, i.e., the one to wake up the rest, and the rest who | ||||||
|  | failed might cause premature OOM kills if they do not wait.  So far there | ||||||
|  | is no evidence supporting this scenario, based on how often the wait has | ||||||
|  | been hit.  And this begs the question how useful the wait queue is in | ||||||
|  | practice. | ||||||
|  |  | ||||||
|  | Based on Minchan's recommendation, which is in line with his commit | ||||||
|  | 6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path") and the | ||||||
|  | rest of the MGLRU code which also uses trylock when possible, remove the | ||||||
|  | wait queue. | ||||||
|  |  | ||||||
|  | [1] https://android-review.googlesource.com/q/I7ed7fbfd6ef9ce10053347528125dd98c39e50bf | ||||||
|  |  | ||||||
|  | Link: https://lkml.kernel.org/r/20230413214326.2147568-1-kaleshsingh@google.com | ||||||
|  | Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") | ||||||
|  | Change-Id: I911f3968fd1adb25171279cc5b6f48ccb7efc8de | ||||||
|  | Signed-off-by: Kalesh Singh <kaleshsingh@google.com> | ||||||
|  | Suggested-by: Minchan Kim <minchan@kernel.org> | ||||||
|  | Reported-by: Wei Wang <wvw@google.com> | ||||||
|  | Acked-by: Yu Zhao <yuzhao@google.com> | ||||||
|  | Cc: Minchan Kim <minchan@kernel.org> | ||||||
|  | Cc: Jan Alexander Steffens (heftig) <heftig@archlinux.org> | ||||||
|  | Cc: Oleksandr Natalenko <oleksandr@natalenko.name> | ||||||
|  | Cc: Suleiman Souhlal <suleiman@google.com> | ||||||
|  | Cc: Suren Baghdasaryan <surenb@google.com> | ||||||
|  | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | ||||||
|  | --- | ||||||
|  |  include/linux/mmzone.h |   8 +-- | ||||||
|  |  mm/vmscan.c            | 111 +++++++++++++++-------------------------- | ||||||
|  |  2 files changed, 42 insertions(+), 77 deletions(-) | ||||||
|  |  | ||||||
|  | --- a/include/linux/mmzone.h | ||||||
|  | +++ b/include/linux/mmzone.h | ||||||
|  | @@ -452,18 +452,14 @@ enum { | ||||||
|  |  struct lru_gen_mm_state { | ||||||
|  |  	/* set to max_seq after each iteration */ | ||||||
|  |  	unsigned long seq; | ||||||
|  | -	/* where the current iteration continues (inclusive) */ | ||||||
|  | +	/* where the current iteration continues after */ | ||||||
|  |  	struct list_head *head; | ||||||
|  | -	/* where the last iteration ended (exclusive) */ | ||||||
|  | +	/* where the last iteration ended before */ | ||||||
|  |  	struct list_head *tail; | ||||||
|  | -	/* to wait for the last page table walker to finish */ | ||||||
|  | -	struct wait_queue_head wait; | ||||||
|  |  	/* Bloom filters flip after each iteration */ | ||||||
|  |  	unsigned long *filters[NR_BLOOM_FILTERS]; | ||||||
|  |  	/* the mm stats for debugging */ | ||||||
|  |  	unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; | ||||||
|  | -	/* the number of concurrent page table walkers */ | ||||||
|  | -	int nr_walkers; | ||||||
|  |  }; | ||||||
|  |   | ||||||
|  |  struct lru_gen_mm_walk { | ||||||
|  | --- a/mm/vmscan.c | ||||||
|  | +++ b/mm/vmscan.c | ||||||
|  | @@ -2999,18 +2999,13 @@ void lru_gen_del_mm(struct mm_struct *mm | ||||||
|  |  		if (!lruvec) | ||||||
|  |  			continue; | ||||||
|  |   | ||||||
|  | -		/* where the last iteration ended (exclusive) */ | ||||||
|  | +		/* where the current iteration continues after */ | ||||||
|  | +		if (lruvec->mm_state.head == &mm->lru_gen.list) | ||||||
|  | +			lruvec->mm_state.head = lruvec->mm_state.head->prev; | ||||||
|  | + | ||||||
|  | +		/* where the last iteration ended before */ | ||||||
|  |  		if (lruvec->mm_state.tail == &mm->lru_gen.list) | ||||||
|  |  			lruvec->mm_state.tail = lruvec->mm_state.tail->next; | ||||||
|  | - | ||||||
|  | -		/* where the current iteration continues (inclusive) */ | ||||||
|  | -		if (lruvec->mm_state.head != &mm->lru_gen.list) | ||||||
|  | -			continue; | ||||||
|  | - | ||||||
|  | -		lruvec->mm_state.head = lruvec->mm_state.head->next; | ||||||
|  | -		/* the deletion ends the current iteration */ | ||||||
|  | -		if (lruvec->mm_state.head == &mm_list->fifo) | ||||||
|  | -			WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); | ||||||
|  |  	} | ||||||
|  |   | ||||||
|  |  	list_del_init(&mm->lru_gen.list); | ||||||
|  | @@ -3194,68 +3189,54 @@ static bool iterate_mm_list(struct lruve | ||||||
|  |  			    struct mm_struct **iter) | ||||||
|  |  { | ||||||
|  |  	bool first = false; | ||||||
|  | -	bool last = true; | ||||||
|  | +	bool last = false; | ||||||
|  |  	struct mm_struct *mm = NULL; | ||||||
|  |  	struct mem_cgroup *memcg = lruvec_memcg(lruvec); | ||||||
|  |  	struct lru_gen_mm_list *mm_list = get_mm_list(memcg); | ||||||
|  |  	struct lru_gen_mm_state *mm_state = &lruvec->mm_state; | ||||||
|  |   | ||||||
|  |  	/* | ||||||
|  | -	 * There are four interesting cases for this page table walker: | ||||||
|  | -	 * 1. It tries to start a new iteration of mm_list with a stale max_seq; | ||||||
|  | -	 *    there is nothing left to do. | ||||||
|  | -	 * 2. It's the first of the current generation, and it needs to reset | ||||||
|  | -	 *    the Bloom filter for the next generation. | ||||||
|  | -	 * 3. It reaches the end of mm_list, and it needs to increment | ||||||
|  | -	 *    mm_state->seq; the iteration is done. | ||||||
|  | -	 * 4. It's the last of the current generation, and it needs to reset the | ||||||
|  | -	 *    mm stats counters for the next generation. | ||||||
|  | +	 * mm_state->seq is incremented after each iteration of mm_list. There | ||||||
|  | +	 * are three interesting cases for this page table walker: | ||||||
|  | +	 * 1. It tries to start a new iteration with a stale max_seq: there is | ||||||
|  | +	 *    nothing left to do. | ||||||
|  | +	 * 2. It started the next iteration: it needs to reset the Bloom filter | ||||||
|  | +	 *    so that a fresh set of PTE tables can be recorded. | ||||||
|  | +	 * 3. It ended the current iteration: it needs to reset the mm stats | ||||||
|  | +	 *    counters and tell its caller to increment max_seq. | ||||||
|  |  	 */ | ||||||
|  |  	spin_lock(&mm_list->lock); | ||||||
|  |   | ||||||
|  |  	VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); | ||||||
|  | -	VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq); | ||||||
|  | -	VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers); | ||||||
|  |   | ||||||
|  | -	if (walk->max_seq <= mm_state->seq) { | ||||||
|  | -		if (!*iter) | ||||||
|  | -			last = false; | ||||||
|  | +	if (walk->max_seq <= mm_state->seq) | ||||||
|  |  		goto done; | ||||||
|  | -	} | ||||||
|  |   | ||||||
|  | -	if (!mm_state->nr_walkers) { | ||||||
|  | -		VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); | ||||||
|  | +	if (!mm_state->head) | ||||||
|  | +		mm_state->head = &mm_list->fifo; | ||||||
|  |   | ||||||
|  | -		mm_state->head = mm_list->fifo.next; | ||||||
|  | +	if (mm_state->head == &mm_list->fifo) | ||||||
|  |  		first = true; | ||||||
|  | -	} | ||||||
|  | - | ||||||
|  | -	while (!mm && mm_state->head != &mm_list->fifo) { | ||||||
|  | -		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); | ||||||
|  |   | ||||||
|  | +	do { | ||||||
|  |  		mm_state->head = mm_state->head->next; | ||||||
|  | +		if (mm_state->head == &mm_list->fifo) { | ||||||
|  | +			WRITE_ONCE(mm_state->seq, mm_state->seq + 1); | ||||||
|  | +			last = true; | ||||||
|  | +			break; | ||||||
|  | +		} | ||||||
|  |   | ||||||
|  |  		/* force scan for those added after the last iteration */ | ||||||
|  | -		if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) { | ||||||
|  | -			mm_state->tail = mm_state->head; | ||||||
|  | +		if (!mm_state->tail || mm_state->tail == mm_state->head) { | ||||||
|  | +			mm_state->tail = mm_state->head->next; | ||||||
|  |  			walk->force_scan = true; | ||||||
|  |  		} | ||||||
|  |   | ||||||
|  | +		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); | ||||||
|  |  		if (should_skip_mm(mm, walk)) | ||||||
|  |  			mm = NULL; | ||||||
|  | -	} | ||||||
|  | - | ||||||
|  | -	if (mm_state->head == &mm_list->fifo) | ||||||
|  | -		WRITE_ONCE(mm_state->seq, mm_state->seq + 1); | ||||||
|  | +	} while (!mm); | ||||||
|  |  done: | ||||||
|  | -	if (*iter && !mm) | ||||||
|  | -		mm_state->nr_walkers--; | ||||||
|  | -	if (!*iter && mm) | ||||||
|  | -		mm_state->nr_walkers++; | ||||||
|  | - | ||||||
|  | -	if (mm_state->nr_walkers) | ||||||
|  | -		last = false; | ||||||
|  | - | ||||||
|  |  	if (*iter || last) | ||||||
|  |  		reset_mm_stats(lruvec, walk, last); | ||||||
|  |   | ||||||
|  | @@ -3283,9 +3264,9 @@ static bool iterate_mm_list_nowalk(struc | ||||||
|  |   | ||||||
|  |  	VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); | ||||||
|  |   | ||||||
|  | -	if (max_seq > mm_state->seq && !mm_state->nr_walkers) { | ||||||
|  | -		VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); | ||||||
|  | - | ||||||
|  | +	if (max_seq > mm_state->seq) { | ||||||
|  | +		mm_state->head = NULL; | ||||||
|  | +		mm_state->tail = NULL; | ||||||
|  |  		WRITE_ONCE(mm_state->seq, mm_state->seq + 1); | ||||||
|  |  		reset_mm_stats(lruvec, NULL, true); | ||||||
|  |  		success = true; | ||||||
|  | @@ -3894,10 +3875,6 @@ restart: | ||||||
|  |   | ||||||
|  |  		walk_pmd_range(&val, addr, next, args); | ||||||
|  |   | ||||||
|  | -		/* a racy check to curtail the waiting time */ | ||||||
|  | -		if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) | ||||||
|  | -			return 1; | ||||||
|  | - | ||||||
|  |  		if (need_resched() || walk->batched >= MAX_LRU_BATCH) { | ||||||
|  |  			end = (addr | ~PUD_MASK) + 1; | ||||||
|  |  			goto done; | ||||||
|  | @@ -3930,8 +3907,14 @@ static void walk_mm(struct lruvec *lruve | ||||||
|  |  	walk->next_addr = FIRST_USER_ADDRESS; | ||||||
|  |   | ||||||
|  |  	do { | ||||||
|  | +		DEFINE_MAX_SEQ(lruvec); | ||||||
|  | + | ||||||
|  |  		err = -EBUSY; | ||||||
|  |   | ||||||
|  | +		/* another thread might have called inc_max_seq() */ | ||||||
|  | +		if (walk->max_seq != max_seq) | ||||||
|  | +			break; | ||||||
|  | + | ||||||
|  |  		/* page_update_gen() requires stable page_memcg() */ | ||||||
|  |  		if (!mem_cgroup_trylock_pages(memcg)) | ||||||
|  |  			break; | ||||||
|  | @@ -4164,25 +4147,12 @@ static bool try_to_inc_max_seq(struct lr | ||||||
|  |  		success = iterate_mm_list(lruvec, walk, &mm); | ||||||
|  |  		if (mm) | ||||||
|  |  			walk_mm(lruvec, mm, walk); | ||||||
|  | - | ||||||
|  | -		cond_resched(); | ||||||
|  |  	} while (mm); | ||||||
|  |  done: | ||||||
|  | -	if (!success) { | ||||||
|  | -		if (sc->priority <= DEF_PRIORITY - 2) | ||||||
|  | -			wait_event_killable(lruvec->mm_state.wait, | ||||||
|  | -					    max_seq < READ_ONCE(lrugen->max_seq)); | ||||||
|  | -		return false; | ||||||
|  | -	} | ||||||
|  | +	if (success) | ||||||
|  | +		inc_max_seq(lruvec, can_swap, force_scan); | ||||||
|  |   | ||||||
|  | -	VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); | ||||||
|  | - | ||||||
|  | -	inc_max_seq(lruvec, can_swap, force_scan); | ||||||
|  | -	/* either this sees any waiters or they will see updated max_seq */ | ||||||
|  | -	if (wq_has_sleeper(&lruvec->mm_state.wait)) | ||||||
|  | -		wake_up_all(&lruvec->mm_state.wait); | ||||||
|  | - | ||||||
|  | -	return true; | ||||||
|  | +	return success; | ||||||
|  |  } | ||||||
|  |   | ||||||
|  |  static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) | ||||||
|  | @@ -5746,7 +5716,6 @@ void lru_gen_init_lruvec(struct lruvec * | ||||||
|  |  		INIT_LIST_HEAD(&lrugen->pages[gen][type][zone]); | ||||||
|  |   | ||||||
|  |  	lruvec->mm_state.seq = MIN_NR_GENS; | ||||||
|  | -	init_waitqueue_head(&lruvec->mm_state.wait); | ||||||
|  |  } | ||||||
|  |   | ||||||
|  |  #ifdef CONFIG_MEMCG | ||||||
| @@ -0,0 +1,280 @@ | |||||||
|  | From 418038c22452df38cde519cc8c662bb15139764a Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Kalesh Singh <kaleshsingh@google.com> | ||||||
|  | Date: Thu, 13 Apr 2023 14:43:26 -0700 | ||||||
|  | Subject: [PATCH 19/19] mm: Multi-gen LRU: remove wait_event_killable() | ||||||
|  |  | ||||||
|  | Android 14 and later default to MGLRU [1] and field telemetry showed | ||||||
|  | occasional long tail latency (>100ms) in the reclaim path. | ||||||
|  |  | ||||||
|  | Tracing revealed priority inversion in the reclaim path.  In | ||||||
|  | try_to_inc_max_seq(), when high priority tasks were blocked on | ||||||
|  | wait_event_killable(), the preemption of the low priority task to call | ||||||
|  | wake_up_all() caused those high priority tasks to wait longer than | ||||||
|  | necessary.  In general, this problem is not different from others of its | ||||||
|  | kind, e.g., one caused by mutex_lock().  However, it is specific to MGLRU | ||||||
|  | because it introduced the new wait queue lruvec->mm_state.wait. | ||||||
|  |  | ||||||
|  | The purpose of this new wait queue is to avoid the thundering herd | ||||||
|  | problem.  If many direct reclaimers rush into try_to_inc_max_seq(), only | ||||||
|  | one can succeed, i.e., the one to wake up the rest, and the rest who | ||||||
|  | failed might cause premature OOM kills if they do not wait.  So far there | ||||||
|  | is no evidence supporting this scenario, based on how often the wait has | ||||||
|  | been hit.  And this begs the question how useful the wait queue is in | ||||||
|  | practice. | ||||||
|  |  | ||||||
|  | Based on Minchan's recommendation, which is in line with his commit | ||||||
|  | 6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path") and the | ||||||
|  | rest of the MGLRU code which also uses trylock when possible, remove the | ||||||
|  | wait queue. | ||||||
|  |  | ||||||
|  | [1] https://android-review.googlesource.com/q/I7ed7fbfd6ef9ce10053347528125dd98c39e50bf | ||||||
|  |  | ||||||
|  | Link: https://lkml.kernel.org/r/20230413214326.2147568-1-kaleshsingh@google.com | ||||||
|  | Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") | ||||||
|  | Signed-off-by: Kalesh Singh <kaleshsingh@google.com> | ||||||
|  | Suggested-by: Minchan Kim <minchan@kernel.org> | ||||||
|  | Reported-by: Wei Wang <wvw@google.com> | ||||||
|  | Acked-by: Yu Zhao <yuzhao@google.com> | ||||||
|  | Cc: Minchan Kim <minchan@kernel.org> | ||||||
|  | Cc: Jan Alexander Steffens (heftig) <heftig@archlinux.org> | ||||||
|  | Cc: Oleksandr Natalenko <oleksandr@natalenko.name> | ||||||
|  | Cc: Suleiman Souhlal <suleiman@google.com> | ||||||
|  | Cc: Suren Baghdasaryan <surenb@google.com> | ||||||
|  | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | ||||||
|  | --- | ||||||
|  |  include/linux/mmzone.h |   8 +-- | ||||||
|  |  mm/vmscan.c            | 112 +++++++++++++++-------------------------- | ||||||
|  |  2 files changed, 42 insertions(+), 78 deletions(-) | ||||||
|  |  | ||||||
|  | diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h | ||||||
|  | index 403c7461e7a70..d62a5accf1be4 100644 | ||||||
|  | --- a/include/linux/mmzone.h | ||||||
|  | +++ b/include/linux/mmzone.h | ||||||
|  | @@ -453,18 +453,14 @@ enum { | ||||||
|  |  struct lru_gen_mm_state { | ||||||
|  |  	/* set to max_seq after each iteration */ | ||||||
|  |  	unsigned long seq; | ||||||
|  | -	/* where the current iteration continues (inclusive) */ | ||||||
|  | +	/* where the current iteration continues after */ | ||||||
|  |  	struct list_head *head; | ||||||
|  | -	/* where the last iteration ended (exclusive) */ | ||||||
|  | +	/* where the last iteration ended before */ | ||||||
|  |  	struct list_head *tail; | ||||||
|  | -	/* to wait for the last page table walker to finish */ | ||||||
|  | -	struct wait_queue_head wait; | ||||||
|  |  	/* Bloom filters flip after each iteration */ | ||||||
|  |  	unsigned long *filters[NR_BLOOM_FILTERS]; | ||||||
|  |  	/* the mm stats for debugging */ | ||||||
|  |  	unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; | ||||||
|  | -	/* the number of concurrent page table walkers */ | ||||||
|  | -	int nr_walkers; | ||||||
|  |  }; | ||||||
|  |   | ||||||
|  |  struct lru_gen_mm_walk { | ||||||
|  | diff --git a/mm/vmscan.c b/mm/vmscan.c | ||||||
|  | index f6ce7a1fd78a3..851758303dbf4 100644 | ||||||
|  | --- a/mm/vmscan.c | ||||||
|  | +++ b/mm/vmscan.c | ||||||
|  | @@ -3371,18 +3371,13 @@ void lru_gen_del_mm(struct mm_struct *mm) | ||||||
|  |  		if (!lruvec) | ||||||
|  |  			continue; | ||||||
|  |   | ||||||
|  | -		/* where the last iteration ended (exclusive) */ | ||||||
|  | +		/* where the current iteration continues after */ | ||||||
|  | +		if (lruvec->mm_state.head == &mm->lru_gen.list) | ||||||
|  | +			lruvec->mm_state.head = lruvec->mm_state.head->prev; | ||||||
|  | + | ||||||
|  | +		/* where the last iteration ended before */ | ||||||
|  |  		if (lruvec->mm_state.tail == &mm->lru_gen.list) | ||||||
|  |  			lruvec->mm_state.tail = lruvec->mm_state.tail->next; | ||||||
|  | - | ||||||
|  | -		/* where the current iteration continues (inclusive) */ | ||||||
|  | -		if (lruvec->mm_state.head != &mm->lru_gen.list) | ||||||
|  | -			continue; | ||||||
|  | - | ||||||
|  | -		lruvec->mm_state.head = lruvec->mm_state.head->next; | ||||||
|  | -		/* the deletion ends the current iteration */ | ||||||
|  | -		if (lruvec->mm_state.head == &mm_list->fifo) | ||||||
|  | -			WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); | ||||||
|  |  	} | ||||||
|  |   | ||||||
|  |  	list_del_init(&mm->lru_gen.list); | ||||||
|  | @@ -3478,68 +3473,54 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, | ||||||
|  |  			    struct mm_struct **iter) | ||||||
|  |  { | ||||||
|  |  	bool first = false; | ||||||
|  | -	bool last = true; | ||||||
|  | +	bool last = false; | ||||||
|  |  	struct mm_struct *mm = NULL; | ||||||
|  |  	struct mem_cgroup *memcg = lruvec_memcg(lruvec); | ||||||
|  |  	struct lru_gen_mm_list *mm_list = get_mm_list(memcg); | ||||||
|  |  	struct lru_gen_mm_state *mm_state = &lruvec->mm_state; | ||||||
|  |   | ||||||
|  |  	/* | ||||||
|  | -	 * There are four interesting cases for this page table walker: | ||||||
|  | -	 * 1. It tries to start a new iteration of mm_list with a stale max_seq; | ||||||
|  | -	 *    there is nothing left to do. | ||||||
|  | -	 * 2. It's the first of the current generation, and it needs to reset | ||||||
|  | -	 *    the Bloom filter for the next generation. | ||||||
|  | -	 * 3. It reaches the end of mm_list, and it needs to increment | ||||||
|  | -	 *    mm_state->seq; the iteration is done. | ||||||
|  | -	 * 4. It's the last of the current generation, and it needs to reset the | ||||||
|  | -	 *    mm stats counters for the next generation. | ||||||
|  | +	 * mm_state->seq is incremented after each iteration of mm_list. There | ||||||
|  | +	 * are three interesting cases for this page table walker: | ||||||
|  | +	 * 1. It tries to start a new iteration with a stale max_seq: there is | ||||||
|  | +	 *    nothing left to do. | ||||||
|  | +	 * 2. It started the next iteration: it needs to reset the Bloom filter | ||||||
|  | +	 *    so that a fresh set of PTE tables can be recorded. | ||||||
|  | +	 * 3. It ended the current iteration: it needs to reset the mm stats | ||||||
|  | +	 *    counters and tell its caller to increment max_seq. | ||||||
|  |  	 */ | ||||||
|  |  	spin_lock(&mm_list->lock); | ||||||
|  |   | ||||||
|  |  	VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); | ||||||
|  | -	VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq); | ||||||
|  | -	VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers); | ||||||
|  |   | ||||||
|  | -	if (walk->max_seq <= mm_state->seq) { | ||||||
|  | -		if (!*iter) | ||||||
|  | -			last = false; | ||||||
|  | +	if (walk->max_seq <= mm_state->seq) | ||||||
|  |  		goto done; | ||||||
|  | -	} | ||||||
|  |   | ||||||
|  | -	if (!mm_state->nr_walkers) { | ||||||
|  | -		VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); | ||||||
|  | +	if (!mm_state->head) | ||||||
|  | +		mm_state->head = &mm_list->fifo; | ||||||
|  |   | ||||||
|  | -		mm_state->head = mm_list->fifo.next; | ||||||
|  | +	if (mm_state->head == &mm_list->fifo) | ||||||
|  |  		first = true; | ||||||
|  | -	} | ||||||
|  | - | ||||||
|  | -	while (!mm && mm_state->head != &mm_list->fifo) { | ||||||
|  | -		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); | ||||||
|  |   | ||||||
|  | +	do { | ||||||
|  |  		mm_state->head = mm_state->head->next; | ||||||
|  | +		if (mm_state->head == &mm_list->fifo) { | ||||||
|  | +			WRITE_ONCE(mm_state->seq, mm_state->seq + 1); | ||||||
|  | +			last = true; | ||||||
|  | +			break; | ||||||
|  | +		} | ||||||
|  |   | ||||||
|  |  		/* force scan for those added after the last iteration */ | ||||||
|  | -		if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) { | ||||||
|  | -			mm_state->tail = mm_state->head; | ||||||
|  | +		if (!mm_state->tail || mm_state->tail == mm_state->head) { | ||||||
|  | +			mm_state->tail = mm_state->head->next; | ||||||
|  |  			walk->force_scan = true; | ||||||
|  |  		} | ||||||
|  |   | ||||||
|  | +		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); | ||||||
|  |  		if (should_skip_mm(mm, walk)) | ||||||
|  |  			mm = NULL; | ||||||
|  | -	} | ||||||
|  | - | ||||||
|  | -	if (mm_state->head == &mm_list->fifo) | ||||||
|  | -		WRITE_ONCE(mm_state->seq, mm_state->seq + 1); | ||||||
|  | +	} while (!mm); | ||||||
|  |  done: | ||||||
|  | -	if (*iter && !mm) | ||||||
|  | -		mm_state->nr_walkers--; | ||||||
|  | -	if (!*iter && mm) | ||||||
|  | -		mm_state->nr_walkers++; | ||||||
|  | - | ||||||
|  | -	if (mm_state->nr_walkers) | ||||||
|  | -		last = false; | ||||||
|  | - | ||||||
|  |  	if (*iter || last) | ||||||
|  |  		reset_mm_stats(lruvec, walk, last); | ||||||
|  |   | ||||||
|  | @@ -3567,9 +3548,9 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) | ||||||
|  |   | ||||||
|  |  	VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); | ||||||
|  |   | ||||||
|  | -	if (max_seq > mm_state->seq && !mm_state->nr_walkers) { | ||||||
|  | -		VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); | ||||||
|  | - | ||||||
|  | +	if (max_seq > mm_state->seq) { | ||||||
|  | +		mm_state->head = NULL; | ||||||
|  | +		mm_state->tail = NULL; | ||||||
|  |  		WRITE_ONCE(mm_state->seq, mm_state->seq + 1); | ||||||
|  |  		reset_mm_stats(lruvec, NULL, true); | ||||||
|  |  		success = true; | ||||||
|  | @@ -4172,10 +4153,6 @@ static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, | ||||||
|  |   | ||||||
|  |  		walk_pmd_range(&val, addr, next, args); | ||||||
|  |   | ||||||
|  | -		/* a racy check to curtail the waiting time */ | ||||||
|  | -		if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) | ||||||
|  | -			return 1; | ||||||
|  | - | ||||||
|  |  		if (need_resched() || walk->batched >= MAX_LRU_BATCH) { | ||||||
|  |  			end = (addr | ~PUD_MASK) + 1; | ||||||
|  |  			goto done; | ||||||
|  | @@ -4208,8 +4185,14 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ | ||||||
|  |  	walk->next_addr = FIRST_USER_ADDRESS; | ||||||
|  |   | ||||||
|  |  	do { | ||||||
|  | +		DEFINE_MAX_SEQ(lruvec); | ||||||
|  | + | ||||||
|  |  		err = -EBUSY; | ||||||
|  |   | ||||||
|  | +		/* another thread might have called inc_max_seq() */ | ||||||
|  | +		if (walk->max_seq != max_seq) | ||||||
|  | +			break; | ||||||
|  | + | ||||||
|  |  		/* folio_update_gen() requires stable folio_memcg() */ | ||||||
|  |  		if (!mem_cgroup_trylock_pages(memcg)) | ||||||
|  |  			break; | ||||||
|  | @@ -4442,25 +4425,12 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, | ||||||
|  |  		success = iterate_mm_list(lruvec, walk, &mm); | ||||||
|  |  		if (mm) | ||||||
|  |  			walk_mm(lruvec, mm, walk); | ||||||
|  | - | ||||||
|  | -		cond_resched(); | ||||||
|  |  	} while (mm); | ||||||
|  |  done: | ||||||
|  | -	if (!success) { | ||||||
|  | -		if (sc->priority <= DEF_PRIORITY - 2) | ||||||
|  | -			wait_event_killable(lruvec->mm_state.wait, | ||||||
|  | -					    max_seq < READ_ONCE(lrugen->max_seq)); | ||||||
|  | -		return false; | ||||||
|  | -	} | ||||||
|  | +	if (success) | ||||||
|  | +		inc_max_seq(lruvec, can_swap, force_scan); | ||||||
|  |   | ||||||
|  | -	VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); | ||||||
|  | - | ||||||
|  | -	inc_max_seq(lruvec, can_swap, force_scan); | ||||||
|  | -	/* either this sees any waiters or they will see updated max_seq */ | ||||||
|  | -	if (wq_has_sleeper(&lruvec->mm_state.wait)) | ||||||
|  | -		wake_up_all(&lruvec->mm_state.wait); | ||||||
|  | - | ||||||
|  | -	return true; | ||||||
|  | +	return success; | ||||||
|  |  } | ||||||
|  |   | ||||||
|  |  /****************************************************************************** | ||||||
|  | @@ -6105,7 +6075,6 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) | ||||||
|  |  		INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); | ||||||
|  |   | ||||||
|  |  	lruvec->mm_state.seq = MIN_NR_GENS; | ||||||
|  | -	init_waitqueue_head(&lruvec->mm_state.wait); | ||||||
|  |  } | ||||||
|  |   | ||||||
|  |  #ifdef CONFIG_MEMCG | ||||||
|  | @@ -6138,7 +6107,6 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) | ||||||
|  |  	for_each_node(nid) { | ||||||
|  |  		struct lruvec *lruvec = get_lruvec(memcg, nid); | ||||||
|  |   | ||||||
|  | -		VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers); | ||||||
|  |  		VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, | ||||||
|  |  					   sizeof(lruvec->lrugen.nr_pages))); | ||||||
|  |   | ||||||
|  | --  | ||||||
|  | 2.40.1 | ||||||
|  |  | ||||||
		Reference in New Issue
	
	Block a user