Lines Matching +full:processor +full:- +full:intensive

1 // SPDX-License-Identifier: GPL-2.0
31 #include <linux/backing-dev.h>
46 #include <linux/memory-tiers.h>
178 if ((_folio)->lru.prev != _base) { \
181 prev = lru_to_folio(&(_folio->lru)); \
182 prefetchw(&prev->_field); \
198 WARN_ON_ONCE(rs && task->reclaim_state); in set_task_reclaim_state()
200 /* Check for the nulling of an already-nulled member */ in set_task_reclaim_state()
201 WARN_ON_ONCE(!rs && !task->reclaim_state); in set_task_reclaim_state()
203 task->reclaim_state = rs; in set_task_reclaim_state()
226 return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, in shrinker_info_protected()
240 pn = memcg->nodeinfo[nid]; in expand_one_shrinker_info()
248 return -ENOMEM; in expand_one_shrinker_info()
250 new->nr_deferred = (atomic_long_t *)(new + 1); in expand_one_shrinker_info()
251 new->map = (void *)new->nr_deferred + defer_size; in expand_one_shrinker_info()
254 memset(new->map, (int)0xff, old_map_size); in expand_one_shrinker_info()
255 memset((void *)new->map + old_map_size, 0, map_size - old_map_size); in expand_one_shrinker_info()
257 memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); in expand_one_shrinker_info()
258 memset((void *)new->nr_deferred + old_defer_size, 0, in expand_one_shrinker_info()
259 defer_size - old_defer_size); in expand_one_shrinker_info()
261 rcu_assign_pointer(pn->shrinker_info, new); in expand_one_shrinker_info()
275 pn = memcg->nodeinfo[nid]; in free_shrinker_info()
276 info = rcu_dereference_protected(pn->shrinker_info, true); in free_shrinker_info()
278 rcu_assign_pointer(pn->shrinker_info, NULL); in free_shrinker_info()
296 ret = -ENOMEM; in alloc_shrinker_info()
299 info->nr_deferred = (atomic_long_t *)(info + 1); in alloc_shrinker_info()
300 info->map = (void *)info->nr_deferred + defer_size; in alloc_shrinker_info()
301 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); in alloc_shrinker_info()
357 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); in set_shrinker_bit()
360 set_bit(shrinker_id, info->map); in set_shrinker_bit()
369 int id, ret = -ENOMEM; in prealloc_memcg_shrinker()
372 return -ENOSYS; in prealloc_memcg_shrinker()
386 shrinker->id = id; in prealloc_memcg_shrinker()
395 int id = shrinker->id; in unregister_memcg_shrinker()
410 return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); in xchg_nr_deferred_memcg()
419 return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); in add_nr_deferred_memcg()
439 nr = atomic_long_read(&child_info->nr_deferred[i]); in reparent_shrinker_deferred()
440 atomic_long_add(nr, &parent_info->nr_deferred[i]); in reparent_shrinker_deferred()
448 return sc->target_mem_cgroup; in cgroup_reclaim()
452 * writeback_throttling_sane - is the usual dirty throttling mechanism available?
477 return -ENOSYS; in prealloc_memcg_shrinker()
510 int nid = sc->nid; in xchg_nr_deferred()
512 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) in xchg_nr_deferred()
515 if (sc->memcg && in xchg_nr_deferred()
516 (shrinker->flags & SHRINKER_MEMCG_AWARE)) in xchg_nr_deferred()
518 sc->memcg); in xchg_nr_deferred()
520 return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); in xchg_nr_deferred()
527 int nid = sc->nid; in add_nr_deferred()
529 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) in add_nr_deferred()
532 if (sc->memcg && in add_nr_deferred()
533 (shrinker->flags & SHRINKER_MEMCG_AWARE)) in add_nr_deferred()
535 sc->memcg); in add_nr_deferred()
537 return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); in add_nr_deferred()
544 if (sc && sc->no_demotion) in can_demote()
558 * For non-memcg reclaim, is there in can_reclaim_anon_pages()
596 * lruvec_lru_size - Returns the number of pages on the given LRU list.
599 * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
608 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; in lruvec_lru_size()
629 if (shrinker->flags & SHRINKER_MEMCG_AWARE) { in __prealloc_shrinker()
631 if (err != -ENOSYS) in __prealloc_shrinker()
634 shrinker->flags &= ~SHRINKER_MEMCG_AWARE; in __prealloc_shrinker()
637 size = sizeof(*shrinker->nr_deferred); in __prealloc_shrinker()
638 if (shrinker->flags & SHRINKER_NUMA_AWARE) in __prealloc_shrinker()
641 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); in __prealloc_shrinker()
642 if (!shrinker->nr_deferred) in __prealloc_shrinker()
643 return -ENOMEM; in __prealloc_shrinker()
655 shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); in prealloc_shrinker()
657 if (!shrinker->name) in prealloc_shrinker()
658 return -ENOMEM; in prealloc_shrinker()
662 kfree_const(shrinker->name); in prealloc_shrinker()
663 shrinker->name = NULL; in prealloc_shrinker()
678 kfree_const(shrinker->name); in free_prealloced_shrinker()
679 shrinker->name = NULL; in free_prealloced_shrinker()
681 if (shrinker->flags & SHRINKER_MEMCG_AWARE) { in free_prealloced_shrinker()
688 kfree(shrinker->nr_deferred); in free_prealloced_shrinker()
689 shrinker->nr_deferred = NULL; in free_prealloced_shrinker()
695 list_add_tail(&shrinker->list, &shrinker_list); in register_shrinker_prepared()
696 shrinker->flags |= SHRINKER_REGISTERED; in register_shrinker_prepared()
718 shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); in register_shrinker()
720 if (!shrinker->name) in register_shrinker()
721 return -ENOMEM; in register_shrinker()
725 kfree_const(shrinker->name); in register_shrinker()
726 shrinker->name = NULL; in register_shrinker()
743 if (!(shrinker->flags & SHRINKER_REGISTERED)) in unregister_shrinker()
747 list_del(&shrinker->list); in unregister_shrinker()
748 shrinker->flags &= ~SHRINKER_REGISTERED; in unregister_shrinker()
749 if (shrinker->flags & SHRINKER_MEMCG_AWARE) in unregister_shrinker()
754 kfree(shrinker->nr_deferred); in unregister_shrinker()
755 shrinker->nr_deferred = NULL; in unregister_shrinker()
760 * synchronize_shrinkers - Wait for all running shrinkers to complete.
785 long batch_size = shrinker->batch ? shrinker->batch in do_shrink_slab()
789 freeable = shrinker->count_objects(shrinker, shrinkctl); in do_shrink_slab()
800 if (shrinker->seeks) { in do_shrink_slab()
803 do_div(delta, shrinker->seeks); in do_shrink_slab()
840 shrinkctl->nr_to_scan = nr_to_scan; in do_shrink_slab()
841 shrinkctl->nr_scanned = nr_to_scan; in do_shrink_slab()
842 ret = shrinker->scan_objects(shrinker, shrinkctl); in do_shrink_slab()
847 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); in do_shrink_slab()
848 total_scan -= shrinkctl->nr_scanned; in do_shrink_slab()
849 scanned += shrinkctl->nr_scanned; in do_shrink_slab()
860 next_deferred = max_t(long, (nr + delta - scanned), 0); in do_shrink_slab()
869 trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); in do_shrink_slab()
891 for_each_set_bit(i, info->map, shrinker_nr_max) { in shrink_slab_memcg()
900 if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { in shrink_slab_memcg()
902 clear_bit(i, info->map); in shrink_slab_memcg()
906 /* Call non-slab shrinkers even though kmem is disabled */ in shrink_slab_memcg()
908 !(shrinker->flags & SHRINKER_NONSLAB)) in shrink_slab_memcg()
913 clear_bit(i, info->map); in shrink_slab_memcg()
956 * shrink_slab - shrink slab caches
970 * @priority is sc->priority, we take the number of objects and >> by priority
1055 * private data at folio->private. in is_page_cache_freeable()
1057 return folio_ref_count(folio) - folio_test_private(folio) == in is_page_cache_freeable()
1063 * -ENOSPC. We need to propagate that into the address_space for a subsequent
1091 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) in skip_throttle_noprogress()
1100 struct zone *zone = pgdat->node_zones + i; in skip_throttle_noprogress()
1117 wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; in reclaim_throttle()
1127 current->flags & (PF_IO_WORKER|PF_KTHREAD)) { in reclaim_throttle()
1135 * parallel reclaimers which is a short-lived event so the timeout is in reclaim_throttle()
1137 * potentially long-lived events so use a longer timeout. This is shaky in reclaim_throttle()
1146 if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { in reclaim_throttle()
1147 WRITE_ONCE(pgdat->nr_reclaim_start, in reclaim_throttle()
1177 atomic_dec(&pgdat->nr_writeback_throttled); in reclaim_throttle()
1179 trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout), in reclaim_throttle()
1180 jiffies_to_usecs(timeout - ret), in reclaim_throttle()
1197 * This is an inaccurate read as the per-cpu deltas may not in __acct_reclaim_writeback()
1203 nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) - in __acct_reclaim_writeback()
1204 READ_ONCE(pgdat->nr_reclaim_start); in __acct_reclaim_writeback()
1207 wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]); in __acct_reclaim_writeback()
1224 * Calls ->writepage().
1231 * will be non-blocking. To prevent this allocation from being in pageout()
1250 * folio->mapping == NULL while being dirty with clean buffers. in pageout()
1261 if (mapping->a_ops->writepage == NULL) in pageout()
1276 res = mapping->a_ops->writepage(&folio->page, &wbc); in pageout()
1310 spin_lock(&mapping->host->i_lock); in __remove_mapping()
1311 xa_lock_irq(&mapping->i_pages); in __remove_mapping()
1331 * escape unnoticed. The smp_rmb is needed to ensure the folio->flags in __remove_mapping()
1332 * load is not satisfied before that of folio->_refcount. in __remove_mapping()
1354 xa_unlock_irq(&mapping->i_pages); in __remove_mapping()
1359 free_folio = mapping->a_ops->free_folio; in __remove_mapping()
1380 xa_unlock_irq(&mapping->i_pages); in __remove_mapping()
1382 inode_add_lru(mapping->host); in __remove_mapping()
1383 spin_unlock(&mapping->host->i_lock); in __remove_mapping()
1392 xa_unlock_irq(&mapping->i_pages); in __remove_mapping()
1394 spin_unlock(&mapping->host->i_lock); in __remove_mapping()
1399 * remove_mapping() - Attempt to remove a folio from its mapping.
1425 * folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
1452 referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, in folio_check_references()
1464 if (referenced_ptes == -1) in folio_check_references()
1488 * Activate file-backed executable folios after first usage. in folio_check_references()
1532 if (mapping && mapping->a_ops->is_dirty_writeback) in folio_check_dirty_writeback()
1533 mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); in folio_check_dirty_writeback()
1544 allowed_mask = mtc->nmask; in alloc_demote_page()
1554 mtc->nmask = NULL; in alloc_demote_page()
1555 mtc->gfp_mask |= __GFP_THISNODE; in alloc_demote_page()
1560 mtc->gfp_mask &= ~__GFP_THISNODE; in alloc_demote_page()
1561 mtc->nmask = allowed_mask; in alloc_demote_page()
1573 int target_nid = next_demotion_node(pgdat->node_id); in demote_folio_list()
1617 * We can "enter_fs" for swap-cache with only __GFP_IO in may_enter_fs()
1619 * ->flags can be updated non-atomicially (scan_swap_map_slots), in may_enter_fs()
1643 do_demote_pass = can_demote(pgdat->node_id, sc); in shrink_folio_list()
1656 list_del(&folio->lru); in shrink_folio_list()
1666 sc->nr_scanned += nr_pages; in shrink_folio_list()
1671 if (!sc->may_unmap && folio_mapped(folio)) in shrink_folio_list()
1686 stat->nr_dirty += nr_pages; in shrink_folio_list()
1689 stat->nr_unqueued_dirty += nr_pages; in shrink_folio_list()
1698 stat->nr_congested += nr_pages; in shrink_folio_list()
1748 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { in shrink_folio_list()
1749 stat->nr_immediate += nr_pages; in shrink_folio_list()
1755 !may_enter_fs(folio, sc->gfp_mask)) { in shrink_folio_list()
1757 * This is slightly racy - in shrink_folio_list()
1761 * interpreted as the readahead flag - but in shrink_folio_list()
1771 stat->nr_writeback += nr_pages; in shrink_folio_list()
1779 list_add_tail(&folio->lru, folio_list); in shrink_folio_list()
1791 stat->nr_ref_keep += nr_pages; in shrink_folio_list()
1804 list_add(&folio->lru, &demote_folios); in shrink_folio_list()
1816 if (!(sc->gfp_mask & __GFP_IO)) in shrink_folio_list()
1861 sc->nr_scanned -= (nr_pages - 1); in shrink_folio_list()
1878 stat->nr_unmap_fail += nr_pages; in shrink_folio_list()
1881 stat->nr_lazyfree_fail += nr_pages; in shrink_folio_list()
1891 * injecting inefficient single-folio I/O into in shrink_folio_list()
1902 !test_bit(PGDAT_DIRTY, &pgdat->flags))) { in shrink_folio_list()
1918 if (!may_enter_fs(folio, sc->gfp_mask)) in shrink_folio_list()
1920 if (!sc->may_writepage) in shrink_folio_list()
1935 stat->nr_pageout += nr_pages; in shrink_folio_list()
1943 * A synchronous write - probably a ramdisk. Go in shrink_folio_list()
1971 * and mark the folio clean - it can be freed. in shrink_folio_list()
1973 * Rarely, folios can have buffers and no ->mapping. in shrink_folio_list()
1982 if (!filemap_release_folio(folio, sc->gfp_mask)) in shrink_folio_list()
2017 sc->target_mem_cgroup)) in shrink_folio_list()
2035 list_add(&folio->lru, &free_folios); in shrink_folio_list()
2044 sc->nr_scanned -= (nr_pages - 1); in shrink_folio_list()
2056 stat->nr_activate[type] += nr_pages; in shrink_folio_list()
2062 list_add(&folio->lru, &ret_folios); in shrink_folio_list()
2078 pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; in shrink_folio_list()
2110 list_move(&folio->lru, &clean_folios); in reclaim_clean_pages_from_list()
2121 nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, in reclaim_clean_pages_from_list()
2126 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, in reclaim_clean_pages_from_list()
2127 -(long)nr_reclaimed); in reclaim_clean_pages_from_list()
2134 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, in reclaim_clean_pages_from_list()
2136 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, in reclaim_clean_pages_from_list()
2137 -(long)stat.nr_lazyfree_fail); in reclaim_clean_pages_from_list()
2154 update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); in update_lru_sizes()
2162 * lruvec->lru_lock is heavily contended. Some of the functions that
2166 * For pagecache intensive workloads, this function is the hottest
2185 struct list_head *src = &lruvec->lists[lru]; in isolate_lru_folios()
2205 if (folio_zonenum(folio) > sc->reclaim_idx) { in isolate_lru_folios()
2222 if (!sc->may_unmap && folio_mapped(folio)) in isolate_lru_folios()
2227 * sure the folio is not being freed elsewhere -- the in isolate_lru_folios()
2243 list_move(&folio->lru, move_to); in isolate_lru_folios()
2266 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, in isolate_lru_folios()
2268 sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru); in isolate_lru_folios()
2274 * folio_isolate_lru() - Try to isolate a folio from its LRU list.
2294 * -EBUSY if the folio was not on an LRU list.
2298 int ret = -EBUSY; in folio_isolate_lru()
2344 * won't get blocked by normal direct-reclaimers, forming a circular in too_many_isolated()
2347 if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) in too_many_isolated()
2375 list_del(&folio->lru); in move_folios_to_lru()
2377 spin_unlock_irq(&lruvec->lru_lock); in move_folios_to_lru()
2379 spin_lock_irq(&lruvec->lru_lock); in move_folios_to_lru()
2391 * list_add(&folio->lru,) in move_folios_to_lru()
2392 * list_add(&folio->lru,) in move_folios_to_lru()
2400 spin_unlock_irq(&lruvec->lru_lock); in move_folios_to_lru()
2402 spin_lock_irq(&lruvec->lru_lock); in move_folios_to_lru()
2404 list_add(&folio->lru, &folios_to_free); in move_folios_to_lru()
2430 * If a kernel thread (such as nfsd for loop-back mounts) services a backing
2436 return !(current->flags & PF_LOCAL_THROTTLE); in current_may_throttle()
2472 spin_lock_irq(&lruvec->lru_lock); in shrink_inactive_list()
2484 spin_unlock_irq(&lruvec->lru_lock); in shrink_inactive_list()
2491 spin_lock_irq(&lruvec->lru_lock); in shrink_inactive_list()
2494 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); in shrink_inactive_list()
2500 spin_unlock_irq(&lruvec->lru_lock); in shrink_inactive_list()
2532 sc->nr.dirty += stat.nr_dirty; in shrink_inactive_list()
2533 sc->nr.congested += stat.nr_congested; in shrink_inactive_list()
2534 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; in shrink_inactive_list()
2535 sc->nr.writeback += stat.nr_writeback; in shrink_inactive_list()
2536 sc->nr.immediate += stat.nr_immediate; in shrink_inactive_list()
2537 sc->nr.taken += nr_taken; in shrink_inactive_list()
2539 sc->nr.file_taken += nr_taken; in shrink_inactive_list()
2541 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, in shrink_inactive_list()
2542 nr_scanned, nr_reclaimed, &stat, sc->priority, file); in shrink_inactive_list()
2557 * It is safe to rely on the active flag against the non-LRU folios in here
2558 * because nobody will play with that bit on a non-LRU folio.
2560 * The downside is that we have to touch folio->_refcount against each folio.
2561 * But we had to alter folio->flags anyway.
2581 spin_lock_irq(&lruvec->lru_lock); in shrink_active_list()
2592 spin_unlock_irq(&lruvec->lru_lock); in shrink_active_list()
2599 list_del(&folio->lru); in shrink_active_list()
2615 if (folio_referenced(folio, 0, sc->target_mem_cgroup, in shrink_active_list()
2618 * Identify referenced, file-backed active folios and in shrink_active_list()
2622 * are not likely to be evicted by use-once streaming in shrink_active_list()
2628 list_add(&folio->lru, &l_active); in shrink_active_list()
2633 folio_clear_active(folio); /* we are de-activating */ in shrink_active_list()
2635 list_add(&folio->lru, &l_inactive); in shrink_active_list()
2641 spin_lock_irq(&lruvec->lru_lock); in shrink_active_list()
2651 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); in shrink_active_list()
2652 spin_unlock_irq(&lruvec->lru_lock); in shrink_active_list()
2656 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, in shrink_active_list()
2657 nr_deactivate, nr_rotated, sc->priority, file); in shrink_active_list()
2677 list_del(&folio->lru); in reclaim_folio_list()
2702 list_move(&folio->lru, &node_folio_list); in reclaim_pages()
2721 if (sc->may_deactivate & (1 << is_file_lru(lru))) in shrink_list()
2724 sc->skipped_deactivate = 1; in shrink_list()
2736 * to the established workingset on the scan-resistant active list,
2750 * -------------------------------------
2769 gb = (inactive + active) >> (30 - PAGE_SHIFT); in inactive_is_low()
2793 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); in prepare_scan_count()
2796 * Flush the memory cgroup stats, so that we read accurate per-memcg in prepare_scan_count()
2804 spin_lock_irq(&target_lruvec->lru_lock); in prepare_scan_count()
2805 sc->anon_cost = target_lruvec->anon_cost; in prepare_scan_count()
2806 sc->file_cost = target_lruvec->file_cost; in prepare_scan_count()
2807 spin_unlock_irq(&target_lruvec->lru_lock); in prepare_scan_count()
2813 if (!sc->force_deactivate) { in prepare_scan_count()
2823 if (refaults != target_lruvec->refaults[WORKINGSET_ANON] || in prepare_scan_count()
2825 sc->may_deactivate |= DEACTIVATE_ANON; in prepare_scan_count()
2827 sc->may_deactivate &= ~DEACTIVATE_ANON; in prepare_scan_count()
2831 if (refaults != target_lruvec->refaults[WORKINGSET_FILE] || in prepare_scan_count()
2833 sc->may_deactivate |= DEACTIVATE_FILE; in prepare_scan_count()
2835 sc->may_deactivate &= ~DEACTIVATE_FILE; in prepare_scan_count()
2837 sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; in prepare_scan_count()
2845 if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) in prepare_scan_count()
2846 sc->cache_trim_mode = 1; in prepare_scan_count()
2848 sc->cache_trim_mode = 0; in prepare_scan_count()
2864 free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); in prepare_scan_count()
2869 struct zone *zone = &pgdat->node_zones[z]; in prepare_scan_count()
2884 sc->file_is_tiny = in prepare_scan_count()
2886 !(sc->may_deactivate & DEACTIVATE_ANON) && in prepare_scan_count()
2887 anon >> sc->priority; in prepare_scan_count()
2912 if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { in get_scan_count()
2934 if (!sc->priority && swappiness) { in get_scan_count()
2940 * If the system is almost out of file pages, force-scan anon. in get_scan_count()
2942 if (sc->file_is_tiny) { in get_scan_count()
2951 if (sc->cache_trim_mode) { in get_scan_count()
2972 total_cost = sc->anon_cost + sc->file_cost; in get_scan_count()
2973 anon_cost = total_cost + sc->anon_cost; in get_scan_count()
2974 file_cost = total_cost + sc->file_cost; in get_scan_count()
2980 fp = (200 - swappiness) * (total_cost + 1); in get_scan_count()
2993 lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); in get_scan_count()
2994 mem_cgroup_protection(sc->target_mem_cgroup, memcg, in get_scan_count()
3004 * becomes extremely binary -- from nothing as we in get_scan_count()
3019 * the best-effort low protection. However, we still in get_scan_count()
3020 * ideally want to honor how well-behaved groups are in in get_scan_count()
3031 if (!sc->memcg_low_reclaim && low > min) { in get_scan_count()
3033 sc->memcg_low_skipped = 1; in get_scan_count()
3041 scan = lruvec_size - lruvec_size * protection / in get_scan_count()
3047 * sc->priority further than desirable. in get_scan_count()
3054 scan >>= sc->priority; in get_scan_count()
3073 * round-off error. in get_scan_count()
3107 return can_demote(pgdat->node_id, sc); in can_age_anon_pages()
3127 unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
3131 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \
3132 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
3146 struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; in get_lruvec()
3149 if (!lruvec->pgdat) in get_lruvec()
3150 lruvec->pgdat = pgdat; in get_lruvec()
3157 return pgdat ? &pgdat->__lruvec : NULL; in get_lruvec()
3165 if (!can_demote(pgdat->node_id, sc) && in get_swappiness()
3174 return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; in get_nr_gens()
3198 return &memcg->mm_list; in get_mm_list()
3211 VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); in lru_gen_add_mm()
3213 VM_WARN_ON_ONCE(mm->lru_gen.memcg); in lru_gen_add_mm()
3214 mm->lru_gen.memcg = memcg; in lru_gen_add_mm()
3216 spin_lock(&mm_list->lock); in lru_gen_add_mm()
3225 if (lruvec->mm_state.tail == &mm_list->fifo) in lru_gen_add_mm()
3226 lruvec->mm_state.tail = &mm->lru_gen.list; in lru_gen_add_mm()
3229 list_add_tail(&mm->lru_gen.list, &mm_list->fifo); in lru_gen_add_mm()
3231 spin_unlock(&mm_list->lock); in lru_gen_add_mm()
3240 if (list_empty(&mm->lru_gen.list)) in lru_gen_del_mm()
3244 memcg = mm->lru_gen.memcg; in lru_gen_del_mm()
3248 spin_lock(&mm_list->lock); in lru_gen_del_mm()
3257 if (lruvec->mm_state.tail == &mm->lru_gen.list) in lru_gen_del_mm()
3258 lruvec->mm_state.tail = lruvec->mm_state.tail->next; in lru_gen_del_mm()
3261 if (lruvec->mm_state.head != &mm->lru_gen.list) in lru_gen_del_mm()
3264 lruvec->mm_state.head = lruvec->mm_state.head->next; in lru_gen_del_mm()
3266 if (lruvec->mm_state.head == &mm_list->fifo) in lru_gen_del_mm()
3267 WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); in lru_gen_del_mm()
3270 list_del_init(&mm->lru_gen.list); in lru_gen_del_mm()
3272 spin_unlock(&mm_list->lock); in lru_gen_del_mm()
3275 mem_cgroup_put(mm->lru_gen.memcg); in lru_gen_del_mm()
3276 mm->lru_gen.memcg = NULL; in lru_gen_del_mm()
3284 struct task_struct *task = rcu_dereference_protected(mm->owner, true); in lru_gen_migrate_mm()
3286 VM_WARN_ON_ONCE(task->mm != mm); in lru_gen_migrate_mm()
3287 lockdep_assert_held(&task->alloc_lock); in lru_gen_migrate_mm()
3296 if (memcg == mm->lru_gen.memcg) in lru_gen_migrate_mm()
3299 VM_WARN_ON_ONCE(!mm->lru_gen.memcg); in lru_gen_migrate_mm()
3300 VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); in lru_gen_migrate_mm()
3314 * To get rid of non-leaf entries that no longer have enough leaf entries, the
3315 * aging uses the double-buffering technique to flip to the other filter each
3316 * time it produces a new generation. For non-leaf entries that have enough
3342 key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); in get_item_key()
3351 filter = lruvec->mm_state.filters[gen]; in reset_bloom_filter()
3359 WRITE_ONCE(lruvec->mm_state.filters[gen], filter); in reset_bloom_filter()
3368 filter = READ_ONCE(lruvec->mm_state.filters[gen]); in update_bloom_filter()
3386 filter = READ_ONCE(lruvec->mm_state.filters[gen]); in test_bloom_filter()
3400 lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); in reset_mm_stats()
3403 hist = lru_hist_from_seq(walk->max_seq); in reset_mm_stats()
3406 WRITE_ONCE(lruvec->mm_state.stats[hist][i], in reset_mm_stats()
3407 lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); in reset_mm_stats()
3408 walk->mm_stats[i] = 0; in reset_mm_stats()
3413 hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); in reset_mm_stats()
3416 WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); in reset_mm_stats()
3424 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); in should_skip_mm()
3425 int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); in should_skip_mm()
3427 if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) in should_skip_mm()
3430 clear_bit(key, &mm->lru_gen.bitmap); in should_skip_mm()
3432 for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { in should_skip_mm()
3452 struct lru_gen_mm_state *mm_state = &lruvec->mm_state; in iterate_mm_list()
3461 * mm_state->seq; the iteration is done. in iterate_mm_list()
3465 spin_lock(&mm_list->lock); in iterate_mm_list()
3467 VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); in iterate_mm_list()
3468 VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq); in iterate_mm_list()
3469 VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers); in iterate_mm_list()
3471 if (walk->max_seq <= mm_state->seq) { in iterate_mm_list()
3477 if (!mm_state->nr_walkers) { in iterate_mm_list()
3478 VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); in iterate_mm_list()
3480 mm_state->head = mm_list->fifo.next; in iterate_mm_list()
3484 while (!mm && mm_state->head != &mm_list->fifo) { in iterate_mm_list()
3485 mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); in iterate_mm_list()
3487 mm_state->head = mm_state->head->next; in iterate_mm_list()
3490 if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) { in iterate_mm_list()
3491 mm_state->tail = mm_state->head; in iterate_mm_list()
3492 walk->force_scan = true; in iterate_mm_list()
3499 if (mm_state->head == &mm_list->fifo) in iterate_mm_list()
3500 WRITE_ONCE(mm_state->seq, mm_state->seq + 1); in iterate_mm_list()
3503 mm_state->nr_walkers--; in iterate_mm_list()
3505 mm_state->nr_walkers++; in iterate_mm_list()
3507 if (mm_state->nr_walkers) in iterate_mm_list()
3513 spin_unlock(&mm_list->lock); in iterate_mm_list()
3516 reset_bloom_filter(lruvec, walk->max_seq + 1); in iterate_mm_list()
3531 struct lru_gen_mm_state *mm_state = &lruvec->mm_state; in iterate_mm_list_nowalk()
3533 spin_lock(&mm_list->lock); in iterate_mm_list_nowalk()
3535 VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); in iterate_mm_list_nowalk()
3537 if (max_seq > mm_state->seq && !mm_state->nr_walkers) { in iterate_mm_list_nowalk()
3538 VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); in iterate_mm_list_nowalk()
3540 WRITE_ONCE(mm_state->seq, mm_state->seq + 1); in iterate_mm_list_nowalk()
3545 spin_unlock(&mm_list->lock); in iterate_mm_list_nowalk()
3555 * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
3570 * 1. The D term may discount the other two terms over time so that long-lived
3582 struct lru_gen_struct *lrugen = &lruvec->lrugen; in read_ctrl_pos()
3583 int hist = lru_hist_from_seq(lrugen->min_seq[type]); in read_ctrl_pos()
3585 pos->refaulted = lrugen->avg_refaulted[type][tier] + in read_ctrl_pos()
3586 atomic_long_read(&lrugen->refaulted[hist][type][tier]); in read_ctrl_pos()
3587 pos->total = lrugen->avg_total[type][tier] + in read_ctrl_pos()
3588 atomic_long_read(&lrugen->evicted[hist][type][tier]); in read_ctrl_pos()
3590 pos->total += lrugen->protected[hist][type][tier - 1]; in read_ctrl_pos()
3591 pos->gain = gain; in read_ctrl_pos()
3597 struct lru_gen_struct *lrugen = &lruvec->lrugen; in reset_ctrl_pos()
3599 unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; in reset_ctrl_pos()
3601 lockdep_assert_held(&lruvec->lru_lock); in reset_ctrl_pos()
3612 sum = lrugen->avg_refaulted[type][tier] + in reset_ctrl_pos()
3613 atomic_long_read(&lrugen->refaulted[hist][type][tier]); in reset_ctrl_pos()
3614 WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); in reset_ctrl_pos()
3616 sum = lrugen->avg_total[type][tier] + in reset_ctrl_pos()
3617 atomic_long_read(&lrugen->evicted[hist][type][tier]); in reset_ctrl_pos()
3619 sum += lrugen->protected[hist][type][tier - 1]; in reset_ctrl_pos()
3620 WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); in reset_ctrl_pos()
3624 atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); in reset_ctrl_pos()
3625 atomic_long_set(&lrugen->evicted[hist][type][tier], 0); in reset_ctrl_pos()
3627 WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); in reset_ctrl_pos()
3638 return pv->refaulted < MIN_LRU_BATCH || in positive_ctrl_err()
3639 pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= in positive_ctrl_err()
3640 (sp->refaulted + 1) * pv->total * pv->gain; in positive_ctrl_err()
3650 unsigned long new_flags, old_flags = READ_ONCE(folio->flags); in folio_update_gen()
3665 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); in folio_update_gen()
3667 return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; in folio_update_gen()
3674 struct lru_gen_struct *lrugen = &lruvec->lrugen; in folio_inc_gen()
3675 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); in folio_inc_gen()
3676 unsigned long new_flags, old_flags = READ_ONCE(folio->flags); in folio_inc_gen()
3681 new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; in folio_inc_gen()
3693 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); in folio_inc_gen()
3710 walk->batched++; in update_batch_size()
3712 walk->nr_pages[old_gen][type][zone] -= delta; in update_batch_size()
3713 walk->nr_pages[new_gen][type][zone] += delta; in update_batch_size()
3719 struct lru_gen_struct *lrugen = &lruvec->lrugen; in reset_batch_size()
3721 walk->batched = 0; in reset_batch_size()
3725 int delta = walk->nr_pages[gen][type][zone]; in reset_batch_size()
3730 walk->nr_pages[gen][type][zone] = 0; in reset_batch_size()
3731 WRITE_ONCE(lrugen->nr_pages[gen][type][zone], in reset_batch_size()
3732 lrugen->nr_pages[gen][type][zone] + delta); in reset_batch_size()
3743 struct vm_area_struct *vma = args->vma; in should_skip_vma()
3744 struct lru_gen_mm_walk *walk = args->private; in should_skip_vma()
3752 if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) in should_skip_vma()
3755 if (vma == get_gate_vma(vma->vm_mm)) in should_skip_vma()
3759 return !walk->can_swap; in should_skip_vma()
3761 if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) in should_skip_vma()
3764 mapping = vma->vm_file->f_mapping; in should_skip_vma()
3769 return !walk->can_swap; in should_skip_vma()
3772 return !mapping->a_ops->read_folio; in should_skip_vma()
3776 * Some userspace memory allocators map many single-page VMAs. Instead of
3785 VMA_ITERATOR(vmi, args->mm, start); in get_next_vma()
3790 for_each_vma(vmi, args->vma) { in get_next_vma()
3791 if (end && end <= args->vma->vm_start) in get_next_vma()
3794 if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) in get_next_vma()
3797 *vm_start = max(start, args->vma->vm_start); in get_next_vma()
3798 *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; in get_next_vma()
3810 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); in get_pte_pfn()
3813 return -1; in get_pte_pfn()
3816 return -1; in get_pte_pfn()
3819 return -1; in get_pte_pfn()
3829 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); in get_pmd_pfn()
3832 return -1; in get_pmd_pfn()
3835 return -1; in get_pmd_pfn()
3838 return -1; in get_pmd_pfn()
3850 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) in get_pfn_folio()
3854 if (folio_nid(folio) != pgdat->node_id) in get_pfn_folio()
3884 struct lru_gen_mm_walk *walk = args->private; in walk_pte_range()
3885 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); in walk_pte_range()
3886 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); in walk_pte_range()
3887 int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); in walk_pte_range()
3891 ptl = pte_lockptr(args->mm, pmd); in walk_pte_range()
3904 walk->mm_stats[MM_LEAF_TOTAL]++; in walk_pte_range()
3906 pfn = get_pte_pfn(pte[i], args->vma, addr); in walk_pte_range()
3907 if (pfn == -1) in walk_pte_range()
3911 walk->mm_stats[MM_LEAF_OLD]++; in walk_pte_range()
3915 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); in walk_pte_range()
3919 if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) in walk_pte_range()
3923 walk->mm_stats[MM_LEAF_YOUNG]++; in walk_pte_range()
3953 struct lru_gen_mm_walk *walk = args->private; in walk_pmd_range_locked()
3954 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); in walk_pmd_range_locked()
3955 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); in walk_pmd_range_locked()
3956 int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); in walk_pmd_range_locked()
3961 if (*start == -1) { in walk_pmd_range_locked()
3966 i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); in walk_pmd_range_locked()
3968 __set_bit(i - 1, bitmap); in walk_pmd_range_locked()
3974 ptl = pmd_lockptr(args->mm, pmd); in walk_pmd_range_locked()
3986 if (pfn == -1) in walk_pmd_range_locked()
3996 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); in walk_pmd_range_locked()
4003 walk->mm_stats[MM_LEAF_YOUNG]++; in walk_pmd_range_locked()
4020 *start = -1; in walk_pmd_range_locked()
4038 unsigned long pos = -1; in walk_pmd_range()
4039 struct lru_gen_mm_walk *walk = args->private; in walk_pmd_range()
4052 vma = args->vma; in walk_pmd_range()
4062 walk->mm_stats[MM_LEAF_TOTAL]++; in walk_pmd_range()
4069 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); in walk_pmd_range()
4071 walk->mm_stats[MM_LEAF_TOTAL]++; in walk_pmd_range()
4074 walk->mm_stats[MM_LEAF_OLD]++; in walk_pmd_range()
4079 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) in walk_pmd_range()
4086 walk->mm_stats[MM_NONLEAF_TOTAL]++; in walk_pmd_range()
4096 if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) in walk_pmd_range()
4099 walk->mm_stats[MM_NONLEAF_FOUND]++; in walk_pmd_range()
4104 walk->mm_stats[MM_NONLEAF_ADDED]++; in walk_pmd_range()
4107 update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); in walk_pmd_range()
4110 walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); in walk_pmd_range()
4123 struct lru_gen_mm_walk *walk = args->private; in walk_pud_range()
4140 if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) in walk_pud_range()
4143 if (need_resched() || walk->batched >= MAX_LRU_BATCH) { in walk_pud_range()
4154 if (!end || !args->vma) in walk_pud_range()
4157 walk->next_addr = max(end, args->vma->vm_start); in walk_pud_range()
4159 return -EAGAIN; in walk_pud_range()
4172 walk->next_addr = FIRST_USER_ADDRESS; in walk_mm()
4175 err = -EBUSY; in walk_mm()
4183 err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); in walk_mm()
4190 if (walk->batched) { in walk_mm()
4191 spin_lock_irq(&lruvec->lru_lock); in walk_mm()
4193 spin_unlock_irq(&lruvec->lru_lock); in walk_mm()
4197 } while (err == -EAGAIN); in walk_mm()
4202 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; in set_mm_walk()
4207 walk = &pgdat->mm_walk; in set_mm_walk()
4214 current->reclaim_state->mm_walk = walk; in set_mm_walk()
4221 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; in clear_mm_walk()
4223 VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); in clear_mm_walk()
4224 VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); in clear_mm_walk()
4226 current->reclaim_state->mm_walk = NULL; in clear_mm_walk()
4236 struct lru_gen_struct *lrugen = &lruvec->lrugen; in inc_min_seq()
4237 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); in inc_min_seq()
4244 struct list_head *head = &lrugen->lists[old_gen][type][zone]; in inc_min_seq()
4255 list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]); in inc_min_seq()
4257 if (!--remaining) in inc_min_seq()
4263 WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); in inc_min_seq()
4272 struct lru_gen_struct *lrugen = &lruvec->lrugen; in try_to_inc_min_seq()
4279 while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { in try_to_inc_min_seq()
4283 if (!list_empty(&lrugen->lists[gen][type][zone])) in try_to_inc_min_seq()
4296 min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); in try_to_inc_min_seq()
4300 if (min_seq[type] == lrugen->min_seq[type]) in try_to_inc_min_seq()
4304 WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); in try_to_inc_min_seq()
4315 struct lru_gen_struct *lrugen = &lruvec->lrugen; in inc_max_seq()
4317 spin_lock_irq(&lruvec->lru_lock); in inc_max_seq()
4321 for (type = ANON_AND_FILE - 1; type >= 0; type--) { in inc_max_seq()
4328 spin_unlock_irq(&lruvec->lru_lock); in inc_max_seq()
4330 spin_lock_irq(&lruvec->lru_lock); in inc_max_seq()
4340 prev = lru_gen_from_seq(lrugen->max_seq - 1); in inc_max_seq()
4341 next = lru_gen_from_seq(lrugen->max_seq + 1); in inc_max_seq()
4346 long delta = lrugen->nr_pages[prev][type][zone] - in inc_max_seq()
4347 lrugen->nr_pages[next][type][zone]; in inc_max_seq()
4353 __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); in inc_max_seq()
4360 WRITE_ONCE(lrugen->timestamps[next], jiffies); in inc_max_seq()
4362 smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); in inc_max_seq()
4364 spin_unlock_irq(&lruvec->lru_lock); in inc_max_seq()
4373 struct lru_gen_struct *lrugen = &lruvec->lrugen; in try_to_inc_max_seq()
4375 VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); in try_to_inc_max_seq()
4378 if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { in try_to_inc_max_seq()
4400 walk->lruvec = lruvec; in try_to_inc_max_seq()
4401 walk->max_seq = max_seq; in try_to_inc_max_seq()
4402 walk->can_swap = can_swap; in try_to_inc_max_seq()
4403 walk->force_scan = force_scan; in try_to_inc_max_seq()
4414 if (sc->priority <= DEF_PRIORITY - 2) in try_to_inc_max_seq()
4415 wait_event_killable(lruvec->mm_state.wait, in try_to_inc_max_seq()
4416 max_seq < READ_ONCE(lrugen->max_seq)); in try_to_inc_max_seq()
4418 return max_seq < READ_ONCE(lrugen->max_seq); in try_to_inc_max_seq()
4421 VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); in try_to_inc_max_seq()
4425 if (wq_has_sleeper(&lruvec->mm_state.wait)) in try_to_inc_max_seq()
4426 wake_up_all(&lruvec->mm_state.wait); in try_to_inc_max_seq()
4438 struct lru_gen_struct *lrugen = &lruvec->lrugen; in should_run_aging()
4450 size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); in should_run_aging()
4461 *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; in should_run_aging()
4497 VM_WARN_ON_ONCE(sc->memcg_low_reclaim); in age_lruvec()
4508 unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); in age_lruvec()
4514 if (!nr_to_scan && sc->priority != DEF_PRIORITY) in age_lruvec()
4535 sc->last_reclaimed = sc->nr_reclaimed; in lru_gen_age_node()
4543 if (!sc->memcgs_need_aging) { in lru_gen_age_node()
4544 sc->memcgs_need_aging = true; in lru_gen_age_node()
4562 /* check the order to exclude compaction-induced reclaim */ in lru_gen_age_node()
4563 if (success || !min_ttl || sc->order) in lru_gen_age_node()
4573 .gfp_mask = sc->gfp_mask, in lru_gen_age_node()
4599 struct folio *folio = pfn_folio(pvmw->pfn); in lru_gen_look_around()
4606 lockdep_assert_held(pvmw->ptl); in lru_gen_look_around()
4609 if (spin_is_contended(pvmw->ptl)) in lru_gen_look_around()
4613 walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; in lru_gen_look_around()
4615 start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); in lru_gen_look_around()
4616 end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; in lru_gen_look_around()
4618 if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { in lru_gen_look_around()
4619 if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) in lru_gen_look_around()
4621 else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) in lru_gen_look_around()
4622 start = end - MIN_LRU_BATCH * PAGE_SIZE; in lru_gen_look_around()
4624 start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; in lru_gen_look_around()
4625 end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; in lru_gen_look_around()
4629 pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; in lru_gen_look_around()
4637 pfn = get_pte_pfn(pte[i], pvmw->vma, addr); in lru_gen_look_around()
4638 if (pfn == -1) in lru_gen_look_around()
4644 folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); in lru_gen_look_around()
4648 if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) in lru_gen_look_around()
4670 update_bloom_filter(lruvec, max_seq, pvmw->pmd); in lru_gen_look_around()
4685 spin_lock_irq(&lruvec->lru_lock); in lru_gen_look_around()
4686 new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); in lru_gen_look_around()
4705 spin_unlock_irq(&lruvec->lru_lock); in lru_gen_look_around()
4723 struct lru_gen_struct *lrugen = &lruvec->lrugen; in sort_folio()
4747 if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { in sort_folio()
4748 list_move(&folio->lru, &lrugen->lists[gen][type][zone]); in sort_folio()
4754 int hist = lru_hist_from_seq(lrugen->min_seq[type]); in sort_folio()
4757 list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]); in sort_folio()
4759 WRITE_ONCE(lrugen->protected[hist][type][tier - 1], in sort_folio()
4760 lrugen->protected[hist][type][tier - 1] + delta); in sort_folio()
4769 list_move(&folio->lru, &lrugen->lists[gen][type][zone]); in sort_folio()
4781 if (!sc->may_unmap && folio_mapped(folio)) in isolate_folio()
4785 if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && in isolate_folio()
4802 set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); in isolate_folio()
4823 struct lru_gen_struct *lrugen = &lruvec->lrugen; in scan_folios()
4831 gen = lru_gen_from_seq(lrugen->min_seq[type]); in scan_folios()
4833 for (zone = sc->reclaim_idx; zone >= 0; zone--) { in scan_folios()
4836 struct list_head *head = &lrugen->lists[gen][type][zone]; in scan_folios()
4852 list_add(&folio->lru, list); in scan_folios()
4855 list_move(&folio->lru, &moved); in scan_folios()
4859 if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) in scan_folios()
4906 return tier - 1; in get_tier_idx()
4913 int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; in get_type_to_scan()
4932 *tier_idx = tier - 1; in get_type_to_scan()
4943 int tier = -1; in isolate_folios()
4971 tier = -1; in isolate_folios()
4996 spin_lock_irq(&lruvec->lru_lock); in evict_folios()
5005 spin_unlock_irq(&lruvec->lru_lock); in evict_folios()
5011 sc->nr_reclaimed += reclaimed; in evict_folios()
5015 list_del(&folio->lru); in evict_folios()
5032 set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, in evict_folios()
5038 list_move(&folio->lru, &clean); in evict_folios()
5039 sc->nr_scanned -= folio_nr_pages(folio); in evict_folios()
5042 spin_lock_irq(&lruvec->lru_lock); in evict_folios()
5046 walk = current->reclaim_state->mm_walk; in evict_folios()
5047 if (walk && walk->batched) in evict_folios()
5056 spin_unlock_irq(&lruvec->lru_lock); in evict_folios()
5089 (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) in get_nr_to_scan()
5097 if (sc->priority == DEF_PRIORITY) in get_nr_to_scan()
5118 if (max_seq - seq > 1) in should_abort_scan()
5121 /* over-swapping can increase allocation latency */ in should_abort_scan()
5122 if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) in should_abort_scan()
5127 sc->nr_reclaimed += MIN_LRU_BATCH; in should_abort_scan()
5133 } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) in should_abort_scan()
5137 if (sc->priority > DEF_PRIORITY - 2) in should_abort_scan()
5146 for (i = 0; i <= sc->reclaim_idx; i++) { in should_abort_scan()
5148 struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; in should_abort_scan()
5158 sc->nr_reclaimed += MIN_LRU_BATCH; in should_abort_scan()
5169 unsigned long reclaimed = sc->nr_reclaimed; in lru_gen_shrink_lruvec()
5183 if (sc->may_swap) in lru_gen_shrink_lruvec()
5209 if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) in lru_gen_shrink_lruvec()
5210 sc->memcgs_need_aging = false; in lru_gen_shrink_lruvec()
5223 struct lru_gen_struct *lrugen = &lruvec->lrugen; in state_is_valid()
5225 if (lrugen->enabled) { in state_is_valid()
5229 if (!list_empty(&lruvec->lists[lru])) in state_is_valid()
5236 if (!list_empty(&lrugen->lists[gen][type][zone])) in state_is_valid()
5252 struct list_head *head = &lruvec->lists[lru]; in fill_evictable()
5261 VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio); in fill_evictable()
5267 if (!--remaining) in fill_evictable()
5281 struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; in drain_evictable()
5296 if (!--remaining) in drain_evictable()
5333 spin_lock_irq(&lruvec->lru_lock); in lru_gen_change_state()
5338 lruvec->lrugen.enabled = enabled; in lru_gen_change_state()
5341 spin_unlock_irq(&lruvec->lru_lock); in lru_gen_change_state()
5343 spin_lock_irq(&lruvec->lru_lock); in lru_gen_change_state()
5346 spin_unlock_irq(&lruvec->lru_lock); in lru_gen_change_state()
5367 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5374 return -EINVAL; in store_min_ttl()
5401 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5411 caps = -1; in store_enabled()
5413 return -EINVAL; in store_enabled()
5453 m->private = kvmalloc(PATH_MAX, GFP_KERNEL); in lru_gen_seq_start()
5454 if (!m->private) in lru_gen_seq_start()
5455 return ERR_PTR(-ENOMEM); in lru_gen_seq_start()
5462 if (!nr_to_skip--) in lru_gen_seq_start()
5475 kvfree(m->private); in lru_gen_seq_stop()
5476 m->private = NULL; in lru_gen_seq_stop()
5481 int nid = lruvec_pgdat(v)->node_id; in lru_gen_seq_next()
5505 struct lru_gen_struct *lrugen = &lruvec->lrugen; in lru_gen_seq_show_full()
5515 n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); in lru_gen_seq_show_full()
5516 n[1] = READ_ONCE(lrugen->avg_total[type][tier]); in lru_gen_seq_show_full()
5519 n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); in lru_gen_seq_show_full()
5520 n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); in lru_gen_seq_show_full()
5522 n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); in lru_gen_seq_show_full()
5538 n = READ_ONCE(lruvec->mm_state.stats[hist][i]); in lru_gen_seq_show_full()
5541 n = READ_ONCE(lruvec->mm_state.stats[hist][i]); in lru_gen_seq_show_full()
5549 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5553 bool full = !debugfs_real_fops(m->file)->write; in lru_gen_seq_show()
5555 struct lru_gen_struct *lrugen = &lruvec->lrugen; in lru_gen_seq_show()
5556 int nid = lruvec_pgdat(lruvec)->node_id; in lru_gen_seq_show()
5562 const char *path = memcg ? m->private : ""; in lru_gen_seq_show()
5566 cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); in lru_gen_seq_show()
5576 seq = max_seq - MAX_NR_GENS + 1; in lru_gen_seq_show()
5583 unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); in lru_gen_seq_show()
5585 seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); in lru_gen_seq_show()
5592 size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); in lru_gen_seq_show()
5623 return -EINVAL; in run_aging()
5625 if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) in run_aging()
5626 return -ERANGE; in run_aging()
5639 return -EINVAL; in run_eviction()
5641 sc->nr_reclaimed = 0; in run_eviction()
5649 if (sc->nr_reclaimed >= nr_to_reclaim) in run_eviction()
5658 return -EINTR; in run_eviction()
5665 int err = -EINVAL; in run_cmd()
5669 return -EINVAL; in run_cmd()
5675 if (memcg && !css_tryget(&memcg->css)) in run_cmd()
5681 return -EINVAL; in run_cmd()
5698 case '-': in run_cmd()
5708 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5716 int err = -EINVAL; in lru_gen_seq_write()
5721 .reclaim_idx = MAX_NR_ZONES - 1, in lru_gen_seq_write()
5727 return -ENOMEM; in lru_gen_seq_write()
5731 return -EFAULT; in lru_gen_seq_write()
5738 err = -ENOMEM; in lru_gen_seq_write()
5752 unsigned int swappiness = -1; in lru_gen_seq_write()
5753 unsigned long opt = -1; in lru_gen_seq_write()
5762 err = -EINVAL; in lru_gen_seq_write()
5809 struct lru_gen_struct *lrugen = &lruvec->lrugen; in lru_gen_init_lruvec()
5811 lrugen->max_seq = MIN_NR_GENS + 1; in lru_gen_init_lruvec()
5812 lrugen->enabled = lru_gen_enabled(); in lru_gen_init_lruvec()
5815 lrugen->timestamps[i] = jiffies; in lru_gen_init_lruvec()
5818 INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); in lru_gen_init_lruvec()
5820 lruvec->mm_state.seq = MIN_NR_GENS; in lru_gen_init_lruvec()
5821 init_waitqueue_head(&lruvec->mm_state.wait); in lru_gen_init_lruvec()
5827 INIT_LIST_HEAD(&memcg->mm_list.fifo); in lru_gen_init_memcg()
5828 spin_lock_init(&memcg->mm_list.lock); in lru_gen_init_memcg()
5839 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, in lru_gen_exit_memcg()
5840 sizeof(lruvec->lrugen.nr_pages))); in lru_gen_exit_memcg()
5843 bitmap_free(lruvec->mm_state.filters[i]); in lru_gen_exit_memcg()
5844 lruvec->mm_state.filters[i] = NULL; in lru_gen_exit_memcg()
5884 unsigned long nr_to_reclaim = sc->nr_to_reclaim; in shrink_lruvec()
5910 sc->priority == DEF_PRIORITY); in shrink_lruvec()
5921 nr[lru] -= nr_to_scan; in shrink_lruvec()
5973 nr_scanned = targets[lru] - nr[lru]; in shrink_lruvec()
5974 nr[lru] = targets[lru] * (100 - percentage) / 100; in shrink_lruvec()
5975 nr[lru] -= min(nr[lru], nr_scanned); in shrink_lruvec()
5978 nr_scanned = targets[lru] - nr[lru]; in shrink_lruvec()
5979 nr[lru] = targets[lru] * (100 - percentage) / 100; in shrink_lruvec()
5980 nr[lru] -= min(nr[lru], nr_scanned); in shrink_lruvec()
5983 sc->nr_reclaimed += nr_reclaimed; in shrink_lruvec()
5998 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && in in_reclaim_compaction()
5999 (sc->order > PAGE_ALLOC_COSTLY_ORDER || in in_reclaim_compaction()
6000 sc->priority < DEF_PRIORITY - 2)) in in_reclaim_compaction()
6007 * Reclaim/compaction is used for high-order allocation requests. It reclaims
6008 * order-0 pages before compacting the zone. should_continue_reclaim() returns
6031 * first, by assuming that zero delta of sc->nr_scanned means full LRU in should_continue_reclaim()
6033 * where always a non-zero amount of pages were scanned. in should_continue_reclaim()
6039 for (z = 0; z <= sc->reclaim_idx; z++) { in should_continue_reclaim()
6040 struct zone *zone = &pgdat->node_zones[z]; in should_continue_reclaim()
6044 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { in should_continue_reclaim()
6058 pages_for_compaction = compact_gap(sc->order); in should_continue_reclaim()
6060 if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) in should_continue_reclaim()
6068 struct mem_cgroup *target_memcg = sc->target_mem_cgroup; in shrink_node_memcgs()
6078 * This loop can become CPU-bound when target memcgs in shrink_node_memcgs()
6079 * aren't eligible for reclaim - either because they in shrink_node_memcgs()
6100 if (!sc->memcg_low_reclaim) { in shrink_node_memcgs()
6101 sc->memcg_low_skipped = 1; in shrink_node_memcgs()
6107 reclaimed = sc->nr_reclaimed; in shrink_node_memcgs()
6108 scanned = sc->nr_scanned; in shrink_node_memcgs()
6112 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, in shrink_node_memcgs()
6113 sc->priority); in shrink_node_memcgs()
6116 if (!sc->proactive) in shrink_node_memcgs()
6117 vmpressure(sc->gfp_mask, memcg, false, in shrink_node_memcgs()
6118 sc->nr_scanned - scanned, in shrink_node_memcgs()
6119 sc->nr_reclaimed - reclaimed); in shrink_node_memcgs()
6126 struct reclaim_state *reclaim_state = current->reclaim_state; in shrink_node()
6131 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); in shrink_node()
6134 memset(&sc->nr, 0, sizeof(sc->nr)); in shrink_node()
6136 nr_reclaimed = sc->nr_reclaimed; in shrink_node()
6137 nr_scanned = sc->nr_scanned; in shrink_node()
6144 sc->nr_reclaimed += reclaim_state->reclaimed_slab; in shrink_node()
6145 reclaim_state->reclaimed_slab = 0; in shrink_node()
6149 if (!sc->proactive) in shrink_node()
6150 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, in shrink_node()
6151 sc->nr_scanned - nr_scanned, in shrink_node()
6152 sc->nr_reclaimed - nr_reclaimed); in shrink_node()
6154 if (sc->nr_reclaimed - nr_reclaimed) in shrink_node()
6160 * it implies that the long-lived page allocation rate in shrink_node()
6175 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) in shrink_node()
6176 set_bit(PGDAT_WRITEBACK, &pgdat->flags); in shrink_node()
6179 if (sc->nr.unqueued_dirty == sc->nr.file_taken) in shrink_node()
6180 set_bit(PGDAT_DIRTY, &pgdat->flags); in shrink_node()
6189 if (sc->nr.immediate) in shrink_node()
6202 sc->nr.dirty && sc->nr.dirty == sc->nr.congested) in shrink_node()
6203 set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); in shrink_node()
6212 !sc->hibernation_mode && in shrink_node()
6213 test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) in shrink_node()
6216 if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, in shrink_node()
6227 pgdat->kswapd_failures = 0; in shrink_node()
6231 * Returns true if compaction should go ahead for a costly-order request, or
6240 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); in compaction_ready()
6257 watermark = high_wmark_pages(zone) + compact_gap(sc->order); in compaction_ready()
6259 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); in compaction_ready()
6268 if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) { in consider_reclaim_throttle()
6271 wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; in consider_reclaim_throttle()
6288 if (sc->priority == 1 && !sc->nr_reclaimed) in consider_reclaim_throttle()
6293 * This is the direct reclaim path, for page-allocating processes. We only
6315 orig_mask = sc->gfp_mask; in shrink_zones()
6317 sc->gfp_mask |= __GFP_HIGHMEM; in shrink_zones()
6318 sc->reclaim_idx = gfp_zone(sc->gfp_mask); in shrink_zones()
6322 sc->reclaim_idx, sc->nodemask) { in shrink_zones()
6336 * non-zero order, only frequent costly order in shrink_zones()
6342 sc->order > PAGE_ALLOC_COSTLY_ORDER && in shrink_zones()
6344 sc->compaction_ready = true; in shrink_zones()
6354 if (zone->zone_pgdat == last_pgdat) in shrink_zones()
6364 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, in shrink_zones()
6365 sc->order, sc->gfp_mask, in shrink_zones()
6367 sc->nr_reclaimed += nr_soft_reclaimed; in shrink_zones()
6368 sc->nr_scanned += nr_soft_scanned; in shrink_zones()
6373 first_pgdat = zone->zone_pgdat; in shrink_zones()
6376 if (zone->zone_pgdat == last_pgdat) in shrink_zones()
6378 last_pgdat = zone->zone_pgdat; in shrink_zones()
6379 shrink_node(zone->zone_pgdat, sc); in shrink_zones()
6389 sc->gfp_mask = orig_mask; in shrink_zones()
6402 target_lruvec->refaults[WORKINGSET_ANON] = refaults; in snapshot_refaults()
6404 target_lruvec->refaults[WORKINGSET_FILE] = refaults; in snapshot_refaults()
6414 * high - the zone may be full of dirty or under-writeback pages, which this
6426 int initial_priority = sc->priority; in do_try_to_free_pages()
6434 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); in do_try_to_free_pages()
6437 if (!sc->proactive) in do_try_to_free_pages()
6438 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, in do_try_to_free_pages()
6439 sc->priority); in do_try_to_free_pages()
6440 sc->nr_scanned = 0; in do_try_to_free_pages()
6443 if (sc->nr_reclaimed >= sc->nr_to_reclaim) in do_try_to_free_pages()
6446 if (sc->compaction_ready) in do_try_to_free_pages()
6453 if (sc->priority < DEF_PRIORITY - 2) in do_try_to_free_pages()
6454 sc->may_writepage = 1; in do_try_to_free_pages()
6455 } while (--sc->priority >= 0); in do_try_to_free_pages()
6458 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, in do_try_to_free_pages()
6459 sc->nodemask) { in do_try_to_free_pages()
6460 if (zone->zone_pgdat == last_pgdat) in do_try_to_free_pages()
6462 last_pgdat = zone->zone_pgdat; in do_try_to_free_pages()
6464 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); in do_try_to_free_pages()
6469 lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, in do_try_to_free_pages()
6470 zone->zone_pgdat); in do_try_to_free_pages()
6471 clear_bit(LRUVEC_CONGESTED, &lruvec->flags); in do_try_to_free_pages()
6477 if (sc->nr_reclaimed) in do_try_to_free_pages()
6478 return sc->nr_reclaimed; in do_try_to_free_pages()
6481 if (sc->compaction_ready) in do_try_to_free_pages()
6493 if (sc->skipped_deactivate) { in do_try_to_free_pages()
6494 sc->priority = initial_priority; in do_try_to_free_pages()
6495 sc->force_deactivate = 1; in do_try_to_free_pages()
6496 sc->skipped_deactivate = 0; in do_try_to_free_pages()
6501 if (sc->memcg_low_skipped) { in do_try_to_free_pages()
6502 sc->priority = initial_priority; in do_try_to_free_pages()
6503 sc->force_deactivate = 0; in do_try_to_free_pages()
6504 sc->memcg_low_reclaim = 1; in do_try_to_free_pages()
6505 sc->memcg_low_skipped = 0; in do_try_to_free_pages()
6520 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) in allow_direct_reclaim()
6524 zone = &pgdat->node_zones[i]; in allow_direct_reclaim()
6542 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { in allow_direct_reclaim()
6543 if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) in allow_direct_reclaim()
6544 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); in allow_direct_reclaim()
6546 wake_up_interruptible(&pgdat->kswapd_wait); in allow_direct_reclaim()
6575 if (current->flags & PF_KTHREAD) in throttle_direct_reclaim()
6605 pgdat = zone->zone_pgdat; in throttle_direct_reclaim()
6627 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, in throttle_direct_reclaim()
6631 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, in throttle_direct_reclaim()
6698 .reclaim_idx = MAX_NR_ZONES - 1, in mem_cgroup_shrink_node()
6702 WARN_ON_ONCE(!current->reclaim_state); in mem_cgroup_shrink_node()
6737 .reclaim_idx = MAX_NR_ZONES - 1, in try_to_free_mem_cgroup_pages()
6798 * Check for watermark boosts top-down as the higher zones in pgdat_watermark_boosted()
6804 for (i = highest_zoneidx; i >= 0; i--) { in pgdat_watermark_boosted()
6805 zone = pgdat->node_zones + i; in pgdat_watermark_boosted()
6809 if (zone->watermark_boost) in pgdat_watermark_boosted()
6823 unsigned long mark = -1; in pgdat_balanced()
6827 * Check watermarks bottom-up as lower zones are more likely to in pgdat_balanced()
6831 zone = pgdat->node_zones + i; in pgdat_balanced()
6846 * need balancing by definition. This can happen if a zone-restricted in pgdat_balanced()
6849 if (mark == -1) in pgdat_balanced()
6860 clear_bit(LRUVEC_CONGESTED, &lruvec->flags); in clear_pgdat_congested()
6861 clear_bit(PGDAT_DIRTY, &pgdat->flags); in clear_pgdat_congested()
6862 clear_bit(PGDAT_WRITEBACK, &pgdat->flags); in clear_pgdat_congested()
6887 if (waitqueue_active(&pgdat->pfmemalloc_wait)) in prepare_kswapd_sleep()
6888 wake_up_all(&pgdat->pfmemalloc_wait); in prepare_kswapd_sleep()
6891 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) in prepare_kswapd_sleep()
6917 sc->nr_to_reclaim = 0; in kswapd_shrink_node()
6918 for (z = 0; z <= sc->reclaim_idx; z++) { in kswapd_shrink_node()
6919 zone = pgdat->node_zones + z; in kswapd_shrink_node()
6923 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); in kswapd_shrink_node()
6934 * high-order allocations. If twice the allocation size has been in kswapd_shrink_node()
6935 * reclaimed then recheck watermarks only at order-0 to prevent in kswapd_shrink_node()
6936 * excessive reclaim. Assume that a process requested a high-order in kswapd_shrink_node()
6939 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) in kswapd_shrink_node()
6940 sc->order = 0; in kswapd_shrink_node()
6942 return sc->nr_scanned >= sc->nr_to_reclaim; in kswapd_shrink_node()
6953 zone = pgdat->node_zones + i; in update_reclaim_active()
6959 set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); in update_reclaim_active()
6961 clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); in update_reclaim_active()
6984 * kswapd scans the zones in the highmem->normal->dma direction. It skips
7019 zone = pgdat->node_zones + i; in balance_pgdat()
7023 nr_boost_reclaim += zone->watermark_boost; in balance_pgdat()
7024 zone_boosts[i] = zone->watermark_boost; in balance_pgdat()
7042 * purpose -- on 64-bit systems it is expected that in balance_pgdat()
7043 * buffer_heads are stripped during active rotation. On 32-bit in balance_pgdat()
7050 for (i = MAX_NR_ZONES - 1; i >= 0; i--) { in balance_pgdat()
7051 zone = pgdat->node_zones + i; in balance_pgdat()
7065 * re-evaluate if boosting is required when kswapd next wakes. in balance_pgdat()
7082 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) in balance_pgdat()
7087 * intent is to relieve pressure not issue sub-optimal IO in balance_pgdat()
7105 if (sc.priority < DEF_PRIORITY - 2) in balance_pgdat()
7128 if (waitqueue_active(&pgdat->pfmemalloc_wait) && in balance_pgdat()
7130 wake_up_all(&pgdat->pfmemalloc_wait); in balance_pgdat()
7143 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; in balance_pgdat()
7144 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); in balance_pgdat()
7155 sc.priority--; in balance_pgdat()
7159 pgdat->kswapd_failures++; in balance_pgdat()
7173 zone = pgdat->node_zones + i; in balance_pgdat()
7174 spin_lock_irqsave(&zone->lock, flags); in balance_pgdat()
7175 zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); in balance_pgdat()
7176 spin_unlock_irqrestore(&zone->lock, flags); in balance_pgdat()
7201 * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
7210 enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); in kswapd_highest_zoneidx()
7224 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); in kswapd_try_to_sleep()
7256 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, in kswapd_try_to_sleep()
7260 if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) in kswapd_try_to_sleep()
7261 WRITE_ONCE(pgdat->kswapd_order, reclaim_order); in kswapd_try_to_sleep()
7264 finish_wait(&pgdat->kswapd_wait, &wait); in kswapd_try_to_sleep()
7265 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); in kswapd_try_to_sleep()
7274 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); in kswapd_try_to_sleep()
7281 * per-cpu vmstat threshold while kswapd is awake and restore in kswapd_try_to_sleep()
7296 finish_wait(&pgdat->kswapd_wait, &wait); in kswapd_try_to_sleep()
7309 * If there are applications that are active memory-allocators
7315 unsigned int highest_zoneidx = MAX_NR_ZONES - 1; in kswapd()
7318 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); in kswapd()
7335 tsk->flags |= PF_MEMALLOC | PF_KSWAPD; in kswapd()
7338 WRITE_ONCE(pgdat->kswapd_order, 0); in kswapd()
7339 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); in kswapd()
7340 atomic_set(&pgdat->nr_writeback_throttled, 0); in kswapd()
7344 alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); in kswapd()
7353 alloc_order = READ_ONCE(pgdat->kswapd_order); in kswapd()
7356 WRITE_ONCE(pgdat->kswapd_order, 0); in kswapd()
7357 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); in kswapd()
7371 * Reclaim begins at the requested order but if a high-order in kswapd()
7373 * order-0. If that happens, kswapd will consider sleeping in kswapd()
7378 trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, in kswapd()
7386 tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD); in kswapd()
7392 * A zone is low on free memory or too fragmented for high-order memory. If
7410 pgdat = zone->zone_pgdat; in wakeup_kswapd()
7411 curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); in wakeup_kswapd()
7414 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); in wakeup_kswapd()
7416 if (READ_ONCE(pgdat->kswapd_order) < order) in wakeup_kswapd()
7417 WRITE_ONCE(pgdat->kswapd_order, order); in wakeup_kswapd()
7419 if (!waitqueue_active(&pgdat->kswapd_wait)) in wakeup_kswapd()
7423 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || in wakeup_kswapd()
7428 * fragmented for high-order allocations. Wake up kcompactd in wakeup_kswapd()
7438 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, in wakeup_kswapd()
7440 wake_up_interruptible(&pgdat->kswapd_wait); in wakeup_kswapd()
7445 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
7457 .reclaim_idx = MAX_NR_ZONES - 1, in shrink_all_memory()
7483 * This kswapd start function will be called by init and node-hot-add.
7490 if (!pgdat->kswapd) { in kswapd_run()
7491 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); in kswapd_run()
7492 if (IS_ERR(pgdat->kswapd)) { in kswapd_run()
7496 pgdat->kswapd = NULL; in kswapd_run()
7512 kswapd = pgdat->kswapd; in kswapd_stop()
7515 pgdat->kswapd = NULL; in kswapd_stop()
7536 * If non-zero call node_reclaim when the number of free pages falls below
7571 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; in node_unmapped_file_pages()
7599 return nr_pagecache_reclaimable - delta; in node_pagecache_reclaimable()
7623 trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, in __node_reclaim()
7635 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || in __node_reclaim()
7636 node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { in __node_reclaim()
7643 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); in __node_reclaim()
7670 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && in node_reclaim()
7672 pgdat->min_slab_pages) in node_reclaim()
7678 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) in node_reclaim()
7683 * have associated processors. This will favor the local processor in node_reclaim()
7687 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) in node_reclaim()
7690 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) in node_reclaim()
7694 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); in node_reclaim()
7709 for (i = 0; i < pvec->nr; i++) { in check_move_unevictable_pages()
7710 struct page *page = pvec->pages[i]; in check_move_unevictable_pages()
7721 * check_move_unevictable_folios - Move evictable folios to appropriate zone
7736 for (i = 0; i < fbatch->nr; i++) { in check_move_unevictable_folios()
7737 struct folio *folio = fbatch->folios[i]; in check_move_unevictable_folios()