- Nov 2024
-
elixir.bootlin.com elixir.bootlin.com
-
#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; do { struct page *page = pfn_to_page(pfn); pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } SetPageReferenced(page); pages[*nr] = page; if (unlikely(try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } (*nr)++; pfn++; } while (addr += PAGE_SIZE, addr != end); put_dev_pagemap(pgmap); return addr == end; } static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } #else static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } #endif
seems like a check to see if pages can be grabbed. A quick skim maybe hints possible checks if huge pages can be grabbed?
-
#ifdef CONFIG_ARCH_HAS_HUGEPD static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) { unsigned long __boundary = (addr + sz) & ~(sz-1); return (__boundary - 1 < end - 1) ? __boundary : end; } static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long pte_end; struct page *page; struct folio *folio; pte_t pte; int refs; pte_end = (addr + sz) & ~(sz-1); if (pte_end < end) end = pte_end; pte = huge_ptep_get(ptep); if (!pte_access_permitted(pte, flags & FOLL_WRITE)) return 0; /* hugepages are never "special" */ VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, struct page **pages, int *nr) { pte_t *ptep; unsigned long sz = 1UL << hugepd_shift(hugepd); unsigned long next; ptep = hugepte_offset(hugepd, addr, pdshift); do { next = hugepte_addr_end(addr, end, sz); if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr)) return 0; } while (ptep++, addr = next, addr != end); return 1; } #else static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_HUGEPD */ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *page; struct folio *folio; int refs; if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; return __gup_device_huge_pmd(orig, pmdp, addr, end, flags, pages, nr); } page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *page; struct folio *folio; int refs; if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; return __gup_device_huge_pud(orig, pudp, addr, end, flags, pages, nr); } page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { int refs; struct page *page; struct folio *folio; if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) return 0; BUILD_BUG_ON(pgd_devmap(orig)); page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { gup_put_folio(folio, refs, flags); return 0; } if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; pmdp = pmd_offset_lockless(pudp, pud, addr); do { pmd_t pmd = pmdp_get_lockless(pmdp); next = pmd_addr_end(addr, end); if (!pmd_present(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd))) { /* See gup_pte_range() */ if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { /* * architecture have different format for hugetlbfs * pmd format and THP pmd format */ if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, PMD_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); return 1; } static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; pudp = pud_offset_lockless(p4dp, p4d, addr); do { pud_t pud = READ_ONCE(*pudp); next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) return 0; if (unlikely(pud_huge(pud) || pud_devmap(pud))) { if (!gup_huge_pud(pud, pudp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, PUD_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); return 1; } static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp; p4dp = p4d_offset_lockless(pgdp, pgd, addr); do { p4d_t p4d = READ_ONCE(*p4dp); next = p4d_addr_end(addr, end); if (p4d_none(p4d)) return 0; BUILD_BUG_ON(p4d_huge(p4d)); if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, P4D_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); return 1; } static void gup_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pgd_t *pgdp; pgdp = pgd_offset(current->mm, addr); do { pgd_t pgd = READ_ONCE(*pgdp); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) return; if (unlikely(pgd_huge(pgd))) { if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, pages, nr)) return; } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, PGDIR_SHIFT, next, flags, pages, nr)) return; } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) return; } while (pgdp++, addr = next, addr != end); } #else static inline void gup_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { }
policy use functions for gup_huge pte policy code function above (not right above, gotta scroll probably to find it)
-
static int internal_get_user_pages_fast(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { unsigned long len, end; unsigned long nr_pinned; int locked = 0; int ret; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | FOLL_FAST_ONLY | FOLL_NOFAULT | FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT))) return -EINVAL; if (gup_flags & FOLL_PIN) mm_set_has_pinned_flag(¤t->mm->flags); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); start = untagged_addr(start) & PAGE_MASK; len = nr_pages << PAGE_SHIFT; if (check_add_overflow(start, len, &end)) return -EOVERFLOW; if (end > TASK_SIZE_MAX) return -EFAULT; if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages); if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) return nr_pinned; /* Slow path: try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned, pages, &locked, gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so * returning -errno is not an option */ if (nr_pinned) return nr_pinned; return ret; } return ret + nr_pinned; } /** * get_user_pages_fast_only() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. * * If the architecture does not support this function, simply return with no * pages pinned. * * Careful, careful! COW breaking can go either way, so a non-write * access can get ambiguous page results. If you call this function without * 'write' set, you'd better be sure that you're ok with that ambiguity. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { /* * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, * because gup fast is always a "pin with a +1 page refcount" request. * * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET | FOLL_FAST_ONLY)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast_only); /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Attempt to pin user pages in memory without taking mm->mmap_lock. * If not successful, it will fall back to taking the lock and * calling get_user_pages(). * * Returns number of pages pinned. This may be fewer than the number requested. * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns * -errno. */ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { /* * The caller may or may not have explicitly set FOLL_GET; either way is * OK. However, internally (within mm/gup.c), gup fast variants must set * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" * request. */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); /** * pin_user_pages_fast() - pin user pages in memory without taking locks * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See * get_user_pages_fast() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for further details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page() will not remove pins from it. */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(pin_user_pages_fast); /** * pin_user_pages_remote() - pin pages of a remote process * * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See * get_user_pages_remote() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { int local_locked = 1; if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) return 0; return __gup_longterm_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_remote); /** * pin_user_pages() - pin user pages in memory for use by other devices * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { int locked = 1; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages); /* * pin_user_pages_unlocked() is the FOLL_PIN variant of * get_user_pages_unlocked(). Behavior is the same, except that this one sets * FOLL_PIN and rejects FOLL_GET. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { int locked = 0; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); }
fast gup functions
-
/** * unpin_user_pages() - release an array of gup-pinned pages. * @pages: array of pages to be marked dirty and released. * @npages: number of pages in the @pages array. * * For each page in the @pages array, release the page using unpin_user_page(). * * Please see the unpin_user_page() documentation for details. */ void unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long i; struct folio *folio; unsigned int nr; /* * If this WARN_ON() fires, then the system *might* be leaking pages (by * leaving them pinned), but probably not. More likely, gup/pup returned * a hard -ERRNO error to the caller, who erroneously passed it here. */ if (WARN_ON(IS_ERR_VALUE(npages))) return; sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); gup_put_folio(folio, nr, FOLL_PIN); } }
gup unpin function, not actual logic
-
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty) { unsigned long i; struct folio *folio; unsigned int nr; for (i = 0; i < npages; i += nr) { folio = gup_folio_range_next(page, npages, i, &nr); if (make_dirty && !folio_test_dirty(folio)) { folio_lock(folio); folio_mark_dirty(folio); folio_unlock(folio); } gup_put_folio(folio, nr, FOLL_PIN); } }
unpin logic but for dirty pages
-
if ((flags & FOLL_DUMP) && (vma_is_anonymous(vma) || !vma->vm_ops->fault)) return ERR_PTR(-EFAULT); return NULL;
explained in comments
-
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL /* * Fast-gup relies on pte change detection to avoid concurrent pgtable * operations. * * To pin the page, fast-gup needs to do below in order: * (1) pin the page (by prefetching pte), then (2) check pte not changed. * * For the rest of pgtable operations where pgtable updates can be racy * with fast-gup, we need to do (1) clear pte, then (2) check whether page * is pinned. * * Above will work for all pte-level operations, including THP split. * * For THP collapse, it's a bit more complicated because fast-gup may be * walking a pgtable page that is being freed (pte is still valid but pmd * can be cleared already). To avoid race in such condition, we need to * also check pmd here to make sure pmd doesn't change (corresponds to * pmdp_collapse_flush() in the THP collapse code path). */ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; int nr_start = *nr, ret = 0; pte_t *ptep, *ptem; ptem = ptep = pte_offset_map(&pmd, addr); if (!ptep) return 0; do { pte_t pte = ptep_get_lockless(ptep); struct page *page; struct folio *folio; /* * Always fallback to ordinary GUP on PROT_NONE-mapped pages: * pte_access_permitted() better should reject these pages * either way: otherwise, GUP-fast might succeed in * cases where ordinary GUP would fail due to VMA access * permissions. */ if (pte_protnone(pte)) goto pte_unmap; if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto pte_unmap; if (pte_devmap(pte)) { if (unlikely(flags & FOLL_LONGTERM)) goto pte_unmap; pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); goto pte_unmap; } } else if (pte_special(pte)) goto pte_unmap; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); folio = try_grab_folio(page, 1, flags); if (!folio) goto pte_unmap; if (unlikely(folio_is_secretmem(folio))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } /* * We need to make the page accessible if and only if we are * going to access its content (the FOLL_PIN case). Please * see Documentation/core-api/pin_user_pages.rst for * details. */ if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { gup_put_folio(folio, 1, flags); goto pte_unmap; } } folio_set_referenced(folio); pages[*nr] = page; (*nr)++; } while (ptep++, addr += PAGE_SIZE, addr != end); ret = 1; pte_unmap: if (pgmap) put_dev_pagemap(pgmap); pte_unmap(ptem); return ret; } #else /* * If we can't determine whether or not a pte is special, then fail immediately * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not * to be special. * * For a futex to be placed on a THP tail page, get_futex_key requires a * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
non concurrent fast gup approach that checks for pinned page and unmaps pte or clears it
-
#ifdef CONFIG_HAVE_FAST_GUP /* * Used in the GUP-fast path to determine whether a pin is permitted for a * specific folio. * * This call assumes the caller has pinned the folio, that the lowest page table * level still points to this folio, and that interrupts have been disabled. * * Writing to pinned file-backed dirty tracked folios is inherently problematic * (see comment describing the writable_file_mapping_allowed() function). We * therefore try to avoid the most egregious case of a long-term mapping doing * so. * * This function cannot be as thorough as that one as the VMA is not available * in the fast path, so instead we whitelist known good cases and if in doubt, * fall back to the slow path. */ static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags) { struct address_space *mapping; unsigned long mapping_flags; /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the one we disallow. */ if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) != (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) return true; /* The folio is pinned, so we can safely access folio fields. */ if (WARN_ON_ONCE(folio_test_slab(folio))) return false; /* hugetlb mappings do not require dirty-tracking. */ if (folio_test_hugetlb(folio)) return true; /* * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods * cannot proceed, which means no actions performed under RCU can * proceed either. * * inodes and thus their mappings are freed under RCU, which means the * mapping cannot be freed beneath us and thus we can safely dereference * it. */ lockdep_assert_irqs_disabled(); /* * However, there may be operations which _alter_ the mapping, so ensure * we read it once and only once. */ mapping = READ_ONCE(folio->mapping); /* * The mapping may have been truncated, in any case we cannot determine * if this mapping is safe - fall back to slow path to determine how to * proceed. */ if (!mapping) return false; /* Anonymous folios pose no problem. */ mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS; if (mapping_flags) return mapping_flags & PAGE_MAPPING_ANON; /* * At this point, we know the mapping is non-null and points to an * address_space object. The only remaining whitelisted file system is * shmem. */ return shmem_mapping(mapping); }
policy logic. avoids locks unlike get user pages unlocked/locked which seems risky so its not supposed to be used on concurrent gup logic
-
long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { int locked = 1; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); }
policy logic.
-
static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, unsigned int flags, struct page **pages) { while ((*nr) - nr_start) { struct page *page = pages[--(*nr)]; ClearPageReferenced(page); if (flags & FOLL_PIN) unpin_user_page(page); else put_page(page); } }
policy use function that undoes mapping
-
#ifdef CONFIG_MIGRATION /* * Returns the number of collected pages. Return value is always >= 0. */ static unsigned long collect_longterm_unpinnable_pages( struct list_head *movable_page_list, unsigned long nr_pages, struct page **pages) { unsigned long i, collected = 0; struct folio *prev_folio = NULL; bool drain_allow = true; for (i = 0; i < nr_pages; i++) { struct folio *folio = page_folio(pages[i]); if (folio == prev_folio) continue; prev_folio = folio; if (folio_is_longterm_pinnable(folio)) continue; collected++; if (folio_is_device_coherent(folio)) continue; if (folio_test_hugetlb(folio)) { isolate_hugetlb(folio, movable_page_list); continue; } if (!folio_test_lru(folio) && drain_allow) { lru_add_drain_all(); drain_allow = false; } if (!folio_isolate_lru(folio)) continue; list_add_tail(&folio->lru, movable_page_list); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } return collected; }
-
#ifdef CONFIG_ELF_CORE struct page *get_dump_page(unsigned long addr) { struct page *page; int locked = 0; int ret; ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */
part of policy use code likely
-
int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; long ret = 0; end = start + len; for (nstart = start; nstart < end; nstart = nend) { /* * We want to fault in pages for [nstart; end) address range. * Find first corresponding VMA. */ if (!locked) { locked = 1; mmap_read_lock(mm); vma = find_vma_intersection(mm, nstart, end); } else if (nstart >= vma->vm_end) vma = find_vma_intersection(mm, vma->vm_end, end); if (!vma) break; /* * Set [nstart; nend) to intersection of desired address * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); if (vma->vm_flags & (VM_IO | VM_PFNMAP)) continue; if (nstart < vma->vm_start) nstart = vma->vm_start; /* * Now fault in a range of pages. populate_vma_page_range() * double checks the vma flags, so that it won't mlock pages * if the vma was already munlocked. */ ret = populate_vma_page_range(vma, nstart, nend, &locked); if (ret < 0) { if (ignore_errors) { ret = 0; continue; /* continue at next VMA */ } break; } nend = nstart + ret * PAGE_SIZE; ret = 0; } if (locked) mmap_read_unlock(mm); return ret; /* 0 or negative error code */ }
policy use function that populates pages like the func before this.
-
long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked) { struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int local_locked = 1; int gup_flags; long ret; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* * Rightly or wrongly, the VM_LOCKONFAULT case has never used * faultin_page() to break COW, so it has no work to do here. */ if (vma->vm_flags & VM_LOCKONFAULT) return nr_pages; gup_flags = FOLL_TOUCH; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW * and we would not want to dirty them for nothing. */ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; /* * We want mlock to succeed for regions that have any permissions * other than PROT_NONE. */ if (vma_is_accessible(vma)) gup_flags |= FOLL_FORCE; if (locked) gup_flags |= FOLL_UNLOCKABLE; /* * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ ret = __get_user_pages(mm, start, nr_pages, gup_flags, NULL, locked ? locked : &local_locked); lru_add_drain(); return ret; }
policy use code.
-
long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { int local_locked = 1; if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_TOUCH | FOLL_REMOTE)) return -EINVAL; return __get_user_pages_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); }
policy logic
-
static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int flags) { long ret, pages_done; bool must_unlock = false; /* * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. */ if (!*locked) { if (mmap_read_lock_killable(mm)) return -EAGAIN; must_unlock = true; *locked = 1; } else mmap_assert_locked(mm); if (flags & FOLL_PIN) mm_set_has_pinned_flag(&mm->flags); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior * is to set FOLL_GET if the caller wants pages[] filled in (but has * carelessly failed to specify FOLL_GET), so keep doing that, but only * for FOLL_GET, not for the newer FOLL_PIN. * * FOLL_PIN always expects pages to be non-null, but no need to assert * that here, as any failures will be obvious enough. */ if (pages && !(flags & FOLL_PIN)) flags |= FOLL_GET; pages_done = 0; for (;;) { ret = __get_user_pages(mm, start, nr_pages, flags, pages, locked); if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ pages_done = ret; break; } /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */ if (!*locked) { BUG_ON(ret < 0); BUG_ON(ret >= nr_pages); } if (ret > 0) { nr_pages -= ret; pages_done += ret; if (!nr_pages) break; } if (*locked) { /* * VM_FAULT_RETRY didn't trigger or it was a * FOLL_NOWAIT. */ if (!pages_done) pages_done = ret; break; } /* * VM_FAULT_RETRY triggered, so seek to the faulting offset. * For the prefault case (!pages) we only update counts. */ if (likely(pages)) pages += ret; start += ret << PAGE_SHIFT; /* The lock was temporarily dropped, so we must unlock later */ must_unlock = true; retry: /* * Repeat on the address that fired VM_FAULT_RETRY * with both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_TRIED. Note that GUP can be interrupted * by fatal signals of even common signals, depending on * the caller's request. So we need to check it before we * start trying again otherwise it can loop forever. */ if (gup_signal_pending(flags)) { if (!pages_done) pages_done = -EINTR; break; } ret = mmap_read_lock_killable(mm); if (ret) { BUG_ON(ret > 0); if (!pages_done) pages_done = ret; break; } *locked = 1; ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED, pages, locked); if (!*locked) { /* Continue to retry until we succeeded */ BUG_ON(ret != 0); goto retry; } if (ret != 1) { BUG_ON(ret > 1); if (!pages_done) pages_done = ret; break; } nr_pages--; pages_done++; if (!nr_pages) break; if (likely(pages)) pages++; start += PAGE_SIZE; } if (must_unlock && *locked) { /* * We either temporarily dropped the lock, or the caller * requested that we both acquire and drop the lock. Either way, * we must now unlock, and notify the caller of that state. */ mmap_read_unlock(mm); *locked = 0; } return pages_done; }
same as gup but sets/unsets mmap_lock
-
if (!(flags & FOLL_INTERRUPTIBLE)) return false;
fatal fault signal handler
-
int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { struct vm_area_struct *vma; vm_fault_t ret; address = untagged_addr_remote(mm, address); if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; retry: vma = gup_vma_lookup(mm, address); if (!vma) return -EFAULT; if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; if ((fault_flags & FAULT_FLAG_KILLABLE) && fatal_signal_pending(current)) return -EINTR; ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_COMPLETED) { /* * NOTE: it's a pity that we need to retake the lock here * to pair with the unlock() in the callers. Ideally we * could tell the callers so they do not need to unlock. */ mmap_read_lock(mm); *unlocked = true; return 0; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); *unlocked = true; fault_flags |= FAULT_FLAG_TRIED; goto retry; } return 0; }
resolves user page fault. policy logic
-
static long __get_user_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; struct follow_page_context ctx = { NULL }; if (!nr_pages) return 0; start = untagged_addr_remote(mm, start); VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); do { struct page *page; unsigned int foll_flags = gup_flags; unsigned int page_increm; /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { /* * MADV_POPULATE_(READ|WRITE) wants to handle VMA * lookups+error reporting differently. */ if (gup_flags & FOLL_MADV_POPULATE) { vma = vma_lookup(mm, start); if (!vma) { ret = -ENOMEM; goto out; } if (check_vma_flags(vma, gup_flags)) { ret = -EINVAL; goto out; } goto retry; } vma = gup_vma_lookup(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, pages ? &page : NULL); if (ret) goto out; ctx.page_mask = 0; goto next_page; } if (!vma) { ret = -EFAULT; goto out; } ret = check_vma_flags(vma, gup_flags); if (ret) goto out; } retry: /* * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ if (fatal_signal_pending(current)) { ret = -EINTR; goto out; } cond_resched(); page = follow_page_mask(vma, start, foll_flags, &ctx); if (!page || PTR_ERR(page) == -EMLINK) { ret = faultin_page(vma, start, &foll_flags, PTR_ERR(page) == -EMLINK, locked); switch (ret) { case 0: goto retry; case -EBUSY: case -EAGAIN: ret = 0; fallthrough; case -EFAULT: case -ENOMEM: case -EHWPOISON: goto out; } BUG(); } else if (PTR_ERR(page) == -EEXIST) { /* * Proper page table entry exists, but no corresponding * struct page. If the caller expects **pages to be * filled in, bail out now, because that can't be done * for this page. */ if (pages) { ret = PTR_ERR(page); goto out; } } else if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } next_page: page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; if (pages) { struct page *subpage; unsigned int j; /* * This must be a large folio (and doesn't need to * be the whole folio; it can be part of it), do * the refcount work for all the subpages too. * * NOTE: here the page may not be the head page * e.g. when start addr is not thp-size aligned. * try_grab_folio() should have taken care of tail * pages. */ if (page_increm > 1) { struct folio *folio; /* * Since we already hold refcount on the * large folio, this should never fail. */ folio = try_grab_folio(page, page_increm - 1, foll_flags); if (WARN_ON_ONCE(!folio)) { /* * Release the 1st page ref if the * folio is problematic, fail hard. */ gup_put_folio(page_folio(page), 1, foll_flags); ret = -EFAULT; goto out; } } for (j = 0; j < page_increm; j++) { subpage = nth_page(page, j); pages[i + j] = subpage; flush_anon_page(vma, subpage, start + j * PAGE_SIZE); flush_dcache_page(subpage); } } i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; } while (nr_pages); out: if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return i ? i : ret; }
Literally the actual policy logic of gup. Most important piece of code right here for gup
-
#ifdef CONFIG_STACK_GROWSUP return vma_lookup(mm, addr); #else static volatile unsigned long next_warn; struct vm_area_struct *vma; unsigned long now, next; vma = find_vma(mm, addr); if (!vma || (addr >= vma->vm_start)) return vma; /* Only warn for half-way relevant accesses */ if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; if (vma->vm_start - addr > 65536) return NULL; /* Let's not warn more than once an hour.. */ now = jiffies; next = next_warn; if (next && time_before(now, next)) return NULL; next_warn = now + 60*60*HZ; /* Let people know things may have changed. */ pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n", current->comm, task_pid_nr(current), vma->vm_start, vma->vm_end, addr); dump_stack(); return NULL;
helper func to lookup vma(virtual mem area) that warns per hour about half way relevant acc and changes in stack
-
static bool writable_file_mapping_allowed(struct vm_area_struct *vma, unsigned long gup_flags) { /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the case we disallow. */ if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) != (FOLL_PIN | FOLL_LONGTERM)) return true; /* * If the VMA does not require dirty tracking then no problematic write * can occur either. */ return !vma_needs_dirty_tracking(vma); }
Def policy code. checks if we can write to a map
-
if (*flags & FOLL_NOFAULT) return -EFAULT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; if (*flags & FOLL_UNLOCKABLE) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE. * That's because some callers may not be prepared to * handle early exits caused by non-fatal signals. */ if (*flags & FOLL_INTERRUPTIBLE) fault_flags |= FAULT_FLAG_INTERRUPTIBLE; } if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (*flags & FOLL_TRIED) { /* * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED * can co-exist */ fault_flags |= FAULT_FLAG_TRIED; } if (unshare) { fault_flags |= FAULT_FLAG_UNSHARE; /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */ VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE); } ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_COMPLETED) { /* * With FAULT_FLAG_RETRY_NOWAIT we'll never release the * mmap lock in the page fault handler. Sanity check this. */ WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT); *locked = 0; /* * We should do the same as VM_FAULT_RETRY, but let's not * return -EBUSY since that's not reflecting the reality of * what has happened - we've just fully completed a page * fault, with the mmap lock released. Use -EAGAIN to show * that we want to take the mmap lock _again_. */ return -EAGAIN; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, *flags); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *locked = 0; return -EBUSY; }
Seems it's just setting flags for page faults based on flags param
-
/* user gate pages are read-only */ if (gup_flags & FOLL_WRITE) return -EFAULT; if (address > TASK_SIZE) pgd = pgd_offset_k(address); else pgd = pgd_offset_gate(mm, address); if (pgd_none(*pgd)) return -EFAULT; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d)) return -EFAULT; pud = pud_offset(p4d, address); if (pud_none(*pud)) return -EFAULT; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return -EFAULT; pte = pte_offset_map(pmd, address); if (!pte) return -EFAULT; entry = ptep_get(pte); if (pte_none(entry)) goto unmap; *vma = get_gate_vma(mm); if (!page) goto out; *page = vm_normal_page(*vma, address, entry); if (!*page) { if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry))) goto unmap; *page = pte_page(entry); } ret = try_grab_page(*page, gup_flags); if (unlikely(ret)) goto unmap;
Most of these seem like sanity checks right up until line 897 i.e, 'if(!page)'* after which we seem to unmap the page.
-
static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct follow_page_context *ctx) { pgd_t *pgd; struct mm_struct *mm = vma->vm_mm; ctx->page_mask = 0; /* * Call hugetlb_follow_page_mask for hugetlb vmas as it will use * special hugetlb page table walking code. This eliminates the * need to check for hugetlb entries in the general walking code. */ if (is_vm_hugetlb_page(vma)) return hugetlb_follow_page_mask(vma, address, flags, &ctx->page_mask); pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); return follow_p4d_mask(vma, address, pgd, flags, ctx); }
places mask after following page into pte
-
struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags) { struct follow_page_context ctx = { NULL }; struct page *page; if (vma_is_secretmem(vma)) return NULL; if (WARN_ON_ONCE(foll_flags & FOLL_PIN)) return NULL; /* * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect * to fail on PROT_NONE-mapped pages. */ page = follow_page_mask(vma, address, foll_flags, &ctx); if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return page; }
finds page
-
if (flags & FOLL_SPLIT_PMD) { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); /* If pmd was left empty, stuff a page table in there quickly */ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) : follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); ctx->page_mask = HPAGE_PMD_NR - 1; return page;
we're finding the page again but storing page mask in ctx
-
if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) return no_page_table(vma, flags); ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); return no_page_table(vma, flags); } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); }
branch prediction to check if pmd is there and if it's big
-
if (pmd_none(pmdval)) return no_page_table(vma, flags); if (!pmd_present(pmdval)) return no_page_table(vma, flags); if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; }
checks if pmd is there. im assuming it's page middle dir.
-
/* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags); pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags)) goto no_page; page = vm_normal_page(vma, address, pte); /* * We only care about anon pages in can_follow_write_pte() and don't * have to worry about pte_devmap() because they are never anon. */ if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, page, vma, flags)) { page = NULL; goto out; } if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN * case since they are only valid while holding the pgmap * reference. */ *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); if (*pgmap) page = pte_page(pte); else goto no_page; } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); goto out; } if (is_zero_pfn(pte_pfn(pte))) { page = pte_page(pte); } else { ret = follow_pfn_pte(vma, address, ptep, flags); page = ERR_PTR(ret); goto out; } } if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { page = ERR_PTR(-EMLINK); goto out; } VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ ret = try_grab_page(page, flags); if (unlikely(ret)) { page = ERR_PTR(ret); goto out; } /* * We need to make the page accessible if and only if we are going * to access its content (the FOLL_PIN case). Please see * Documentation/core-api/pin_user_pages.rst for details. */ if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { unpin_user_page(page); page = ERR_PTR(ret); goto out; } } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * mark_page_accessed(). */ mark_page_accessed(page); }
finds page in pte. Judging by the complexity of the logic this is most likely policy code because we're literally getting user page
-
if (pte_write(pte)) return true; /* Maybe FOLL_FORCE is set to override it? */ if (!(flags & FOLL_FORCE)) return false; /* But FOLL_FORCE has no effect on shared mappings */ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) return false; /* ... or read-only private ones */ if (!(vma->vm_flags & VM_MAYWRITE)) return false; /* ... or already writable ones that just need to take a write fault */ if (vma->vm_flags & VM_WRITE) return false; /* * See can_change_pte_writable(): we broke COW and could map the page * writable if we have an exclusive anonymous page ... */ if (!page || !PageAnon(page) || !PageAnonExclusive(page)) return false; /* ... and a write-fault isn't required for other reasons. */ if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) return false; return !userfaultfd_pte_wp(vma, pte);
flag checks to see if u can write to a pte
-
if (flags & FOLL_TOUCH) { pte_t orig_entry = ptep_get(pte); pte_t entry = orig_entry; if (flags & FOLL_WRITE) entry = pte_mkdirty(entry); entry = pte_mkyoung(entry); if (!pte_same(orig_entry, entry)) { set_pte_at(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); }
uses pte to mark dirty pages and finds pfn in pte
-
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty) { unsigned long i; struct folio *folio; unsigned int nr; if (!make_dirty) { unpin_user_pages(pages, npages); return; } sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); /* * Checking PageDirty at this point may race with * clear_page_dirty_for_io(), but that's OK. Two key * cases: * * 1) This code sees the page as already dirty, so it * skips the call to set_page_dirty(). That could happen * because clear_page_dirty_for_io() called * page_mkclean(), followed by set_page_dirty(). * However, now the page is going to get written back, * which meets the original intention of setting it * dirty, so all is well: clear_page_dirty_for_io() goes * on to call TestClearPageDirty(), and write the page * back. * * 2) This code sees the page as clean, so it calls * set_page_dirty(). The page stays dirty, despite being * written back, so it gets written back again in the * next writeback cycle. This is harmless. */ if (!folio_test_dirty(folio)) { folio_lock(folio); folio_mark_dirty(folio); folio_unlock(folio); } gup_put_folio(folio, nr, FOLL_PIN); } }
unpins and dirties page
-
static inline struct folio *gup_folio_next(struct page **list, unsigned long npages, unsigned long i, unsigned int *ntails) { struct folio *folio = page_folio(list[i]); unsigned int nr; for (nr = i + 1; nr < npages; nr++) { if (page_folio(list[nr]) != folio) break; } *ntails = nr - i; return folio; }
gets folio of next page along with reference to end of folio
-
static inline struct folio *gup_folio_range_next(struct page *start, unsigned long npages, unsigned long i, unsigned int *ntails) { struct page *next = nth_page(start, i); struct folio *folio = page_folio(next); unsigned int nr = 1; if (folio_test_large(folio)) nr = min_t(unsigned int, npages - i, folio_nr_pages(folio) - folio_page_idx(folio, next)); *ntails = nr; return folio; }
gets the folio of the next page from start to 'i' range. also gets the tail folio/reference
-
folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
function for adding reference
-
void unpin_user_page(struct page *page) { sanity_check_pinned_pages(&page, 1); gup_put_folio(page_folio(page), 1, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_page);
actual policy use logic
-
struct folio *folio = page_folio(page); if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) return -EREMOTEIO; if (flags & FOLL_GET) folio_ref_inc(folio);
checks for code that is involved in policy but is not the actual logic
-
else if (flags & FOLL_PIN) { /* * Don't take a pin on the zero page - it's not going anywhere * and it is used in a *lot* of places. */ if (is_zero_page(page)) return 0; /* * Similar to try_grab_folio(): be sure to *also* * increment the normal page refcount field at least once, * so that the page really is pinned. */ if (folio_test_large(folio)) { folio_ref_add(folio, 1); atomic_add(1, &folio->_pincount); } else { folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); } node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); }
Logic that actually tries to grab the folio. Also policy use code and not actual policy
-
if (!put_devmap_managed_page_refs(&folio->page, refs)) folio_put_refs(folio, refs);
Definitely a vital and straightforward policy use section of gup that simples places a reference on the folio
-
if (flags & FOLL_PIN) { if (is_zero_folio(folio)) return; node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); if (folio_test_large(folio)) atomic_sub(refs, &folio->_pincount); else refs *= GUP_PIN_COUNTING_BIAS; }
Checks if the folio is zero/large
-
if (folio_test_large(folio)) atomic_add(refs, &folio->_pincount); else folio_ref_add(folio, refs * (GUP_PIN_COUNTING_BIAS - 1))
maintaining reference counts. Part of policy logic most likely
-
if (unlikely((flags & FOLL_LONGTERM) && !folio_is_longterm_pinnable(folio))) { if (!put_devmap_managed_page_refs(&folio->page, refs)) folio_put_refs(folio, refs); return NULL;
checks for longterm folio pins.
-
if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) return NULL; if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) return NULL;
Time saving predictions(unlikely) and single time warning func(WARN_ON_ONCE) for flags. Not actual policy logic so low confidence.
-
if (unlikely(page_folio(page) != folio)) { if (!put_devmap_managed_page_refs(&folio->page, refs)) folio_put_refs(folio, refs); goto retry;
Uses prediction to check if a folio still points to the page. This is part of the function that tries to retrieve the folio to confirm that it is associated with a page.
-
folio = page_folio(page); if (WARN_ON_ONCE(folio_ref_count(folio) < 0)) return NULL; if (unlikely(!folio_ref_try_add(folio, refs))) return NULL;
These increment the reference count for the folio since you're returning a reference of the folio. Important function so important internal logic subsequently
-
if (is_zero_page(page) || !folio_test_anon(folio)) continue; if (!folio_test_large(folio) || folio_test_hugetlb(folio))
Sanity checks for pinned pages wouldn't classify as policy logic but common sense pre-checks for the actual policy. But I think it's worth tagging this to gain a sense of what is not policy code
-
if (is_zero_page(page)) return page_folio(page); folio = try_get_folio(page, refs); if (!folio) return NULL;
Just trying to check for zero pages and trying to retrieve folios. Unlikely policy logic
-
if (flags & FOLL_GET) return try_get_folio(page, refs);
Policy logic that determines and tries to retrieve folios based on given flags.
-
if (gup_flags & FOLL_PIN) mm_set_has_pinned_flag(¤t->mm->flags); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); start = untagged_addr(start) & PAGE_MASK; len = nr_pages << PAGE_SHIFT; if (check_add_overflow(start, len, &end)) return -EOVERFLOW; if (end > TASK_SIZE_MAX) return -EFAULT; if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT;
checking for overflow in page alloc likely
-
if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { gup_put_folio(folio, 1, flags); goto pte_unmap; } }
part of policy code
-
if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, locked, gup_flags);
policy decision to get locked page!
-
vm_flags = (foll_flags & FOLL_WRITE) ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (foll_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
vm_flags
-
if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ pages_done = ret; break; }
flag
-
if (ret & VM_FAULT_COMPLETED) { /* * NOTE: it's a pity that we need to retake the lock here * to pair with the unlock() in the callers. Ideally we * could tell the callers so they do not need to unlock. */ mmap_read_lock(mm); *unlocked = true; return 0; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); *unlocked = true; fault_flags |= FAULT_FLAG_TRIED; goto retry; }
a lot of VM fault flags that are checked when calling a function that handles page faults
-
if (flags & FOLL_PIN) mm_set_has_pinned_flag(&mm->flags);
-
if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
flag pol to check if map is unlocked (for pos remap?)
-
if (page_increm > nr_pages) page_increm = nr_pages;
next page logic
-
*/ if (gup_flags & FOLL_MADV_POPULATE) { vma = vma_lookup(mm, start); if (!vma) { ret = -ENOMEM; goto out; } if (check_vma_flags(vma, gup_flags)) { ret = -EINVAL; goto out; } goto retry; }
page populate flag for sure
-
if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ if (is_vm_hugetlb_page(vma)) return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could * set a breakpoint in a read-only mapping of an * executable, without corrupting the file (yet only * when that file had been opened for writing!). * Anon pages in shared mappings are surprising: now * just reject it. */ if (!is_cow_mapping(vm_flags)) return -EFAULT; } } else if (!(vm_flags & VM_READ)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* * Is there actually any vma we can reach here which does not * have VM_MAYREAD set? */ if (!(vm_flags & VM_MAYREAD)) return -EFAULT; }
cow mapping, sdw stack seem like imp policies which dont seem like regular flags
-
- Oct 2024
-
elixir.bootlin.com elixir.bootlin.com
-
if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) != (FOLL_PIN | FOLL_LONGTERM)) return true;
most likely part of a policy code that includes flag based decision making
Tags
Annotators
URL
-
-
elixir.bootlin.com elixir.bootlin.com
-
matching the ownership tracking * granularities between memcg and writeback in either direction
Algorithmic policy: the memcg and writeback ownerships have to match. This mechanism implemented via the mem_cgroup_track_foreign_dirty_slowpath and mem_cgroup_flush_foreign functions.
-
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
Maximum loops when reclaiming memory for a soft threshold from a cgroup. These value configurations are set once via #define.
-
if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) return ep; if (parent_effective > siblings_protected && parent_usage > siblings_protected && usage > protected) { unsigned long unclaimed; unclaimed = parent_effective - siblings_protected; unclaimed *= usage - protected; unclaimed /= parent_usage - siblings_protected; ep += unclaimed; } return ep;
Heuristic algorithm for calculations of the effective protection of an individual cgroup. Used to check if the cgroup tree is using memory consumption in normal range by referencing the cgroup, its parent, its sibling, and rest of tree
-
atomic_add(nr_bytes, &old->nr_charged_bytes);
Configuration policy for the tradeoff for flushing per-memcg from old object stock. Currently choosing to fully limit enforcement accuracy while having no CPU contention by writing to a centralized value.
-
#define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024
These #define statements are for the target number of events before the system triggers action for threshold events and soft limit events, respectively. These are memory pressure events. THRESHOLDS_EVENTS_TARGET is set to 128, meaning threshold events are processed more frequently (finer grain) than soft limit events, which are only triggered every 1024 events.
-
if (total >= (excess >> 2) || (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) break;
For the soft limit reclaim of cgroup memory, we continuously loop through the cgroup and find a victim process to shrink. If we didn't find a victim, we will try again up to a certain threshold (#define MEM_CGROUP_MAX_RECLAIM_LOOPS) or quit if we found that we've reclaimed more than the exponentially decreasing original excess: algo left shifts the excess every failed loop.
-
#define FLUSH_TIME (2UL*HZ)
This is a configuration policy that sets the interval for periodic flushing of memory statistics. The flushing is performed every 2 seconds (2UL * HZ), allowing the system to balance between the cost of frequent stat updates and keeping the statistics reasonably fresh.
-
#define MEMCG_DELAY_PRECISION_SHIFT 20 #define MEMCG_DELAY_SCALING_SHIFT 14
These two #define statements are part of the configuration policy for controlling the increasing penalty applied to memory overage. They ensure the system doesn’t penalize too harshly in minor cases but still exponentially increases delay for excessive usage.
-
/* * Don't sleep if the amount of jiffies this memcg owes us is so low * that it's not even worth doing, in an attempt to be nice to those who * go only a small amount over their memory.high value and maybe haven't * been aggressively reclaimed enough yet. */ if (penalty_jiffies <= HZ / 100) goto out;
Both the predicate and action of an algorithmic policy are defined here. If the calculated penality_jiffies (delay for over allocated cgrouup) is less than 1/100th of a second, don't throttle the cgroup.
-
#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
This line defines the maximum sleep time (delay) for a memory cgroup that has breached its memory.high limit. This is a configuration policy because it sets a fixed upper limit (2 seconds).
-
-
elixir.bootlin.com elixir.bootlin.com
-
static inline bool should_continue_reclaim(struct pglist_data *pgdat,
This function decides whether there should be another round of reclamation. Stop reclaiming as long as space is sufficient for compaction or page allocation
-
static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) { /* * If reclaim is making progress greater than 12% efficiency then * wake all the NOPROGRESS throttled tasks. */ if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) { wait_queue_head_t *wqh; wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; if (waitqueue_active(wqh)) wake_up(wqh); return; } /* * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages * under writeback and marked for immediate reclaim at the tail of the * LRU. */ if (current_is_kswapd() || cgroup_reclaim(sc)) return; /* Throttle if making no progress at high prioities. */ if (sc->priority == 1 && !sc->nr_reclaimed) reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); }
Managing throttling: wakes up throttled tasks when reclaim is making progress at some efficiency. Throttle reclaim if no progress (in reclaim_throttle(), throttling under no progress could also be skipped under some conditions).
-
/* * We make inactive:active ratio decisions based on the node's * composition of memory, but a restrictive reclaim_idx or a * memory.low cgroup setting can exempt large amounts of * memory from reclaim. Neither of which are very common, so * instead of doing costly eligibility calculations of the * entire cgroup subtree up front, we assume the estimates are * good, and retry with forcible deactivation if that fails. */ if (sc->skipped_deactivate) { sc->priority = initial_priority; sc->force_deactivate = 1; sc->skipped_deactivate = 0; goto retry; } /* Untapped cgroup reserves? Don't OOM, retry. */ if (sc->memcg_low_skipped) { sc->priority = initial_priority; sc->force_deactivate = 0; sc->memcg_low_reclaim = 1; sc->memcg_low_skipped = 0; goto retry; }
Deactivation may be skipped if inactive ratio is not low or due to memory.Low limit. In this case, simply retry again with forcible deactivation/reclaim flags.
-
scan_balance = SCAN_FRACT; /* * Calculate the pressure balance between anon and file pages. * * The amount of pressure we put on each LRU is inversely * proportional to the cost of reclaiming each list, as * determined by the share of pages that are refaulting, times * the relative IO cost of bringing back a swapped out * anonymous page vs reloading a filesystem page (swappiness). * * Although we limit that influence to ensure no list gets * left behind completely: at least a third of the pressure is * applied, before swappiness. * * With swappiness at 100, anon and file have equal IO cost. */ total_cost = sc->anon_cost + sc->file_cost; anon_cost = total_cost + sc->anon_cost; file_cost = total_cost + sc->file_cost; total_cost = anon_cost + file_cost; ap = swappiness * (total_cost + 1); ap /= anon_cost + 1; fp = (200 - swappiness) * (total_cost + 1); fp /= file_cost + 1; fraction[0] = ap; fraction[1] = fp; denominator = ap + fp;
Calculates the proportion of anon and file pages to be scanned based on swappiness. The whole funtion determines number of folios to be scanned for each type.
-
if (young * MIN_NR_GENS > total) return true; if (old * (MIN_NR_GENS + 2) < total) return true;
Decides whether aging is needed. Aging is needed if the lruvec is short on cold folios. If returns true, the scanning of this lruvec is skipped.
-
if (!swappiness) type = LRU_GEN_FILE; else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) type = LRU_GEN_ANON; else if (swappiness == 1) type = LRU_GEN_FILE; else if (swappiness == 200) type = LRU_GEN_ANON; else type = get_type_to_scan(lruvec, swappiness, &tier);
Makes a decision on which type to scan based on swappiness, for the generational lruvecs
-
*/ if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) { if (cgroup_reclaim(sc) && writeback_throttling_sane(sc)) set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags); if (current_is_kswapd()) set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags); } /* * Stall direct reclaim for IO completions if the lruvec is * node is congested. Allow kswapd to continue until it * starts encountering unqueued dirty pages or cycling through * the LRU too quickly. */ if (!current_is_kswapd() && current_may_throttle() && !sc->hibernation_mode && (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) || test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags))) reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc)) goto again;
stall reclaim if lruvec is congested, which is determined by whether all dirty pages in the cgroup or node were marked for writeback
-
static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) { enum lru_list active_lru = inactive_lru + LRU_ACTIVE; unsigned long inactive, active; unsigned long inactive_ratio; unsigned long gb; inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru); active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) inactive_ratio = int_sqrt(10 * gb); else inactive_ratio = 1; return inactive * inactive_ratio < active; }
Defines/calculates a target active-to-inactive ratio. There are various places where this function is called to decide whether the active list should be shrinked and whether inactive list could be expanded.
-
nr = xchg_nr_deferred(shrinker, shrinkctl); if (shrinker->seeks) { delta = freeable >> priority; delta *= 4; do_div(delta, shrinker->seeks); } else { /* * These objects don't require any IO to create. Trim * them aggressively under memory pressure to keep * them from causing refetches in the IO caches. */ delta = freeable / 2; } total_scan = nr >> priority; total_scan += delta; total_scan = min(total_scan, (2 * freeable)); trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, freeable, delta, total_scan, priority); /* * Normally, we should not scan less than batch_size objects in one * pass to avoid too frequent shrinker calls, but if the slab has less * than batch_size objects in total and we are really tight on memory, * we will try to reclaim all available objects, otherwise we can end * up failing allocations although there are plenty of reclaimable * objects spread over several slabs with usage less than the * batch_size. * * We detect the "tight on memory" situations by looking at the total * number of objects we want to scan (total_scan). If it is greater * than the total number of objects on slab (freeable), we must be * scanning at high prio and therefore should try to reclaim as much as * possible. */ while (total_scan >= batch_size || total_scan >= freeable) {
Determine the number of slab objects to be scanned (total_scan) based on 'freeable' and 'priority'. Scans are done in batch sizes unless memory is tight. Batch size is set by shrinker, or defaulted to SHRINK_BATCH at line 832
-
if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) { nr_rotated += folio_nr_pages(folio); list_add(&folio->lru, &l_active); continue; }
The goal is to identify executable code regions and keep them in memory under moderate memory pressure. The heuristic is to check VM_EXEC flag and whether folio is file-backed, and add identified folio back to active list.
-
-
elixir.bootlin.com elixir.bootlin.com
-
#define ZSWAP_NR_ZPOOLS 32
More configuration policy, as per the comment, this is an empirical number.
-
static unsigned int zswap_max_pool_percent = 20;
maximum percentage of memory that the compresses pool can occupy
-
static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
Since zswap will be part of decisions on the swap flow the this config is policy.
-
-
elixir.bootlin.com elixir.bootlin.com
-
if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) { struct pcpu_chunk *pos; list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list) if (pos != chunk) { need_balance = true; break; } } else if (pcpu_should_reclaim_chunk(chunk)) { pcpu_isolate_chunk(chunk); need_balance = true; }
When freeing memory determine whether to balance or not.
-
static void pcpu_balance_workfn(struct work_struct *work)
Manage free chunks and populated pages. It does so by first reclaiming all fully free chunks regardless of the number of populated pages. Then rebalancing and finally cleaning.
-
if (freed_page_start < freed_page_end) {
Batching TLB flushes to amortize cost. The threashold is also a heuristic.
-
int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
Static allocation policy is used here for the first chunk per cpu.
-
if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) { reintegrate = true; break; }
Heuristic to stop reclaiming if empty fall below threshold that would cause atomic alloc failures.
-
static void pcpu_balance_populated(void)
Maintain a certain amount of populated pages to satisfy atomic allocations.
-
if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
Heuristic to determine when to rebalance when allocating pages. If the number of pages are below the PCPU_EMPTY_POP_PAGES_LOW threashold then rebalancer is called.
-
if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
As the comment said this is a heuristic to stop finding chunks when true.
-
static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, int bits)
Update metadate hints (for policy).
-
if (contig == block->contig_hint) { if (block->contig_hint_start && (!start || __ffs(start) > __ffs(block->contig_hint_start))) { /* start has a better alignment so use it */ block->contig_hint_start = start; if (start < block->scan_hint_start && block->contig_hint > block->scan_hint) block->scan_hint = 0; } else if (start > block->scan_hint_start || block->contig_hint > block->scan_hint) { /* * Knowing contig == contig_hint, update the scan_hint * if it is farther than or larger than the current * scan_hint. */ block->scan_hint_start = start; block->scan_hint = contig; }
Heuristic when contig hints are equal to choose the "best" starting offset.
-
-
elixir.bootlin.com elixir.bootlin.com
-
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * Watermark failed for this zone, but see if we can * grow this zone if it contains deferred pages. */ if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } #endif
This is a configuration policy. It will retry if zone has deferred pages.
-
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } #endif
This is a configuration policy. It will retry if zone has deferred pages.
-
batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; /* * Clamp the batch to a 2^n - 1 value. Having a power * of 2 value was found to be more likely to have * suboptimal cache aliasing properties in some cases. * * For example if 2 tasks are alternately allocating * batches of pages, one task can end up with a lot * of pages of one half of the possible page colors * and the other with pages of the other colors. */ batch = rounddown_pow_of_two(batch + batch/2) - 1;
Determine the number of pages for batch allocating based on a heuristic. Using a (2^n - 1) to minimize cache aliasing issues.
but I think it may also be categorized as a configuration policy because the code execution depends on CONFIG_MMU.
-
-
elixir.bootlin.com elixir.bootlin.com
-
if (oc->chosen && oc->chosen != (void *)-1UL) oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : "Memory cgroup out of memory");
We found the process that has the highest "badness" score and will be terminated after we called select_bad_process. This is the action side of the previously mentioned predicate (maximal "badness").
-
#define MAX_OOM_REAP_RETRIES 10 static void oom_reap_task(struct task_struct *tsk) { int attempts = 0; struct mm_struct *mm = tsk->signal->oom_mm; /* Retry the mmap_read_trylock(mm) a few times */ while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm)) schedule_timeout_idle(HZ/10); if (attempts <= MAX_OOM_REAP_RETRIES || test_bit(MMF_OOM_SKIP, &mm->flags)) goto done;
The reaper will try 10 times (with a delay after each attempt) to reap the memory from the process. This value configuration is defined via a #define statement. This will fail if the process doesn't give up its mmap lock.
-
#define OOM_REAPER_DELAY (2*HZ) static void queue_oom_reaper(struct task_struct *tsk) { /* mm is already queued? */ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) return; get_task_struct(tsk); timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0); tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY; add_timer(&tsk->oom_reaper_timer); }
The program here sends a signal to the selected process that will be killed to solve the out of memory issue. It will then wait for a configuration OOM_REAPER_DELAY before forcefully killing it by invoking the reaper thread on the process. It's currently set to 2*HZ, meaning that we wait for 2 seconds. This time period policy is defined as configuration via #define.
-
/* * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); /* Normalize to oom_score_adj units */ adj *= totalpages / 1000; points += adj;
The kernel calls out of memory when the RAM passes full utilization limit. This code calculates a predicate for choosing the maximal "bad" process to reap to fix the out of memory issue. This code defines the predicate as a heuristic calculation; it calculates "badness" as the proportion of RAM that the sum of the task's RSS, pagetable, and swap space use.
-
-
elixir.bootlin.com elixir.bootlin.com
-
if (s->flags & __CMPXCHG_DOUBLE) { ret = __update_freelist_fast(slab, freelist_old, counters_old, freelist_new, counters_new); } else { ret = __update_freelist_slow(slab, freelist_old, counters_old, freelist_new, counters_new); }
This policy is very similar to annotated code below. The description is reproduced here:
This policy determines if the system has support for compare and exchange. If so, it will use the "__update_freelist_fast()" function, which uses a compare and exchange internally. Otherwise, it will use "__update_freelist_slow()", which uses a lock (specifically a bit-based spinlock) internally.
-
#ifdef CONFIG_SLAB_FREELIST_HARDENED encoded = (unsigned long)ptr ^ s->random ^ swab(ptr_addr); #else encoded = (unsigned long)ptr; #endif
This policy uses the configuration setting "CONFIG_SLAB_FREELIST_HARDENED" to determine whether to obfuscate a SLUB free list pointer or not for increased security at the cost of some performance.
-
#ifdef CONFIG_SLAB_FREELIST_HARDENED decoded = (void *)(ptr.v ^ s->random ^ swab(ptr_addr)); #else decoded = (void *)ptr.v; #endif
This policy is based on the configuration setting indicated by "CONFIG_SLAB_FREELIST_HARDENED", which hardens the slab free list. If the free list is hardened, then free list pointers will be obfuscated; this policy just undoes the obfuscation in that case. In the case where free list pointers are not obfuscated, this function just returns the unmodified pointer value.
-
#ifdef CONFIG_SLAB_FREELIST_RANDOM /* Pre-initialize the random sequence cache */ static int init_cache_random_seq(struct kmem_cache *s) { unsigned int count = oo_objects(s->oo); int err; /* Bailout if already initialised */ if (s->random_seq) return 0; err = cache_random_seq_create(s, count, GFP_KERNEL); if (err) { pr_err("SLUB: Unable to initialize free list for %s\n", s->name); return err; } /* Transform to an offset on the set of pages */ if (s->random_seq) { unsigned int i; for (i = 0; i < count; i++) s->random_seq[i] *= s->size; } return 0; } /* Initialize each random sequence freelist per cache */ static void __init init_freelist_randomization(void) { struct kmem_cache *s; mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) init_cache_random_seq(s); mutex_unlock(&slab_mutex); } /* Get the next entry on the pre-computed freelist randomized */ static void *next_freelist_entry(struct kmem_cache *s, struct slab *slab, unsigned long *pos, void *start, unsigned long page_limit, unsigned long freelist_count) { unsigned int idx; /* * If the target page allocation failed, the number of objects on the * page might be smaller than the usual size defined by the cache. */ do { idx = s->random_seq[*pos]; *pos += 1; if (*pos >= freelist_count) *pos = 0; } while (unlikely(idx >= page_limit)); return (char *)start + idx; } /* Shuffle the single linked freelist based on a random pre-computed sequence */ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) { void *start; void *cur; void *next; unsigned long idx, pos, page_limit, freelist_count; if (slab->objects < 2 || !s->random_seq) return false; freelist_count = oo_objects(s->oo); pos = get_random_u32_below(freelist_count); page_limit = slab->objects * s->size; start = fixup_red_left(s, slab_address(slab)); /* First entry is used as the base of the freelist */ cur = next_freelist_entry(s, slab, &pos, start, page_limit, freelist_count); cur = setup_object(s, cur); slab->freelist = cur; for (idx = 1; idx < slab->objects; idx++) { next = next_freelist_entry(s, slab, &pos, start, page_limit, freelist_count); next = setup_object(s, next); set_freepointer(s, cur, next); cur = next; } set_freepointer(s, cur, NULL); return true; } #else static inline int init_cache_random_seq(struct kmem_cache *s) { return 0; } static inline void init_freelist_randomization(void) { } static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) { return false; } #endif /* CONFIG_SLAB_FREELIST_RANDOM */
This policy looks at the "CONFIG_SLAB_FREELIST_RANDOM" config setting. If it is set, the policy defines functions to randomize the free list order when creating new pages (for security purposes). If it is not set, the functions involved in randomizing the free list are empty, effectively turning off free list randomization.
-
if (s->flags & __CMPXCHG_DOUBLE) { ret = __update_freelist_fast(slab, freelist_old, counters_old, freelist_new, counters_new); } else { unsigned long flags; local_irq_save(flags); ret = __update_freelist_slow(slab, freelist_old, counters_old, freelist_new, counters_new); local_irq_restore(flags); }
This policy determines if the system has support for compare and exchange. If so, it will use the "__update_freelist_fast()" function, which uses a compare and exchange internally. Otherwise, it will use "__update_freelist_slow()", which uses a lock (specifically a bit-based spinlock) internally.
-
-
elixir.bootlin.com elixir.bootlin.com
-
#ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); static void sum_vm_events(unsigned long *ret) { int cpu; int i; memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); for_each_online_cpu(cpu) { struct vm_event_state *this = &per_cpu(vm_event_states, cpu); for (i = 0; i < NR_VM_EVENT_ITEMS; i++) ret[i] += this->event[i]; } } /* * Accumulate the vm event counters across all CPUs. * The result is unavoidably approximate - it can change * during and after execution of this function. */ void all_vm_events(unsigned long *ret) { cpus_read_lock(); sum_vm_events(ret); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(all_vm_events); /* * Fold the foreign cpu events into our own. * * This is adding to the events on one processor * but keeps the global counts constant. */ void vm_events_fold_cpu(int cpu) { struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); int i; for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { count_vm_events(i, fold_state->event[i]); fold_state->event[i] = 0; } } #endif /* CONFIG_VM_EVENT_COUNTERS */
When CONFIG_VM_EVENT_COUNTERS is enabled, the kernel compiles the code that collects and manages virtual memory (vm) event counters. These counters track events like page faults, page allocations, and swap operations. The functions
sum_vm_events
,all_vm_events
, andvm_events_fold_cpu
are responsble for accumulating these statistics across all CPUs.If CONFIG_VM_EVENT_COUNTERS is not enabled, this code is excluded from the build. This means the kernel won't collect these detailed VM statistics, reducing memory usage and avoiding the overhead associated with tracking them.
-
#ifdef CONFIG_NUMA int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; /* zero numa counters within a zone */ static void zero_zone_numa_counters(struct zone *zone) { int item, cpu; for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) { atomic_long_set(&zone->vm_numa_event[item], 0); for_each_online_cpu(cpu) { per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item] = 0; } } } /* zero numa counters of all the populated zones */ static void zero_zones_numa_counters(void) { struct zone *zone; for_each_populated_zone(zone) zero_zone_numa_counters(zone); } /* zero global numa counters */ static void zero_global_numa_counters(void) { int item; for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) atomic_long_set(&vm_numa_event[item], 0); } static void invalid_numa_statistics(void) { zero_zones_numa_counters(); zero_global_numa_counters(); } static DEFINE_MUTEX(vm_numa_stat_lock); int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int ret, oldval; mutex_lock(&vm_numa_stat_lock); if (write) oldval = sysctl_vm_numa_stat; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (ret || !write) goto out; if (oldval == sysctl_vm_numa_stat) goto out; else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) { static_branch_enable(&vm_numa_stat_key); pr_info("enable numa statistics\n"); } else { static_branch_disable(&vm_numa_stat_key); invalid_numa_statistics(); pr_info("disable numa statistics, and clear numa counters\n"); } out: mutex_unlock(&vm_numa_stat_lock); return ret; } #endif
This conditionally includes the NUMA-specific code based on whether the
CONFIG_NUMA
option is enabled during kernel compilation. Within this conditional block, thesysctl_vm_numa_stat_handler
function provides a runtime configuration mechanism through thesysctl_vm_numa_stat
variable. This function allows system administrators to enable or disable the collection of NUMA statistics at runtime. When the value ofsysctl_vm_numa_stat
changes, the function changes thevm_numa_stat_key
to start or stop the statistics collection and clears the NUMA counters if disabled.
-
-
elixir.bootlin.com elixir.bootlin.com
-
if (start != vma->vm_start) { error = split_vma(&vmi, vma, start, 1); if (error) return error; } if (end != vma->vm_end) { error = split_vma(&vmi, vma, end, 0); if (error) return error; }
This is an algorithmic policy. The VMA is split at the start address if the specified start is different from the current beginning of the VMA, or if the specified end address is different from the current end of the VMA, the VMA is split.
-
if (behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", pfn, start); ret = soft_offline_page(pfn, MF_COUNT_INCREASED); }
This is algorithmic policy. If this particular madvise behavior is MADV_SOFT_OFFLINE, the code decides to performs soft offlining of a page by marking it as unavailable for future use without reclaiming it.
-
if (folio_test_large(folio)) { int err; if (folio_estimated_sharers(folio) != 1) break; if (!folio_trylock(folio)) break; folio_get(folio); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); if (err) break; start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break; arch_enter_lazy_mmu_mode(); pte--; addr -= PAGE_SIZE; continue; }
This is algorithmic policy. This code handles the splitting of large folio pages. It checks if the folio can be safely split, attempts to split it into smaller folios, and then appropriately updates the relevant PTEs and MMU state.
-
if (!pte_present(ptent)) { swp_entry_t entry; entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) { nr_swap--; free_swap_and_cache(entry); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } continue; }
This is an algorithmic policy. It checks if the entry is a valid swap entry and manages its resources accordingly. If the entry is poisoned, it clears the PTE but does not free associated resources, and if it is not a swap entry then all resources are freed.
-
static long madvise_cold(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; lru_add_drain(); tlb_gather_mmu(&tlb, mm); madvise_cold_page_range(&tlb, vma, start_addr, end_addr); tlb_finish_mmu(&tlb); return 0; }
This is a configuration policy. Cold page ranges can not be advised to the kernel if the VM_LOCKED or VM_PFNMAP or VM_HUGETLB flags are set.
-
-
elixir.bootlin.com elixir.bootlin.com
-
if (dtc->wb_thresh < 2 * wb_stat_error()) { wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); } else { wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); }
This is a configuration policy that does a more accurate calculation on the number of reclaimable pages and dirty pages when the threshold for the dirty pages in the writeback context is lower than 2 times the maximal error of a stat counter.
-
static long wb_min_pause(struct bdi_writeback *wb, long max_pause, unsigned long task_ratelimit, unsigned long dirty_ratelimit, int *nr_dirtied_pause
This function is an algorithmic policy that determines the minimum throttle time for a process between consecutive writeback operations for dirty pages based on heuristics. It is used for balancing the load of the I/O subsystems so that there will not be excessive I/O operations that impact the performance of the system.
-
if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh && !writeback_in_progress(wb)) wb_start_background_writeback(wb);
This is a configuration policy that determines whether to start background writeout. The code here indicates that if laptop_mode, which will reduce disk activity for power saving, is not set, then when the number of dirty pages reaches the bg_thresh threshold, the system starts writing back pages.
-
if (thresh > dirty) return 1UL << (ilog2(thresh - dirty) >> 1);
This implements a configuration policy that determines the interval for the kernel to wake up and check for dirty pages that need to be written back to disk.
-
limit -= (limit - thresh) >> 5;
This is a configuration policy that determines how much should the limit be updated. The limit controls the amount of dirty memory allowed in the system.
-
if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && (!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { unsigned long intv; unsigned long m_intv; free_running: intv = dirty_poll_interval(dirty, thresh); m_intv = ULONG_MAX; current->dirty_paused_when = now; current->nr_dirtied = 0; if (mdtc) m_intv = dirty_poll_interval(m_dirty, m_thresh); current->nr_dirtied_pause = min(intv, m_intv); break; } /* Start writeback even when in laptop mode */ if (unlikely(!writeback_in_progress(wb))) wb_start_background_writeback(wb); mem_cgroup_flush_foreign(wb); /* * Calculate global domain's pos_ratio and select the * global dtc by default. */ if (!strictlimit) { wb_dirty_limits(gdtc); if ((current->flags & PF_LOCAL_THROTTLE) && gdtc->wb_dirty < dirty_freerun_ceiling(gdtc->wb_thresh, gdtc->wb_bg_thresh)) /* * LOCAL_THROTTLE tasks must not be throttled * when below the per-wb freerun ceiling. */ goto free_running; } dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit); wb_position_ratio(gdtc); sdtc = gdtc; if (mdtc) { /* * If memcg domain is in effect, calculate its * pos_ratio. @wb should satisfy constraints from * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ if (!strictlimit) { wb_dirty_limits(mdtc); if ((current->flags & PF_LOCAL_THROTTLE) && mdtc->wb_dirty < dirty_freerun_ceiling(mdtc->wb_thresh, mdtc->wb_bg_thresh)) /* * LOCAL_THROTTLE tasks must not be * throttled when below the per-wb * freerun ceiling. */ goto free_running; } dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && ((mdtc->dirty > mdtc->thresh) || strictlimit); wb_position_ratio(mdtc); if (mdtc->pos_ratio < gdtc->pos_ratio) sdtc = mdtc; }
This is an algorithmic policy that determines whether the process can run freely or a throttle is needed to control the rate of the writeback by checking if the number of dirty pages exceed the average of the global threshold and background threshold.
-
shift = dirty_ratelimit / (2 * step + 1); if (shift < BITS_PER_LONG) step = DIV_ROUND_UP(step >> shift, 8); else step = 0; if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; else dirty_ratelimit -= step;
This is a configuration policy that determines how much we should increase/decrease the dirty_ratelimit, which controls the rate that processors write dirty pages back to storage.
-
ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16;
This is a configuration policy that dynamically determines the rate that kernel can write dirty pages back to storage in a single writeback cycle.
-
t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
This implements a configuration policy that determines the maximum time that the kernel should wait between writeback operations for dirty pages. This ensures that dirty pages are flushed to disk within a reasonable time frame and control the risk of data loss in case of a system crash.
-
int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, unsigned int flags)
line 2064 calling balance_dirty_pages() depending on ratelimit.
-
if (elapsed > WB_BANDWIDTH_IDLE_JIF && !atomic_read(&wb->writeback_inodes)) {
Idle interval config
-
static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, unsigned long dirtied, unsigned long elapsed)
The dynamic calculation of dirty_ratelimit, the use of step size adjustments, filtering of singular points to avoid spikes, and special handling for strict limits.
-
static void wb_update_write_bandwidth(struct bdi_writeback *wb, unsigned long elapsed, unsigned long written)
Does bandwidth calculation (line 1221-1229) and smoothing (line 1234-1240)
-
static void domain_dirty_limits(struct dirty_throttle_control *dtc)
Heuristic based algorithms for calculating thresh and bg_thresh. Line 419-422 adds additional threshold if the task is determined to be real time task.
-
void writeback_set_ratelimit(void)
Dynamically calculates the ratelimit_pages based on heuristics. Thresholds are set according to configuration policies.
-
bool wb_over_bg_thresh(struct bdi_writeback *wb)
Although thresholds are set using configuation policies, this function uses series of comparison to determine if the wb should start.
-
-
elixir.bootlin.com elixir.bootlin.com
-
if (lru_gen_enabled() && pte_young(ptep_get(pvmw.pte))) { lru_gen_look_around(&pvmw); referenced++; }
These lines show configuration policy. The lru_gen_enabled() function checks if the multi-gen LRU is enabled and this is set with the CONFIG_LRU_GEN_ENABLED kernel configuration. This is a determining factor into how the folio referenced bit is updated, so it is a configuration policy
-
pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
This is an algorithmic policy. Before it was called, a pte was cleared. A call to this function puts PTE_MARKER_UFFD_WP on a page, if needed. Here the predicate would be that the pte was cleared, and the action is the updating of the marker. (The function should not be called if the pte was not cleared).There are more policies within the function to determine if the marker is needed.
-
if (pending != flushed) { arch_flush_tlb_batched_pending(mm);
This is an algorithmic policy. This code decides whether to perform a TLB flush based on whether there are pending flushes that haven't been completed.
-
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_flush_young_notify(vma, address, pvmw.pmd)) referenced++;
This is configuration policy. The CONFIG_TRANSPARENT_HUGEPAGE kernel configuration determines how and if the page reference bit is incremented.
-
if (!dst->anon_vma && src->anon_vma && anon_vma->num_children < 2 && anon_vma->num_active_vmas == 0) dst->anon_vma = anon_vma;
This is an algorithmic policy. This code checks if dst currently does not have an anonymous VMA and if src does. It checks that the anon VMA being considered has fewer than two children and no active VMAs. If all conditions are met, it reuses the existing anonymous VMA rather than allocating a new one.
-
-
elixir.bootlin.com elixir.bootlin.com
-
if (should_proactive_compact_node(pgdat)) { unsigned int prev_score, score; prev_score = fragmentation_score_node(pgdat); proactive_compact_node(pgdat); score = fragmentation_score_node(pgdat); /* * Defer proactive compaction if the fragmentation * score did not go down i.e. no progress made. */ if (unlikely(score >= prev_score)) timeout = default_timeout << COMPACT_MAX_DEFER_SHIFT; }
This part of the code checks whether proactive compaction should be triggered using the
should_proactive_compact_node(pgdat)
predicate. If proactive compaction is necessary, it stores the node's fragmentation score (prev_score
), performs proactive compaction (proactive_compact_node(pgdat)
), and then checks the new fragmentation score (score). If the score does not decrease after compaction (indicating no progress), the system defers further proactive compaction by increasing the timeout (timeout = default_timeout << COMPACT_MAX_DEFER_SHIFT
). This helps avoid repeated compaction attempts without progress. -
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) static ssize_t compact_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int nid = dev->id; if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { /* Flush pending updates to the LRU lists */ lru_add_drain_all(); compact_node(nid); } return count; } static DEVICE_ATTR_WO(compact); int compaction_register_node(struct node *node) { return device_create_file(&node->dev, &dev_attr_compact); } void compaction_unregister_node(struct node *node) { device_remove_file(&node->dev, &dev_attr_compact); } #endif /* CONFIG_SYSFS && CONFIG_NUMA */
When these options are enabled, the functions compact_store, compaction_register_node, and compaction_unregister_node are included in the build. The compact_store function allows users to trigger memory compaction on specific NUMA nodes by writing to the sysfs interface. Inside this function, it checks if the node ID is valid and online, drains pending updates to the LRU lists with lru_add_drain_all(), and then calls compact_node(nid) to perform compaction on that node.
Including these functions provides fine-grained control over memory management in NUMA system. If even one of the flag is not eneabled, these functions are excluded from the build, and the per-node compaction interface is unavailable.
-
if (prio > MIN_COMPACT_PRIORITY && compaction_deferred(zone, order)) { rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); continue; } status = compact_zone_order(zone, order, gfp_mask, prio, alloc_flags, ac->highest_zoneidx, capture); rc = max(status, rc); /* The allocation should succeed, stop compacting */ if (status == COMPACT_SUCCESS) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller * will repeat this with true if allocation indeed * succeeds in this zone. */ compaction_defer_reset(zone, order, false); break; } if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE || status == COMPACT_PARTIAL_SKIPPED)) /* * We think that allocation won't succeed in this zone * so we defer compaction there. If it ends up * succeeding after all, it will be reset. */ defer_compaction(zone, order); /* * We might have stopped compacting due to need_resched() in * async compaction, or due to a fatal signal detected. In that * case do not try further zones */ if ((prio == COMPACT_PRIO_ASYNC && need_resched()) || fatal_signal_pending(current)) break;
The kernel attempts to compact memory across different zones, guided by several key predicates. First, if compaction for a zone has been deferred and the priority is above
MIN_COMPACT_PRIORITY
, the function skips the zone, marking the result asCOMPACT_DEFERRED
. For each zone, it calls compact_zone_order and updates the overall status. If compaction succeeds (COMPACT_SUCCESS
), the kernel resets the deferred state for the zone and stops further compaction attempts, expecting allocation to succeed. If compaction is either complete or partially skipped in non-async mode, the zone is marked for deferred compaction, preventing further unnecessary attempts. If compaction is interrupted due to rescheduling or a fatal signal, the function exits early to avoid system overload. -
if (cc->nr_freepages > 0) { unsigned long free_pfn = release_freepages(&cc->freepages); cc->nr_freepages = 0; VM_BUG_ON(free_pfn == 0); /* The cached pfn is always the first in a pageblock */ free_pfn = pageblock_start_pfn(free_pfn); /* * Only go back, not forward. The cached pfn might have been * already reset to zone end in compact_finished() */ if (free_pfn > cc->zone->compact_cached_free_pfn) cc->zone->compact_cached_free_pfn = free_pfn; }
This part ensures efficient release of free pages back into the system. If there are free pages to be released (
cc->nr_freepages > 0
), the functionrelease_freepages
is called, and the first page frame number (PFN) in the pageblock is recorded. The cached PFN is then updated to point to the start of the newly freed pageblock. A critical check (VM_BUG_ON
) ensures that the process does not operate on invalid or uninitialized page ranges.This ensures that the cached free PFN is always updated in a conservative manner i.e. only moving backward, not forward. This avoids skipping any potential free pages. This helps optimize subsequent compaction attempts by ensuring that future operations start from the correct position.
-
if (err) { putback_movable_pages(&cc->migratepages); /* * migrate_pages() may return -ENOMEM when scanners meet * and we want compact_finished() to detect it */ if (err == -ENOMEM && !compact_scanners_met(cc)) { ret = COMPACT_CONTENDED; goto out; } /* * If an ASYNC or SYNC_LIGHT fails to migrate a page * within the pageblock_order-aligned block and * fast_find_migrateblock may be used then scan the * remainder of the pageblock. This will mark the * pageblock "skip" to avoid rescanning in the near * future. This will isolate more pages than necessary * for the request but avoid loops due to * fast_find_migrateblock revisiting blocks that were * recently partially scanned. */ if (!pageblock_aligned(cc->migrate_pfn) && !cc->ignore_skip_hint && !cc->finish_pageblock && (cc->mode < MIGRATE_SYNC)) { cc->finish_pageblock = true; /* * Draining pcplists does not help THP if * any page failed to migrate. Even after * drain, the pageblock will not be free. */ if (cc->order == COMPACTION_HPAGE_ORDER) last_migrated_pfn = 0; goto rescan; } }
This block of code handles errors during page migration by first resetting the state to prevent inconsistencies. If a migration error occurs, the kernel puts back the movable pages. If the error is due to memory shortage (
-ENOMEM
), the process checks if the migration and free page scanners have met. If not, compaction is terminated early withCOMPACT_CONTENDED
. ForASYNC
orSYNC_LIGHT
modes, if the migration fails within an unaligned pageblock, the kernel marks the block for skipping to avoid unnecessary rescanning. Thus, improving efficiency by isolating more pages than required and preventing revisits to partially scanned blocks. For transparent huge pages (THP), it ensures the failed pageblocks are not reused. -
#ifdef CONFIG_CMA /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ if (migratetype == MIGRATE_MOVABLE && !free_area_empty(area, MIGRATE_CMA)) return COMPACT_SUCCESS; #endif
This block allows the kernel's memory compaction algorithm to consider
MIGRATE_CMA
pages as fallback options forMIGRATE_MOVABLE
allocations when the Contiguous Memory Allocator (CMA) is enabled (CONFIG_CMA
is defined).When
CONFIG_CMA
is enabled, the compaction process becomes more flexible by allowing the use of free CMA pages if standard movable pages are unavailable, potentially improving allocation success rates.If
CONFIG_CMA
is disabled, this fallback mechanism is omitted, ensuring that CMA regions remain dedicated to their primary purpose of providing contiguous memory blocks for devices that require them.https://elixir.bootlin.com/linux/v6.6.42/source/mm/Kconfig#L895
-
/* * compaction_suitable: Is this suitable to run compaction on this zone now? */ bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx) { enum compact_result compact_result; bool suitable; suitable = __compaction_suitable(zone, order, highest_zoneidx, zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation * * index of -1000 would imply allocations might succeed depending on * watermarks, but we already failed the high-order watermark check * index towards 0 implies failure is due to lack of memory * index towards 1000 implies failure is due to fragmentation * * Only compact if a failure would be due to fragmentation. Also * ignore fragindex for non-costly orders where the alternative to * a successful reclaim/compaction is OOM. Fragindex and the * vm.extfrag_threshold sysctl is meant as a heuristic to prevent * excessive compaction for costly orders, but it should not be at the * expense of system stability. */ if (suitable) { compact_result = COMPACT_CONTINUE; if (order > PAGE_ALLOC_COSTLY_ORDER) { int fragindex = fragmentation_index(zone, order); if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) { suitable = false; compact_result = COMPACT_NOT_SUITABLE_ZONE; } } } else { compact_result = COMPACT_SKIPPED; } trace_mm_compaction_suitable(zone, order, compact_result); return suitable; }
This function decides whether to initiate memory compaction on a particular zone based on system conditions like the fragmentation index and the external fragmentation threshold (
sysctl_extfrag_threshold
). It determines whether compaction would be effective or should be skipped. For high-order allocations, if the fragmentation index is below the threshold, the function opts to avoid unnecessary compaction, conserving system resources and preventing potential performance degradation.This enhances linux's memory management efficiency by preventing futile compaction attempts; such as in low-memory situations. Initiating compaction only when fragmentation is the primary issue, helps the system optimize CPU utilization. This should improve memory allocation success rates.
-
static unsigned long fast_find_migrateblock(struct compact_control *cc) { unsigned int limit = freelist_scan_limit(cc); unsigned int nr_scanned = 0; unsigned long distance; unsigned long pfn = cc->migrate_pfn; unsigned long high_pfn; int order; bool found_block = false; /* Skip hints are relied on to avoid repeats on the fast search */ if (cc->ignore_skip_hint) return pfn; /* * If the pageblock should be finished then do not select a different * pageblock. */ if (cc->finish_pageblock) return pfn; /* * If the migrate_pfn is not at the start of a zone or the start * of a pageblock then assume this is a continuation of a previous * scan restarted due to COMPACT_CLUSTER_MAX. */ if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn)) return pfn; /* * For smaller orders, just linearly scan as the number of pages * to migrate should be relatively small and does not necessarily * justify freeing up a large block for a small allocation. */ if (cc->order <= PAGE_ALLOC_COSTLY_ORDER) return pfn; /* * Only allow kcompactd and direct requests for movable pages to * quickly clear out a MOVABLE pageblock for allocation. This * reduces the risk that a large movable pageblock is freed for * an unmovable/reclaimable small allocation. */ if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE) return pfn; /* * When starting the migration scanner, pick any pageblock within the * first half of the search space. Otherwise try and pick a pageblock * within the first eighth to reduce the chances that a migration * target later becomes a source. */ distance = (cc->free_pfn - cc->migrate_pfn) >> 1; if (cc->migrate_pfn != cc->zone->zone_start_pfn) distance >>= 2; high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance); for (order = cc->order - 1; order >= PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < limit; order--) { struct free_area *area = &cc->zone->free_area[order]; struct list_head *freelist; unsigned long flags; struct page *freepage; if (!area->nr_free) continue; spin_lock_irqsave(&cc->zone->lock, flags); freelist = &area->free_list[MIGRATE_MOVABLE]; list_for_each_entry(freepage, freelist, buddy_list) { unsigned long free_pfn; if (nr_scanned++ >= limit) { move_freelist_tail(freelist, freepage); break; } free_pfn = page_to_pfn(freepage); if (free_pfn < high_pfn) { /* * Avoid if skipped recently. Ideally it would * move to the tail but even safe iteration of * the list assumes an entry is deleted, not * reordered. */ if (get_pageblock_skip(freepage)) continue; /* Reorder to so a future search skips recent pages */ move_freelist_tail(freelist, freepage); update_fast_start_pfn(cc, free_pfn); pfn = pageblock_start_pfn(free_pfn); if (pfn < cc->zone->zone_start_pfn) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; break; } } spin_unlock_irqrestore(&cc->zone->lock, flags); } cc->total_migrate_scanned += nr_scanned; /* * If fast scanning failed then use a cached entry for a page block * that had free pages as the basis for starting a linear scan. */ if (!found_block) { cc->fast_search_fail++; pfn = reinit_migrate_pfn(cc); } return pfn; }
This function helps determine when and how to perform a fast search for a suitable migration block during memory compaction. It evaluates predicates such as skipping the fast search if skip hints are ignored
(if (cc->ignore_skip_hint) return pfn;)
, if the current scan should be finished (if (cc->finish_pageblock) return pfn;
), if the migration pointer isn't at the start of a pageblock (if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn)) return pfn;
), or if the allocation order is smlal (if (cc->order <= PAGE_ALLOC_COSTLY_ORDER) return pfn;
). It also restricts fast searches to movable pages during direct compaction for higher-order allocations (if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE) return pfn;
). Based on these conditions, the function performs actions like initiating a fast search under specific circumstances to quickly locate suitable blocks, limiting the number of pages scanned to conserve CPU resources (if (nr_scanned++ >= limit) { move_freelist_tail(freelist, freepage); break;
}), and adjusting scanning strategies based on previous successes or failures to enhance adaptability (if (!found_block) { cc->fast_search_fail++; pfn = reinit_migrate_pfn(cc);
}). -
#ifdef CONFIG_SPARSEMEM
#ifdef CONFIG_SPARSEMEM
conditionally includes or excludes the implementation of theskip_offline_sections
andskip_offline_sections_reverse
functions based on whether theCONFIG_SPARSEMEM
option is enabled in the kernel configuration.CONFIG_SPARSEMEM
enables the the kernel to support sparse memory systems where memory sections can be dynamically online or offline. They provide logic to skip over offline memory sections during operations like memory compaction, ensuring that the system does not attempt to access or manipulate memory that is not currently available.If
CONFIG_SPARSEMEM
is not defined, these functions return 0, effectively disabling the handling of offline memory sections.https://elixir.bootlin.com/linux/v6.6.42/source/mm/Kconfig#L461
-
#ifdef CONFIG_COMPACTION
This option acts as a policy control mechanism that determines whether the memory compaction feature is included in the kernel build. By including the compaction-related code within
#ifdef CONFIG_COMPACTION
and#endif
[Line 522], the code conditionally compiles these sections based on the configuration setting. This affects the kernel's behavior regarding memory management and fragmentation handling. CONFIG_COMPACTION is defined in https://elixir.bootlin.com/linux/v6.6.42/source/mm/Kconfig#L637 and default is set toYes
orTrue
-
static bool kcompactd_node_suitable(pg_data_t *pgdat) { int zoneid; struct zone *zone; enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx; for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) { zone = &pgdat->node_zones[zoneid]; if (!populated_zone(zone)) continue; /* Allocation can already succeed, check other zones */ if (zone_watermark_ok(zone, pgdat->kcompactd_max_order, min_wmark_pages(zone), highest_zoneidx, 0)) continue; if (compaction_suitable(zone, pgdat->kcompactd_max_order, highest_zoneidx)) return true; } return false; }
This function determines whether a memory node is suitable for compaction by the kcompactd daemon. It iterates through memory zones up to a configured highest index, checking if compaction is necessary and beneficial. The policy skips unpopulated zones and those where allocation can already succeed without compaction.
-
static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc, nid; rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; if (write && sysctl_compaction_proactiveness) { for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); if (pgdat->proactive_compact_trigger) continue; pgdat->proactive_compact_trigger = true; trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, -1, pgdat->nr_zones - 1); wake_up_interruptible(&pgdat->kcompactd_wait); } } return 0; }
This function implements configuration and policy logic for proactive memory compaction. It supports the sysctl handler, and provides runtime configuration of compaction proactiveness. When the proactiveness setting is enabled, the function applies a policy to trigger proactive compaction across all online memory nodes. It does this by setting a trigger flag and waking up the kcompactd daemon for each node that hasn't already been triggered.
-
if (suitable) {
This policy is based off of the allocation order, zone characteristics, and memory fragmentation level to decide whether compaction should happen. For high-order allocations, it uses a fragmentation index and a configurable threshold to decide if compaction would be beneficial. The policy aims to balance the need for contiguous memory with system performance, avoiding unnecessary compaction that could impact system stability.
-
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx, ALLOC_CMA, wmark_target);
The __compaction_suitable function determines if memory compaction is viable for a given zone based on its current watermark levels and the requested allocation order. It calculates a watermark threshold, which varies depending on the allocation size, and includes a compaction gap for efficiency. The function then checks if the zone meets this watermark using __zone_watermark_ok, considering factors like CMA pages and zone indices.
-
static enum compact_result __compact_finished(struct compact_control *cc) { unsigned int order; const int migratetype = cc->migratetype; int ret; /* Compaction run completes if the migrate and free scanner meet */ if (compact_scanners_met(cc)) { /* Let the next compaction start anew. */ reset_cached_positions(cc->zone); /* * Mark that the PG_migrate_skip information should be cleared * by kswapd when it goes to sleep. kcompactd does not set the * flag itself as the decision to be clear should be directly * based on an allocation request. */ if (cc->direct_compaction) cc->zone->compact_blockskip_flush = true; if (cc->whole_zone) return COMPACT_COMPLETE; else return COMPACT_PARTIAL_SKIPPED; } if (cc->proactive_compaction) { int score, wmark_low; pg_data_t *pgdat; pgdat = cc->zone->zone_pgdat; if (kswapd_is_running(pgdat)) return COMPACT_PARTIAL_SKIPPED; score = fragmentation_score_zone(cc->zone); wmark_low = fragmentation_score_wmark(true); if (score > wmark_low) ret = COMPACT_CONTINUE; else ret = COMPACT_SUCCESS; goto out; } if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; /* * Always finish scanning a pageblock to reduce the possibility of * fallbacks in the future. This is particularly important when * migration source is unmovable/reclaimable but it's not worth * special casing. */ if (!pageblock_aligned(cc->migrate_pfn)) return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ ret = COMPACT_NO_SUITABLE_PAGE; for (order = cc->order; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &cc->zone->free_area[order]; bool can_steal; /* Job done if page is free of the right migratetype */ if (!free_area_empty(area, migratetype)) return COMPACT_SUCCESS;
This function has policy logic for when memory compaction should complete, considering whether migrate and free scanners have met, proactive compactions status, fragmentation scores, and availability of free pages. The function returns different COMPACT_* status codes based on these criteria, effectively deciding whether to continue compaction, consider it successful, or terminate the process. This policy balances thorough memory reorganization with efficient use of system resources during compaction.
-
#ifdef CONFIG_COMPACTION static bool suitable_migration_source(struct compact_control *cc, struct page *page) { int block_mt; if (pageblock_skip_persistent(page)) return false; if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) return true; block_mt = get_pageblock_migratetype(page); if (cc->migratetype == MIGRATE_MOVABLE) return is_migrate_movable(block_mt); else return block_mt == cc->migratetype; }
This code snippet from the Linux kernel's memory compaction system implements policy and configuration logic for determining suitable migration sources during compaction. It's conditionally compiled based on the CONFIG_COMPACTION option, demonstrating configuration-dependent behavior. The suitable_migration_source function encapsulates policy decisions by considering factors such as persistent skip flags, compaction mode, and migration types. It applies different criteria for async direct compaction versus other modes, and implements specific rules for matching migration types, with special handling for movable pages.
-
/* Similar to reclaim, but different enough that they don't share logic */ static bool too_many_isolated(struct compact_control *cc) { pg_data_t *pgdat = cc->zone->zone_pgdat; bool too_many; unsigned long active, inactive, isolated; inactive = node_page_state(pgdat, NR_INACTIVE_FILE) + node_page_state(pgdat, NR_INACTIVE_ANON); active = node_page_state(pgdat, NR_ACTIVE_FILE) + node_page_state(pgdat, NR_ACTIVE_ANON); isolated = node_page_state(pgdat, NR_ISOLATED_FILE) + node_page_state(pgdat, NR_ISOLATED_ANON); /* * Allow GFP_NOFS to isolate past the limit set for regular * compaction runs. This prevents an ABBA deadlock when other * compactors have already isolated to the limit, but are * blocked on filesystem locks held by the GFP_NOFS thread. */ if (cc->gfp_mask & __GFP_FS) { inactive >>= 3; active >>= 3; } too_many = isolated > (inactive + active) / 2; if (!too_many) wake_throttle_isolated(pgdat); return too_many; }
This policy determines when too many pages have been isolated during kernel compaction and calculates a threshold based on the number of active and inactive pages relative to isolated pages. It handles GFP_NOFS allocations specifically differently so as to prevent deadlocks.
-
/* * Compound pages of >= pageblock_order should consistently be skipped until * released. It is always pointless to compact pages of such order (if they are * migratable), and the pageblocks they occupy cannot contain any free pages. */ static bool pageblock_skip_persistent(struct page *page) { if (!PageCompound(page)) return false; page = compound_head(page); if (compound_order(page) >= pageblock_order) return true; return false; }
This policy optimizes memory compaction by deferring compaction on large compound pages. It determines if they are of a size greater than or equal to pageblock_order and skips them if so. For non-compound pages, it returns false, meaning it shouldn't be deferred.
-
/* Returns true if compaction should be skipped this time */ static bool compaction_deferred(struct zone *zone, int order) { unsigned long defer_limit = 1UL << zone->compact_defer_shift; if (order < zone->compact_order_failed) return false; /* Avoid possible overflow */ if (++zone->compact_considered >= defer_limit) { zone->compact_considered = defer_limit; return false; } trace_mm_compaction_deferred(zone, order); return true; }
This policy is for deferring memory compaction in Linux. It defers compaction if the requested order is > than the previously failed order and there hasn't been more than some limit of comparisons recently, so that compaction attempts will stop if repeatedly unsuccessful.
-
if (!sysctl_compaction_proactiveness) timeout = MAX_SCHEDULE_TIMEOUT;
If proactive compaction is disabled (sysctl_compaction_proactiveness is 0), the daemon sets its timeout to MAX_SCHEDULE_TIMEOUT, effectively sleeping until explicitly woken up. If proactive compaction is enabled, it uses the default timeout to periodically wake up and check for work.
-
-
elixir.bootlin.com elixir.bootlin.com
-
if (khugepaged_has_work()) { const unsigned long scan_sleep_jiffies = msecs_to_jiffies(khugepaged_scan_sleep_millisecs); if (!scan_sleep_jiffies) return; khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; wait_event_freezable_timeout(khugepaged_wait, khugepaged_should_wakeup(), scan_sleep_jiffies); return; } if (hugepage_flags_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
khugepaged thread sleeps when khugepaged does not have work (khugepaged_scan list is empty). Sleeps for khugepaged_scan_sleep_millisecs if has work
-
if (hugepage_flags_enabled()) { if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL,
hugeoage_flags_enabled() flag will turn on hupe page for all mappings. transparent_hugepage_flags is disabled by default. If enabled, khugepaged thread will start running.
-
-
elixir.bootlin.com elixir.bootlin.com
-
if (si->cluster_info) { if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto scan; } else if (unlikely(!si->cluster_nr--)) {
Algorithmic policy decision: If SSD, use SSD wear-leveling friendly algorithm. Otherwise, use HDD algo which minimizes seek times. Potentially, certain access patterns may make one of these algorithms less effective (i.e. in the case of wear leveling, the swap is constantly full)
-
scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ for (; last_in_cluster <= si->highest_bit; offset++) { if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { spin_lock(&si->lock); offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } }
Here, (when using HDDs), a policy is implemented that places the swapped page in the first available slot. This is supposed to reduce seek time in spinning drives, as it encourages having swap entries near each other.
-
/* * Even if there's no free clusters available (fragmented), * try to scan a little more quickly with lock held unless we * have scanned too many slots already. */ if (!scanned_many) { unsigned long scan_limit; if (offset < scan_base) scan_limit = scan_base; else scan_limit = si->highest_bit; for (; offset <= scan_limit && --latency_ration > 0; offset++) { if (!si->swap_map[offset]) goto checks; } }
Here we have a configuration policy where we do another smaller scan as long as we haven't exhausted our latency_ration. Another alternative could be yielding early in anticipation that we aren't going to find a free slot.
-
/* * select a random position to start with to help wear leveling * SSD */ for_each_possible_cpu(cpu) { per_cpu(*p->cluster_next_cpu, cpu) = get_random_u32_inclusive(1, p->highest_bit); }
An algorithmic (random) policy is used here to spread swap pages around an SSD to help with wear leveling instead of writing to the same area of an SSD often.
-
cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
A policy decision is made to add a cluster to the discard list on a first-come, first-served basis. However, this approach could be enhanced by prioritizing certain clusters higher on the list based on their 'importance.' This 'importance' could be defined by how closely a cluster is related to other pages in the swap. By doing so, the system can reduce seek time as mentioned on line 817.
-
if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; } if (swap_offset_available_and_locked(si, offset)) goto checks; } offset = si->lowest_bit; while (offset < scan_base) { if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; }
Here, a policy decision is made to fully replenish the latency_ration with the LATENCY_LIMIT and then yield back to the scheduler if we've exhausted it. This makes it so that when scheduled again, we have the full LATENCY_LIMIT to do a scan. Alternative policies could grow/shrink this to find a better heuristic instead of fully replenishing each time.
Marked config/value as awe're replacing latency_ration with a compiletime-defined limit.
-
while (scan_swap_map_ssd_cluster_conflict(si, offset)) { /* take a break if we already got some slots */ if (n_ret) goto done; if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto scan;
Here, a policy decision is made to stop scanning if some slots were already found. Other policy decisions could be made to keep scanning or take into account how long the scan took or how many pages were found.
-
else if (!cluster_list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in * discarding, do discard now and reclaim them, then * reread cluster_next_cpu since we dropped si->lock */ swap_do_scheduled_discard(si); *scan_base = this_cpu_read(*si->cluster_next_cpu); *offset = *scan_base; goto new_cluster;
This algorithmic policy discards + reclaims pages as-needed whenever there is no free cluster. Other policies could be explored that do this preemptively in order to avoid the cost of doing it here.
-
#ifdef CONFIG_THP_SWAP
This is a build-time flag that configures how hugepages are handled when swapped. When defined, it swaps them in one piece, while without it splits them into smaller units and swaps those units.
-
if (swap_flags & SWAP_FLAG_DISCARD_ONCE) p->flags &= ~SWP_PAGE_DISCARD; else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) p->flags &= ~SWP_AREA_DISCARD
This is a configuration policy decision where a sysadmin can pass flags to sys_swapon() to control the behavior discards are handled. If DISCARD_ONCE is set, a flag which "discard[s] swap area at swapon-time" is unset, and if DISCARD_PAGES is set, a flag which "discard[s] page-clusters after use" is unset.
-
/* * Should not even be attempting cluster allocations when huge * page swap is disabled. Warn and fail the allocation. */ if (!IS_ENABLED(CONFIG_THP_SWAP)) { VM_WARN_ON_ONCE(1); return 0; }
The policy dictates that if huge page swapping is disabled at compile time (via the CONFIG_THP_SWAP configuration option), the kernel should not attempt to allocate swap clusters for huge pages and should fail the operation with a warning.
-
#ifdef CONFIG_HIBERNATION
If CONFIG_HIBERNATION is defined, the kernel includes code to write the entire system memory state to the swapfile before powering down the system. This involves allocating swap slots for the entire memory state and ensuring that the data is properly stored.
-
if (count == SWAP_HAS_CACHE && !swap_page_trans_huge_swapped(p, entry)) __try_to_reclaim_swap(p, swp_offset(entry), TTRS_UNMAPPED | TTRS_FULL);
This policy optimizes memory usage by attempting to free both the swap entry and the page cache when they are no longer needed. The system avoids keeping cached pages in memory unnecessarily, thus freeing up resources for other processes.
-
#define SWAPFILE_CLUSTER 256
This configuration defines the size of the swap cluster, which is the number of swap pages that the system tries to allocate as a unit. Is set to 256.
-
/* * Use percpu scan base for SSD to reduce lock contention on * cluster and swap cache. For HDD, sequential access is more * important. */ if (si->flags & SWP_SOLIDSTATE) scan_base = this_cpu_read(*si->cluster_next_cpu); else scan_base = si->cluster_next; offset = scan_base;
Algorithmic Policy: Check if the device is HDD or SSD. If SSD, allow different cpus to scan different parts of the swap map. If HDD, check for slots sequentially.
-
/* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; unlock_cluster(ci); spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed) goto checks; goto scan; /* check next one */ }
This is an algorithmic policy. If you're running out of swap spaces, try to reclaim space in the cache-only swap.
-
#define LATENCY_LIMIT 256
LATENCY_LIMIT is an upper bound on how many slots can be checked by scan_swap_map_slots before it yields the cpu back. It's set to 64. This policy prevents the function from monopolizing the cpu.
-
node = numa_node_id();
This algorithmic policy is choosing which node to look at for swap devices.
In particular, it only tries swapping pages with devices that are at the same Numa Node as the core, and not any other.
-
plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { /* requeue si to after same-priority siblings */ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
This is an algorithmic policy that decides which of the swap devices should be used to get swap pages from, in that numa node. What's interesting here is that once a device is seen, it is requeued, so that the burden is spread evenly over all the devices with same priority, on that that node.
-
n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
SWAP_BATCH: This is a configuration policy (Macro) that caps the maximum number of entries that can be swapped in a single operation. It's set to 64. This is done to limit amount of swapping that happens.
-
-
elixir.bootlin.com elixir.bootlin.com
-
static const struct file_operations proc_page_owner_operations = { .read = read_page_owner, .llseek = lseek_page_owner, };
Configuration policy that defines an interface for user space to access kernel page ownership data. Configuring how users can read and seek through the page information.
-
/* * Some pages could be missed by concurrent allocation or free, * because we don't hold the zone lock. */ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) goto ext_put_continue; /* * Although we do have the info about past allocation of free * pages, it's not relevant for current memory usage. */ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) goto ext_put_continue; page_owner = get_page_owner(page_ext); /* * Don't print "tail" pages of high-order allocations as that * would inflate the stats. */ if (!IS_ALIGNED(pfn, 1 << page_owner->order)) goto ext_put_continue;
Policy that ensures pages have ownership info, are valid and allocated.
In the first if statement it skip pages without owners, the second one checks that the pages are not currently allocated. And finally it skip tail pages.
-
/* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) pfn++;
This policy ensures efficient traversal of large memory areas speeding up memory management. It avoid redundant checks, It ensures only valid memory pages are processed.
-
-
elixir.bootlin.com elixir.bootlin.com
-
static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc)
The policy in this function governs the management of shadow nodes in the Linux kernel's page cache. It dynamically calculates a maximum allowable number of shadow nodes based on the available memory, aiming to maintain an optimal balance between memory usage and the ability to detect page refaults. The policy sets this limit at approximately 1/8th of the total possible pages, adapting to different memory configurations and pressure scenarios. When the current number of shadow nodes exceeds this calculated maximum, the policy determines how many nodes should be reclaimed.
-
f (lru_gen_enabled()) return lru_gen_eviction(folio);
When enabled via lru_gen_enabled(), it alters how the kernel tracks page access patterns and manages evictions. This policy introduces the concept of generation-based tracking, where pages are grouped into generations based on their access time. It uses functions like lru_gen_eviction() and lru_gen_refault() to handle page evictions and refaults respectively. This approach allows for more nuanced decisions in page reclaim, potentially improving memory utilization and reducing I/O operations by better identifying truly cold pages and protecting hot pages from premature eviction.
-
-
elixir.bootlin.com elixir.bootlin.com
-
if (vma->vm_flags & VM_PFNMAP) { int err = 1; if (ops->pte_hole) err = ops->pte_hole(start, end, -1, walk);
-
-
elixir.bootlin.com elixir.bootlin.com
-
unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, bool need_rmap_locks)
Simple, but optimize by size of page table. Triggered by checking CONFIG_HAVE_MOVE_PUD and CONFIG_HAVE_MOVE_PMD
-
-
elixir.bootlin.com elixir.bootlin.com
-
static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) { int ret; bool found_slot = false; struct memory_tier *memtier, *new_memtier; int adistance = memtype->adistance; unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; lockdep_assert_held_once(&memory_tier_lock); adistance = round_down(adistance, memtier_adistance_chunk_size); /* * If the memtype is already part of a memory tier, * just return that. */ if (!list_empty(&memtype->tier_sibiling)) { list_for_each_entry(memtier, &memory_tiers, list) { if (adistance == memtier->adistance_start) return memtier; } WARN_ON(1); return ERR_PTR(-EINVAL); } list_for_each_entry(memtier, &memory_tiers, list) { if (adistance == memtier->adistance_start) { goto link_memtype; } else if (adistance < memtier->adistance_start) { found_slot = true; break; } } new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); if (!new_memtier) return ERR_PTR(-ENOMEM); new_memtier->adistance_start = adistance; INIT_LIST_HEAD(&new_memtier->list); INIT_LIST_HEAD(&new_memtier->memory_types); if (found_slot) list_add_tail(&new_memtier->list, &memtier->list); else list_add_tail(&new_memtier->list, &memory_tiers); new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; new_memtier->dev.bus = &memory_tier_subsys; new_memtier->dev.release = memory_tier_device_release; new_memtier->dev.groups = memtier_dev_groups; ret = device_register(&new_memtier->dev); if (ret) { list_del(&new_memtier->list); put_device(&new_memtier->dev); return ERR_PTR(ret); } memtier = new_memtier; link_memtype: list_add(&memtype->tier_sibiling, &memtier->memory_types); return memtier; }
This function decides whether to assign a memory device to an existing memory tier or create a new one. The decision is based on the memory device's distance and the structure of existing memory tiers.
-
static void establish_demotion_targets(void) { struct memory_tier *memtier; struct demotion_nodes *nd; int target = NUMA_NO_NODE, node; int distance, best_distance; nodemask_t tier_nodes, lower_tier; lockdep_assert_held_once(&memory_tier_lock); if (!node_demotion) return; disable_all_demotion_targets(); for_each_node_state(node, N_MEMORY) { best_distance = -1; nd = &node_demotion[node]; memtier = __node_get_memory_tier(node); if (!memtier || list_is_last(&memtier->list, &memory_tiers)) continue; /* * Get the lower memtier to find the demotion node list. */ memtier = list_next_entry(memtier, list); tier_nodes = get_memtier_nodemask(memtier); /* * find_next_best_node, use 'used' nodemask as a skip list. * Add all memory nodes except the selected memory tier * nodelist to skip list so that we find the best node from the * memtier nodelist. */ nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); /* * Find all the nodes in the memory tier node list of same best distance. * add them to the preferred mask. We randomly select between nodes * in the preferred mask when allocating pages during demotion. */ do { target = find_next_best_node(node, &tier_nodes); if (target == NUMA_NO_NODE) break; distance = node_distance(node, target); if (distance == best_distance || best_distance == -1) { best_distance = distance; node_set(target, nd->preferred); } else { break; } } while (1); } /* * Promotion is allowed from a memory tier to higher * memory tier only if the memory tier doesn't include * compute. We want to skip promotion from a memory tier, * if any node that is part of the memory tier have CPUs. * Once we detect such a memory tier, we consider that tier * as top tiper from which promotion is not allowed. */ list_for_each_entry_reverse(memtier, &memory_tiers, list) { tier_nodes = get_memtier_nodemask(memtier); nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); if (!nodes_empty(tier_nodes)) { /* * abstract distance below the max value of this memtier * is considered toptier. */ top_tier_adistance = memtier->adistance_start + MEMTIER_CHUNK_SIZE - 1; break; } } /* * Now build the lower_tier mask for each node collecting node mask from * all memory tier below it. This allows us to fallback demotion page * allocation to a set of nodes that is closer the above selected * perferred node. */ lower_tier = node_states[N_MEMORY]; list_for_each_entry(memtier, &memory_tiers, list) { /* * Keep removing current tier from lower_tier nodes, * This will remove all nodes in current and above * memory tier from the lower_tier mask. */ tier_nodes = get_memtier_nodemask(memtier); nodes_andnot(lower_tier, lower_tier, tier_nodes); memtier->lower_tier_mask = lower_tier; } }
This function contains algorithmic policies that select demotion targets based on node distance, memory tier relationships, and CPU presence in memory nodes, and adjusts the demotion strategy based on runtime system conditions.
-
-
elixir.bootlin.com elixir.bootlin.com
-
enum get_ksm_page_flags { GET_KSM_PAGE_NOLOCK, GET_KSM_PAGE_LOCK, GET_KSM_PAGE_TRYLOCK };
This defines the locking behavior when accessing a KSM page. The caller can choose between no locking, locking, or trying to lock without blocking, affecting how the page is accessed and modified.
-
static struct page *get_ksm_page(struct ksm_stable_node *stable_node, enum get_ksm_page_flags flags) { struct page *page; void *expected_mapping; unsigned long kpfn; expected_mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ page = pfn_to_page(kpfn); if (READ_ONCE(page->mapping) != expected_mapping) goto stale; /* * We cannot do anything with the page while its refcount is 0. * Usually 0 means free, or tail of a higher-order page: in which * case this node is no longer referenced, and should be freed; * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; * the same is in reuse_ksm_page() case; but if page is swapcache * in folio_migrate_mapping(), it might still be our page, * in which case it's essential to keep the node. */ while (!get_page_unless_zero(page)) { /* * Another check for page->mapping != expected_mapping would * work here too. We have chosen the !PageSwapCache test to * optimize the common case, when the page is or is about to * be freed: PageSwapCache is cleared (under spin_lock_irq) * in the ref_freeze section of __remove_mapping(); but Anon * page->mapping reset to NULL later, in free_pages_prepare(). */ if (!PageSwapCache(page)) goto stale; cpu_relax(); } if (READ_ONCE(page->mapping) != expected_mapping) { put_page(page); goto stale; } if (flags == GET_KSM_PAGE_TRYLOCK) { if (!trylock_page(page)) { put_page(page); return ERR_PTR(-EBUSY); } } else if (flags == GET_KSM_PAGE_LOCK) lock_page(page); if (flags != GET_KSM_PAGE_NOLOCK) { if (READ_ONCE(page->mapping) != expected_mapping) { unlock_page(page); put_page(page); goto stale; } } return page; stale: /* * We come here from above when page->mapping or !PageSwapCache * suggests that the node is stale; but it might be under migration. * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(), * before checking whether node->kpfn has been changed. */ smp_rmb(); if (READ_ONCE(stable_node->kpfn) != kpfn) goto again; remove_node_from_stable_tree(stable_node); return NULL; }
This function checks whether a KSM page is still valid by verifying its mapping, reference count, and handling page migration. It dynamically decides whether to return the page, retry, or mark the stable node as stale.
-
struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address) { struct folio *folio = page_folio(page); struct anon_vma *anon_vma = folio_anon_vma(folio); struct page *new_page; if (PageKsm(page)) { if (page_stable_node(page) && !(ksm_run & KSM_RUN_UNMERGE)) return page; /* no need to copy it */ } else if (!anon_vma) { return page; /* no need to copy it */ } else if (page->index == linear_page_index(vma, address) && anon_vma->root == vma->anon_vma->root) { return page; /* still no need to copy it */ } if (PageHWPoison(page)) return ERR_PTR(-EHWPOISON); if (!PageUptodate(page)) return page; /* let do_swap_page report the error */ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (new_page && mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) { put_page(new_page); new_page = NULL; } if (new_page) { if (copy_mc_user_highpage(new_page, page, address, vma)) { put_page(new_page); memory_failure_queue(page_to_pfn(page), 0); return ERR_PTR(-EHWPOISON); } SetPageDirty(new_page); __SetPageUptodate(new_page); __SetPageLocked(new_page); #ifdef CONFIG_SWAP count_vm_event(KSM_SWPIN_COPY); #endif } return new_page; }
This function decides whether a page needs to be copied based on its status as a KSM page, its association with an anonymous VMA, and its hardware-poisoned or up-to-date status.
-
static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, struct page *page, struct ksm_rmap_item *tree_rmap_item, struct page *tree_page) { int err; err = try_to_merge_with_ksm_page(rmap_item, page, NULL); if (!err) { err = try_to_merge_with_ksm_page(tree_rmap_item, tree_page, page); /* * If that fails, we have a ksm page with only one pte * pointing to it: so break it. */ if (err) break_cow(rmap_item); } return err ? NULL : page; }
This function dynamically decides when two pages should be merged into a KSM page. If the merge fails, it applies a fallback policy by breaking the COW relationship, ensuring that memory management remains consistent.
-
static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item, struct page *page, struct page *kpage) { struct mm_struct *mm = rmap_item->mm; struct vm_area_struct *vma; int err = -EFAULT; mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (!vma) goto out; err = try_to_merge_one_page(vma, page, kpage); if (err) goto out; /* Unstable nid is in union with stable anon_vma: remove first */ remove_rmap_item_from_tree(rmap_item); /* Must get reference to anon_vma while still holding mmap_lock */ rmap_item->anon_vma = vma->anon_vma; get_anon_vma(vma->anon_vma); out: mmap_read_unlock(mm); trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page), rmap_item, mm, err); return err; }
This function checks whether the VMA is mergable and proceeds based on that condition, and it dynamically removes the reverse mapping item from the unstable tree after a successful merge.
-
static int ksmd_should_run(void) { return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node); }
This function checks whether the ksmd should run based on the KSM_RUN_MERGE flag and whether there are any memory regions to process. The decision to enable or disable memory merging is controlled by the ksm_run configuration.
-
static unsigned int ksm_thread_sleep_millisecs = 20;
This variable sets the delay (in milliseconds) for the KSM daemon to sleep between scanning batches. It controls the scanning frequency, balancing KSM's memory merging process with system performance.
-
static unsigned int ksm_thread_pages_to_scan = 100;
This variable determines the number of pages the KSM daemon will scan in one batch. It controls the performance and efficiency of KSM's scanning process, balancing between thoroughness and system load.
-
static int ksm_max_page_sharing = 256;
This variable controls the maximum number of page slots that can share a stable node in the stable tree for KSM. It defines a limit for page sharing, affecting how KSM consolidates memory pages.
-
static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;
This variable sets the delay (in milliseconds) for pruning stale stable_node_dups in the stable_node_chains. It controls how often KSM will perform cleanup operations on stale nodes.
-
-
elixir.bootlin.com elixir.bootlin.com
-
randomize_stack_top
This function uses a configuration policy to enable Address Space Layout Randomization (ASLR) for a specific process if PF_RANDOMIZE flag is set. It randomly arranges the positions of stack of a process to help defend certain attacks by making memory addresses unpredictable.
-
-
elixir.bootlin.com elixir.bootlin.com
-
/* Soft offline could migrate non-LRU movable pages */ if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page)) return true;
The code includes a policy for soft offline pages (MF_SOFT_OFFLINE). This is a feature where the kernel attempts to migrate pages to avoid using faulty memory areas. The policy allows non-LRU movable pages (pages that aren’t in the Least Recently Used list) to be migrated.
-
if (TestSetPageHWPoison(p)) { pr_err("%#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; if (flags & MF_ACTION_REQUIRED) res = kill_accessing_process(current, pfn, flags); if (flags & MF_COUNT_INCREASED) put_page(p); goto unlock_mutex; }
The page isolation policy ensures that faulty pages are marked as hardware poisoned (using TestSetPageHWPoison). The kernel checks if a page has already been isolated (marked as poisoned) before taking any further action. This prevents faulty pages from being reused by the system and isolates them to avoid further corruption.
-
-
elixir.bootlin.com elixir.bootlin.com
-
if (movable_node_is_enabled()) {
This policy, as the comment states, will ignore the kernelcore and movablecore options if movable nodes are enabled (skipping the logic below this if statement and jumping to "out2" instead). The logic for the "movable_node_is_enabled()" function is in "memory_hotplug.h".
-
if (page_poisoning_enabled() || (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && debug_pagealloc_enabled())) {
This check looks at flags and configuration settings to determine if page poisoning should be enabled.
-
if (descending)
This decides whether to iterate forward or backward through the zones passed into the function. The variable "descending" is determined by the "arch_has_descending_max_zone_pfns()" function on line 1811, which is determined from configuration options.
-