99 Matching Annotations
  1. Last 7 days
    1. if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 && zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >= BIG_CHUNK_GAP) { mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS); zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; return 1;

      threshold for moving middle chunk to right after first chunk

    2. new_zhdr = __z3fold_alloc(pool, sz, false);

      GPT: allocates to same pool to avoid locality

    3. unbuddied = per_cpu_ptr(pool->unbuddied, cpu); spin_lock(&pool->lock); l = &unbuddied[chunks]; zhdr = list_first_entry_or_null(READ_ONCE(l), struct z3fold_header, buddy);

      First fit alloc

    4. /* First, try to find an unbuddied z3fold page. */ unbuddied = this_cpu_ptr(pool->unbuddied); for_each_unbuddied_list(i, chunks) { struct list_head *l = &unbuddied[i]; zhdr = list_first_entry_or_null(READ_ONCE(l), struct z3fold_header, buddy);

      First fit alloc

    5. if (!(handle & (1 << PAGE_HEADLESS))) { slots = handle_to_slots(handle); do { unsigned long addr; read_lock(&slots->lock); addr = *(unsigned long *)handle; zhdr = (struct z3fold_header *)(addr & PAGE_MASK); locked = z3fold_page_trylock(zhdr); read_unlock(&slots->lock); if (locked) { struct page *page = virt_to_page(zhdr); if (!test_bit(PAGE_MIGRATED, &page->private)) break; z3fold_page_unlock(zhdr); } cpu_relax(); } while (true); } else { zhdr = (struct z3fold_header *)(handle & PAGE_MASK); }

      check for policy

    6. static unsigned long __encode_handle(struct z3fold_header *zhdr, struct z3fold_buddy_slots *slots, enum buddy bud) { unsigned long h = (unsigned long)zhdr; int idx = 0; /* * For a headless page, its handle is its pointer with the extra * PAGE_HEADLESS bit set */ if (bud == HEADLESS) return h | (1 << PAGE_HEADLESS); /* otherwise, return pointer to encoded handle */ idx = __idx(zhdr, bud); h += idx; if (bud == LAST) h |= (zhdr->last_chunks << BUDDY_SHIFT); write_lock(&slots->lock); slots->slot[idx] = h; write_unlock(&slots->lock); return (unsigned long)&slots->slot[idx]; }

      Def policy code i think.

    7. /* * Returns the number of free chunks in a z3fold page. * NB: can't be used with HEADLESS pages. */ static int num_free_chunks(struct z3fold_header *zhdr) { int nfree; /* * If there is a middle object, pick up the bigger free space * either before or after it. Otherwise just subtract the number * of chunks occupied by the first and the last objects. */ if (zhdr->middle_chunks != 0) { int nfree_before = zhdr->first_chunks ? 0 : zhdr->start_middle - ZHDR_CHUNKS; int nfree_after = zhdr->last_chunks ? 0 : TOTAL_CHUNKS - (zhdr->start_middle + zhdr->middle_chunks); nfree = max(nfree_before, nfree_after); } else nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; return nfree; }

      policy use code at very least. Likely to be policy x4

    8. static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks) { enum buddy bud = HEADLESS; if (zhdr->middle_chunks) { if (!zhdr->first_chunks && chunks <= zhdr->start_middle - ZHDR_CHUNKS) bud = FIRST; else if (!zhdr->last_chunks) bud = LAST; } else { if (!zhdr->first_chunks) bud = FIRST; else if (!zhdr->last_chunks) bud = LAST; else bud = MIDDLE; } return bud; } static inline void *mchunk_memmove(struct z3fold_header *zhdr, unsigned short dst_chunk) { void *beg = zhdr; return memmove(beg + (dst_chunk << CHUNK_SHIFT), beg + (zhdr->start_middle << CHUNK_SHIFT), zhdr->middle_chunks << CHUNK_SHIFT); } static inline bool buddy_single(struct z3fold_header *zhdr) { return !((zhdr->first_chunks && zhdr->middle_chunks) || (zhdr->first_chunks && zhdr->last_chunks) || (zhdr->middle_chunks && zhdr->last_chunks)); }

      POLICY most likely x3

    9. { struct z3fold_pool *pool = zhdr_to_pool(zhdr); void *p = zhdr; unsigned long old_handle = 0; size_t sz = 0; struct z3fold_header *new_zhdr = NULL; int first_idx = __idx(zhdr, FIRST); int middle_idx = __idx(zhdr, MIDDLE); int last_idx = __idx(zhdr, LAST); unsigned short *moved_chunks = NULL; /* * No need to protect slots here -- all the slots are "local" and * the page lock is already taken */ if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) { p += ZHDR_SIZE_ALIGNED; sz = zhdr->first_chunks << CHUNK_SHIFT; old_handle = (unsigned long)&zhdr->slots->slot[first_idx]; moved_chunks = &zhdr->first_chunks; } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) { p += zhdr->start_middle << CHUNK_SHIFT; sz = zhdr->middle_chunks << CHUNK_SHIFT; old_handle = (unsigned long)&zhdr->slots->slot[middle_idx]; moved_chunks = &zhdr->middle_chunks; } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) { p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); sz = zhdr->last_chunks << CHUNK_SHIFT; old_handle = (unsigned long)&zhdr->slots->slot[last_idx]; moved_chunks = &zhdr->last_chunks; } if (sz > 0) { enum buddy new_bud = HEADLESS; short chunks = size_to_chunks(sz); void *q; new_zhdr = __z3fold_alloc(pool, sz, false); if (!new_zhdr) return NULL; if (WARN_ON(new_zhdr == zhdr)) goto out_fail; new_bud = get_free_buddy(new_zhdr, chunks); q = new_zhdr; switch (new_bud) { case FIRST: new_zhdr->first_chunks = chunks; q += ZHDR_SIZE_ALIGNED; break; case MIDDLE: new_zhdr->middle_chunks = chunks; new_zhdr->start_middle = new_zhdr->first_chunks + ZHDR_CHUNKS; q += new_zhdr->start_middle << CHUNK_SHIFT; break; case LAST: new_zhdr->last_chunks = chunks; q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT); break; default: goto out_fail; } new_zhdr->foreign_handles++; memcpy(q, p, sz); write_lock(&zhdr->slots->lock); *(unsigned long *)old_handle = (unsigned long)new_zhdr + __idx(new_zhdr, new_bud); if (new_bud == LAST) *(unsigned long *)old_handle |= (new_zhdr->last_chunks << BUDDY_SHIFT); write_unlock(&zhdr->slots->lock); add_to_unbuddied(pool, new_zhdr); z3fold_page_unlock(new_zhdr); *moved_chunks = 0; } return new_zhdr; out_fail: if (new_zhdr && !put_z3fold_locked(new_zhdr)) { add_to_unbuddied(pool, new_zhdr); z3fold_page_unlock(new_zhdr); } return NULL; }

      def contains policy x2. IMPr

    10. if (!zhdr->foreign_handles && buddy_single(zhdr) && zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) { if (!put_z3fold_locked(zhdr)) { clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } return; }

      stands out as potential policy

    11. #define BIG_CHUNK_GAP 3 /* Has to be called with lock held */ static int z3fold_compact_page(struct z3fold_header *zhdr) { struct page *page = virt_to_page(zhdr); if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private)) return 0; /* can't move middle chunk, it's used */ if (unlikely(PageIsolated(page))) return 0; if (zhdr->middle_chunks == 0) return 0; /* nothing to compact */ if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { /* move to the beginning */ mchunk_memmove(zhdr, ZHDR_CHUNKS); zhdr->first_chunks = zhdr->middle_chunks; zhdr->middle_chunks = 0; zhdr->start_middle = 0; zhdr->first_num++; return 1; } /* * moving data is expensive, so let's only do that if * there's substantial gain (at least BIG_CHUNK_GAP chunks) */ if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 && zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >= BIG_CHUNK_GAP) { mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS); zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; return 1; } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 && TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle + zhdr->middle_chunks) >= BIG_CHUNK_GAP) { unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks - zhdr->middle_chunks; mchunk_memmove(zhdr, new_start); zhdr->start_middle = new_start; return 1; } return 0; }

      def policy in here. IMP

    12. if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; if (size > PAGE_SIZE) return -ENOSPC; if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) bud = HEADLESS; else { retry: zhdr = __z3fold_alloc(pool, size, can_sleep); if (zhdr) { bud = get_free_buddy(zhdr, chunks); if (bud == HEADLESS) { if (!put_z3fold_locked(zhdr)) z3fold_page_unlock(zhdr); pr_err("No free chunks in unbuddied\n"); WARN_ON(1); goto retry; } page = virt_to_page(zhdr); goto found; } bud = FIRST; }

      Leaning towards it being refgular checks and not policy

    13. static const struct movable_operations z3fold_mops = { .isolate_page = z3fold_page_isolate, .migrate_page = z3fold_page_migrate, .putback_page = z3fold_page_putback, };

      Defines type of movable ops on pages. Most likely policy influencing struct

    14. static int size_to_chunks(size_t size) { return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; }

      Interesting formula to convert size of page to no of chunks. Look into why this makes sense, especially why CHUNK_SHIFT was chosen

    15. enum z3fold_page_flags { PAGE_HEADLESS = 0, MIDDLE_CHUNK_MAPPED, NEEDS_COMPACTING, PAGE_STALE, PAGE_CLAIMED, /* by either reclaim or free */ PAGE_MIGRATED, /* page is migrated and soon to be released */ };

      Interesting enums to note

    16. struct z3fold_buddy_slots { /* * we are using BUDDY_MASK in handle_to_buddy etc. so there should * be enough slots to hold all possible variants */ unsigned long slot[BUDDY_MASK + 1]; unsigned long pool; /* back link */ rwlock_t lock; }; #define HANDLE_FLAG_MASK (0x03)

      Buddy slots

    17. static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) { struct z3fold_header *zhdr; struct page *page; void *addr; enum buddy buddy; zhdr = get_z3fold_header(handle); addr = zhdr; page = virt_to_page(zhdr); if (test_bit(PAGE_HEADLESS, &page->private)) goto out; buddy = handle_to_buddy(handle); switch (buddy) { case FIRST: addr += ZHDR_SIZE_ALIGNED; break; case MIDDLE: addr += zhdr->start_middle << CHUNK_SHIFT; set_bit(MIDDLE_CHUNK_MAPPED, &page->private); break; case LAST: addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT); break; default: pr_err("unknown buddy id %d\n", buddy); WARN_ON(1); addr = NULL; break; } if (addr) zhdr->mapped_count++; out: put_z3fold_header(zhdr); return addr; }

      hihi

    18. #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) #define CHUNK_SIZE (1 << CHUNK_SHIFT) #define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) #define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) #define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) #define NCHUNKS (TOTAL_CHUNKS - ZHDR_CHUNKS) #define BUDDY_MASK (0x3) #define BUDDY_SHIFT 2 #define SLOTS_ALIGN (0x40)

      Policy controlling chunks

    19. #define NCHUNKS_ORDER 6

      Controls internal fragmentation.

  2. Feb 2025
    1. #define PAGE_REPORTING_DELAY (2 * HZ)

      Min delay that allows for more unreported pages to accumulate

    2. int page_reporting_register

      Registers page reporting device and checks in reporting_req to see if we have started requesting or need to wait for queue to build up

    3. /* allocate scatterlist to store pages being reported on */ sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);

      Scatterlist is allocd here. arr that will contain contiguous blocks of phy uncontiguous mem.

    4. unsigned int page_reporting_order = -1;

      Kernel param that controls number of contiguous pages. -1 is default for invalid and 0 to MAX_ORDER is log base 2 of no.of pages.

    5. void page_reporting_unregister(struct page_reporting_dev_info *prdev)

      Same importance as unregister

  3. Jan 2025
    1. if (ret) { zswap_reject_reclaim_fail++; if (ret != -EAGAIN) break; if (++failures == MAX_RECLAIM_RETRIES) break; }

      The reclaim loop can only be run for 16 times, each time it tries to swap out the pool that the OOM killer is activated. This criteria to kill the reclaim process is decided by its oom_score determined by the kernel and adjustable in sysfs.

    2. zswap_pool_put(pool);

      not a policy, simply reduces reference count to the pool

    3. static int zswap_reclaim_entry(struct zswap_pool *pool) {

      Tries to writeback an entry to swap cache and removes it from zswap. We utilize this function when zpool and zswap are full, hence 'reclaim'.

    4. static struct zswap_tree *zswap_trees[MAX_SWAPFILES];

      Not a policy but something to note. An array of red-black zswap_trees that contain zswap entries. Each tree is indexed by the swap device/file no.

    5. static unsigned int zswap_max_pool_percent = 20;

      Max percentage of total system RAM that can be used by the pool. Also a user controlled policy recognized by the Linux kernel documentation

    6. static bool zswap_same_filled_pages_enabled = true;

      Interesting optimization. If a page is just filled with purely 1s or 0s and this bool is true then it just compresses the page to one bit.

    7. static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);

      Im just adding on here but usually you choose between zswap and zram. The latter acts as a block device to store compressed page and thus finds use in embedded Linux systems while the former requires an existing swap device to act as a fallback for removing entries. This means that the policy likely here is choosing an internal compression storage in RAM based on the system (Embedded vs Server/datacenter)

    8. cond_resched();

      Signals that this process is low priority and allows the scheduler to run higher priority processes. Not a policy but low prioritization can indicate importance of zswap policies

    9. /* reclaim space if needed */ if (zswap_is_full()) { zswap_pool_limit_hit++; zswap_pool_reached_full = true; goto shrink; } if (zswap_pool_reached_full) { if (!zswap_can_accept()) goto shrink; else zswap_pool_reached_full = false; }

      Checks if zswap device is full and if any zpools are available to store page. The check is based on what percentage of pool and RAM the user has allocated to zswap. The true policy arises in the question of WHY the user has set the percentage to those values. The shrink function is enabled in the case that there is no space in zswap.

    10. static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;

      Dictates the type of compression algorithm to use as a string, with tradeoffs in speed vs size compressed. Can also be changed at boot time by changing boot params in sysfs

    11. static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */

      Once you hit max no of pools, you dictate the max percentage of pages that a specific pool can accept before LRU page eviction to swap device

  4. Nov 2024
    1. #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; do { struct page *page = pfn_to_page(pfn); pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } SetPageReferenced(page); pages[*nr] = page; if (unlikely(try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } (*nr)++; pfn++; } while (addr += PAGE_SIZE, addr != end); put_dev_pagemap(pgmap); return addr == end; } static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } #else static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } #endif

      seems like a check to see if pages can be grabbed. A quick skim maybe hints possible checks if huge pages can be grabbed?

    2. #ifdef CONFIG_ARCH_HAS_HUGEPD static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) { unsigned long __boundary = (addr + sz) & ~(sz-1); return (__boundary - 1 < end - 1) ? __boundary : end; } static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long pte_end; struct page *page; struct folio *folio; pte_t pte; int refs; pte_end = (addr + sz) & ~(sz-1); if (pte_end < end) end = pte_end; pte = huge_ptep_get(ptep); if (!pte_access_permitted(pte, flags & FOLL_WRITE)) return 0; /* hugepages are never "special" */ VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, struct page **pages, int *nr) { pte_t *ptep; unsigned long sz = 1UL << hugepd_shift(hugepd); unsigned long next; ptep = hugepte_offset(hugepd, addr, pdshift); do { next = hugepte_addr_end(addr, end, sz); if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr)) return 0; } while (ptep++, addr = next, addr != end); return 1; } #else static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_HUGEPD */ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *page; struct folio *folio; int refs; if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; return __gup_device_huge_pmd(orig, pmdp, addr, end, flags, pages, nr); } page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *page; struct folio *folio; int refs; if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; return __gup_device_huge_pud(orig, pudp, addr, end, flags, pages, nr); } page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { int refs; struct page *page; struct folio *folio; if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) return 0; BUILD_BUG_ON(pgd_devmap(orig)); page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { gup_put_folio(folio, refs, flags); return 0; } if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; pmdp = pmd_offset_lockless(pudp, pud, addr); do { pmd_t pmd = pmdp_get_lockless(pmdp); next = pmd_addr_end(addr, end); if (!pmd_present(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd))) { /* See gup_pte_range() */ if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { /* * architecture have different format for hugetlbfs * pmd format and THP pmd format */ if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, PMD_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); return 1; } static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; pudp = pud_offset_lockless(p4dp, p4d, addr); do { pud_t pud = READ_ONCE(*pudp); next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) return 0; if (unlikely(pud_huge(pud) || pud_devmap(pud))) { if (!gup_huge_pud(pud, pudp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, PUD_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); return 1; } static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp; p4dp = p4d_offset_lockless(pgdp, pgd, addr); do { p4d_t p4d = READ_ONCE(*p4dp); next = p4d_addr_end(addr, end); if (p4d_none(p4d)) return 0; BUILD_BUG_ON(p4d_huge(p4d)); if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, P4D_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); return 1; } static void gup_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pgd_t *pgdp; pgdp = pgd_offset(current->mm, addr); do { pgd_t pgd = READ_ONCE(*pgdp); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) return; if (unlikely(pgd_huge(pgd))) { if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, pages, nr)) return; } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, PGDIR_SHIFT, next, flags, pages, nr)) return; } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) return; } while (pgdp++, addr = next, addr != end); } #else static inline void gup_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { }

      policy use functions for gup_huge pte policy code function above (not right above, gotta scroll probably to find it)

    3. static int internal_get_user_pages_fast(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { unsigned long len, end; unsigned long nr_pinned; int locked = 0; int ret; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | FOLL_FAST_ONLY | FOLL_NOFAULT | FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT))) return -EINVAL; if (gup_flags & FOLL_PIN) mm_set_has_pinned_flag(&current->mm->flags); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(&current->mm->mmap_lock); start = untagged_addr(start) & PAGE_MASK; len = nr_pages << PAGE_SHIFT; if (check_add_overflow(start, len, &end)) return -EOVERFLOW; if (end > TASK_SIZE_MAX) return -EFAULT; if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages); if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) return nr_pinned; /* Slow path: try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned, pages, &locked, gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so * returning -errno is not an option */ if (nr_pinned) return nr_pinned; return ret; } return ret + nr_pinned; } /** * get_user_pages_fast_only() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. * * If the architecture does not support this function, simply return with no * pages pinned. * * Careful, careful! COW breaking can go either way, so a non-write * access can get ambiguous page results. If you call this function without * 'write' set, you'd better be sure that you're ok with that ambiguity. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { /* * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, * because gup fast is always a "pin with a +1 page refcount" request. * * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET | FOLL_FAST_ONLY)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast_only); /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Attempt to pin user pages in memory without taking mm->mmap_lock. * If not successful, it will fall back to taking the lock and * calling get_user_pages(). * * Returns number of pages pinned. This may be fewer than the number requested. * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns * -errno. */ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { /* * The caller may or may not have explicitly set FOLL_GET; either way is * OK. However, internally (within mm/gup.c), gup fast variants must set * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" * request. */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); /** * pin_user_pages_fast() - pin user pages in memory without taking locks * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See * get_user_pages_fast() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for further details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page() will not remove pins from it. */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(pin_user_pages_fast); /** * pin_user_pages_remote() - pin pages of a remote process * * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See * get_user_pages_remote() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { int local_locked = 1; if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) return 0; return __gup_longterm_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_remote); /** * pin_user_pages() - pin user pages in memory for use by other devices * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { int locked = 1; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages); /* * pin_user_pages_unlocked() is the FOLL_PIN variant of * get_user_pages_unlocked(). Behavior is the same, except that this one sets * FOLL_PIN and rejects FOLL_GET. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { int locked = 0; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); }

      fast gup functions

    4. /** * unpin_user_pages() - release an array of gup-pinned pages. * @pages: array of pages to be marked dirty and released. * @npages: number of pages in the @pages array. * * For each page in the @pages array, release the page using unpin_user_page(). * * Please see the unpin_user_page() documentation for details. */ void unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long i; struct folio *folio; unsigned int nr; /* * If this WARN_ON() fires, then the system *might* be leaking pages (by * leaving them pinned), but probably not. More likely, gup/pup returned * a hard -ERRNO error to the caller, who erroneously passed it here. */ if (WARN_ON(IS_ERR_VALUE(npages))) return; sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); gup_put_folio(folio, nr, FOLL_PIN); } }

      gup unpin function, not actual logic

    5. void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty) { unsigned long i; struct folio *folio; unsigned int nr; for (i = 0; i < npages; i += nr) { folio = gup_folio_range_next(page, npages, i, &nr); if (make_dirty && !folio_test_dirty(folio)) { folio_lock(folio); folio_mark_dirty(folio); folio_unlock(folio); } gup_put_folio(folio, nr, FOLL_PIN); } }

      unpin logic but for dirty pages

    6. if ((flags & FOLL_DUMP) && (vma_is_anonymous(vma) || !vma->vm_ops->fault)) return ERR_PTR(-EFAULT); return NULL;

      explained in comments

    7. #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL /* * Fast-gup relies on pte change detection to avoid concurrent pgtable * operations. * * To pin the page, fast-gup needs to do below in order: * (1) pin the page (by prefetching pte), then (2) check pte not changed. * * For the rest of pgtable operations where pgtable updates can be racy * with fast-gup, we need to do (1) clear pte, then (2) check whether page * is pinned. * * Above will work for all pte-level operations, including THP split. * * For THP collapse, it's a bit more complicated because fast-gup may be * walking a pgtable page that is being freed (pte is still valid but pmd * can be cleared already). To avoid race in such condition, we need to * also check pmd here to make sure pmd doesn't change (corresponds to * pmdp_collapse_flush() in the THP collapse code path). */ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; int nr_start = *nr, ret = 0; pte_t *ptep, *ptem; ptem = ptep = pte_offset_map(&pmd, addr); if (!ptep) return 0; do { pte_t pte = ptep_get_lockless(ptep); struct page *page; struct folio *folio; /* * Always fallback to ordinary GUP on PROT_NONE-mapped pages: * pte_access_permitted() better should reject these pages * either way: otherwise, GUP-fast might succeed in * cases where ordinary GUP would fail due to VMA access * permissions. */ if (pte_protnone(pte)) goto pte_unmap; if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto pte_unmap; if (pte_devmap(pte)) { if (unlikely(flags & FOLL_LONGTERM)) goto pte_unmap; pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); goto pte_unmap; } } else if (pte_special(pte)) goto pte_unmap; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); folio = try_grab_folio(page, 1, flags); if (!folio) goto pte_unmap; if (unlikely(folio_is_secretmem(folio))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } /* * We need to make the page accessible if and only if we are * going to access its content (the FOLL_PIN case). Please * see Documentation/core-api/pin_user_pages.rst for * details. */ if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { gup_put_folio(folio, 1, flags); goto pte_unmap; } } folio_set_referenced(folio); pages[*nr] = page; (*nr)++; } while (ptep++, addr += PAGE_SIZE, addr != end); ret = 1; pte_unmap: if (pgmap) put_dev_pagemap(pgmap); pte_unmap(ptem); return ret; } #else /* * If we can't determine whether or not a pte is special, then fail immediately * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not * to be special. * * For a futex to be placed on a THP tail page, get_futex_key requires a * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */

      non concurrent fast gup approach that checks for pinned page and unmaps pte or clears it

    8. #ifdef CONFIG_HAVE_FAST_GUP /* * Used in the GUP-fast path to determine whether a pin is permitted for a * specific folio. * * This call assumes the caller has pinned the folio, that the lowest page table * level still points to this folio, and that interrupts have been disabled. * * Writing to pinned file-backed dirty tracked folios is inherently problematic * (see comment describing the writable_file_mapping_allowed() function). We * therefore try to avoid the most egregious case of a long-term mapping doing * so. * * This function cannot be as thorough as that one as the VMA is not available * in the fast path, so instead we whitelist known good cases and if in doubt, * fall back to the slow path. */ static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags) { struct address_space *mapping; unsigned long mapping_flags; /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the one we disallow. */ if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) != (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) return true; /* The folio is pinned, so we can safely access folio fields. */ if (WARN_ON_ONCE(folio_test_slab(folio))) return false; /* hugetlb mappings do not require dirty-tracking. */ if (folio_test_hugetlb(folio)) return true; /* * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods * cannot proceed, which means no actions performed under RCU can * proceed either. * * inodes and thus their mappings are freed under RCU, which means the * mapping cannot be freed beneath us and thus we can safely dereference * it. */ lockdep_assert_irqs_disabled(); /* * However, there may be operations which _alter_ the mapping, so ensure * we read it once and only once. */ mapping = READ_ONCE(folio->mapping); /* * The mapping may have been truncated, in any case we cannot determine * if this mapping is safe - fall back to slow path to determine how to * proceed. */ if (!mapping) return false; /* Anonymous folios pose no problem. */ mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS; if (mapping_flags) return mapping_flags & PAGE_MAPPING_ANON; /* * At this point, we know the mapping is non-null and points to an * address_space object. The only remaining whitelisted file system is * shmem. */ return shmem_mapping(mapping); }

      policy logic. avoids locks unlike get user pages unlocked/locked which seems risky so its not supposed to be used on concurrent gup logic

    9. long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { int locked = 1; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); }

      policy logic.

    10. static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, unsigned int flags, struct page **pages) { while ((*nr) - nr_start) { struct page *page = pages[--(*nr)]; ClearPageReferenced(page); if (flags & FOLL_PIN) unpin_user_page(page); else put_page(page); } }

      policy use function that undoes mapping

    11. #ifdef CONFIG_MIGRATION /* * Returns the number of collected pages. Return value is always >= 0. */ static unsigned long collect_longterm_unpinnable_pages( struct list_head *movable_page_list, unsigned long nr_pages, struct page **pages) { unsigned long i, collected = 0; struct folio *prev_folio = NULL; bool drain_allow = true; for (i = 0; i < nr_pages; i++) { struct folio *folio = page_folio(pages[i]); if (folio == prev_folio) continue; prev_folio = folio; if (folio_is_longterm_pinnable(folio)) continue; collected++; if (folio_is_device_coherent(folio)) continue; if (folio_test_hugetlb(folio)) { isolate_hugetlb(folio, movable_page_list); continue; } if (!folio_test_lru(folio) && drain_allow) { lru_add_drain_all(); drain_allow = false; } if (!folio_isolate_lru(folio)) continue; list_add_tail(&folio->lru, movable_page_list); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } return collected; }
    12. #ifdef CONFIG_ELF_CORE struct page *get_dump_page(unsigned long addr) { struct page *page; int locked = 0; int ret; ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */

      part of policy use code likely

    13. int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; long ret = 0; end = start + len; for (nstart = start; nstart < end; nstart = nend) { /* * We want to fault in pages for [nstart; end) address range. * Find first corresponding VMA. */ if (!locked) { locked = 1; mmap_read_lock(mm); vma = find_vma_intersection(mm, nstart, end); } else if (nstart >= vma->vm_end) vma = find_vma_intersection(mm, vma->vm_end, end); if (!vma) break; /* * Set [nstart; nend) to intersection of desired address * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); if (vma->vm_flags & (VM_IO | VM_PFNMAP)) continue; if (nstart < vma->vm_start) nstart = vma->vm_start; /* * Now fault in a range of pages. populate_vma_page_range() * double checks the vma flags, so that it won't mlock pages * if the vma was already munlocked. */ ret = populate_vma_page_range(vma, nstart, nend, &locked); if (ret < 0) { if (ignore_errors) { ret = 0; continue; /* continue at next VMA */ } break; } nend = nstart + ret * PAGE_SIZE; ret = 0; } if (locked) mmap_read_unlock(mm); return ret; /* 0 or negative error code */ }

      policy use function that populates pages like the func before this.

    14. long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked) { struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int local_locked = 1; int gup_flags; long ret; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* * Rightly or wrongly, the VM_LOCKONFAULT case has never used * faultin_page() to break COW, so it has no work to do here. */ if (vma->vm_flags & VM_LOCKONFAULT) return nr_pages; gup_flags = FOLL_TOUCH; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW * and we would not want to dirty them for nothing. */ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; /* * We want mlock to succeed for regions that have any permissions * other than PROT_NONE. */ if (vma_is_accessible(vma)) gup_flags |= FOLL_FORCE; if (locked) gup_flags |= FOLL_UNLOCKABLE; /* * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ ret = __get_user_pages(mm, start, nr_pages, gup_flags, NULL, locked ? locked : &local_locked); lru_add_drain(); return ret; }

      policy use code.

    15. long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { int local_locked = 1; if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_TOUCH | FOLL_REMOTE)) return -EINVAL; return __get_user_pages_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); }

      policy logic

    16. static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int flags) { long ret, pages_done; bool must_unlock = false; /* * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. */ if (!*locked) { if (mmap_read_lock_killable(mm)) return -EAGAIN; must_unlock = true; *locked = 1; } else mmap_assert_locked(mm); if (flags & FOLL_PIN) mm_set_has_pinned_flag(&mm->flags); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior * is to set FOLL_GET if the caller wants pages[] filled in (but has * carelessly failed to specify FOLL_GET), so keep doing that, but only * for FOLL_GET, not for the newer FOLL_PIN. * * FOLL_PIN always expects pages to be non-null, but no need to assert * that here, as any failures will be obvious enough. */ if (pages && !(flags & FOLL_PIN)) flags |= FOLL_GET; pages_done = 0; for (;;) { ret = __get_user_pages(mm, start, nr_pages, flags, pages, locked); if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ pages_done = ret; break; } /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */ if (!*locked) { BUG_ON(ret < 0); BUG_ON(ret >= nr_pages); } if (ret > 0) { nr_pages -= ret; pages_done += ret; if (!nr_pages) break; } if (*locked) { /* * VM_FAULT_RETRY didn't trigger or it was a * FOLL_NOWAIT. */ if (!pages_done) pages_done = ret; break; } /* * VM_FAULT_RETRY triggered, so seek to the faulting offset. * For the prefault case (!pages) we only update counts. */ if (likely(pages)) pages += ret; start += ret << PAGE_SHIFT; /* The lock was temporarily dropped, so we must unlock later */ must_unlock = true; retry: /* * Repeat on the address that fired VM_FAULT_RETRY * with both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_TRIED. Note that GUP can be interrupted * by fatal signals of even common signals, depending on * the caller's request. So we need to check it before we * start trying again otherwise it can loop forever. */ if (gup_signal_pending(flags)) { if (!pages_done) pages_done = -EINTR; break; } ret = mmap_read_lock_killable(mm); if (ret) { BUG_ON(ret > 0); if (!pages_done) pages_done = ret; break; } *locked = 1; ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED, pages, locked); if (!*locked) { /* Continue to retry until we succeeded */ BUG_ON(ret != 0); goto retry; } if (ret != 1) { BUG_ON(ret > 1); if (!pages_done) pages_done = ret; break; } nr_pages--; pages_done++; if (!nr_pages) break; if (likely(pages)) pages++; start += PAGE_SIZE; } if (must_unlock && *locked) { /* * We either temporarily dropped the lock, or the caller * requested that we both acquire and drop the lock. Either way, * we must now unlock, and notify the caller of that state. */ mmap_read_unlock(mm); *locked = 0; } return pages_done; }

      same as gup but sets/unsets mmap_lock

    17. if (!(flags & FOLL_INTERRUPTIBLE)) return false;

      fatal fault signal handler

    18. int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { struct vm_area_struct *vma; vm_fault_t ret; address = untagged_addr_remote(mm, address); if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; retry: vma = gup_vma_lookup(mm, address); if (!vma) return -EFAULT; if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; if ((fault_flags & FAULT_FLAG_KILLABLE) && fatal_signal_pending(current)) return -EINTR; ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_COMPLETED) { /* * NOTE: it's a pity that we need to retake the lock here * to pair with the unlock() in the callers. Ideally we * could tell the callers so they do not need to unlock. */ mmap_read_lock(mm); *unlocked = true; return 0; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); *unlocked = true; fault_flags |= FAULT_FLAG_TRIED; goto retry; } return 0; }

      resolves user page fault. policy logic

    19. static long __get_user_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; struct follow_page_context ctx = { NULL }; if (!nr_pages) return 0; start = untagged_addr_remote(mm, start); VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); do { struct page *page; unsigned int foll_flags = gup_flags; unsigned int page_increm; /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { /* * MADV_POPULATE_(READ|WRITE) wants to handle VMA * lookups+error reporting differently. */ if (gup_flags & FOLL_MADV_POPULATE) { vma = vma_lookup(mm, start); if (!vma) { ret = -ENOMEM; goto out; } if (check_vma_flags(vma, gup_flags)) { ret = -EINVAL; goto out; } goto retry; } vma = gup_vma_lookup(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, pages ? &page : NULL); if (ret) goto out; ctx.page_mask = 0; goto next_page; } if (!vma) { ret = -EFAULT; goto out; } ret = check_vma_flags(vma, gup_flags); if (ret) goto out; } retry: /* * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ if (fatal_signal_pending(current)) { ret = -EINTR; goto out; } cond_resched(); page = follow_page_mask(vma, start, foll_flags, &ctx); if (!page || PTR_ERR(page) == -EMLINK) { ret = faultin_page(vma, start, &foll_flags, PTR_ERR(page) == -EMLINK, locked); switch (ret) { case 0: goto retry; case -EBUSY: case -EAGAIN: ret = 0; fallthrough; case -EFAULT: case -ENOMEM: case -EHWPOISON: goto out; } BUG(); } else if (PTR_ERR(page) == -EEXIST) { /* * Proper page table entry exists, but no corresponding * struct page. If the caller expects **pages to be * filled in, bail out now, because that can't be done * for this page. */ if (pages) { ret = PTR_ERR(page); goto out; } } else if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } next_page: page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; if (pages) { struct page *subpage; unsigned int j; /* * This must be a large folio (and doesn't need to * be the whole folio; it can be part of it), do * the refcount work for all the subpages too. * * NOTE: here the page may not be the head page * e.g. when start addr is not thp-size aligned. * try_grab_folio() should have taken care of tail * pages. */ if (page_increm > 1) { struct folio *folio; /* * Since we already hold refcount on the * large folio, this should never fail. */ folio = try_grab_folio(page, page_increm - 1, foll_flags); if (WARN_ON_ONCE(!folio)) { /* * Release the 1st page ref if the * folio is problematic, fail hard. */ gup_put_folio(page_folio(page), 1, foll_flags); ret = -EFAULT; goto out; } } for (j = 0; j < page_increm; j++) { subpage = nth_page(page, j); pages[i + j] = subpage; flush_anon_page(vma, subpage, start + j * PAGE_SIZE); flush_dcache_page(subpage); } } i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; } while (nr_pages); out: if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return i ? i : ret; }

      Literally the actual policy logic of gup. Most important piece of code right here for gup

    20. #ifdef CONFIG_STACK_GROWSUP return vma_lookup(mm, addr); #else static volatile unsigned long next_warn; struct vm_area_struct *vma; unsigned long now, next; vma = find_vma(mm, addr); if (!vma || (addr >= vma->vm_start)) return vma; /* Only warn for half-way relevant accesses */ if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; if (vma->vm_start - addr > 65536) return NULL; /* Let's not warn more than once an hour.. */ now = jiffies; next = next_warn; if (next && time_before(now, next)) return NULL; next_warn = now + 60*60*HZ; /* Let people know things may have changed. */ pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n", current->comm, task_pid_nr(current), vma->vm_start, vma->vm_end, addr); dump_stack(); return NULL;

      helper func to lookup vma(virtual mem area) that warns per hour about half way relevant acc and changes in stack

    21. static bool writable_file_mapping_allowed(struct vm_area_struct *vma, unsigned long gup_flags) { /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the case we disallow. */ if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) != (FOLL_PIN | FOLL_LONGTERM)) return true; /* * If the VMA does not require dirty tracking then no problematic write * can occur either. */ return !vma_needs_dirty_tracking(vma); }

      Def policy code. checks if we can write to a map

    22. if (*flags & FOLL_NOFAULT) return -EFAULT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; if (*flags & FOLL_UNLOCKABLE) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE. * That's because some callers may not be prepared to * handle early exits caused by non-fatal signals. */ if (*flags & FOLL_INTERRUPTIBLE) fault_flags |= FAULT_FLAG_INTERRUPTIBLE; } if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (*flags & FOLL_TRIED) { /* * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED * can co-exist */ fault_flags |= FAULT_FLAG_TRIED; } if (unshare) { fault_flags |= FAULT_FLAG_UNSHARE; /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */ VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE); } ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_COMPLETED) { /* * With FAULT_FLAG_RETRY_NOWAIT we'll never release the * mmap lock in the page fault handler. Sanity check this. */ WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT); *locked = 0; /* * We should do the same as VM_FAULT_RETRY, but let's not * return -EBUSY since that's not reflecting the reality of * what has happened - we've just fully completed a page * fault, with the mmap lock released. Use -EAGAIN to show * that we want to take the mmap lock _again_. */ return -EAGAIN; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, *flags); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *locked = 0; return -EBUSY; }

      Seems it's just setting flags for page faults based on flags param

    23. /* user gate pages are read-only */ if (gup_flags & FOLL_WRITE) return -EFAULT; if (address > TASK_SIZE) pgd = pgd_offset_k(address); else pgd = pgd_offset_gate(mm, address); if (pgd_none(*pgd)) return -EFAULT; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d)) return -EFAULT; pud = pud_offset(p4d, address); if (pud_none(*pud)) return -EFAULT; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return -EFAULT; pte = pte_offset_map(pmd, address); if (!pte) return -EFAULT; entry = ptep_get(pte); if (pte_none(entry)) goto unmap; *vma = get_gate_vma(mm); if (!page) goto out; *page = vm_normal_page(*vma, address, entry); if (!*page) { if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry))) goto unmap; *page = pte_page(entry); } ret = try_grab_page(*page, gup_flags); if (unlikely(ret)) goto unmap;

      Most of these seem like sanity checks right up until line 897 i.e, 'if(!page)'* after which we seem to unmap the page.

    24. static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct follow_page_context *ctx) { pgd_t *pgd; struct mm_struct *mm = vma->vm_mm; ctx->page_mask = 0; /* * Call hugetlb_follow_page_mask for hugetlb vmas as it will use * special hugetlb page table walking code. This eliminates the * need to check for hugetlb entries in the general walking code. */ if (is_vm_hugetlb_page(vma)) return hugetlb_follow_page_mask(vma, address, flags, &ctx->page_mask); pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); return follow_p4d_mask(vma, address, pgd, flags, ctx); }

      places mask after following page into pte

    25. struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags) { struct follow_page_context ctx = { NULL }; struct page *page; if (vma_is_secretmem(vma)) return NULL; if (WARN_ON_ONCE(foll_flags & FOLL_PIN)) return NULL; /* * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect * to fail on PROT_NONE-mapped pages. */ page = follow_page_mask(vma, address, foll_flags, &ctx); if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return page; }

      finds page

    26. if (flags & FOLL_SPLIT_PMD) { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); /* If pmd was left empty, stuff a page table in there quickly */ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) : follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); ctx->page_mask = HPAGE_PMD_NR - 1; return page;

      we're finding the page again but storing page mask in ctx

    27. if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) return no_page_table(vma, flags); ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); return no_page_table(vma, flags); } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); }

      branch prediction to check if pmd is there and if it's big

    28. if (pmd_none(pmdval)) return no_page_table(vma, flags); if (!pmd_present(pmdval)) return no_page_table(vma, flags); if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; }

      checks if pmd is there. im assuming it's page middle dir.

    29. /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags); pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags)) goto no_page; page = vm_normal_page(vma, address, pte); /* * We only care about anon pages in can_follow_write_pte() and don't * have to worry about pte_devmap() because they are never anon. */ if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, page, vma, flags)) { page = NULL; goto out; } if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN * case since they are only valid while holding the pgmap * reference. */ *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); if (*pgmap) page = pte_page(pte); else goto no_page; } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); goto out; } if (is_zero_pfn(pte_pfn(pte))) { page = pte_page(pte); } else { ret = follow_pfn_pte(vma, address, ptep, flags); page = ERR_PTR(ret); goto out; } } if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { page = ERR_PTR(-EMLINK); goto out; } VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ ret = try_grab_page(page, flags); if (unlikely(ret)) { page = ERR_PTR(ret); goto out; } /* * We need to make the page accessible if and only if we are going * to access its content (the FOLL_PIN case). Please see * Documentation/core-api/pin_user_pages.rst for details. */ if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { unpin_user_page(page); page = ERR_PTR(ret); goto out; } } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * mark_page_accessed(). */ mark_page_accessed(page); }

      finds page in pte. Judging by the complexity of the logic this is most likely policy code because we're literally getting user page

    30. if (pte_write(pte)) return true; /* Maybe FOLL_FORCE is set to override it? */ if (!(flags & FOLL_FORCE)) return false; /* But FOLL_FORCE has no effect on shared mappings */ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) return false; /* ... or read-only private ones */ if (!(vma->vm_flags & VM_MAYWRITE)) return false; /* ... or already writable ones that just need to take a write fault */ if (vma->vm_flags & VM_WRITE) return false; /* * See can_change_pte_writable(): we broke COW and could map the page * writable if we have an exclusive anonymous page ... */ if (!page || !PageAnon(page) || !PageAnonExclusive(page)) return false; /* ... and a write-fault isn't required for other reasons. */ if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) return false; return !userfaultfd_pte_wp(vma, pte);

      flag checks to see if u can write to a pte

    31. if (flags & FOLL_TOUCH) { pte_t orig_entry = ptep_get(pte); pte_t entry = orig_entry; if (flags & FOLL_WRITE) entry = pte_mkdirty(entry); entry = pte_mkyoung(entry); if (!pte_same(orig_entry, entry)) { set_pte_at(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); }

      uses pte to mark dirty pages and finds pfn in pte

    32. void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty) { unsigned long i; struct folio *folio; unsigned int nr; if (!make_dirty) { unpin_user_pages(pages, npages); return; } sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); /* * Checking PageDirty at this point may race with * clear_page_dirty_for_io(), but that's OK. Two key * cases: * * 1) This code sees the page as already dirty, so it * skips the call to set_page_dirty(). That could happen * because clear_page_dirty_for_io() called * page_mkclean(), followed by set_page_dirty(). * However, now the page is going to get written back, * which meets the original intention of setting it * dirty, so all is well: clear_page_dirty_for_io() goes * on to call TestClearPageDirty(), and write the page * back. * * 2) This code sees the page as clean, so it calls * set_page_dirty(). The page stays dirty, despite being * written back, so it gets written back again in the * next writeback cycle. This is harmless. */ if (!folio_test_dirty(folio)) { folio_lock(folio); folio_mark_dirty(folio); folio_unlock(folio); } gup_put_folio(folio, nr, FOLL_PIN); } }

      unpins and dirties page

    33. static inline struct folio *gup_folio_next(struct page **list, unsigned long npages, unsigned long i, unsigned int *ntails) { struct folio *folio = page_folio(list[i]); unsigned int nr; for (nr = i + 1; nr < npages; nr++) { if (page_folio(list[nr]) != folio) break; } *ntails = nr - i; return folio; }

      gets folio of next page along with reference to end of folio

    34. static inline struct folio *gup_folio_range_next(struct page *start, unsigned long npages, unsigned long i, unsigned int *ntails) { struct page *next = nth_page(start, i); struct folio *folio = page_folio(next); unsigned int nr = 1; if (folio_test_large(folio)) nr = min_t(unsigned int, npages - i, folio_nr_pages(folio) - folio_page_idx(folio, next)); *ntails = nr; return folio; }

      gets the folio of the next page from start to 'i' range. also gets the tail folio/reference

    35. folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);

      function for adding reference

    36. void unpin_user_page(struct page *page) { sanity_check_pinned_pages(&page, 1); gup_put_folio(page_folio(page), 1, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_page);

      actual policy use logic

    37. struct folio *folio = page_folio(page); if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) return -EREMOTEIO; if (flags & FOLL_GET) folio_ref_inc(folio);

      checks for code that is involved in policy but is not the actual logic

    38. else if (flags & FOLL_PIN) { /* * Don't take a pin on the zero page - it's not going anywhere * and it is used in a *lot* of places. */ if (is_zero_page(page)) return 0; /* * Similar to try_grab_folio(): be sure to *also* * increment the normal page refcount field at least once, * so that the page really is pinned. */ if (folio_test_large(folio)) { folio_ref_add(folio, 1); atomic_add(1, &folio->_pincount); } else { folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); } node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); }

      Logic that actually tries to grab the folio. Also policy use code and not actual policy

    39. if (!put_devmap_managed_page_refs(&folio->page, refs)) folio_put_refs(folio, refs);

      Definitely a vital and straightforward policy use section of gup that simples places a reference on the folio

    40. if (flags & FOLL_PIN) { if (is_zero_folio(folio)) return; node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); if (folio_test_large(folio)) atomic_sub(refs, &folio->_pincount); else refs *= GUP_PIN_COUNTING_BIAS; }

      Checks if the folio is zero/large

    41. if (folio_test_large(folio)) atomic_add(refs, &folio->_pincount); else folio_ref_add(folio, refs * (GUP_PIN_COUNTING_BIAS - 1))

      maintaining reference counts. Part of policy logic most likely

    42. if (unlikely((flags & FOLL_LONGTERM) && !folio_is_longterm_pinnable(folio))) { if (!put_devmap_managed_page_refs(&folio->page, refs)) folio_put_refs(folio, refs); return NULL;

      checks for longterm folio pins.

    43. if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) return NULL; if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) return NULL;

      Time saving predictions(unlikely) and single time warning func(WARN_ON_ONCE) for flags. Not actual policy logic so low confidence.

    44. if (unlikely(page_folio(page) != folio)) { if (!put_devmap_managed_page_refs(&folio->page, refs)) folio_put_refs(folio, refs); goto retry;

      Uses prediction to check if a folio still points to the page. This is part of the function that tries to retrieve the folio to confirm that it is associated with a page.

    45. folio = page_folio(page); if (WARN_ON_ONCE(folio_ref_count(folio) < 0)) return NULL; if (unlikely(!folio_ref_try_add(folio, refs))) return NULL;

      These increment the reference count for the folio since you're returning a reference of the folio. Important function so important internal logic subsequently

    46. if (is_zero_page(page) || !folio_test_anon(folio)) continue; if (!folio_test_large(folio) || folio_test_hugetlb(folio))

      Sanity checks for pinned pages wouldn't classify as policy logic but common sense pre-checks for the actual policy. But I think it's worth tagging this to gain a sense of what is not policy code

    47. if (is_zero_page(page)) return page_folio(page); folio = try_get_folio(page, refs); if (!folio) return NULL;

      Just trying to check for zero pages and trying to retrieve folios. Unlikely policy logic

    48. if (flags & FOLL_GET) return try_get_folio(page, refs);

      Policy logic that determines and tries to retrieve folios based on given flags.

    49. if (gup_flags & FOLL_PIN) mm_set_has_pinned_flag(&current->mm->flags); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(&current->mm->mmap_lock); start = untagged_addr(start) & PAGE_MASK; len = nr_pages << PAGE_SHIFT; if (check_add_overflow(start, len, &end)) return -EOVERFLOW; if (end > TASK_SIZE_MAX) return -EFAULT; if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT;

      checking for overflow in page alloc likely

    50. if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { gup_put_folio(folio, 1, flags); goto pte_unmap; } }

      part of policy code

    51. if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, locked, gup_flags);

      policy decision to get locked page!

    52. if (folio_is_device_coherent(folio)) { /* * Migration will fail if the page is pinned, so convert * the pin on the source page to a normal reference. */ pages[i] = NULL; folio_get(folio); gup_put_folio(folio, 1, FOLL_PIN); if (migrate_device_coherent_page(&folio->page)) { ret = -EBUSY; goto err; } continue; }

      algorithm/fnc decision

    53. vm_flags = (foll_flags & FOLL_WRITE) ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (foll_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);

      vm_flags

    54. if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ pages_done = ret; break; }

      flag

    55. if (ret & VM_FAULT_COMPLETED) { /* * NOTE: it's a pity that we need to retake the lock here * to pair with the unlock() in the callers. Ideally we * could tell the callers so they do not need to unlock. */ mmap_read_lock(mm); *unlocked = true; return 0; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); *unlocked = true; fault_flags |= FAULT_FLAG_TRIED; goto retry; }

      a lot of VM fault flags that are checked when calling a function that handles page faults

    56. if ((fault_flags & FAULT_FLAG_KILLABLE) && fatal_signal_pending(current))

      flag fault killable

    57. if (flags & FOLL_PIN) mm_set_has_pinned_flag(&mm->flags);
    58. if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

      flag pol to check if map is unlocked (for pos remap?)

    59. if (page_increm > nr_pages) page_increm = nr_pages;

      next page logic

    60. */ if (gup_flags & FOLL_MADV_POPULATE) { vma = vma_lookup(mm, start); if (!vma) { ret = -ENOMEM; goto out; } if (check_vma_flags(vma, gup_flags)) { ret = -EINVAL; goto out; } goto retry; }

      page populate flag for sure

    61. if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ if (is_vm_hugetlb_page(vma)) return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could * set a breakpoint in a read-only mapping of an * executable, without corrupting the file (yet only * when that file had been opened for writing!). * Anon pages in shared mappings are surprising: now * just reject it. */ if (!is_cow_mapping(vm_flags)) return -EFAULT; } } else if (!(vm_flags & VM_READ)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* * Is there actually any vma we can reach here which does not * have VM_MAYREAD set? */ if (!(vm_flags & VM_MAYREAD)) return -EFAULT; }

      cow mapping, sdw stack seem like imp policies which dont seem like regular flags

  5. Oct 2024
    1. if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) != (FOLL_PIN | FOLL_LONGTERM)) return true;

      most likely part of a policy code that includes flag based decision making

    2. if (ctx.pgmap) put_dev_pagemap(ctx.pgmap);

      Low confidence policy

    3. if (flags & FOLL_TOUCH)

      value policy!!