Hypothesis

368 Matching Annotations

Apr 2025
elixir.bootlin.com elixir.bootlin.com

topology.c - kernel/sched/topology.c - Linux source code v6.6.42 - Bootlin Elixir Cross Referencer

1
1. Rajitb 18 Apr 2025
  
  in Public
  
  #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) /* Build perf. domains: */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < n && !sched_energy_update; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) { has_eas = true; goto match3; } } /* No match - add perf. domains for a new rd */ has_eas |= build_perf_domains(doms_new[i]);
  
  Policy that creates performance domains if EAS is enabled. This means that the groups of CPUs inside each domain is governed by schedutil + EAS. Schedutil is the only CPU governor that fits EAS.
  
  #ldos-config #ldos-ifdef-code
Visit annotations in context

Tags

#ldos-ifdef-code

#ldos-config

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/kernel/sched/topology.c
elixir.bootlin.com elixir.bootlin.com

zswap.c - mm/zswap.c - Linux source code v6.6.42 - Bootlin

1
1. Rajitb 16 Apr 2025
  
  in Public
  
  if (!zswap_non_same_filled_pages_enabled) goto freepage;
  
  Don't cache 'bigger' unoptimized file.
  
  #ldos-flag
Visit annotations in context

Tags

#ldos-flag

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/zswap.c
elixir.bootlin.com elixir.bootlin.com

slab_common.c - mm/slab_common.c - Linux source code v6.6.42 - Bootlin

1
1. monishak 04 Apr 2025
  
  in Public
  
  int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, gfp_t gfp) {
  
  Randomizes allocation within cache
  
  #ldos-algo
Visit annotations in context

Tags

#ldos-algo

Annotators

monishak

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/slab_common.c
Mar 2025
elixir.bootlin.com elixir.bootlin.com

migrate.c - mm/migrate.c - Linux source code v6.6.42 - Bootlin Elixir Cross Referencer

2
1. Rajitb 21 Mar 2025
  
  in Public
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define NR_MAX_BATCHED_MIGRATION HPAGE_PMD_NR #else #define NR_MAX_BATCHED_MIGRATION 512 #endif #define NR_MAX_MIGRATE_PAGES_RETRY 10 #define NR_MAX_MIGRATE_ASYNC_RETRY 3 #define NR_MAX_MIGRATE_SYNC_RETRY \ (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY)
  
  Policy that determines batch of pages that can be migrated in one migration cycle and also specifies on the number of retries that are possible for re-migration (async vs sync)
  
  #ldos-high-confidence #ldos-value
2. Rajitb 21 Mar 2025
  
  in Public
  
  int migrate_pages(struct list_head *from, new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { int rc, rc_gather; int nr_pages; struct folio *folio, *folio2; LIST_HEAD(folios); LIST_HEAD(ret_folios); LIST_HEAD(split_folios); struct migrate_pages_stats stats; trace_mm_migrate_pages_start(mode, reason); memset(&stats, 0, sizeof(stats)); rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private, mode, reason, &stats, &ret_folios); if (rc_gather < 0) goto out; again: nr_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { /* Retried hugetlb folios will be kept in list */ if (folio_test_hugetlb(folio)) { list_move_tail(&folio->lru, &ret_folios); continue; } nr_pages += folio_nr_pages(folio); if (nr_pages >= NR_MAX_BATCHED_MIGRATION) break; } if (nr_pages >= NR_MAX_BATCHED_MIGRATION) list_cut_before(&folios, from, &folio2->lru); else list_splice_init(from, &folios); if (mode == MIGRATE_ASYNC) rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, private, mode, reason, &ret_folios, &split_folios, &stats, NR_MAX_MIGRATE_PAGES_RETRY); else rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio, private, mode, reason, &ret_folios, &split_folios, &stats); list_splice_tail_init(&folios, &ret_folios); if (rc < 0) { rc_gather = rc; list_splice_tail(&split_folios, &ret_folios); goto out; } if (!list_empty(&split_folios)) { /* * Failure isn't counted since all split folios of a large folio * is counted as 1 failure already. And, we only try to migrate * with minimal effort, force MIGRATE_ASYNC mode and retry once. */ migrate_pages_batch(&split_folios, get_new_folio, put_new_folio, private, MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1); list_splice_tail_init(&split_folios, &ret_folios); } rc_gather += rc; if (!list_empty(from)) goto again; out: /* * Put the permanent failure folio back to migration list, they * will be put back to the right list by the caller. */ list_splice(&ret_folios, from); /* * Return 0 in case all split folios of fail-to-migrate large folios * are migrated successfully. */ if (list_empty(from)) rc_gather = 0; count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded); count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded); count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed); count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split); trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages, stats.nr_thp_succeeded, stats.nr_thp_failed, stats.nr_thp_split, mode, reason); if (ret_succeeded) *ret_succeeded = stats.nr_succeeded; return rc_gather; }
  
  Might.... be main func for migrating pages maintained by a folio to another free folio. No indication of policy... Except for the macros. Also worth looking into how list data structs are used. Overall not an interesting func despite linux kernel documentation (not the comments in here)
  
  #ldos-low-confidence
Visit annotations in context

Tags

#ldos-value

#ldos-high-confidence

#ldos-low-confidence

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/migrate.c
elixir.bootlin.com elixir.bootlin.com

slab.c - mm/slab.c - Linux source code v6.6.42 - Bootlin Elixir Cross Referencer

4
1. monishak 21 Mar 2025
  
  in Public
  
  n->colour_next++; if (n->colour_next >= cachep->colour) n->colour_next = 0; offset = n->colour_next; if (offset >= cachep->colour) offset = 0; offset *= cachep->colour_off;
  
  Colors are rotated through for cache coloring and offset values for next slab is calculated.
  
  #ldos-calculation
2. monishak 21 Mar 2025
  
  in Public
  
  cachep->align = ralign; cachep->colour_off = cache_line_size(); /* Offset must be a multiple of the alignment. */ if (cachep->colour_off < cachep->align) cachep->colour_off = cachep->align;
  
  defines cache coloring alignments to ensure objects likely to be accessed together aren't placed in the same cache lines
  
  #ldos-calculation
3. monishak 20 Mar 2025
  
  in Public
  
  #define REAPTIMEOUT_NODE
  
  hardcoded values for when the system needs to reap caches
  
  #ldos-value
4. monishak 20 Mar 2025
  
  in Public
  
  size_t slab_size = PAGE_SIZE << gfporder;
  
  slab size is fixed based off whatever value for the page order is passed in
  
  #ldos-calculation
Visit annotations in context

Tags

#ldos-calculation

#ldos-value

Annotators

monishak

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/slab.c
Feb 2025
elixir.bootlin.com elixir.bootlin.com

vmpressure.c - mm/vmpressure.c - Linux source code v6.6.42 - Bootlin Elixir Cross Referencer

3
1. monishak 28 Feb 2025
  
  in Public
  
  pressure = scale - (reclaimed * scale / scanned); pressure = pressure * 100 / scale;
  
  Calculates pressure via reclaim efficiency rather than actual memory pressure.
  
  #ldos-calculation
2. monishak 28 Feb 2025
  
  in Public
  
  static const unsigned int vmpressure_level_med = 60; static const unsigned int vmpressure_level_critical = 95;
  
  Encoded threshold values - could be machine dependent.
  
  #ldos-value
3. monishak 28 Feb 2025
  
  in Public
  
  static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
  
  changed based on machine but also application potentially. Goal is to minimize the number of false positives.
  
  #ldos-value
Visit annotations in context

Tags

#ldos-calculation

#ldos-value

Annotators

monishak

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/vmpressure.c
elixir.bootlin.com elixir.bootlin.com

ksm.c - mm/ksm.c - Linux source code v6.6.42 - Bootlin

6
1. monishak 21 Feb 2025
  
  in Public
  
  static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) {
  
  This method chooses what section to process next by going in the order of the linked list - could be improved by going off historical success rates
  
  #ldos-algo
2. monishak 19 Feb 2025
  
  in Public
  
  static ssize_t general_profit_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
  
  Can be used to evaluate most efficient use of ksm
  
  #ldos-calculation
3. monishak 14 Feb 2025
  
  in Public
  
  if (ksm_run & KSM_RUN_UNMERGE) list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node); else list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock);
  
  Determines where to insert the process in the list of processes.
  
  #ldos-algo
4. monishak 14 Feb 2025
  
  in Public
  
  switch (advice) { case MADV_MERGEABLE: if (vma->vm_flags & VM_MERGEABLE) return 0; if (!vma_ksm_compatible(vma)) return 0; if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) return err; } *vm_flags |= VM_MERGEABLE; break; case MADV_UNMERGEABLE: if (!(*vm_flags & VM_MERGEABLE)) return 0; /* just ignore the advice */ if (vma->anon_vma) { err = unmerge_ksm_pages(vma, start, end, true); if (err) return err; } *vm_flags &= ~VM_MERGEABLE; break; }
  
  Sets whether pages should be merged or unmerged, relies on flags in order to do so.
  
  #ldos-flag #ldos-algo
5. monishak 14 Feb 2025
  
  in Public
  
  sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); wait_event_interruptible_timeout(ksm_iter_wait, sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), msecs_to_jiffies(sleep_ms));
  
  Sleep duration dynamically adjusted, if this value changes while sleeping then it will wake up immediately. But uses intial hardcoded values - potential to change this.
  
  #ldos-value
6. monishak 14 Feb 2025
  
  in Public
  
  set_user_nice(current, 5);
  
  Potentially delays function by setting priority low so that when CPU is too busy, it will get delayed until more space frees up
  
  #ldos-flag
Visit annotations in context

Tags

#ldos-flag

#ldos-value

#ldos-calculation

#ldos-algo

Annotators

monishak

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/ksm.c
elixir.bootlin.com elixir.bootlin.com

z3fold.c - mm/z3fold.c - Linux source code v6.6.42 - Bootlin Elixir Cross Referencer

1
1. Rajitb 20 Feb 2025
  
  in Public
  
  #define NCHUNKS_ORDER 6
  
  used for bit shift ops. 2^6 = 64 bytes or chunks depending upon your problem
  
  #ldos-policy-use
Visit annotations in context

Tags

#ldos-policy-use

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/z3fold.c
elixir.bootlin.com elixir.bootlin.com

page_alloc.c - mm/page_alloc.c - Linux source code v6.6.42 - Bootlin

1
1. yushanq 13 Feb 2025
  
  in Public
  
  static inline bool should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, int *compaction_retries) { int max_retries = MAX_COMPACT_RETRIES; int min_priority; bool ret = false; int retries = *compaction_retries; enum compact_priority priority = *compact_priority; if (!order) return false; if (fatal_signal_pending(current)) return false; /* * Compaction was skipped due to a lack of free order-0 * migration targets. Continue if reclaim can help. */ if (compact_result == COMPACT_SKIPPED) { ret = compaction_zonelist_suitable(ac, order, alloc_flags); goto out; } /* * Compaction managed to coalesce some page blocks, but the * allocation failed presumably due to a race. Retry some. */ if (compact_result == COMPACT_SUCCESS) { /* * !costly requests are much more important than * __GFP_RETRY_MAYFAIL costly ones because they are de * facto nofail and invoke OOM killer to move on while * costly can fail and users are ready to cope with * that. 1/4 retries is rather arbitrary but we would * need much more detailed feedback from compaction to * make a better decision. */ if (order > PAGE_ALLOC_COSTLY_ORDER) max_retries /= 4; if (++(*compaction_retries) <= max_retries) { ret = true; goto out; } } /* * Compaction failed. Retry with increasing priority. */ min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; if (*compact_priority > min_priority) { (*compact_priority)--; *compaction_retries = 0; ret = true; } out: trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); return ret; }
  
  This function determines whether the system should retry memory compaction during page allocation. 1. If the allocation order is 0, no need for compaction 2. If compaction was skipped because of a lack of free order-0 pages: Check if memory reclaim can help. 3. If compaction succeeded but allocation still failed: retry 1/4 times. 4. If the compaction failed. retry with increase priority
  
  #ldos-policy
Visit annotations in context

Tags

#ldos-policy

Annotators

yushanq

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/page_alloc.c
elixir.bootlin.com elixir.bootlin.com

vmstat.c - mm/vmstat.c - Linux source code v6.6.42 - Bootlin

1
1. yushanq 11 Feb 2025
  
  in Public
  
  /* * A fragmentation index only makes sense if an allocation of a requested * size would fail. If that is true, the fragmentation index indicates * whether external fragmentation or a lack of memory was the problem. * The value can be used to determine if page reclaim or compaction * should be used */ static int __fragmentation_index(unsigned int order, struct contig_page_info *info) { unsigned long requested = 1UL << order; if (WARN_ON_ONCE(order > MAX_ORDER)) return 0; if (!info->free_blocks_total) return 0; /* Fragmentation index only makes sense when a request would fail */ if (info->free_blocks_suitable) return -1000; /* * Index is between 0 and 1 so return within 3 decimal places * * 0 => allocation would fail due to lack of memory * 1 => allocation would fail due to fragmentation */ return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); }
  
  An algorithm to calculate the current zone’s external fragmentation index, where a higher value indicates greater fragmentation.
  
  #ldos-algo
Visit annotations in context

Tags

#ldos-algo

Annotators

yushanq

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/vmstat.c
elixir.bootlin.com elixir.bootlin.com

compaction.c - mm/compaction.c - Linux source code v6.6.42 - Bootlin

2
1. yushanq 11 Feb 2025
  
  in Public
  
  .mode = (prio == COMPACT_PRIO_ASYNC) ? MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,
  
  A migration mode config asynchronous migration : synchronous migration
  
  #ldos-config
2. yushanq 11 Feb 2025
  
  in Public
  
  /* Returns true if compaction should be skipped this time */ static bool compaction_deferred(struct zone *zone, int order) { unsigned long defer_limit = 1UL << zone->compact_defer_shift; if (order < zone->compact_order_failed) return false; /* Avoid possible overflow */ if (++zone->compact_considered >= defer_limit) { zone->compact_considered = defer_limit; return false; } trace_mm_compaction_deferred(zone, order); return true; }
  
  Trivial policy. The function to decide if the system can defer current compaction request. defer -> if the number of page order >= threshold (defer_limit) The threshold is an integer.
  
  ldos-algo
Visit annotations in context

Tags

ldos-algo

#ldos-config

Annotators

yushanq

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/compaction.c
elixir.bootlin.com elixir.bootlin.com

compaction.h - include/linux/compaction.h - Linux source code v6.6.42 - Bootlin Elixir Cross Referencer

1
1. yushanq 11 Feb 2025
  
  in Public
  
  enum compact_priority { COMPACT_PRIO_SYNC_FULL, MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_FULL, COMPACT_PRIO_SYNC_LIGHT, MIN_COMPACT_COSTLY_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, COMPACT_PRIO_ASYNC, INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC };
  
  Define the priority of compaction strategy. 0: COMPACT_PRIO_SYNC_FULL 1: COMPACT_PRIO_SYNC_LIGHT 2: COMPACT_PRIO_ASYNC
  
  #ldos-config
Visit annotations in context

Tags

#ldos-config

Annotators

yushanq

URL

elixir.bootlin.com/linux/v6.6.42/source/include/linux/compaction.h
elixir.bootlin.com elixir.bootlin.com

page_reporting.c - mm/page_reporting.c - Linux source code v6.6.42 - Bootlin Elixir Cross Referencer

1
1. Rajitb 07 Feb 2025
  
  in Public
  
  unsigned int page_reporting_order = -1;
  
  Kernel param that controls number of contiguous pages. -1 is default for invalid and 0 to MAX_ORDER is log base 2 of no.of pages.
  
  #ldos-value
Visit annotations in context

Tags

#ldos-value

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/page_reporting.c
elixir.bootlin.com elixir.bootlin.com

readahead.c - mm/readahead.c - Linux source code v6.6.42 - Bootlin

7
1. monishak 06 Feb 2025
  
  in Public
  
  static void ondemand_readahead(struct readahead_control *ractl, struct folio *folio, unsigned long req_size) {
  
  Function uses heuristics to allocate readahead size when assumed/predicted to be sequential reads. However, mostly in trivial cases so may not be worth changing.
  
  #ldos-function
2. monishak 06 Feb 2025
  
  in Public
  
  expected = round_down(ra->start + ra->size - ra->async_size, 1UL << order);
  
  Assumes sequential access and increases readahead size in expectation.
  
  #ldos-value
3. monishak 06 Feb 2025
  
  in Public
  
  if (new_order < MAX_PAGECACHE_ORDER) {
  
  Adjusts new_order so that it's balanced with the readahead window size while increasing it as much as possible.
  
  #ldos-calculation
4. monishak 06 Feb 2025
  
  in Public
  
  if (size >= index) size *= 2;
  
  Assumes that there will be a long sequential read - potentially overestimating.
  
  #ldos-value
5. monishak 06 Feb 2025
  
  in Public
  
  if (cur < max / 16) return 4 * cur; if (cur <= max / 2) return 2 * cur;
  
  Set values for scaling readahead window. Might be too minor to need changes though.
  
  #ldos-calculation
6. monishak 06 Feb 2025
  
  in Public
  
  unsigned long newsize = roundup_pow_of_two(size); if (newsize <= max / 32) newsize = newsize * 4; else if (newsize <= max / 4) newsize = newsize * 2; else newsize = max; return newsize;
  
  These are all heuristics to determine intial readahead size. Trades potentially memory overuse for faster access. Could be changed to not use these hardcoded values.
  
  #ldos-calculation
7. monishak 06 Feb 2025
  
  in Public
  
  if (unlikely(rac->_workingset))
  
  Heuristic that says _workingset likely isn't set (pages being read aren't likely to be used soon and don't belong to physical memory). When true, stalls current task due to lack of memory (relieves memory pressure)
  
  #ldos-flag
Visit annotations in context

Tags

#ldos-calculation

#ldos-value

#ldos-flag

#ldos-function

Annotators

monishak

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/readahead.c
Jan 2025
elixir.bootlin.com elixir.bootlin.com

zswap.c - mm/zswap.c - Linux source code v6.6.42 - Bootlin

1
1. Rajitb 26 Jan 2025
  
  in Public
  
  static unsigned int zswap_max_pool_percent = 20;
  
  Max percentage of total system RAM that can be used by the pool. Also a user controlled policy recognized by the Linux kernel documentation
  
  #ldos-high-confidence #ldos-value
Visit annotations in context

Tags

#ldos-value

#ldos-high-confidence

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/zswap.c
Nov 2024
elixir.bootlin.com elixir.bootlin.com

gup.c - mm/gup.c - Linux source code v6.6.42 - Bootlin

59
1. Rajitb 11 Nov 2024
  
  in Public
  
  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; do { struct page *page = pfn_to_page(pfn); pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } SetPageReferenced(page); pages[*nr] = page; if (unlikely(try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } (*nr)++; pfn++; } while (addr += PAGE_SIZE, addr != end); put_dev_pagemap(pgmap); return addr == end; } static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } #else static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } #endif
  
  seems like a check to see if pages can be grabbed. A quick skim maybe hints possible checks if huge pages can be grabbed?
  
  #ldos-ifdef-code #ldos-function #ldos-high-confidence
2. Rajitb 11 Nov 2024
  
  in Public
  
  #ifdef CONFIG_ARCH_HAS_HUGEPD static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) { unsigned long __boundary = (addr + sz) & ~(sz-1); return (__boundary - 1 < end - 1) ? __boundary : end; } static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long pte_end; struct page *page; struct folio *folio; pte_t pte; int refs; pte_end = (addr + sz) & ~(sz-1); if (pte_end < end) end = pte_end; pte = huge_ptep_get(ptep); if (!pte_access_permitted(pte, flags & FOLL_WRITE)) return 0; /* hugepages are never "special" */ VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, struct page **pages, int *nr) { pte_t *ptep; unsigned long sz = 1UL << hugepd_shift(hugepd); unsigned long next; ptep = hugepte_offset(hugepd, addr, pdshift); do { next = hugepte_addr_end(addr, end, sz); if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr)) return 0; } while (ptep++, addr = next, addr != end); return 1; } #else static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_HUGEPD */ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *page; struct folio *folio; int refs; if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; return __gup_device_huge_pmd(orig, pmdp, addr, end, flags, pages, nr); } page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *page; struct folio *folio; int refs; if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; return __gup_device_huge_pud(orig, pudp, addr, end, flags, pages, nr); } page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { int refs; struct page *page; struct folio *folio; if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) return 0; BUILD_BUG_ON(pgd_devmap(orig)); page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { gup_put_folio(folio, refs, flags); return 0; } if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; pmdp = pmd_offset_lockless(pudp, pud, addr); do { pmd_t pmd = pmdp_get_lockless(pmdp); next = pmd_addr_end(addr, end); if (!pmd_present(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd))) { /* See gup_pte_range() */ if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { /* * architecture have different format for hugetlbfs * pmd format and THP pmd format */ if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, PMD_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); return 1; } static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; pudp = pud_offset_lockless(p4dp, p4d, addr); do { pud_t pud = READ_ONCE(*pudp); next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) return 0; if (unlikely(pud_huge(pud) || pud_devmap(pud))) { if (!gup_huge_pud(pud, pudp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, PUD_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); return 1; } static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp; p4dp = p4d_offset_lockless(pgdp, pgd, addr); do { p4d_t p4d = READ_ONCE(*p4dp); next = p4d_addr_end(addr, end); if (p4d_none(p4d)) return 0; BUILD_BUG_ON(p4d_huge(p4d)); if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, P4D_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); return 1; } static void gup_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pgd_t *pgdp; pgdp = pgd_offset(current->mm, addr); do { pgd_t pgd = READ_ONCE(*pgdp); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) return; if (unlikely(pgd_huge(pgd))) { if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, pages, nr)) return; } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, PGDIR_SHIFT, next, flags, pages, nr)) return; } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) return; } while (pgdp++, addr = next, addr != end); } #else static inline void gup_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { }
  
  policy use functions for gup_huge pte policy code function above (not right above, gotta scroll probably to find it)
  
  #ldos-function #ldos-flag #ldos-low-confidence #ldos-ifdef-code #ldos-policy-use
3. Rajitb 11 Nov 2024
  
  in Public
  
  static int internal_get_user_pages_fast(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { unsigned long len, end; unsigned long nr_pinned; int locked = 0; int ret; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | FOLL_FAST_ONLY | FOLL_NOFAULT | FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT))) return -EINVAL; if (gup_flags & FOLL_PIN) mm_set_has_pinned_flag(&current->mm->flags); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(&current->mm->mmap_lock); start = untagged_addr(start) & PAGE_MASK; len = nr_pages << PAGE_SHIFT; if (check_add_overflow(start, len, &end)) return -EOVERFLOW; if (end > TASK_SIZE_MAX) return -EFAULT; if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages); if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) return nr_pinned; /* Slow path: try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned, pages, &locked, gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so * returning -errno is not an option */ if (nr_pinned) return nr_pinned; return ret; } return ret + nr_pinned; } /** * get_user_pages_fast_only() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. * * If the architecture does not support this function, simply return with no * pages pinned. * * Careful, careful! COW breaking can go either way, so a non-write * access can get ambiguous page results. If you call this function without * 'write' set, you'd better be sure that you're ok with that ambiguity. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { /* * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, * because gup fast is always a "pin with a +1 page refcount" request. * * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET | FOLL_FAST_ONLY)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast_only); /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Attempt to pin user pages in memory without taking mm->mmap_lock. * If not successful, it will fall back to taking the lock and * calling get_user_pages(). * * Returns number of pages pinned. This may be fewer than the number requested. * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns * -errno. */ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { /* * The caller may or may not have explicitly set FOLL_GET; either way is * OK. However, internally (within mm/gup.c), gup fast variants must set * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" * request. */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); /** * pin_user_pages_fast() - pin user pages in memory without taking locks * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See * get_user_pages_fast() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for further details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page() will not remove pins from it. */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(pin_user_pages_fast); /** * pin_user_pages_remote() - pin pages of a remote process * * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See * get_user_pages_remote() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { int local_locked = 1; if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) return 0; return __gup_longterm_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_remote); /** * pin_user_pages() - pin user pages in memory for use by other devices * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { int locked = 1; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages); /* * pin_user_pages_unlocked() is the FOLL_PIN variant of * get_user_pages_unlocked(). Behavior is the same, except that this one sets * FOLL_PIN and rejects FOLL_GET. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { int locked = 0; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); }
  
  fast gup functions
  
  #ldos-function #ldos-low-confidence #ldos-ifdef-code #ldos-flag #ldos-policy-use
4. Rajitb 11 Nov 2024
  
  in Public
  
  /** * unpin_user_pages() - release an array of gup-pinned pages. * @pages: array of pages to be marked dirty and released. * @npages: number of pages in the @pages array. * * For each page in the @pages array, release the page using unpin_user_page(). * * Please see the unpin_user_page() documentation for details. */ void unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long i; struct folio *folio; unsigned int nr; /* * If this WARN_ON() fires, then the system *might* be leaking pages (by * leaving them pinned), but probably not. More likely, gup/pup returned * a hard -ERRNO error to the caller, who erroneously passed it here. */ if (WARN_ON(IS_ERR_VALUE(npages))) return; sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); gup_put_folio(folio, nr, FOLL_PIN); } }
  
  gup unpin function, not actual logic
  
  #ldos-function #ldos-low-confidence #ldos-flag #ldos-policy-use
5. Rajitb 11 Nov 2024
  
  in Public
  
  void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty) { unsigned long i; struct folio *folio; unsigned int nr; for (i = 0; i < npages; i += nr) { folio = gup_folio_range_next(page, npages, i, &nr); if (make_dirty && !folio_test_dirty(folio)) { folio_lock(folio); folio_mark_dirty(folio); folio_unlock(folio); } gup_put_folio(folio, nr, FOLL_PIN); } }
  
  unpin logic but for dirty pages
  
  #ldos-function #ldos-low-confidence #ldos-policy-use
6. Rajitb 11 Nov 2024
  
  in Public
  
  if ((flags & FOLL_DUMP) && (vma_is_anonymous(vma) || !vma->vm_ops->fault)) return ERR_PTR(-EFAULT); return NULL;
  
  explained in comments
  
  #ldos-low-confidence #ldos-flag #ldos-function #ldos-ifdef-code
7. Rajitb 11 Nov 2024
  
  in Public
  
  #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL /* * Fast-gup relies on pte change detection to avoid concurrent pgtable * operations. * * To pin the page, fast-gup needs to do below in order: * (1) pin the page (by prefetching pte), then (2) check pte not changed. * * For the rest of pgtable operations where pgtable updates can be racy * with fast-gup, we need to do (1) clear pte, then (2) check whether page * is pinned. * * Above will work for all pte-level operations, including THP split. * * For THP collapse, it's a bit more complicated because fast-gup may be * walking a pgtable page that is being freed (pte is still valid but pmd * can be cleared already). To avoid race in such condition, we need to * also check pmd here to make sure pmd doesn't change (corresponds to * pmdp_collapse_flush() in the THP collapse code path). */ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; int nr_start = *nr, ret = 0; pte_t *ptep, *ptem; ptem = ptep = pte_offset_map(&pmd, addr); if (!ptep) return 0; do { pte_t pte = ptep_get_lockless(ptep); struct page *page; struct folio *folio; /* * Always fallback to ordinary GUP on PROT_NONE-mapped pages: * pte_access_permitted() better should reject these pages * either way: otherwise, GUP-fast might succeed in * cases where ordinary GUP would fail due to VMA access * permissions. */ if (pte_protnone(pte)) goto pte_unmap; if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto pte_unmap; if (pte_devmap(pte)) { if (unlikely(flags & FOLL_LONGTERM)) goto pte_unmap; pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); goto pte_unmap; } } else if (pte_special(pte)) goto pte_unmap; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); folio = try_grab_folio(page, 1, flags); if (!folio) goto pte_unmap; if (unlikely(folio_is_secretmem(folio))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } /* * We need to make the page accessible if and only if we are * going to access its content (the FOLL_PIN case). Please * see Documentation/core-api/pin_user_pages.rst for * details. */ if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { gup_put_folio(folio, 1, flags); goto pte_unmap; } } folio_set_referenced(folio); pages[*nr] = page; (*nr)++; } while (ptep++, addr += PAGE_SIZE, addr != end); ret = 1; pte_unmap: if (pgmap) put_dev_pagemap(pgmap); pte_unmap(ptem); return ret; } #else /* * If we can't determine whether or not a pte is special, then fail immediately * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not * to be special. * * For a futex to be placed on a THP tail page, get_futex_key requires a * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
  
  non concurrent fast gup approach that checks for pinned page and unmaps pte or clears it
  
  #ldos-ifdef-code #ldos-high-confidence #ldos-function #ldos-algorithm #ldos-flag #ldos-formula
8. Rajitb 11 Nov 2024
  
  in Public
  
  #ifdef CONFIG_HAVE_FAST_GUP /* * Used in the GUP-fast path to determine whether a pin is permitted for a * specific folio. * * This call assumes the caller has pinned the folio, that the lowest page table * level still points to this folio, and that interrupts have been disabled. * * Writing to pinned file-backed dirty tracked folios is inherently problematic * (see comment describing the writable_file_mapping_allowed() function). We * therefore try to avoid the most egregious case of a long-term mapping doing * so. * * This function cannot be as thorough as that one as the VMA is not available * in the fast path, so instead we whitelist known good cases and if in doubt, * fall back to the slow path. */ static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags) { struct address_space *mapping; unsigned long mapping_flags; /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the one we disallow. */ if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) != (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) return true; /* The folio is pinned, so we can safely access folio fields. */ if (WARN_ON_ONCE(folio_test_slab(folio))) return false; /* hugetlb mappings do not require dirty-tracking. */ if (folio_test_hugetlb(folio)) return true; /* * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods * cannot proceed, which means no actions performed under RCU can * proceed either. * * inodes and thus their mappings are freed under RCU, which means the * mapping cannot be freed beneath us and thus we can safely dereference * it. */ lockdep_assert_irqs_disabled(); /* * However, there may be operations which _alter_ the mapping, so ensure * we read it once and only once. */ mapping = READ_ONCE(folio->mapping); /* * The mapping may have been truncated, in any case we cannot determine * if this mapping is safe - fall back to slow path to determine how to * proceed. */ if (!mapping) return false; /* Anonymous folios pose no problem. */ mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS; if (mapping_flags) return mapping_flags & PAGE_MAPPING_ANON; /* * At this point, we know the mapping is non-null and points to an * address_space object. The only remaining whitelisted file system is * shmem. */ return shmem_mapping(mapping); }
  
  policy logic. avoids locks unlike get user pages unlocked/locked which seems risky so its not supposed to be used on concurrent gup logic
  
  #ldos-flag #ldos-function #ldos-high-confidence #ldos-ifdef-code
9. Rajitb 11 Nov 2024
  
  in Public
  
  long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { int locked = 1; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); }
  
  policy logic.
  
  #ldos-high-confidence #ldos-flag
10. Rajitb 11 Nov 2024
  
  in Public
  
  static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, unsigned int flags, struct page **pages) { while ((*nr) - nr_start) { struct page *page = pages[--(*nr)]; ClearPageReferenced(page); if (flags & FOLL_PIN) unpin_user_page(page); else put_page(page); } }
  
  policy use function that undoes mapping
  
  #ldos-policy-use #ldos-function #ldos-low-confidence
11. Rajitb 11 Nov 2024
  
  in Public
  
  #ifdef CONFIG_MIGRATION /* * Returns the number of collected pages. Return value is always >= 0. */ static unsigned long collect_longterm_unpinnable_pages( struct list_head *movable_page_list, unsigned long nr_pages, struct page **pages) { unsigned long i, collected = 0; struct folio *prev_folio = NULL; bool drain_allow = true; for (i = 0; i < nr_pages; i++) { struct folio *folio = page_folio(pages[i]); if (folio == prev_folio) continue; prev_folio = folio; if (folio_is_longterm_pinnable(folio)) continue; collected++; if (folio_is_device_coherent(folio)) continue; if (folio_test_hugetlb(folio)) { isolate_hugetlb(folio, movable_page_list); continue; } if (!folio_test_lru(folio) && drain_allow) { lru_add_drain_all(); drain_allow = false; } if (!folio_isolate_lru(folio)) continue; list_add_tail(&folio->lru, movable_page_list); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } return collected; }
  
  #ldos-ifdef-code #ldos-function #ldos-low-confidence #ldos-flag
12. Rajitb 11 Nov 2024
  
  in Public
  
  #ifdef CONFIG_ELF_CORE struct page *get_dump_page(unsigned long addr) { struct page *page; int locked = 0; int ret; ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */
  
  part of policy use code likely
  
  #ldos-ifdef-code #ldos-flag #ldos-function #ldos-low-confidence
13. Rajitb 11 Nov 2024
  
  in Public
  
  int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; long ret = 0; end = start + len; for (nstart = start; nstart < end; nstart = nend) { /* * We want to fault in pages for [nstart; end) address range. * Find first corresponding VMA. */ if (!locked) { locked = 1; mmap_read_lock(mm); vma = find_vma_intersection(mm, nstart, end); } else if (nstart >= vma->vm_end) vma = find_vma_intersection(mm, vma->vm_end, end); if (!vma) break; /* * Set [nstart; nend) to intersection of desired address * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); if (vma->vm_flags & (VM_IO | VM_PFNMAP)) continue; if (nstart < vma->vm_start) nstart = vma->vm_start; /* * Now fault in a range of pages. populate_vma_page_range() * double checks the vma flags, so that it won't mlock pages * if the vma was already munlocked. */ ret = populate_vma_page_range(vma, nstart, nend, &locked); if (ret < 0) { if (ignore_errors) { ret = 0; continue; /* continue at next VMA */ } break; } nend = nstart + ret * PAGE_SIZE; ret = 0; } if (locked) mmap_read_unlock(mm); return ret; /* 0 or negative error code */ }
  
  policy use function that populates pages like the func before this.
  
  #ldos-function #ldos-low-confidence #ldos-flag
14. Rajitb 11 Nov 2024
  
  in Public
  
  long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked) { struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int local_locked = 1; int gup_flags; long ret; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* * Rightly or wrongly, the VM_LOCKONFAULT case has never used * faultin_page() to break COW, so it has no work to do here. */ if (vma->vm_flags & VM_LOCKONFAULT) return nr_pages; gup_flags = FOLL_TOUCH; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW * and we would not want to dirty them for nothing. */ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; /* * We want mlock to succeed for regions that have any permissions * other than PROT_NONE. */ if (vma_is_accessible(vma)) gup_flags |= FOLL_FORCE; if (locked) gup_flags |= FOLL_UNLOCKABLE; /* * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ ret = __get_user_pages(mm, start, nr_pages, gup_flags, NULL, locked ? locked : &local_locked); lru_add_drain(); return ret; }
  
  policy use code.
  
  #ldos-flag #ldos-function #ldos-low-confidence
15. Rajitb 10 Nov 2024
  
  in Public
  
  long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { int local_locked = 1; if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_TOUCH | FOLL_REMOTE)) return -EINVAL; return __get_user_pages_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); }
  
  policy logic
  
  #ldos-high-confidence #ldos-function #ldos-flag
16. Rajitb 10 Nov 2024
  
  in Public
  
  static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int flags) { long ret, pages_done; bool must_unlock = false; /* * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. */ if (!*locked) { if (mmap_read_lock_killable(mm)) return -EAGAIN; must_unlock = true; *locked = 1; } else mmap_assert_locked(mm); if (flags & FOLL_PIN) mm_set_has_pinned_flag(&mm->flags); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior * is to set FOLL_GET if the caller wants pages[] filled in (but has * carelessly failed to specify FOLL_GET), so keep doing that, but only * for FOLL_GET, not for the newer FOLL_PIN. * * FOLL_PIN always expects pages to be non-null, but no need to assert * that here, as any failures will be obvious enough. */ if (pages && !(flags & FOLL_PIN)) flags |= FOLL_GET; pages_done = 0; for (;;) { ret = __get_user_pages(mm, start, nr_pages, flags, pages, locked); if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ pages_done = ret; break; } /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */ if (!*locked) { BUG_ON(ret < 0); BUG_ON(ret >= nr_pages); } if (ret > 0) { nr_pages -= ret; pages_done += ret; if (!nr_pages) break; } if (*locked) { /* * VM_FAULT_RETRY didn't trigger or it was a * FOLL_NOWAIT. */ if (!pages_done) pages_done = ret; break; } /* * VM_FAULT_RETRY triggered, so seek to the faulting offset. * For the prefault case (!pages) we only update counts. */ if (likely(pages)) pages += ret; start += ret << PAGE_SHIFT; /* The lock was temporarily dropped, so we must unlock later */ must_unlock = true; retry: /* * Repeat on the address that fired VM_FAULT_RETRY * with both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_TRIED. Note that GUP can be interrupted * by fatal signals of even common signals, depending on * the caller's request. So we need to check it before we * start trying again otherwise it can loop forever. */ if (gup_signal_pending(flags)) { if (!pages_done) pages_done = -EINTR; break; } ret = mmap_read_lock_killable(mm); if (ret) { BUG_ON(ret > 0); if (!pages_done) pages_done = ret; break; } *locked = 1; ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED, pages, locked); if (!*locked) { /* Continue to retry until we succeeded */ BUG_ON(ret != 0); goto retry; } if (ret != 1) { BUG_ON(ret > 1); if (!pages_done) pages_done = ret; break; } nr_pages--; pages_done++; if (!nr_pages) break; if (likely(pages)) pages++; start += PAGE_SIZE; } if (must_unlock && *locked) { /* * We either temporarily dropped the lock, or the caller * requested that we both acquire and drop the lock. Either way, * we must now unlock, and notify the caller of that state. */ mmap_read_unlock(mm); *locked = 0; } return pages_done; }
  
  same as gup but sets/unsets mmap_lock
  
  #ldos-flag #ldos-formula #ldos-high-confidence #ldos-ifdef-code #ldos-function
17. Rajitb 10 Nov 2024
  
  in Public
  
  if (!(flags & FOLL_INTERRUPTIBLE)) return false;
  
  fatal fault signal handler
  
  #ldos-flag #ldos-low-confidence
18. Rajitb 10 Nov 2024
  
  in Public
  
  int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { struct vm_area_struct *vma; vm_fault_t ret; address = untagged_addr_remote(mm, address); if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; retry: vma = gup_vma_lookup(mm, address); if (!vma) return -EFAULT; if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; if ((fault_flags & FAULT_FLAG_KILLABLE) && fatal_signal_pending(current)) return -EINTR; ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_COMPLETED) { /* * NOTE: it's a pity that we need to retake the lock here * to pair with the unlock() in the callers. Ideally we * could tell the callers so they do not need to unlock. */ mmap_read_lock(mm); *unlocked = true; return 0; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); *unlocked = true; fault_flags |= FAULT_FLAG_TRIED; goto retry; } return 0; }
  
  resolves user page fault. policy logic
  
  #ldos-flag #ldos-high-confidence #ldos-function
19. Rajitb 10 Nov 2024
  
  in Public
  
  static long __get_user_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; struct follow_page_context ctx = { NULL }; if (!nr_pages) return 0; start = untagged_addr_remote(mm, start); VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); do { struct page *page; unsigned int foll_flags = gup_flags; unsigned int page_increm; /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { /* * MADV_POPULATE_(READ|WRITE) wants to handle VMA * lookups+error reporting differently. */ if (gup_flags & FOLL_MADV_POPULATE) { vma = vma_lookup(mm, start); if (!vma) { ret = -ENOMEM; goto out; } if (check_vma_flags(vma, gup_flags)) { ret = -EINVAL; goto out; } goto retry; } vma = gup_vma_lookup(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, pages ? &page : NULL); if (ret) goto out; ctx.page_mask = 0; goto next_page; } if (!vma) { ret = -EFAULT; goto out; } ret = check_vma_flags(vma, gup_flags); if (ret) goto out; } retry: /* * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ if (fatal_signal_pending(current)) { ret = -EINTR; goto out; } cond_resched(); page = follow_page_mask(vma, start, foll_flags, &ctx); if (!page || PTR_ERR(page) == -EMLINK) { ret = faultin_page(vma, start, &foll_flags, PTR_ERR(page) == -EMLINK, locked); switch (ret) { case 0: goto retry; case -EBUSY: case -EAGAIN: ret = 0; fallthrough; case -EFAULT: case -ENOMEM: case -EHWPOISON: goto out; } BUG(); } else if (PTR_ERR(page) == -EEXIST) { /* * Proper page table entry exists, but no corresponding * struct page. If the caller expects **pages to be * filled in, bail out now, because that can't be done * for this page. */ if (pages) { ret = PTR_ERR(page); goto out; } } else if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } next_page: page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; if (pages) { struct page *subpage; unsigned int j; /* * This must be a large folio (and doesn't need to * be the whole folio; it can be part of it), do * the refcount work for all the subpages too. * * NOTE: here the page may not be the head page * e.g. when start addr is not thp-size aligned. * try_grab_folio() should have taken care of tail * pages. */ if (page_increm > 1) { struct folio *folio; /* * Since we already hold refcount on the * large folio, this should never fail. */ folio = try_grab_folio(page, page_increm - 1, foll_flags); if (WARN_ON_ONCE(!folio)) { /* * Release the 1st page ref if the * folio is problematic, fail hard. */ gup_put_folio(page_folio(page), 1, foll_flags); ret = -EFAULT; goto out; } } for (j = 0; j < page_increm; j++) { subpage = nth_page(page, j); pages[i + j] = subpage; flush_anon_page(vma, subpage, start + j * PAGE_SIZE); flush_dcache_page(subpage); } } i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; } while (nr_pages); out: if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return i ? i : ret; }
  
  Literally the actual policy logic of gup. Most important piece of code right here for gup
  
  #ldos-high-confidence #ldos-formula #ldos-algorithm #ldos-function #ldos-value #ldos-threshold
20. Rajitb 10 Nov 2024
  
  in Public
  
  #ifdef CONFIG_STACK_GROWSUP return vma_lookup(mm, addr); #else static volatile unsigned long next_warn; struct vm_area_struct *vma; unsigned long now, next; vma = find_vma(mm, addr); if (!vma || (addr >= vma->vm_start)) return vma; /* Only warn for half-way relevant accesses */ if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; if (vma->vm_start - addr > 65536) return NULL; /* Let's not warn more than once an hour.. */ now = jiffies; next = next_warn; if (next && time_before(now, next)) return NULL; next_warn = now + 60*60*HZ; /* Let people know things may have changed. */ pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n", current->comm, task_pid_nr(current), vma->vm_start, vma->vm_end, addr); dump_stack(); return NULL;
  
  helper func to lookup vma(virtual mem area) that warns per hour about half way relevant acc and changes in stack
  
  #ldos-low-confidence #ldos-threshold #ldos-policy-use #ldos-value #ldos-function
21. Rajitb 10 Nov 2024
  
  in Public
  
  static bool writable_file_mapping_allowed(struct vm_area_struct *vma, unsigned long gup_flags) { /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the case we disallow. */ if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) != (FOLL_PIN | FOLL_LONGTERM)) return true; /* * If the VMA does not require dirty tracking then no problematic write * can occur either. */ return !vma_needs_dirty_tracking(vma); }
  
  Def policy code. checks if we can write to a map
  
  #ldos-flag #ldos-function #ldos-high-confidence
22. Rajitb 10 Nov 2024
  
  in Public
  
  if (*flags & FOLL_NOFAULT) return -EFAULT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; if (*flags & FOLL_UNLOCKABLE) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE. * That's because some callers may not be prepared to * handle early exits caused by non-fatal signals. */ if (*flags & FOLL_INTERRUPTIBLE) fault_flags |= FAULT_FLAG_INTERRUPTIBLE; } if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (*flags & FOLL_TRIED) { /* * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED * can co-exist */ fault_flags |= FAULT_FLAG_TRIED; } if (unshare) { fault_flags |= FAULT_FLAG_UNSHARE; /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */ VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE); } ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_COMPLETED) { /* * With FAULT_FLAG_RETRY_NOWAIT we'll never release the * mmap lock in the page fault handler. Sanity check this. */ WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT); *locked = 0; /* * We should do the same as VM_FAULT_RETRY, but let's not * return -EBUSY since that's not reflecting the reality of * what has happened - we've just fully completed a page * fault, with the mmap lock released. Use -EAGAIN to show * that we want to take the mmap lock _again_. */ return -EAGAIN; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, *flags); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *locked = 0; return -EBUSY; }
  
  Seems it's just setting flags for page faults based on flags param
  
  #ldos-low-confidence #ldos-flag #ldos-function
23. Rajitb 10 Nov 2024
  
  in Public
  
  /* user gate pages are read-only */ if (gup_flags & FOLL_WRITE) return -EFAULT; if (address > TASK_SIZE) pgd = pgd_offset_k(address); else pgd = pgd_offset_gate(mm, address); if (pgd_none(*pgd)) return -EFAULT; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d)) return -EFAULT; pud = pud_offset(p4d, address); if (pud_none(*pud)) return -EFAULT; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return -EFAULT; pte = pte_offset_map(pmd, address); if (!pte) return -EFAULT; entry = ptep_get(pte); if (pte_none(entry)) goto unmap; *vma = get_gate_vma(mm); if (!page) goto out; *page = vm_normal_page(*vma, address, entry); if (!*page) { if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry))) goto unmap; *page = pte_page(entry); } ret = try_grab_page(*page, gup_flags); if (unlikely(ret)) goto unmap;
  
  Most of these seem like sanity checks right up until line 897 i.e, 'if(!page)'* after which we seem to unmap the page.
  
  #ldos-high-confidence #ldos-function #ldos-threshold #ldos-ifdef-code
24. Rajitb 10 Nov 2024
  
  in Public
  
  static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct follow_page_context *ctx) { pgd_t *pgd; struct mm_struct *mm = vma->vm_mm; ctx->page_mask = 0; /* * Call hugetlb_follow_page_mask for hugetlb vmas as it will use * special hugetlb page table walking code. This eliminates the * need to check for hugetlb entries in the general walking code. */ if (is_vm_hugetlb_page(vma)) return hugetlb_follow_page_mask(vma, address, flags, &ctx->page_mask); pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); return follow_p4d_mask(vma, address, pgd, flags, ctx); }
  
  places mask after following page into pte
  
  #ldos-high-confidence #ldos-flag #ldos-function #ldos-ifdef-code
25. Rajitb 10 Nov 2024
  
  in Public
  
  struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags) { struct follow_page_context ctx = { NULL }; struct page *page; if (vma_is_secretmem(vma)) return NULL; if (WARN_ON_ONCE(foll_flags & FOLL_PIN)) return NULL; /* * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect * to fail on PROT_NONE-mapped pages. */ page = follow_page_mask(vma, address, foll_flags, &ctx); if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return page; }
  
  finds page
  
  #ldos-flag #ldos-function #ldos-high-confidence
26. Rajitb 10 Nov 2024
  
  in Public
  
  if (flags & FOLL_SPLIT_PMD) { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); /* If pmd was left empty, stuff a page table in there quickly */ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) : follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); ctx->page_mask = HPAGE_PMD_NR - 1; return page;
  
  we're finding the page again but storing page mask in ctx
  
  #ldos-flag #ldos-function #ldos-high-confidence
27. Rajitb 10 Nov 2024
  
  in Public
  
  if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) return no_page_table(vma, flags); ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); return no_page_table(vma, flags); } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); }
  
  branch prediction to check if pmd is there and if it's big
  
  #ldos-function #ldos-low-confidence #ldos-ifdef-code #
28. Rajitb 10 Nov 2024
  
  in Public
  
  if (pmd_none(pmdval)) return no_page_table(vma, flags); if (!pmd_present(pmdval)) return no_page_table(vma, flags); if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; }
  
  checks if pmd is there. im assuming it's page middle dir.
  
  # #ldos-low-confidence #ldos-function
29. Rajitb 10 Nov 2024
  
  in Public
  
  /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags); pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags)) goto no_page; page = vm_normal_page(vma, address, pte); /* * We only care about anon pages in can_follow_write_pte() and don't * have to worry about pte_devmap() because they are never anon. */ if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, page, vma, flags)) { page = NULL; goto out; } if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN * case since they are only valid while holding the pgmap * reference. */ *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); if (*pgmap) page = pte_page(pte); else goto no_page; } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); goto out; } if (is_zero_pfn(pte_pfn(pte))) { page = pte_page(pte); } else { ret = follow_pfn_pte(vma, address, ptep, flags); page = ERR_PTR(ret); goto out; } } if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { page = ERR_PTR(-EMLINK); goto out; } VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ ret = try_grab_page(page, flags); if (unlikely(ret)) { page = ERR_PTR(ret); goto out; } /* * We need to make the page accessible if and only if we are going * to access its content (the FOLL_PIN case). Please see * Documentation/core-api/pin_user_pages.rst for details. */ if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { unpin_user_page(page); page = ERR_PTR(ret); goto out; } } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * mark_page_accessed(). */ mark_page_accessed(page); }
  
  finds page in pte. Judging by the complexity of the logic this is most likely policy code because we're literally getting user page
  
  #ldos-high-confidence #ldos-flag #ldos-function #ldos-ifdef-code
30. Rajitb 10 Nov 2024
  
  in Public
  
  if (pte_write(pte)) return true; /* Maybe FOLL_FORCE is set to override it? */ if (!(flags & FOLL_FORCE)) return false; /* But FOLL_FORCE has no effect on shared mappings */ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) return false; /* ... or read-only private ones */ if (!(vma->vm_flags & VM_MAYWRITE)) return false; /* ... or already writable ones that just need to take a write fault */ if (vma->vm_flags & VM_WRITE) return false; /* * See can_change_pte_writable(): we broke COW and could map the page * writable if we have an exclusive anonymous page ... */ if (!page || !PageAnon(page) || !PageAnonExclusive(page)) return false; /* ... and a write-fault isn't required for other reasons. */ if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) return false; return !userfaultfd_pte_wp(vma, pte);
  
  flag checks to see if u can write to a pte
  
  #ldos-flag
31. Rajitb 10 Nov 2024
  
  in Public
  
  if (flags & FOLL_TOUCH) { pte_t orig_entry = ptep_get(pte); pte_t entry = orig_entry; if (flags & FOLL_WRITE) entry = pte_mkdirty(entry); entry = pte_mkyoung(entry); if (!pte_same(orig_entry, entry)) { set_pte_at(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); }
  
  uses pte to mark dirty pages and finds pfn in pte
  
  #ldos-flag #ldos-function #ldos-high-confidence
32. Rajitb 10 Nov 2024
  
  in Public
  
  void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty) { unsigned long i; struct folio *folio; unsigned int nr; if (!make_dirty) { unpin_user_pages(pages, npages); return; } sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); /* * Checking PageDirty at this point may race with * clear_page_dirty_for_io(), but that's OK. Two key * cases: * * 1) This code sees the page as already dirty, so it * skips the call to set_page_dirty(). That could happen * because clear_page_dirty_for_io() called * page_mkclean(), followed by set_page_dirty(). * However, now the page is going to get written back, * which meets the original intention of setting it * dirty, so all is well: clear_page_dirty_for_io() goes * on to call TestClearPageDirty(), and write the page * back. * * 2) This code sees the page as clean, so it calls * set_page_dirty(). The page stays dirty, despite being * written back, so it gets written back again in the * next writeback cycle. This is harmless. */ if (!folio_test_dirty(folio)) { folio_lock(folio); folio_mark_dirty(folio); folio_unlock(folio); } gup_put_folio(folio, nr, FOLL_PIN); } }
  
  unpins and dirties page
  
  #ldos-function #ldos-low-confidence #ldos-policy-use
33. Rajitb 10 Nov 2024
  
  in Public
  
  static inline struct folio *gup_folio_next(struct page **list, unsigned long npages, unsigned long i, unsigned int *ntails) { struct folio *folio = page_folio(list[i]); unsigned int nr; for (nr = i + 1; nr < npages; nr++) { if (page_folio(list[nr]) != folio) break; } *ntails = nr - i; return folio; }
  
  gets folio of next page along with reference to end of folio
  
  #ldos-function #ldos-low-confidence #ldos-policy-use
34. Rajitb 10 Nov 2024
  
  in Public
  
  static inline struct folio *gup_folio_range_next(struct page *start, unsigned long npages, unsigned long i, unsigned int *ntails) { struct page *next = nth_page(start, i); struct folio *folio = page_folio(next); unsigned int nr = 1; if (folio_test_large(folio)) nr = min_t(unsigned int, npages - i, folio_nr_pages(folio) - folio_page_idx(folio, next)); *ntails = nr; return folio; }
  
  gets the folio of the next page from start to 'i' range. also gets the tail folio/reference
  
  #ldos-policy-use #ldos-low-confidence #ldos-function
35. Rajitb 10 Nov 2024
  
  in Public
  
  folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
  
  function for adding reference
  
  #ldos-low-confidence #ldos-function #ldos-value
36. Rajitb 10 Nov 2024
  
  in Public
  
  void unpin_user_page(struct page *page) { sanity_check_pinned_pages(&page, 1); gup_put_folio(page_folio(page), 1, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_page);
  
  actual policy use logic
  
  #ldos-policy-use #ldos-low-confidence #ldos-function
37. Rajitb 10 Nov 2024
  
  in Public
  
  struct folio *folio = page_folio(page); if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) return -EREMOTEIO; if (flags & FOLL_GET) folio_ref_inc(folio);
  
  checks for code that is involved in policy but is not the actual logic
  
  #ldos-ifdef-code #ldos-flag #ldos-function #ldos-threshold #ldos-low-confidence
38. Rajitb 10 Nov 2024
  
  in Public
  
  else if (flags & FOLL_PIN) { /* * Don't take a pin on the zero page - it's not going anywhere * and it is used in a *lot* of places. */ if (is_zero_page(page)) return 0; /* * Similar to try_grab_folio(): be sure to *also* * increment the normal page refcount field at least once, * so that the page really is pinned. */ if (folio_test_large(folio)) { folio_ref_add(folio, 1); atomic_add(1, &folio->_pincount); } else { folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); } node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); }
  
  Logic that actually tries to grab the folio. Also policy use code and not actual policy
  
  #ldos-flag #ldos-function #ldos-low-confidence
39. Rajitb 10 Nov 2024
  
  in Public
  
  if (!put_devmap_managed_page_refs(&folio->page, refs)) folio_put_refs(folio, refs);
  
  Definitely a vital and straightforward policy use section of gup that simples places a reference on the folio
  
  #ldos-function #ldos-policy-use #ldos-low-confidence
40. Rajitb 10 Nov 2024
  
  in Public
  
  if (flags & FOLL_PIN) { if (is_zero_folio(folio)) return; node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); if (folio_test_large(folio)) atomic_sub(refs, &folio->_pincount); else refs *= GUP_PIN_COUNTING_BIAS; }
  
  Checks if the folio is zero/large
  
  #ldos-flag #ldos-function #ldos-low-confidence #ldos-policy-use
41. Rajitb 10 Nov 2024
  
  in Public
  
  if (folio_test_large(folio)) atomic_add(refs, &folio->_pincount); else folio_ref_add(folio, refs * (GUP_PIN_COUNTING_BIAS - 1))
  
  maintaining reference counts. Part of policy logic most likely
  
  #ldos-function #ldos-low-confidence #ldos-policy-use
42. Rajitb 10 Nov 2024
  
  in Public
  
  if (unlikely((flags & FOLL_LONGTERM) && !folio_is_longterm_pinnable(folio))) { if (!put_devmap_managed_page_refs(&folio->page, refs)) folio_put_refs(folio, refs); return NULL;
  
  checks for longterm folio pins.
  
  #ldos-flag #ldos-function #ldos-low-confidence #ldos-ifdef-code
43. Rajitb 10 Nov 2024
  
  in Public
  
  if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) return NULL; if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) return NULL;
  
  Time saving predictions(unlikely) and single time warning func(WARN_ON_ONCE) for flags. Not actual policy logic so low confidence.
  
  #ldos-formula #ldos-flag #ldos-low-confidence #ldos-ifdef-code
44. Rajitb 10 Nov 2024
  
  in Public
  
  if (unlikely(page_folio(page) != folio)) { if (!put_devmap_managed_page_refs(&folio->page, refs)) folio_put_refs(folio, refs); goto retry;
  
  Uses prediction to check if a folio still points to the page. This is part of the function that tries to retrieve the folio to confirm that it is associated with a page.
  
  #ldos-function #ldos-high-confidence #ldos-ifdef-code
45. Rajitb 10 Nov 2024
  
  in Public
  
  folio = page_folio(page); if (WARN_ON_ONCE(folio_ref_count(folio) < 0)) return NULL; if (unlikely(!folio_ref_try_add(folio, refs))) return NULL;
  
  These increment the reference count for the folio since you're returning a reference of the folio. Important function so important internal logic subsequently
  
  #ldos-function #ldos-high-confidence #ldos-ifdef-code
46. Rajitb 10 Nov 2024
  
  in Public
  
  if (is_zero_page(page) || !folio_test_anon(folio)) continue; if (!folio_test_large(folio) || folio_test_hugetlb(folio))
  
  Sanity checks for pinned pages wouldn't classify as policy logic but common sense pre-checks for the actual policy. But I think it's worth tagging this to gain a sense of what is not policy code
  
  #ldos-function #ldos-low-confidence
47. Rajitb 10 Nov 2024
  
  in Public
  
  if (is_zero_page(page)) return page_folio(page); folio = try_get_folio(page, refs); if (!folio) return NULL;
  
  Just trying to check for zero pages and trying to retrieve folios. Unlikely policy logic
  
  #ldos-formula #ldos-low-confidence
48. Rajitb 10 Nov 2024
  
  in Public
  
  if (flags & FOLL_GET) return try_get_folio(page, refs);
  
  Policy logic that determines and tries to retrieve folios based on given flags.
  
  #ldos-flag #ldos-high-confidence
49. Rajitb 03 Nov 2024
  
  in Public
  
  if (gup_flags & FOLL_PIN) mm_set_has_pinned_flag(&current->mm->flags); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(&current->mm->mmap_lock); start = untagged_addr(start) & PAGE_MASK; len = nr_pages << PAGE_SHIFT; if (check_add_overflow(start, len, &end)) return -EOVERFLOW; if (end > TASK_SIZE_MAX) return -EFAULT; if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT;
  
  checking for overflow in page alloc likely
  
  #ldos-flag #ldos-formula
50. Rajitb 03 Nov 2024
  
  in Public
  
  if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { gup_put_folio(folio, 1, flags); goto pte_unmap; } }
  
  part of policy code
  
  #ldos-flag #ldos-high-confidence
51. Rajitb 03 Nov 2024
  
  in Public
  
  if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, locked, gup_flags);
  
  policy decision to get locked page!
  
  #ldos-high-confidence #ldos-flag
52. Rajitb 03 Nov 2024
  
  in Public
  
  vm_flags = (foll_flags & FOLL_WRITE) ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (foll_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
  
  vm_flags
  
  #ldos-flag
53. Rajitb 03 Nov 2024
  
  in Public
  
  if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ pages_done = ret; break; }
  
  flag
  
  #ldos-flag
54. Rajitb 03 Nov 2024
  
  in Public
  
  if (ret & VM_FAULT_COMPLETED) { /* * NOTE: it's a pity that we need to retake the lock here * to pair with the unlock() in the callers. Ideally we * could tell the callers so they do not need to unlock. */ mmap_read_lock(mm); *unlocked = true; return 0; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); *unlocked = true; fault_flags |= FAULT_FLAG_TRIED; goto retry; }
  
  a lot of VM fault flags that are checked when calling a function that handles page faults
  
  #ldos-flag
55. Rajitb 02 Nov 2024
  
  in Public
  
  if (flags & FOLL_PIN) mm_set_has_pinned_flag(&mm->flags);
  
  #ldos-flag
56. Rajitb 02 Nov 2024
  
  in Public
  
  if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
  
  flag pol to check if map is unlocked (for pos remap?)
  
  #ldos-flag
57. Rajitb 02 Nov 2024
  
  in Public
  
  if (page_increm > nr_pages) page_increm = nr_pages;
  
  next page logic
  
  #ldos-flag #ldos-high-confidence
58. Rajitb 02 Nov 2024
  
  in Public
  
  */ if (gup_flags & FOLL_MADV_POPULATE) { vma = vma_lookup(mm, start); if (!vma) { ret = -ENOMEM; goto out; } if (check_vma_flags(vma, gup_flags)) { ret = -EINVAL; goto out; } goto retry; }
  
  page populate flag for sure
  
  #ldos-high-confidence#ldos-flag#ldos-formula
59. Rajitb 02 Nov 2024
  
  in Public
  
  if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ if (is_vm_hugetlb_page(vma)) return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could * set a breakpoint in a read-only mapping of an * executable, without corrupting the file (yet only * when that file had been opened for writing!). * Anon pages in shared mappings are surprising: now * just reject it. */ if (!is_cow_mapping(vm_flags)) return -EFAULT; } } else if (!(vm_flags & VM_READ)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* * Is there actually any vma we can reach here which does not * have VM_MAYREAD set? */ if (!(vm_flags & VM_MAYREAD)) return -EFAULT; }
  
  cow mapping, sdw stack seem like imp policies which dont seem like regular flags
  
  #ldos-flag
Visit annotations in context

Tags

#ldos-function

#ldos-ifdef-code

#ldos-threshold

#

#ldos-algorithm

#ldos-high-confidence#ldos-flag#ldos-formula

#ldos-high-confidence

#ldos-formula

#ldos-flag

#ldos-value

#ldos-low-confidence

#ldos-policy-use

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/gup.c
Oct 2024
elixir.bootlin.com elixir.bootlin.com

gup.c - mm/gup.c - Linux source code v6.6.42 - Bootlin

1
1. Rajitb 06 Oct 2024
  
  in Public
  
  if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) != (FOLL_PIN | FOLL_LONGTERM)) return true;
  
  most likely part of a policy code that includes flag based decision making
  
  #ldos-flag
Visit annotations in context

Tags

#ldos-flag

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/gup.c
elixir.bootlin.com elixir.bootlin.com

memcontrol.c - mm/memcontrol.c - Linux source code v6.6.42 - Bootlin

10
1. sidsabh 04 Oct 2024
  
  in Public
  
  matching the ownership tracking * granularities between memcg and writeback in either direction
  
  Algorithmic policy: the memcg and writeback ownerships have to match. This mechanism implemented via the mem_cgroup_track_foreign_dirty_slowpath and mem_cgroup_flush_foreign functions.
  
  #ldos-algo
2. sidsabh 04 Oct 2024
  
  in Public
  
  #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
  
  Maximum loops when reclaiming memory for a soft threshold from a cgroup. These value configurations are set once via #define.
  
  ldos #ldos-config
3. sidsabh 04 Oct 2024
  
  in Public
  
  if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) return ep; if (parent_effective > siblings_protected && parent_usage > siblings_protected && usage > protected) { unsigned long unclaimed; unclaimed = parent_effective - siblings_protected; unclaimed *= usage - protected; unclaimed /= parent_usage - siblings_protected; ep += unclaimed; } return ep;
  
  Heuristic algorithm for calculations of the effective protection of an individual cgroup. Used to check if the cgroup tree is using memory consumption in normal range by referencing the cgroup, its parent, its sibling, and rest of tree
  
  #ldos #ldos-algo
4. sidsabh 04 Oct 2024
  
  in Public
  
  atomic_add(nr_bytes, &old->nr_charged_bytes);
  
  Configuration policy for the tradeoff for flushing per-memcg from old object stock. Currently choosing to fully limit enforcement accuracy while having no CPU contention by writing to a centralized value.
  
  #ldos #ldos-config
5. sidsabh 04 Oct 2024
  
  in Public
  
  #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024
  
  These #define statements are for the target number of events before the system triggers action for threshold events and soft limit events, respectively. These are memory pressure events. THRESHOLDS_EVENTS_TARGET is set to 128, meaning threshold events are processed more frequently (finer grain) than soft limit events, which are only triggered every 1024 events.
  
  #ldos #ldos-config
6. sidsabh 04 Oct 2024
  
  in Public
  
  if (total >= (excess >> 2) || (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) break;
  
  For the soft limit reclaim of cgroup memory, we continuously loop through the cgroup and find a victim process to shrink. If we didn't find a victim, we will try again up to a certain threshold (#define MEM_CGROUP_MAX_RECLAIM_LOOPS) or quit if we found that we've reclaimed more than the exponentially decreasing original excess: algo left shifts the excess every failed loop.
  
  #ldos #ldos-algo #ldos-config
7. sidsabh 04 Oct 2024
  
  in Public
  
  #define FLUSH_TIME (2UL*HZ)
  
  This is a configuration policy that sets the interval for periodic flushing of memory statistics. The flushing is performed every 2 seconds (2UL * HZ), allowing the system to balance between the cost of frequent stat updates and keeping the statistics reasonably fresh.
  
  #ldos-config #ldos
8. sidsabh 04 Oct 2024
  
  in Public
  
  #define MEMCG_DELAY_PRECISION_SHIFT 20 #define MEMCG_DELAY_SCALING_SHIFT 14
  
  These two #define statements are part of the configuration policy for controlling the increasing penalty applied to memory overage. They ensure the system doesn’t penalize too harshly in minor cases but still exponentially increases delay for excessive usage.
  
  #ldos #ldos-config
9. sidsabh 04 Oct 2024
  
  in Public
  
  /* * Don't sleep if the amount of jiffies this memcg owes us is so low * that it's not even worth doing, in an attempt to be nice to those who * go only a small amount over their memory.high value and maybe haven't * been aggressively reclaimed enough yet. */ if (penalty_jiffies <= HZ / 100) goto out;
  
  Both the predicate and action of an algorithmic policy are defined here. If the calculated penality_jiffies (delay for over allocated cgrouup) is less than 1/100th of a second, don't throttle the cgroup.
  
  #ldos #ldos-algo
10. sidsabh 04 Oct 2024
  
  in Public
  
  #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
  
  This line defines the maximum sleep time (delay) for a memory cgroup that has breached its memory.high limit. This is a configuration policy because it sets a fixed upper limit (2 seconds).
  
  #ldos #ldos-config
Visit annotations in context

Tags

#ldos

ldos

#ldos-algo

#ldos-config

Annotators

sidsabh

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/memcontrol.c
elixir.bootlin.com elixir.bootlin.com

vmscan.c - mm/vmscan.c - Linux source code v6.6.42 - Bootlin

10
1. kelly_w 04 Oct 2024
  
  in Public
  
  static inline bool should_continue_reclaim(struct pglist_data *pgdat,
  
  This function decides whether there should be another round of reclamation. Stop reclaiming as long as space is sufficient for compaction or page allocation
  
  #ldos-algo
2. kelly_w 04 Oct 2024
  
  in Public
  
  static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) { /* * If reclaim is making progress greater than 12% efficiency then * wake all the NOPROGRESS throttled tasks. */ if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) { wait_queue_head_t *wqh; wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; if (waitqueue_active(wqh)) wake_up(wqh); return; } /* * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages * under writeback and marked for immediate reclaim at the tail of the * LRU. */ if (current_is_kswapd() || cgroup_reclaim(sc)) return; /* Throttle if making no progress at high prioities. */ if (sc->priority == 1 && !sc->nr_reclaimed) reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); }
  
  Managing throttling: wakes up throttled tasks when reclaim is making progress at some efficiency. Throttle reclaim if no progress (in reclaim_throttle(), throttling under no progress could also be skipped under some conditions).
  
  #ldos-config
3. kelly_w 04 Oct 2024
  
  in Public
  
  /* * We make inactive:active ratio decisions based on the node's * composition of memory, but a restrictive reclaim_idx or a * memory.low cgroup setting can exempt large amounts of * memory from reclaim. Neither of which are very common, so * instead of doing costly eligibility calculations of the * entire cgroup subtree up front, we assume the estimates are * good, and retry with forcible deactivation if that fails. */ if (sc->skipped_deactivate) { sc->priority = initial_priority; sc->force_deactivate = 1; sc->skipped_deactivate = 0; goto retry; } /* Untapped cgroup reserves? Don't OOM, retry. */ if (sc->memcg_low_skipped) { sc->priority = initial_priority; sc->force_deactivate = 0; sc->memcg_low_reclaim = 1; sc->memcg_low_skipped = 0; goto retry; }
  
  Deactivation may be skipped if inactive ratio is not low or due to memory.Low limit. In this case, simply retry again with forcible deactivation/reclaim flags.
  
  #ldos-algo
4. kelly_w 04 Oct 2024
  
  in Public
  
  scan_balance = SCAN_FRACT; /* * Calculate the pressure balance between anon and file pages. * * The amount of pressure we put on each LRU is inversely * proportional to the cost of reclaiming each list, as * determined by the share of pages that are refaulting, times * the relative IO cost of bringing back a swapped out * anonymous page vs reloading a filesystem page (swappiness). * * Although we limit that influence to ensure no list gets * left behind completely: at least a third of the pressure is * applied, before swappiness. * * With swappiness at 100, anon and file have equal IO cost. */ total_cost = sc->anon_cost + sc->file_cost; anon_cost = total_cost + sc->anon_cost; file_cost = total_cost + sc->file_cost; total_cost = anon_cost + file_cost; ap = swappiness * (total_cost + 1); ap /= anon_cost + 1; fp = (200 - swappiness) * (total_cost + 1); fp /= file_cost + 1; fraction[0] = ap; fraction[1] = fp; denominator = ap + fp;
  
  Calculates the proportion of anon and file pages to be scanned based on swappiness. The whole funtion determines number of folios to be scanned for each type.
  
  #ldos-config
5. kelly_w 04 Oct 2024
  
  in Public
  
  if (young * MIN_NR_GENS > total) return true; if (old * (MIN_NR_GENS + 2) < total) return true;
  
  Decides whether aging is needed. Aging is needed if the lruvec is short on cold folios. If returns true, the scanning of this lruvec is skipped.
  
  #ldos-config
6. kelly_w 04 Oct 2024
  
  in Public
  
  if (!swappiness) type = LRU_GEN_FILE; else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) type = LRU_GEN_ANON; else if (swappiness == 1) type = LRU_GEN_FILE; else if (swappiness == 200) type = LRU_GEN_ANON; else type = get_type_to_scan(lruvec, swappiness, &tier);
  
  Makes a decision on which type to scan based on swappiness, for the generational lruvecs
  
  #ldos-config
7. kelly_w 04 Oct 2024
  
  in Public
  
  */ if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) { if (cgroup_reclaim(sc) && writeback_throttling_sane(sc)) set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags); if (current_is_kswapd()) set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags); } /* * Stall direct reclaim for IO completions if the lruvec is * node is congested. Allow kswapd to continue until it * starts encountering unqueued dirty pages or cycling through * the LRU too quickly. */ if (!current_is_kswapd() && current_may_throttle() && !sc->hibernation_mode && (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) || test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags))) reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc)) goto again;
  
  stall reclaim if lruvec is congested, which is determined by whether all dirty pages in the cgroup or node were marked for writeback
  
  #ldos-algo
8. kelly_w 03 Oct 2024
  
  in Public
  
  static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) { enum lru_list active_lru = inactive_lru + LRU_ACTIVE; unsigned long inactive, active; unsigned long inactive_ratio; unsigned long gb; inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru); active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) inactive_ratio = int_sqrt(10 * gb); else inactive_ratio = 1; return inactive * inactive_ratio < active; }
  
  Defines/calculates a target active-to-inactive ratio. There are various places where this function is called to decide whether the active list should be shrinked and whether inactive list could be expanded.
  
  #ldos-algo
9. kelly_w 03 Oct 2024
  
  in Public
  
  nr = xchg_nr_deferred(shrinker, shrinkctl); if (shrinker->seeks) { delta = freeable >> priority; delta *= 4; do_div(delta, shrinker->seeks); } else { /* * These objects don't require any IO to create. Trim * them aggressively under memory pressure to keep * them from causing refetches in the IO caches. */ delta = freeable / 2; } total_scan = nr >> priority; total_scan += delta; total_scan = min(total_scan, (2 * freeable)); trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, freeable, delta, total_scan, priority); /* * Normally, we should not scan less than batch_size objects in one * pass to avoid too frequent shrinker calls, but if the slab has less * than batch_size objects in total and we are really tight on memory, * we will try to reclaim all available objects, otherwise we can end * up failing allocations although there are plenty of reclaimable * objects spread over several slabs with usage less than the * batch_size. * * We detect the "tight on memory" situations by looking at the total * number of objects we want to scan (total_scan). If it is greater * than the total number of objects on slab (freeable), we must be * scanning at high prio and therefore should try to reclaim as much as * possible. */ while (total_scan >= batch_size || total_scan >= freeable) {
  
  Determine the number of slab objects to be scanned (total_scan) based on 'freeable' and 'priority'. Scans are done in batch sizes unless memory is tight. Batch size is set by shrinker, or defaulted to SHRINK_BATCH at line 832
  
  #ldos-algo #ldos-config
10. kelly_w 03 Oct 2024
  
  in Public
  
  if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) { nr_rotated += folio_nr_pages(folio); list_add(&folio->lru, &l_active); continue; }
  
  The goal is to identify executable code regions and keep them in memory under moderate memory pressure. The heuristic is to check VM_EXEC flag and whether folio is file-backed, and add identified folio back to active list.
  
  #ldos-algo
Visit annotations in context

Tags

#ldos-algo

#ldos-config

Annotators

kelly_w

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/vmscan.c
elixir.bootlin.com elixir.bootlin.com

zswap.c - mm/zswap.c - Linux source code v6.6.42 - Bootlin

3
1. weijiec 04 Oct 2024
  
  in Public
  
  #define ZSWAP_NR_ZPOOLS 32
  
  More configuration policy, as per the comment, this is an empirical number.
  
  #ldos #ldos-config
2. weijiec 04 Oct 2024
  
  in Public
  
  static unsigned int zswap_max_pool_percent = 20;
  
  maximum percentage of memory that the compresses pool can occupy
  
  #ldos #ldos-config
3. weijiec 04 Oct 2024
  
  in Public
  
  static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
  
  Since zswap will be part of decisions on the swap flow the this config is policy.
  
  #ldos #ldos-config
Visit annotations in context

Tags

#ldos

#ldos-config

Annotators

weijiec

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/zswap.c
elixir.bootlin.com elixir.bootlin.com

percpu.c - mm/percpu.c - Linux source code v6.6.42 - Bootlin

10
1. weijiec 04 Oct 2024
  
  in Public
  
  if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) { struct pcpu_chunk *pos; list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list) if (pos != chunk) { need_balance = true; break; } } else if (pcpu_should_reclaim_chunk(chunk)) { pcpu_isolate_chunk(chunk); need_balance = true; }
  
  When freeing memory determine whether to balance or not.
  
  #ldos #ldos-algo
2. weijiec 04 Oct 2024
  
  in Public
  
  static void pcpu_balance_workfn(struct work_struct *work)
  
  Manage free chunks and populated pages. It does so by first reclaiming all fully free chunks regardless of the number of populated pages. Then rebalancing and finally cleaning.
  
  #ldos #ldos-algo
3. weijiec 04 Oct 2024
  
  in Public
  
  if (freed_page_start < freed_page_end) {
  
  Batching TLB flushes to amortize cost. The threashold is also a heuristic.
  
  #ldos #ldos-algo
4. weijiec 04 Oct 2024
  
  in Public
  
  int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
  
  Static allocation policy is used here for the first chunk per cpu.
  
  #ldos #ldos-algo
5. weijiec 04 Oct 2024
  
  in Public
  
  if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) { reintegrate = true; break; }
  
  Heuristic to stop reclaiming if empty fall below threshold that would cause atomic alloc failures.
  
  #ldos #ldos-algo
6. weijiec 04 Oct 2024
  
  in Public
  
  static void pcpu_balance_populated(void)
  
  Maintain a certain amount of populated pages to satisfy atomic allocations.
  
  #ldos #ldos-algo
7. weijiec 04 Oct 2024
  
  in Public
  
  if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
  
  Heuristic to determine when to rebalance when allocating pages. If the number of pages are below the PCPU_EMPTY_POP_PAGES_LOW threashold then rebalancer is called.
  
  #ldos #ldos-algo
8. weijiec 04 Oct 2024
  
  in Public
  
  if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
  
  As the comment said this is a heuristic to stop finding chunks when true.
  
  #ldos #ldos-algo
9. weijiec 04 Oct 2024
  
  in Public
  
  static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, int bits)
  
  Update metadate hints (for policy).
  
  #ldos #ldos-algo
10. weijiec 04 Oct 2024
  
  in Public
  
  if (contig == block->contig_hint) { if (block->contig_hint_start && (!start || __ffs(start) > __ffs(block->contig_hint_start))) { /* start has a better alignment so use it */ block->contig_hint_start = start; if (start < block->scan_hint_start && block->contig_hint > block->scan_hint) block->scan_hint = 0; } else if (start > block->scan_hint_start || block->contig_hint > block->scan_hint) { /* * Knowing contig == contig_hint, update the scan_hint * if it is farther than or larger than the current * scan_hint. */ block->scan_hint_start = start; block->scan_hint = contig; }
  
  Heuristic when contig hints are equal to choose the "best" starting offset.
  
  #ldos #ldos-algo
Visit annotations in context

Tags

#ldos

#ldos-algo

Annotators

weijiec

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/percpu.c
elixir.bootlin.com elixir.bootlin.com

page_alloc.c - mm/page_alloc.c - Linux source code v6.6.42 - Bootlin

3
1. yichi170 04 Oct 2024
  
  in Public
  
  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * Watermark failed for this zone, but see if we can * grow this zone if it contains deferred pages. */ if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } #endif
  
  This is a configuration policy. It will retry if zone has deferred pages.
  
  #ldos #ldos-config
2. yichi170 04 Oct 2024
  
  in Public
  
  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } #endif
  
  This is a configuration policy. It will retry if zone has deferred pages.
  
  #ldos #ldos-config
3. yichi170 04 Oct 2024
  
  in Public
  
  batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; /* * Clamp the batch to a 2^n - 1 value. Having a power * of 2 value was found to be more likely to have * suboptimal cache aliasing properties in some cases. * * For example if 2 tasks are alternately allocating * batches of pages, one task can end up with a lot * of pages of one half of the possible page colors * and the other with pages of the other colors. */ batch = rounddown_pow_of_two(batch + batch/2) - 1;
  
  Determine the number of pages for batch allocating based on a heuristic. Using a (2^n - 1) to minimize cache aliasing issues.
  
  but I think it may also be categorized as a configuration policy because the code execution depends on CONFIG_MMU.
  
  #ldos #ldos-algo
Visit annotations in context

Tags

#ldos

#ldos-algo

#ldos-config

Annotators

yichi170

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/page_alloc.c
elixir.bootlin.com elixir.bootlin.com

oom_kill.c - mm/oom_kill.c - Linux source code v6.6.42 - Bootlin

4
1. sidsabh 04 Oct 2024
  
  in Public
  
  if (oc->chosen && oc->chosen != (void *)-1UL) oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : "Memory cgroup out of memory");
  
  We found the process that has the highest "badness" score and will be terminated after we called select_bad_process. This is the action side of the previously mentioned predicate (maximal "badness").
  
  #ldos-algo
2. sidsabh 04 Oct 2024
  
  in Public
  
  #define MAX_OOM_REAP_RETRIES 10 static void oom_reap_task(struct task_struct *tsk) { int attempts = 0; struct mm_struct *mm = tsk->signal->oom_mm; /* Retry the mmap_read_trylock(mm) a few times */ while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm)) schedule_timeout_idle(HZ/10); if (attempts <= MAX_OOM_REAP_RETRIES || test_bit(MMF_OOM_SKIP, &mm->flags)) goto done;
  
  The reaper will try 10 times (with a delay after each attempt) to reap the memory from the process. This value configuration is defined via a #define statement. This will fail if the process doesn't give up its mmap lock.
  
  #ldos-config #ldos
3. sidsabh 04 Oct 2024
  
  in Public
  
  #define OOM_REAPER_DELAY (2*HZ) static void queue_oom_reaper(struct task_struct *tsk) { /* mm is already queued? */ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) return; get_task_struct(tsk); timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0); tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY; add_timer(&tsk->oom_reaper_timer); }
  
  The program here sends a signal to the selected process that will be killed to solve the out of memory issue. It will then wait for a configuration OOM_REAPER_DELAY before forcefully killing it by invoking the reaper thread on the process. It's currently set to 2*HZ, meaning that we wait for 2 seconds. This time period policy is defined as configuration via #define.
  
  #ldos-config #ldos
4. sidsabh 04 Oct 2024
  
  in Public
  
  /* * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); /* Normalize to oom_score_adj units */ adj *= totalpages / 1000; points += adj;
  
  The kernel calls out of memory when the RAM passes full utilization limit. This code calculates a predicate for choosing the maximal "bad" process to reap to fix the out of memory issue. This code defines the predicate as a heuristic calculation; it calculates "badness" as the proportion of RAM that the sum of the task's RSS, pagetable, and swap space use.
  
  #ldos-algo #ldos
Visit annotations in context

Tags

#ldos

#ldos-algo

#ldos-config

Annotators

sidsabh

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/oom_kill.c
elixir.bootlin.com elixir.bootlin.com

slub.c - mm/slub.c - Linux source code v6.6.42 - Bootlin

5
1. himanshur 04 Oct 2024
  
  in Public
  
  if (s->flags & __CMPXCHG_DOUBLE) { ret = __update_freelist_fast(slab, freelist_old, counters_old, freelist_new, counters_new); } else { ret = __update_freelist_slow(slab, freelist_old, counters_old, freelist_new, counters_new); }
  
  This policy is very similar to annotated code below. The description is reproduced here:
  
  This policy determines if the system has support for compare and exchange. If so, it will use the "__update_freelist_fast()" function, which uses a compare and exchange internally. Otherwise, it will use "__update_freelist_slow()", which uses a lock (specifically a bit-based spinlock) internally.
  
  #ldos #ldos-config #ldos-flag
2. himanshur 04 Oct 2024
  
  in Public
  
  #ifdef CONFIG_SLAB_FREELIST_HARDENED encoded = (unsigned long)ptr ^ s->random ^ swab(ptr_addr); #else encoded = (unsigned long)ptr; #endif
  
  This policy uses the configuration setting "CONFIG_SLAB_FREELIST_HARDENED" to determine whether to obfuscate a SLUB free list pointer or not for increased security at the cost of some performance.
  
  #ldos #ldos-config #ldos-flag
3. himanshur 04 Oct 2024
  
  in Public
  
  #ifdef CONFIG_SLAB_FREELIST_HARDENED decoded = (void *)(ptr.v ^ s->random ^ swab(ptr_addr)); #else decoded = (void *)ptr.v; #endif
  
  This policy is based on the configuration setting indicated by "CONFIG_SLAB_FREELIST_HARDENED", which hardens the slab free list. If the free list is hardened, then free list pointers will be obfuscated; this policy just undoes the obfuscation in that case. In the case where free list pointers are not obfuscated, this function just returns the unmodified pointer value.
  
  #ldos #ldos-config #ldos-flag
4. himanshur 04 Oct 2024
  
  in Public
  
  #ifdef CONFIG_SLAB_FREELIST_RANDOM /* Pre-initialize the random sequence cache */ static int init_cache_random_seq(struct kmem_cache *s) { unsigned int count = oo_objects(s->oo); int err; /* Bailout if already initialised */ if (s->random_seq) return 0; err = cache_random_seq_create(s, count, GFP_KERNEL); if (err) { pr_err("SLUB: Unable to initialize free list for %s\n", s->name); return err; } /* Transform to an offset on the set of pages */ if (s->random_seq) { unsigned int i; for (i = 0; i < count; i++) s->random_seq[i] *= s->size; } return 0; } /* Initialize each random sequence freelist per cache */ static void __init init_freelist_randomization(void) { struct kmem_cache *s; mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) init_cache_random_seq(s); mutex_unlock(&slab_mutex); } /* Get the next entry on the pre-computed freelist randomized */ static void *next_freelist_entry(struct kmem_cache *s, struct slab *slab, unsigned long *pos, void *start, unsigned long page_limit, unsigned long freelist_count) { unsigned int idx; /* * If the target page allocation failed, the number of objects on the * page might be smaller than the usual size defined by the cache. */ do { idx = s->random_seq[*pos]; *pos += 1; if (*pos >= freelist_count) *pos = 0; } while (unlikely(idx >= page_limit)); return (char *)start + idx; } /* Shuffle the single linked freelist based on a random pre-computed sequence */ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) { void *start; void *cur; void *next; unsigned long idx, pos, page_limit, freelist_count; if (slab->objects < 2 || !s->random_seq) return false; freelist_count = oo_objects(s->oo); pos = get_random_u32_below(freelist_count); page_limit = slab->objects * s->size; start = fixup_red_left(s, slab_address(slab)); /* First entry is used as the base of the freelist */ cur = next_freelist_entry(s, slab, &pos, start, page_limit, freelist_count); cur = setup_object(s, cur); slab->freelist = cur; for (idx = 1; idx < slab->objects; idx++) { next = next_freelist_entry(s, slab, &pos, start, page_limit, freelist_count); next = setup_object(s, next); set_freepointer(s, cur, next); cur = next; } set_freepointer(s, cur, NULL); return true; } #else static inline int init_cache_random_seq(struct kmem_cache *s) { return 0; } static inline void init_freelist_randomization(void) { } static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) { return false; } #endif /* CONFIG_SLAB_FREELIST_RANDOM */
  
  This policy looks at the "CONFIG_SLAB_FREELIST_RANDOM" config setting. If it is set, the policy defines functions to randomize the free list order when creating new pages (for security purposes). If it is not set, the functions involved in randomizing the free list are empty, effectively turning off free list randomization.
  
  #ldos #ldos-config #ldos-flag
5. himanshur 04 Oct 2024
  
  in Public
  
  if (s->flags & __CMPXCHG_DOUBLE) { ret = __update_freelist_fast(slab, freelist_old, counters_old, freelist_new, counters_new); } else { unsigned long flags; local_irq_save(flags); ret = __update_freelist_slow(slab, freelist_old, counters_old, freelist_new, counters_new); local_irq_restore(flags); }
  
  This policy determines if the system has support for compare and exchange. If so, it will use the "__update_freelist_fast()" function, which uses a compare and exchange internally. Otherwise, it will use "__update_freelist_slow()", which uses a lock (specifically a bit-based spinlock) internally.
  
  #ldos #ldos-config #ldos-flag
Visit annotations in context

Tags

#ldos

#ldos-flag

#ldos-config

Annotators

himanshur

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/slub.c
elixir.bootlin.com elixir.bootlin.com

vmstat.c - mm/vmstat.c - Linux source code v6.6.42 - Bootlin

2
1. rwik2000_UTCS 04 Oct 2024
  
  in Public
  
  #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); static void sum_vm_events(unsigned long *ret) { int cpu; int i; memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); for_each_online_cpu(cpu) { struct vm_event_state *this = &per_cpu(vm_event_states, cpu); for (i = 0; i < NR_VM_EVENT_ITEMS; i++) ret[i] += this->event[i]; } } /* * Accumulate the vm event counters across all CPUs. * The result is unavoidably approximate - it can change * during and after execution of this function. */ void all_vm_events(unsigned long *ret) { cpus_read_lock(); sum_vm_events(ret); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(all_vm_events); /* * Fold the foreign cpu events into our own. * * This is adding to the events on one processor * but keeps the global counts constant. */ void vm_events_fold_cpu(int cpu) { struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); int i; for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { count_vm_events(i, fold_state->event[i]); fold_state->event[i] = 0; } } #endif /* CONFIG_VM_EVENT_COUNTERS */
  
  When CONFIG_VM_EVENT_COUNTERS is enabled, the kernel compiles the code that collects and manages virtual memory (vm) event counters. These counters track events like page faults, page allocations, and swap operations. The functions sum_vm_events, all_vm_events, and vm_events_fold_cpu are responsble for accumulating these statistics across all CPUs.
  
  If CONFIG_VM_EVENT_COUNTERS is not enabled, this code is excluded from the build. This means the kernel won't collect these detailed VM statistics, reducing memory usage and avoiding the overhead associated with tracking them.
  
  #ldos-config
2. rwik2000_UTCS 04 Oct 2024
  
  in Public
  
  #ifdef CONFIG_NUMA int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; /* zero numa counters within a zone */ static void zero_zone_numa_counters(struct zone *zone) { int item, cpu; for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) { atomic_long_set(&zone->vm_numa_event[item], 0); for_each_online_cpu(cpu) { per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item] = 0; } } } /* zero numa counters of all the populated zones */ static void zero_zones_numa_counters(void) { struct zone *zone; for_each_populated_zone(zone) zero_zone_numa_counters(zone); } /* zero global numa counters */ static void zero_global_numa_counters(void) { int item; for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) atomic_long_set(&vm_numa_event[item], 0); } static void invalid_numa_statistics(void) { zero_zones_numa_counters(); zero_global_numa_counters(); } static DEFINE_MUTEX(vm_numa_stat_lock); int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int ret, oldval; mutex_lock(&vm_numa_stat_lock); if (write) oldval = sysctl_vm_numa_stat; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (ret || !write) goto out; if (oldval == sysctl_vm_numa_stat) goto out; else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) { static_branch_enable(&vm_numa_stat_key); pr_info("enable numa statistics\n"); } else { static_branch_disable(&vm_numa_stat_key); invalid_numa_statistics(); pr_info("disable numa statistics, and clear numa counters\n"); } out: mutex_unlock(&vm_numa_stat_lock); return ret; } #endif
  
  This conditionally includes the NUMA-specific code based on whether the CONFIG_NUMA option is enabled during kernel compilation. Within this conditional block, the sysctl_vm_numa_stat_handler function provides a runtime configuration mechanism through the sysctl_vm_numa_stat variable. This function allows system administrators to enable or disable the collection of NUMA statistics at runtime. When the value of sysctl_vm_numa_stat changes, the function changes the vm_numa_stat_key to start or stop the statistics collection and clears the NUMA counters if disabled.
  
  #ldos-config
Visit annotations in context

Tags

#ldos-config

Annotators

rwik2000_UTCS

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/vmstat.c
elixir.bootlin.com elixir.bootlin.com

madvise.c - mm/madvise.c - Linux source code v6.6.42 - Bootlin

5
1. temi14 04 Oct 2024
  
  in Public
  
  if (start != vma->vm_start) { error = split_vma(&vmi, vma, start, 1); if (error) return error; } if (end != vma->vm_end) { error = split_vma(&vmi, vma, end, 0); if (error) return error; }
  
  This is an algorithmic policy. The VMA is split at the start address if the specified start is different from the current beginning of the VMA, or if the specified end address is different from the current end of the VMA, the VMA is split.
  
  #ldos-algo
2. temi14 04 Oct 2024
  
  in Public
  
  if (behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", pfn, start); ret = soft_offline_page(pfn, MF_COUNT_INCREASED); }
  
  This is algorithmic policy. If this particular madvise behavior is MADV_SOFT_OFFLINE, the code decides to performs soft offlining of a page by marking it as unavailable for future use without reclaiming it.
  
  #ldos-algo
3. temi14 04 Oct 2024
  
  in Public
  
  if (folio_test_large(folio)) { int err; if (folio_estimated_sharers(folio) != 1) break; if (!folio_trylock(folio)) break; folio_get(folio); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); if (err) break; start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break; arch_enter_lazy_mmu_mode(); pte--; addr -= PAGE_SIZE; continue; }
  
  This is algorithmic policy. This code handles the splitting of large folio pages. It checks if the folio can be safely split, attempts to split it into smaller folios, and then appropriately updates the relevant PTEs and MMU state.
  
  #ldos-algo
4. temi14 04 Oct 2024
  
  in Public
  
  if (!pte_present(ptent)) { swp_entry_t entry; entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) { nr_swap--; free_swap_and_cache(entry); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } continue; }
  
  This is an algorithmic policy. It checks if the entry is a valid swap entry and manages its resources accordingly. If the entry is poisoned, it clears the PTE but does not free associated resources, and if it is not a swap entry then all resources are freed.
  
  #ldos-algo
5. temi14 04 Oct 2024
  
  in Public
  
  static long madvise_cold(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; lru_add_drain(); tlb_gather_mmu(&tlb, mm); madvise_cold_page_range(&tlb, vma, start_addr, end_addr); tlb_finish_mmu(&tlb); return 0; }
  
  This is a configuration policy. Cold page ranges can not be advised to the kernel if the VM_LOCKED or VM_PFNMAP or VM_HUGETLB flags are set.
  
  #ldos-config
Visit annotations in context

Tags

#ldos-algo

#ldos-config

Annotators

temi14

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/madvise.c
elixir.bootlin.com elixir.bootlin.com

page-writeback.c - mm/page-writeback.c - Linux source code v6.6.42 - Bootlin

8
1. YungTa.Chang 04 Oct 2024
  
  in Public
  
  if (dtc->wb_thresh < 2 * wb_stat_error()) { wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); } else { wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); }
  
  This is a configuration policy that does a more accurate calculation on the number of reclaimable pages and dirty pages when the threshold for the dirty pages in the writeback context is lower than 2 times the maximal error of a stat counter.
  
  #ldos #ldos-flag #ldos-policy-use #ldos-config
2. YungTa.Chang 04 Oct 2024
  
  in Public
  
  static long wb_min_pause(struct bdi_writeback *wb, long max_pause, unsigned long task_ratelimit, unsigned long dirty_ratelimit, int *nr_dirtied_pause
  
  This function is an algorithmic policy that determines the minimum throttle time for a process between consecutive writeback operations for dirty pages based on heuristics. It is used for balancing the load of the I/O subsystems so that there will not be excessive I/O operations that impact the performance of the system.
  
  #ldos #ldos-flag #ldos-policy-use #ldos-algo
3. YungTa.Chang 04 Oct 2024
  
  in Public
  
  if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh && !writeback_in_progress(wb)) wb_start_background_writeback(wb);
  
  This is a configuration policy that determines whether to start background writeout. The code here indicates that if laptop_mode, which will reduce disk activity for power saving, is not set, then when the number of dirty pages reaches the bg_thresh threshold, the system starts writing back pages.
  
  #ldos #ldos-flag #ldos-policy-use #ldos-algo
4. YungTa.Chang 04 Oct 2024
  
  in Public
  
  if (thresh > dirty) return 1UL << (ilog2(thresh - dirty) >> 1);
  
  This implements a configuration policy that determines the interval for the kernel to wake up and check for dirty pages that need to be written back to disk.
  
  #ldos #ldos-flag #ldos-policy-use #ldos-config
5. YungTa.Chang 04 Oct 2024
  
  in Public
  
  limit -= (limit - thresh) >> 5;
  
  This is a configuration policy that determines how much should the limit be updated. The limit controls the amount of dirty memory allowed in the system.
  
  #ldos #ldos-flag #ldos-policy-use #ldos-config
6. YungTa.Chang 03 Oct 2024
  
  in Public
  
  if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && (!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { unsigned long intv; unsigned long m_intv; free_running: intv = dirty_poll_interval(dirty, thresh); m_intv = ULONG_MAX; current->dirty_paused_when = now; current->nr_dirtied = 0; if (mdtc) m_intv = dirty_poll_interval(m_dirty, m_thresh); current->nr_dirtied_pause = min(intv, m_intv); break; } /* Start writeback even when in laptop mode */ if (unlikely(!writeback_in_progress(wb))) wb_start_background_writeback(wb); mem_cgroup_flush_foreign(wb); /* * Calculate global domain's pos_ratio and select the * global dtc by default. */ if (!strictlimit) { wb_dirty_limits(gdtc); if ((current->flags & PF_LOCAL_THROTTLE) && gdtc->wb_dirty < dirty_freerun_ceiling(gdtc->wb_thresh, gdtc->wb_bg_thresh)) /* * LOCAL_THROTTLE tasks must not be throttled * when below the per-wb freerun ceiling. */ goto free_running; } dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit); wb_position_ratio(gdtc); sdtc = gdtc; if (mdtc) { /* * If memcg domain is in effect, calculate its * pos_ratio. @wb should satisfy constraints from * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ if (!strictlimit) { wb_dirty_limits(mdtc); if ((current->flags & PF_LOCAL_THROTTLE) && mdtc->wb_dirty < dirty_freerun_ceiling(mdtc->wb_thresh, mdtc->wb_bg_thresh)) /* * LOCAL_THROTTLE tasks must not be * throttled when below the per-wb * freerun ceiling. */ goto free_running; } dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && ((mdtc->dirty > mdtc->thresh) || strictlimit); wb_position_ratio(mdtc); if (mdtc->pos_ratio < gdtc->pos_ratio) sdtc = mdtc; }
  
  This is an algorithmic policy that determines whether the process can run freely or a throttle is needed to control the rate of the writeback by checking if the number of dirty pages exceed the average of the global threshold and background threshold.
  
  #ldos #ldos-flag #ldos-policy-use #ldos-algo
7. YungTa.Chang 03 Oct 2024
  
  in Public
  
  shift = dirty_ratelimit / (2 * step + 1); if (shift < BITS_PER_LONG) step = DIV_ROUND_UP(step >> shift, 8); else step = 0; if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; else dirty_ratelimit -= step;
  
  This is a configuration policy that determines how much we should increase/decrease the dirty_ratelimit, which controls the rate that processors write dirty pages back to storage.
  
  #ldos #ldos-flag #ldos-policy-use #ldos-config
8. YungTa.Chang 03 Oct 2024
  
  in Public
  
  ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16;
  
  This is a configuration policy that dynamically determines the rate that kernel can write dirty pages back to storage in a single writeback cycle.
  
  #ldos #ldos-flag #ldos-policy-use #ldos-config
Visit annotations in context

Tags

#ldos

#ldos-algo

#ldos-config

#ldos-flag

#ldos-policy-use

Annotators

YungTa.Chang

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/page-writeback.c
elixir.bootlin.com elixir.bootlin.com

rmap.c - mm/rmap.c - Linux source code v6.6.42 - Bootlin

5
1. temi14 04 Oct 2024
  
  in Public
  
  if (lru_gen_enabled() && pte_young(ptep_get(pvmw.pte))) { lru_gen_look_around(&pvmw); referenced++; }
  
  These lines show configuration policy. The lru_gen_enabled() function checks if the multi-gen LRU is enabled and this is set with the CONFIG_LRU_GEN_ENABLED kernel configuration. This is a determining factor into how the folio referenced bit is updated, so it is a configuration policy
  
  #ldos-config
2. temi14 04 Oct 2024
  
  in Public
  
  pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
  
  This is an algorithmic policy. Before it was called, a pte was cleared. A call to this function puts PTE_MARKER_UFFD_WP on a page, if needed. Here the predicate would be that the pte was cleared, and the action is the updating of the marker. (The function should not be called if the pte was not cleared).There are more policies within the function to determine if the marker is needed.
  
  #ldos-algo
3. temi14 04 Oct 2024
  
  in Public
  
  if (pending != flushed) { arch_flush_tlb_batched_pending(mm);
  
  This is an algorithmic policy. This code decides whether to perform a TLB flush based on whether there are pending flushes that haven't been completed.
  
  #ldos-algo
4. temi14 04 Oct 2024
  
  in Public
  
  if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_flush_young_notify(vma, address, pvmw.pmd)) referenced++;
  
  This is configuration policy. The CONFIG_TRANSPARENT_HUGEPAGE kernel configuration determines how and if the page reference bit is incremented.
  
  #ldos-config
5. temi14 04 Oct 2024
  
  in Public
  
  if (!dst->anon_vma && src->anon_vma && anon_vma->num_children < 2 && anon_vma->num_active_vmas == 0) dst->anon_vma = anon_vma;
  
  This is an algorithmic policy. This code checks if dst currently does not have an anonymous VMA and if src does. It checks that the anon VMA being considered has fewer than two children and no active VMAs. If all conditions are met, it reuses the existing anonymous VMA rather than allocating a new one.
  
  #ldos-algo
Visit annotations in context

Tags

#ldos-algo

#ldos-config

Annotators

temi14

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/rmap.c
elixir.bootlin.com elixir.bootlin.com

compaction.c - mm/compaction.c - Linux source code v6.6.42 - Bootlin

19
1. rwik2000_UTCS 04 Oct 2024
  
  in Public
  
  if (should_proactive_compact_node(pgdat)) { unsigned int prev_score, score; prev_score = fragmentation_score_node(pgdat); proactive_compact_node(pgdat); score = fragmentation_score_node(pgdat); /* * Defer proactive compaction if the fragmentation * score did not go down i.e. no progress made. */ if (unlikely(score >= prev_score)) timeout = default_timeout << COMPACT_MAX_DEFER_SHIFT; }
  
  This part of the code checks whether proactive compaction should be triggered using the should_proactive_compact_node(pgdat) predicate. If proactive compaction is necessary, it stores the node's fragmentation score (prev_score), performs proactive compaction (proactive_compact_node(pgdat)), and then checks the new fragmentation score (score). If the score does not decrease after compaction (indicating no progress), the system defers further proactive compaction by increasing the timeout (timeout = default_timeout << COMPACT_MAX_DEFER_SHIFT). This helps avoid repeated compaction attempts without progress.
  
  #ldos-algo
2. rwik2000_UTCS 04 Oct 2024
  
  in Public
  
  #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) static ssize_t compact_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int nid = dev->id; if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { /* Flush pending updates to the LRU lists */ lru_add_drain_all(); compact_node(nid); } return count; } static DEVICE_ATTR_WO(compact); int compaction_register_node(struct node *node) { return device_create_file(&node->dev, &dev_attr_compact); } void compaction_unregister_node(struct node *node) { device_remove_file(&node->dev, &dev_attr_compact); } #endif /* CONFIG_SYSFS && CONFIG_NUMA */
  
  When these options are enabled, the functions compact_store, compaction_register_node, and compaction_unregister_node are included in the build. The compact_store function allows users to trigger memory compaction on specific NUMA nodes by writing to the sysfs interface. Inside this function, it checks if the node ID is valid and online, drains pending updates to the LRU lists with lru_add_drain_all(), and then calls compact_node(nid) to perform compaction on that node.
  
  Including these functions provides fine-grained control over memory management in NUMA system. If even one of the flag is not eneabled, these functions are excluded from the build, and the per-node compaction interface is unavailable.
  
  #ldos-config
3. rwik2000_UTCS 04 Oct 2024
  
  in Public
  
  if (prio > MIN_COMPACT_PRIORITY && compaction_deferred(zone, order)) { rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); continue; } status = compact_zone_order(zone, order, gfp_mask, prio, alloc_flags, ac->highest_zoneidx, capture); rc = max(status, rc); /* The allocation should succeed, stop compacting */ if (status == COMPACT_SUCCESS) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller * will repeat this with true if allocation indeed * succeeds in this zone. */ compaction_defer_reset(zone, order, false); break; } if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE || status == COMPACT_PARTIAL_SKIPPED)) /* * We think that allocation won't succeed in this zone * so we defer compaction there. If it ends up * succeeding after all, it will be reset. */ defer_compaction(zone, order); /* * We might have stopped compacting due to need_resched() in * async compaction, or due to a fatal signal detected. In that * case do not try further zones */ if ((prio == COMPACT_PRIO_ASYNC && need_resched()) || fatal_signal_pending(current)) break;
  
  The kernel attempts to compact memory across different zones, guided by several key predicates. First, if compaction for a zone has been deferred and the priority is above MIN_COMPACT_PRIORITY, the function skips the zone, marking the result as COMPACT_DEFERRED. For each zone, it calls compact_zone_order and updates the overall status. If compaction succeeds (COMPACT_SUCCESS), the kernel resets the deferred state for the zone and stops further compaction attempts, expecting allocation to succeed. If compaction is either complete or partially skipped in non-async mode, the zone is marked for deferred compaction, preventing further unnecessary attempts. If compaction is interrupted due to rescheduling or a fatal signal, the function exits early to avoid system overload.
  
  #ldos-algo
4. rwik2000_UTCS 04 Oct 2024
  
  in Public
  
  if (cc->nr_freepages > 0) { unsigned long free_pfn = release_freepages(&cc->freepages); cc->nr_freepages = 0; VM_BUG_ON(free_pfn == 0); /* The cached pfn is always the first in a pageblock */ free_pfn = pageblock_start_pfn(free_pfn); /* * Only go back, not forward. The cached pfn might have been * already reset to zone end in compact_finished() */ if (free_pfn > cc->zone->compact_cached_free_pfn) cc->zone->compact_cached_free_pfn = free_pfn; }
  
  This part ensures efficient release of free pages back into the system. If there are free pages to be released (cc->nr_freepages > 0), the function release_freepages is called, and the first page frame number (PFN) in the pageblock is recorded. The cached PFN is then updated to point to the start of the newly freed pageblock. A critical check (VM_BUG_ON) ensures that the process does not operate on invalid or uninitialized page ranges.
  
  This ensures that the cached free PFN is always updated in a conservative manner i.e. only moving backward, not forward. This avoids skipping any potential free pages. This helps optimize subsequent compaction attempts by ensuring that future operations start from the correct position.
  
  #ldos-algo
5. rwik2000_UTCS 04 Oct 2024
  
  in Public
  
  if (err) { putback_movable_pages(&cc->migratepages); /* * migrate_pages() may return -ENOMEM when scanners meet * and we want compact_finished() to detect it */ if (err == -ENOMEM && !compact_scanners_met(cc)) { ret = COMPACT_CONTENDED; goto out; } /* * If an ASYNC or SYNC_LIGHT fails to migrate a page * within the pageblock_order-aligned block and * fast_find_migrateblock may be used then scan the * remainder of the pageblock. This will mark the * pageblock "skip" to avoid rescanning in the near * future. This will isolate more pages than necessary * for the request but avoid loops due to * fast_find_migrateblock revisiting blocks that were * recently partially scanned. */ if (!pageblock_aligned(cc->migrate_pfn) && !cc->ignore_skip_hint && !cc->finish_pageblock && (cc->mode < MIGRATE_SYNC)) { cc->finish_pageblock = true; /* * Draining pcplists does not help THP if * any page failed to migrate. Even after * drain, the pageblock will not be free. */ if (cc->order == COMPACTION_HPAGE_ORDER) last_migrated_pfn = 0; goto rescan; } }
  
  This block of code handles errors during page migration by first resetting the state to prevent inconsistencies. If a migration error occurs, the kernel puts back the movable pages. If the error is due to memory shortage (-ENOMEM), the process checks if the migration and free page scanners have met. If not, compaction is terminated early with COMPACT_CONTENDED. For ASYNC or SYNC_LIGHT modes, if the migration fails within an unaligned pageblock, the kernel marks the block for skipping to avoid unnecessary rescanning. Thus, improving efficiency by isolating more pages than required and preventing revisits to partially scanned blocks. For transparent huge pages (THP), it ensures the failed pageblocks are not reused.
  
  #ldos-algo
6. rwik2000_UTCS 04 Oct 2024
  
  in Public
  
  #ifdef CONFIG_CMA /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ if (migratetype == MIGRATE_MOVABLE && !free_area_empty(area, MIGRATE_CMA)) return COMPACT_SUCCESS; #endif
  
  This block allows the kernel's memory compaction algorithm to consider MIGRATE_CMA pages as fallback options for MIGRATE_MOVABLE allocations when the Contiguous Memory Allocator (CMA) is enabled (CONFIG_CMA is defined).
  
  When CONFIG_CMA is enabled, the compaction process becomes more flexible by allowing the use of free CMA pages if standard movable pages are unavailable, potentially improving allocation success rates.
  
  If CONFIG_CMA is disabled, this fallback mechanism is omitted, ensuring that CMA regions remain dedicated to their primary purpose of providing contiguous memory blocks for devices that require them.
  
  https://elixir.bootlin.com/linux/v6.6.42/source/mm/Kconfig#L895
  
  #ldos-config
7. rwik2000_UTCS 04 Oct 2024
  
  in Public
  
  /* * compaction_suitable: Is this suitable to run compaction on this zone now? */ bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx) { enum compact_result compact_result; bool suitable; suitable = __compaction_suitable(zone, order, highest_zoneidx, zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation * * index of -1000 would imply allocations might succeed depending on * watermarks, but we already failed the high-order watermark check * index towards 0 implies failure is due to lack of memory * index towards 1000 implies failure is due to fragmentation * * Only compact if a failure would be due to fragmentation. Also * ignore fragindex for non-costly orders where the alternative to * a successful reclaim/compaction is OOM. Fragindex and the * vm.extfrag_threshold sysctl is meant as a heuristic to prevent * excessive compaction for costly orders, but it should not be at the * expense of system stability. */ if (suitable) { compact_result = COMPACT_CONTINUE; if (order > PAGE_ALLOC_COSTLY_ORDER) { int fragindex = fragmentation_index(zone, order); if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) { suitable = false; compact_result = COMPACT_NOT_SUITABLE_ZONE; } } } else { compact_result = COMPACT_SKIPPED; } trace_mm_compaction_suitable(zone, order, compact_result); return suitable; }
  
  This function decides whether to initiate memory compaction on a particular zone based on system conditions like the fragmentation index and the external fragmentation threshold (sysctl_extfrag_threshold). It determines whether compaction would be effective or should be skipped. For high-order allocations, if the fragmentation index is below the threshold, the function opts to avoid unnecessary compaction, conserving system resources and preventing potential performance degradation.
  
  This enhances linux's memory management efficiency by preventing futile compaction attempts; such as in low-memory situations. Initiating compaction only when fragmentation is the primary issue, helps the system optimize CPU utilization. This should improve memory allocation success rates.
  
  #ldos-algo
8. rwik2000_UTCS 04 Oct 2024
  
  in Public
  
  static unsigned long fast_find_migrateblock(struct compact_control *cc) { unsigned int limit = freelist_scan_limit(cc); unsigned int nr_scanned = 0; unsigned long distance; unsigned long pfn = cc->migrate_pfn; unsigned long high_pfn; int order; bool found_block = false; /* Skip hints are relied on to avoid repeats on the fast search */ if (cc->ignore_skip_hint) return pfn; /* * If the pageblock should be finished then do not select a different * pageblock. */ if (cc->finish_pageblock) return pfn; /* * If the migrate_pfn is not at the start of a zone or the start * of a pageblock then assume this is a continuation of a previous * scan restarted due to COMPACT_CLUSTER_MAX. */ if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn)) return pfn; /* * For smaller orders, just linearly scan as the number of pages * to migrate should be relatively small and does not necessarily * justify freeing up a large block for a small allocation. */ if (cc->order <= PAGE_ALLOC_COSTLY_ORDER) return pfn; /* * Only allow kcompactd and direct requests for movable pages to * quickly clear out a MOVABLE pageblock for allocation. This * reduces the risk that a large movable pageblock is freed for * an unmovable/reclaimable small allocation. */ if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE) return pfn; /* * When starting the migration scanner, pick any pageblock within the * first half of the search space. Otherwise try and pick a pageblock * within the first eighth to reduce the chances that a migration * target later becomes a source. */ distance = (cc->free_pfn - cc->migrate_pfn) >> 1; if (cc->migrate_pfn != cc->zone->zone_start_pfn) distance >>= 2; high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance); for (order = cc->order - 1; order >= PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < limit; order--) { struct free_area *area = &cc->zone->free_area[order]; struct list_head *freelist; unsigned long flags; struct page *freepage; if (!area->nr_free) continue; spin_lock_irqsave(&cc->zone->lock, flags); freelist = &area->free_list[MIGRATE_MOVABLE]; list_for_each_entry(freepage, freelist, buddy_list) { unsigned long free_pfn; if (nr_scanned++ >= limit) { move_freelist_tail(freelist, freepage); break; } free_pfn = page_to_pfn(freepage); if (free_pfn < high_pfn) { /* * Avoid if skipped recently. Ideally it would * move to the tail but even safe iteration of * the list assumes an entry is deleted, not * reordered. */ if (get_pageblock_skip(freepage)) continue; /* Reorder to so a future search skips recent pages */ move_freelist_tail(freelist, freepage); update_fast_start_pfn(cc, free_pfn); pfn = pageblock_start_pfn(free_pfn); if (pfn < cc->zone->zone_start_pfn) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; break; } } spin_unlock_irqrestore(&cc->zone->lock, flags); } cc->total_migrate_scanned += nr_scanned; /* * If fast scanning failed then use a cached entry for a page block * that had free pages as the basis for starting a linear scan. */ if (!found_block) { cc->fast_search_fail++; pfn = reinit_migrate_pfn(cc); } return pfn; }
  
  This function helps determine when and how to perform a fast search for a suitable migration block during memory compaction. It evaluates predicates such as skipping the fast search if skip hints are ignored (if (cc->ignore_skip_hint) return pfn;), if the current scan should be finished (if (cc->finish_pageblock) return pfn;), if the migration pointer isn't at the start of a pageblock (if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn)) return pfn;), or if the allocation order is smlal (if (cc->order <= PAGE_ALLOC_COSTLY_ORDER) return pfn;). It also restricts fast searches to movable pages during direct compaction for higher-order allocations (if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE) return pfn;). Based on these conditions, the function performs actions like initiating a fast search under specific circumstances to quickly locate suitable blocks, limiting the number of pages scanned to conserve CPU resources (if (nr_scanned++ >= limit) { move_freelist_tail(freelist, freepage); break; }), and adjusting scanning strategies based on previous successes or failures to enhance adaptability (if (!found_block) { cc->fast_search_fail++; pfn = reinit_migrate_pfn(cc); }).
  
  #ldos-algo
9. rwik2000_UTCS 04 Oct 2024
  
  in Public
  
  #ifdef CONFIG_SPARSEMEM
  
  #ifdef CONFIG_SPARSEMEM conditionally includes or excludes the implementation of the skip_offline_sections and skip_offline_sections_reverse functions based on whether the CONFIG_SPARSEMEM option is enabled in the kernel configuration. CONFIG_SPARSEMEM enables the the kernel to support sparse memory systems where memory sections can be dynamically online or offline. They provide logic to skip over offline memory sections during operations like memory compaction, ensuring that the system does not attempt to access or manipulate memory that is not currently available.
  
  If CONFIG_SPARSEMEM is not defined, these functions return 0, effectively disabling the handling of offline memory sections.
  
  https://elixir.bootlin.com/linux/v6.6.42/source/mm/Kconfig#L461
  
  #ldos-config
10. rwik2000_UTCS 04 Oct 2024
  
  in Public
  
  #ifdef CONFIG_COMPACTION
  
  This option acts as a policy control mechanism that determines whether the memory compaction feature is included in the kernel build. By including the compaction-related code within #ifdef CONFIG_COMPACTION and #endif [Line 522], the code conditionally compiles these sections based on the configuration setting. This affects the kernel's behavior regarding memory management and fragmentation handling. CONFIG_COMPACTION is defined in https://elixir.bootlin.com/linux/v6.6.42/source/mm/Kconfig#L637 and default is set to Yes or True
  
  #ldos-config
11. anishp123 03 Oct 2024
  
  in Public
  
  static bool kcompactd_node_suitable(pg_data_t *pgdat) { int zoneid; struct zone *zone; enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx; for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) { zone = &pgdat->node_zones[zoneid]; if (!populated_zone(zone)) continue; /* Allocation can already succeed, check other zones */ if (zone_watermark_ok(zone, pgdat->kcompactd_max_order, min_wmark_pages(zone), highest_zoneidx, 0)) continue; if (compaction_suitable(zone, pgdat->kcompactd_max_order, highest_zoneidx)) return true; } return false; }
  
  This function determines whether a memory node is suitable for compaction by the kcompactd daemon. It iterates through memory zones up to a configured highest index, checking if compaction is necessary and beneficial. The policy skips unpopulated zones and those where allocation can already succeed without compaction.
  
  ldos-algo
12. anishp123 03 Oct 2024
  
  in Public
  
  static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc, nid; rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; if (write && sysctl_compaction_proactiveness) { for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); if (pgdat->proactive_compact_trigger) continue; pgdat->proactive_compact_trigger = true; trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, -1, pgdat->nr_zones - 1); wake_up_interruptible(&pgdat->kcompactd_wait); } } return 0; }
  
  This function implements configuration and policy logic for proactive memory compaction. It supports the sysctl handler, and provides runtime configuration of compaction proactiveness. When the proactiveness setting is enabled, the function applies a policy to trigger proactive compaction across all online memory nodes. It does this by setting a trigger flag and waking up the kcompactd daemon for each node that hasn't already been triggered.
  
  ldos-config ldos-algo
13. anishp123 03 Oct 2024
  
  in Public
  
  if (suitable) {
  
  This policy is based off of the allocation order, zone characteristics, and memory fragmentation level to decide whether compaction should happen. For high-order allocations, it uses a fragmentation index and a configurable threshold to decide if compaction would be beneficial. The policy aims to balance the need for contiguous memory with system performance, avoiding unnecessary compaction that could impact system stability.
  
  ldos-algo
14. anishp123 03 Oct 2024
  
  in Public
  
  watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx, ALLOC_CMA, wmark_target);
  
  The __compaction_suitable function determines if memory compaction is viable for a given zone based on its current watermark levels and the requested allocation order. It calculates a watermark threshold, which varies depending on the allocation size, and includes a compaction gap for efficiency. The function then checks if the zone meets this watermark using __zone_watermark_ok, considering factors like CMA pages and zone indices.
  
  ldos-algo
15. anishp123 03 Oct 2024
  
  in Public
  
  static enum compact_result __compact_finished(struct compact_control *cc) { unsigned int order; const int migratetype = cc->migratetype; int ret; /* Compaction run completes if the migrate and free scanner meet */ if (compact_scanners_met(cc)) { /* Let the next compaction start anew. */ reset_cached_positions(cc->zone); /* * Mark that the PG_migrate_skip information should be cleared * by kswapd when it goes to sleep. kcompactd does not set the * flag itself as the decision to be clear should be directly * based on an allocation request. */ if (cc->direct_compaction) cc->zone->compact_blockskip_flush = true; if (cc->whole_zone) return COMPACT_COMPLETE; else return COMPACT_PARTIAL_SKIPPED; } if (cc->proactive_compaction) { int score, wmark_low; pg_data_t *pgdat; pgdat = cc->zone->zone_pgdat; if (kswapd_is_running(pgdat)) return COMPACT_PARTIAL_SKIPPED; score = fragmentation_score_zone(cc->zone); wmark_low = fragmentation_score_wmark(true); if (score > wmark_low) ret = COMPACT_CONTINUE; else ret = COMPACT_SUCCESS; goto out; } if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; /* * Always finish scanning a pageblock to reduce the possibility of * fallbacks in the future. This is particularly important when * migration source is unmovable/reclaimable but it's not worth * special casing. */ if (!pageblock_aligned(cc->migrate_pfn)) return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ ret = COMPACT_NO_SUITABLE_PAGE; for (order = cc->order; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &cc->zone->free_area[order]; bool can_steal; /* Job done if page is free of the right migratetype */ if (!free_area_empty(area, migratetype)) return COMPACT_SUCCESS;
  
  This function has policy logic for when memory compaction should complete, considering whether migrate and free scanners have met, proactive compactions status, fragmentation scores, and availability of free pages. The function returns different COMPACT_* status codes based on these criteria, effectively deciding whether to continue compaction, consider it successful, or terminate the process. This policy balances thorough memory reorganization with efficient use of system resources during compaction.
  
  ldos-algo
16. anishp123 03 Oct 2024
  
  in Public
  
  #ifdef CONFIG_COMPACTION static bool suitable_migration_source(struct compact_control *cc, struct page *page) { int block_mt; if (pageblock_skip_persistent(page)) return false; if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) return true; block_mt = get_pageblock_migratetype(page); if (cc->migratetype == MIGRATE_MOVABLE) return is_migrate_movable(block_mt); else return block_mt == cc->migratetype; }
  
  This code snippet from the Linux kernel's memory compaction system implements policy and configuration logic for determining suitable migration sources during compaction. It's conditionally compiled based on the CONFIG_COMPACTION option, demonstrating configuration-dependent behavior. The suitable_migration_source function encapsulates policy decisions by considering factors such as persistent skip flags, compaction mode, and migration types. It applies different criteria for async direct compaction versus other modes, and implements specific rules for matching migration types, with special handling for movable pages.
  
  ldos-algo ldos-config
17. anishp123 03 Oct 2024
  
  in Public
  
  /* Similar to reclaim, but different enough that they don't share logic */ static bool too_many_isolated(struct compact_control *cc) { pg_data_t *pgdat = cc->zone->zone_pgdat; bool too_many; unsigned long active, inactive, isolated; inactive = node_page_state(pgdat, NR_INACTIVE_FILE) + node_page_state(pgdat, NR_INACTIVE_ANON); active = node_page_state(pgdat, NR_ACTIVE_FILE) + node_page_state(pgdat, NR_ACTIVE_ANON); isolated = node_page_state(pgdat, NR_ISOLATED_FILE) + node_page_state(pgdat, NR_ISOLATED_ANON); /* * Allow GFP_NOFS to isolate past the limit set for regular * compaction runs. This prevents an ABBA deadlock when other * compactors have already isolated to the limit, but are * blocked on filesystem locks held by the GFP_NOFS thread. */ if (cc->gfp_mask & __GFP_FS) { inactive >>= 3; active >>= 3; } too_many = isolated > (inactive + active) / 2; if (!too_many) wake_throttle_isolated(pgdat); return too_many; }
  
  This policy determines when too many pages have been isolated during kernel compaction and calculates a threshold based on the number of active and inactive pages relative to isolated pages. It handles GFP_NOFS allocations specifically differently so as to prevent deadlocks.
  
  ldos-algo
18. anishp123 03 Oct 2024
  
  in Public
  
  /* * Compound pages of >= pageblock_order should consistently be skipped until * released. It is always pointless to compact pages of such order (if they are * migratable), and the pageblocks they occupy cannot contain any free pages. */ static bool pageblock_skip_persistent(struct page *page) { if (!PageCompound(page)) return false; page = compound_head(page); if (compound_order(page) >= pageblock_order) return true; return false; }
  
  This policy optimizes memory compaction by deferring compaction on large compound pages. It determines if they are of a size greater than or equal to pageblock_order and skips them if so. For non-compound pages, it returns false, meaning it shouldn't be deferred.
  
  ldos-algo
19. anishp123 03 Oct 2024
  
  in Public
  
  /* Returns true if compaction should be skipped this time */ static bool compaction_deferred(struct zone *zone, int order) { unsigned long defer_limit = 1UL << zone->compact_defer_shift; if (order < zone->compact_order_failed) return false; /* Avoid possible overflow */ if (++zone->compact_considered >= defer_limit) { zone->compact_considered = defer_limit; return false; } trace_mm_compaction_deferred(zone, order); return true; }
  
  This policy is for deferring memory compaction in Linux. It defers compaction if the requested order is > than the previously failed order and there hasn't been more than some limit of comparisons recently, so that compaction attempts will stop if repeatedly unsuccessful.
  
  ldos-algo
Visit annotations in context

Tags

ldos-algo

ldos-config

#ldos-algo

#ldos-config

Annotators

rwik2000_UTCS

anishp123

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/compaction.c
elixir.bootlin.com elixir.bootlin.com

khugepaged.c - mm/khugepaged.c - Linux source code v6.6.42 - Bootlin

2
1. kelly_w 04 Oct 2024
  
  in Public
  
  if (khugepaged_has_work()) { const unsigned long scan_sleep_jiffies = msecs_to_jiffies(khugepaged_scan_sleep_millisecs); if (!scan_sleep_jiffies) return; khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; wait_event_freezable_timeout(khugepaged_wait, khugepaged_should_wakeup(), scan_sleep_jiffies); return; } if (hugepage_flags_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
  
  khugepaged thread sleeps when khugepaged does not have work (khugepaged_scan list is empty). Sleeps for khugepaged_scan_sleep_millisecs if has work
  
  #ldos-config
2. kelly_w 04 Oct 2024
  
  in Public
  
  if (hugepage_flags_enabled()) { if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL,
  
  hugeoage_flags_enabled() flag will turn on hupe page for all mappings. transparent_hugepage_flags is disabled by default. If enabled, khugepaged thread will start running.
  
  #ldos-config
Visit annotations in context

Tags

#ldos-config

Annotators

kelly_w

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/khugepaged.c
elixir.bootlin.com elixir.bootlin.com

swapfile.c - mm/swapfile.c - Linux source code v6.6.42 - Bootlin

10
1. nickorlo 04 Oct 2024
  
  in Public
  
  if (si->cluster_info) { if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto scan; } else if (unlikely(!si->cluster_nr--)) {
  
  Algorithmic policy decision: If SSD, use SSD wear-leveling friendly algorithm. Otherwise, use HDD algo which minimizes seek times. Potentially, certain access patterns may make one of these algorithms less effective (i.e. in the case of wear leveling, the swap is constantly full)
  
  #ldos #ldos-policy-use #ldos-algo
2. nickorlo 04 Oct 2024
  
  in Public
  
  scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ for (; last_in_cluster <= si->highest_bit; offset++) { if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { spin_lock(&si->lock); offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } }
  
  Here, (when using HDDs), a policy is implemented that places the swapped page in the first available slot. This is supposed to reduce seek time in spinning drives, as it encourages having swap entries near each other.
  
  #ldos #ldos-config #ldos-policy-use
3. nickorlo 04 Oct 2024
  
  in Public
  
  /* * Even if there's no free clusters available (fragmented), * try to scan a little more quickly with lock held unless we * have scanned too many slots already. */ if (!scanned_many) { unsigned long scan_limit; if (offset < scan_base) scan_limit = scan_base; else scan_limit = si->highest_bit; for (; offset <= scan_limit && --latency_ration > 0; offset++) { if (!si->swap_map[offset]) goto checks; } }
  
  Here we have a configuration policy where we do another smaller scan as long as we haven't exhausted our latency_ration. Another alternative could be yielding early in anticipation that we aren't going to find a free slot.
  
  #ldos #ldos-config #ldos-policy-use
4. nickorlo 04 Oct 2024
  
  in Public
  
  /* * select a random position to start with to help wear leveling * SSD */ for_each_possible_cpu(cpu) { per_cpu(*p->cluster_next_cpu, cpu) = get_random_u32_inclusive(1, p->highest_bit); }
  
  An algorithmic (random) policy is used here to spread swap pages around an SSD to help with wear leveling instead of writing to the same area of an SSD often.
  
  #ldos #ldos-policy-use #ldos-algo
5. nickorlo 04 Oct 2024
  
  in Public
  
  cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
  
  A policy decision is made to add a cluster to the discard list on a first-come, first-served basis. However, this approach could be enhanced by prioritizing certain clusters higher on the list based on their 'importance.' This 'importance' could be defined by how closely a cluster is related to other pages in the swap. By doing so, the system can reduce seek time as mentioned on line 817.
  
  #ldos #ldos-config #ldos-policy-use
6. nickorlo 04 Oct 2024
  
  in Public
  
  if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; } if (swap_offset_available_and_locked(si, offset)) goto checks; } offset = si->lowest_bit; while (offset < scan_base) { if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; }
  
  Here, a policy decision is made to fully replenish the latency_ration with the LATENCY_LIMIT and then yield back to the scheduler if we've exhausted it. This makes it so that when scheduled again, we have the full LATENCY_LIMIT to do a scan. Alternative policies could grow/shrink this to find a better heuristic instead of fully replenishing each time.
  
  Marked config/value as awe're replacing latency_ration with a compiletime-defined limit.
  
  #ldos #ldos-policy-use #ldos-config #ldos-value
7. nickorlo 04 Oct 2024
  
  in Public
  
  while (scan_swap_map_ssd_cluster_conflict(si, offset)) { /* take a break if we already got some slots */ if (n_ret) goto done; if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto scan;
  
  Here, a policy decision is made to stop scanning if some slots were already found. Other policy decisions could be made to keep scanning or take into account how long the scan took or how many pages were found.
  
  #ldos #ldos-config #ldos-policy-use
8. nickorlo 04 Oct 2024
  
  in Public
  
  else if (!cluster_list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in * discarding, do discard now and reclaim them, then * reread cluster_next_cpu since we dropped si->lock */ swap_do_scheduled_discard(si); *scan_base = this_cpu_read(*si->cluster_next_cpu); *offset = *scan_base; goto new_cluster;
  
  This algorithmic policy discards + reclaims pages as-needed whenever there is no free cluster. Other policies could be explored that do this preemptively in order to avoid the cost of doing it here.
  
  #ldos #ldos-policy-use #ldos-algo
9. nickorlo 03 Oct 2024
  
  in Public
  
  #ifdef CONFIG_THP_SWAP
  
  This is a build-time flag that configures how hugepages are handled when swapped. When defined, it swaps them in one piece, while without it splits them into smaller units and swaps those units.
  
  #ldos #ldos-config #ldos-policy-use
10. nickorlo 03 Oct 2024
  
  in Public
  
  if (swap_flags & SWAP_FLAG_DISCARD_ONCE) p->flags &= ~SWP_PAGE_DISCARD; else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) p->flags &= ~SWP_AREA_DISCARD
  
  This is a configuration policy decision where a sysadmin can pass flags to sys_swapon() to control the behavior discards are handled. If DISCARD_ONCE is set, a flag which "discard[s] swap area at swapon-time" is unset, and if DISCARD_PAGES is set, a flag which "discard[s] page-clusters after use" is unset.
  
  #ldos #ldos-config #ldos-policy-use
Visit annotations in context

Tags

#ldos

#ldos-value

#ldos-policy-use

#ldos-algo

#ldos-config

Annotators

nickorlo

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/swapfile.c
elixir.bootlin.com elixir.bootlin.com

page_owner.c - mm/page_owner.c - Linux source code v6.6.42 - Bootlin

3
1. salvadorrh10 03 Oct 2024
  
  in Public
  
  static const struct file_operations proc_page_owner_operations = { .read = read_page_owner, .llseek = lseek_page_owner, };
  
  Configuration policy that defines an interface for user space to access kernel page ownership data. Configuring how users can read and seek through the page information.
  
  #ldos-config
2. salvadorrh10 03 Oct 2024
  
  in Public
  
  /* * Some pages could be missed by concurrent allocation or free, * because we don't hold the zone lock. */ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) goto ext_put_continue; /* * Although we do have the info about past allocation of free * pages, it's not relevant for current memory usage. */ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) goto ext_put_continue; page_owner = get_page_owner(page_ext); /* * Don't print "tail" pages of high-order allocations as that * would inflate the stats. */ if (!IS_ALIGNED(pfn, 1 << page_owner->order)) goto ext_put_continue;
  
  Policy that ensures pages have ownership info, are valid and allocated.
  
  In the first if statement it skip pages without owners, the second one checks that the pages are not currently allocated. And finally it skip tail pages.
  
  #ldos-algo
3. salvadorrh10 03 Oct 2024
  
  in Public
  
  /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) pfn++;
  
  This policy ensures efficient traversal of large memory areas speeding up memory management. It avoid redundant checks, It ensures only valid memory pages are processed.
  
  #ldos-algo
Visit annotations in context

Tags

#ldos-algo

#ldos-config

Annotators

salvadorrh10

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/page_owner.c
elixir.bootlin.com elixir.bootlin.com

workingset.c - mm/workingset.c - Linux source code v6.6.42 - Bootlin

2
1. anishp123 03 Oct 2024
  
  in Public
  
  static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc)
  
  The policy in this function governs the management of shadow nodes in the Linux kernel's page cache. It dynamically calculates a maximum allowable number of shadow nodes based on the available memory, aiming to maintain an optimal balance between memory usage and the ability to detect page refaults. The policy sets this limit at approximately 1/8th of the total possible pages, adapting to different memory configurations and pressure scenarios. When the current number of shadow nodes exceeds this calculated maximum, the policy determines how many nodes should be reclaimed.
  
  ldos-algo
2. anishp123 03 Oct 2024
  
  in Public
  
  f (lru_gen_enabled()) return lru_gen_eviction(folio);
  
  When enabled via lru_gen_enabled(), it alters how the kernel tracks page access patterns and manages evictions. This policy introduces the concept of generation-based tracking, where pages are grouped into generations based on their access time. It uses functions like lru_gen_eviction() and lru_gen_refault() to handle page evictions and refaults respectively. This approach allows for more nuanced decisions in page reclaim, potentially improving memory utilization and reducing I/O operations by better identifying truly cold pages and protecting hot pages from premature eviction.
  
  ldos-algo
Visit annotations in context

Tags

ldos-algo

Annotators

anishp123

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/workingset.c
elixir.bootlin.com elixir.bootlin.com

pagewalk.c - mm/pagewalk.c - Linux source code v6.6.42 - Bootlin

1
1. jp65254 03 Oct 2024
  
  in Public
  
  if (vma->vm_flags & VM_PFNMAP) { int err = 1; if (ops->pte_hole) err = ops->pte_hole(start, end, -1, walk);
  
  #ldos-config
Visit annotations in context

Tags

#ldos-config

Annotators

jp65254

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/pagewalk.c
elixir.bootlin.com elixir.bootlin.com

mremap.c - mm/mremap.c - Linux source code v6.6.42 - Bootlin

1
1. jp65254 03 Oct 2024
  
  in Public
  
  unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, bool need_rmap_locks)
  
  Simple, but optimize by size of page table. Triggered by checking CONFIG_HAVE_MOVE_PUD and CONFIG_HAVE_MOVE_PMD
  
  #ldos-config
Visit annotations in context

Tags

#ldos-config

Annotators

jp65254

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/mremap.c
elixir.bootlin.com elixir.bootlin.com

memory-tiers.c - mm/memory-tiers.c - Linux source code v6.6.42 - Bootlin

2
1. neb825 03 Oct 2024
  
  in Public
  
  static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) { int ret; bool found_slot = false; struct memory_tier *memtier, *new_memtier; int adistance = memtype->adistance; unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; lockdep_assert_held_once(&memory_tier_lock); adistance = round_down(adistance, memtier_adistance_chunk_size); /* * If the memtype is already part of a memory tier, * just return that. */ if (!list_empty(&memtype->tier_sibiling)) { list_for_each_entry(memtier, &memory_tiers, list) { if (adistance == memtier->adistance_start) return memtier; } WARN_ON(1); return ERR_PTR(-EINVAL); } list_for_each_entry(memtier, &memory_tiers, list) { if (adistance == memtier->adistance_start) { goto link_memtype; } else if (adistance < memtier->adistance_start) { found_slot = true; break; } } new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); if (!new_memtier) return ERR_PTR(-ENOMEM); new_memtier->adistance_start = adistance; INIT_LIST_HEAD(&new_memtier->list); INIT_LIST_HEAD(&new_memtier->memory_types); if (found_slot) list_add_tail(&new_memtier->list, &memtier->list); else list_add_tail(&new_memtier->list, &memory_tiers); new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; new_memtier->dev.bus = &memory_tier_subsys; new_memtier->dev.release = memory_tier_device_release; new_memtier->dev.groups = memtier_dev_groups; ret = device_register(&new_memtier->dev); if (ret) { list_del(&new_memtier->list); put_device(&new_memtier->dev); return ERR_PTR(ret); } memtier = new_memtier; link_memtype: list_add(&memtype->tier_sibiling, &memtier->memory_types); return memtier; }
  
  This function decides whether to assign a memory device to an existing memory tier or create a new one. The decision is based on the memory device's distance and the structure of existing memory tiers.
  
  #ldos #ldos-algo
2. neb825 03 Oct 2024
  
  in Public
  
  static void establish_demotion_targets(void) { struct memory_tier *memtier; struct demotion_nodes *nd; int target = NUMA_NO_NODE, node; int distance, best_distance; nodemask_t tier_nodes, lower_tier; lockdep_assert_held_once(&memory_tier_lock); if (!node_demotion) return; disable_all_demotion_targets(); for_each_node_state(node, N_MEMORY) { best_distance = -1; nd = &node_demotion[node]; memtier = __node_get_memory_tier(node); if (!memtier || list_is_last(&memtier->list, &memory_tiers)) continue; /* * Get the lower memtier to find the demotion node list. */ memtier = list_next_entry(memtier, list); tier_nodes = get_memtier_nodemask(memtier); /* * find_next_best_node, use 'used' nodemask as a skip list. * Add all memory nodes except the selected memory tier * nodelist to skip list so that we find the best node from the * memtier nodelist. */ nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); /* * Find all the nodes in the memory tier node list of same best distance. * add them to the preferred mask. We randomly select between nodes * in the preferred mask when allocating pages during demotion. */ do { target = find_next_best_node(node, &tier_nodes); if (target == NUMA_NO_NODE) break; distance = node_distance(node, target); if (distance == best_distance || best_distance == -1) { best_distance = distance; node_set(target, nd->preferred); } else { break; } } while (1); } /* * Promotion is allowed from a memory tier to higher * memory tier only if the memory tier doesn't include * compute. We want to skip promotion from a memory tier, * if any node that is part of the memory tier have CPUs. * Once we detect such a memory tier, we consider that tier * as top tiper from which promotion is not allowed. */ list_for_each_entry_reverse(memtier, &memory_tiers, list) { tier_nodes = get_memtier_nodemask(memtier); nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); if (!nodes_empty(tier_nodes)) { /* * abstract distance below the max value of this memtier * is considered toptier. */ top_tier_adistance = memtier->adistance_start + MEMTIER_CHUNK_SIZE - 1; break; } } /* * Now build the lower_tier mask for each node collecting node mask from * all memory tier below it. This allows us to fallback demotion page * allocation to a set of nodes that is closer the above selected * perferred node. */ lower_tier = node_states[N_MEMORY]; list_for_each_entry(memtier, &memory_tiers, list) { /* * Keep removing current tier from lower_tier nodes, * This will remove all nodes in current and above * memory tier from the lower_tier mask. */ tier_nodes = get_memtier_nodemask(memtier); nodes_andnot(lower_tier, lower_tier, tier_nodes); memtier->lower_tier_mask = lower_tier; } }
  
  This function contains algorithmic policies that select demotion targets based on node distance, memory tier relationships, and CPU presence in memory nodes, and adjusts the demotion strategy based on runtime system conditions.
  
  #ldos #ldos-algo
Visit annotations in context

Tags

#ldos

#ldos-algo

Annotators

neb825

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/memory-tiers.c
elixir.bootlin.com elixir.bootlin.com

ksm.c - mm/ksm.c - Linux source code v6.6.42 - Bootlin

2
1. neb825 03 Oct 2024
  
  in Public
  
  enum get_ksm_page_flags { GET_KSM_PAGE_NOLOCK, GET_KSM_PAGE_LOCK, GET_KSM_PAGE_TRYLOCK };
  
  This defines the locking behavior when accessing a KSM page. The caller can choose between no locking, locking, or trying to lock without blocking, affecting how the page is accessed and modified.
  
  #ldos #ldos-config #ldos-flag
2. neb825 03 Oct 2024
  
  in Public
  
  static struct page *get_ksm_page(struct ksm_stable_node *stable_node, enum get_ksm_page_flags flags) { struct page *page; void *expected_mapping; unsigned long kpfn; expected_mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ page = pfn_to_page(kpfn); if (READ_ONCE(page->mapping) != expected_mapping) goto stale; /* * We cannot do anything with the page while its refcount is 0. * Usually 0 means free, or tail of a higher-order page: in which * case this node is no longer referenced, and should be freed; * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; * the same is in reuse_ksm_page() case; but if page is swapcache * in folio_migrate_mapping(), it might still be our page, * in which case it's essential to keep the node. */ while (!get_page_unless_zero(page)) { /* * Another check for page->mapping != expected_mapping would * work here too. We have chosen the !PageSwapCache test to * optimize the common case, when the page is or is about to * be freed: PageSwapCache is cleared (under spin_lock_irq) * in the ref_freeze section of __remove_mapping(); but Anon * page->mapping reset to NULL later, in free_pages_prepare(). */ if (!PageSwapCache(page)) goto stale; cpu_relax(); } if (READ_ONCE(page->mapping) != expected_mapping) { put_page(page); goto stale; } if (flags == GET_KSM_PAGE_TRYLOCK) { if (!trylock_page(page)) { put_page(page); return ERR_PTR(-EBUSY); } } else if (flags == GET_KSM_PAGE_LOCK) lock_page(page); if (flags != GET_KSM_PAGE_NOLOCK) { if (READ_ONCE(page->mapping) != expected_mapping) { unlock_page(page); put_page(page); goto stale; } } return page; stale: /* * We come here from above when page->mapping or !PageSwapCache * suggests that the node is stale; but it might be under migration. * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(), * before checking whether node->kpfn has been changed. */ smp_rmb(); if (READ_ONCE(stable_node->kpfn) != kpfn) goto again; remove_node_from_stable_tree(stable_node); return NULL; }
  
  This function checks whether a KSM page is still valid by verifying its mapping, reference count, and handling page migration. It dynamically decides whether to return the page, retry, or mark the stable node as stale.
  
  #ldos #ldos-algo
Visit annotations in context

Tags

#ldos

#ldos-flag

#ldos-algo

#ldos-config

Annotators

neb825

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/ksm.c

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators