121 Matching Annotations

Apr 2025
elixir.bootlin.com elixir.bootlin.com

topology.c - kernel/sched/topology.c - Linux source code v6.6.42 - Bootlin Elixir Cross Referencer

15
1. Rajitb 18 Apr 2025
  
  in Public
  
  /* * Initialize sched groups cpu_capacity. * * cpu_capacity indicates the capacity of sched group, which is used while * distributing the load between different sched groups in a sched domain. * Typically cpu_capacity for all the groups in a sched domain will be same * unless there are asymmetries in the topology. If there are asymmetries, * group having more cpu_capacity will pickup more load compared to the * group having less cpu_capacity. */ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; struct cpumask *mask = sched_domains_tmpmask2; WARN_ON(!sg); do { int cpu, cores = 0, max_cpu = -1; sg->group_weight = cpumask_weight(sched_group_span(sg)); cpumask_copy(mask, sched_group_span(sg)); for_each_cpu(cpu, mask) { cores++; #ifdef CONFIG_SCHED_SMT cpumask_andnot(mask, mask, cpu_smt_mask(cpu)); #endif } sg->cores = cores; if (!(sd->flags & SD_ASYM_PACKING)) goto next; for_each_cpu(cpu, sched_group_span(sg)) { if (max_cpu < 0) max_cpu = cpu; else if (sched_asym_prefer(cpu, max_cpu)) max_cpu = cpu; } sg->asym_prefer_cpu = max_cpu; next: sg = sg->next; } while (sg != sd->groups); if (cpu != group_balance_cpu(sg)) return; update_group_capacity(sd, cpu); } /*
  
  Little to no policy worth talking about today.
2. Rajitb 18 Apr 2025
  
  in Public
  
  for_each_cpu(i, cpu_map) { /* Skip already covered CPUs. */ if (find_pd(pd, i)) continue; /* Do not attempt EAS if schedutil is not being used. */ policy = cpufreq_cpu_get(i); if (!policy) goto free; gov = policy->governor; cpufreq_cpu_put(policy); if (gov != &schedutil_gov) { if (rd->pd) pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n", cpumask_pr_args(cpu_map)); goto free; } /* Create the new pd and add it to the local list. */ tmp = pd_init(i); if (!tmp) goto free; tmp->next = pd; pd = tmp; /* * Count performance domains and performance states for the * complexity check. */ nr_pd++; nr_ps += em_pd_nr_perf_states(pd->em_pd); } /* Bail out if the Energy Model complexity is too high. */ if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) { WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", cpumask_pr_args(cpu_map)); goto free; }
  
  Some policy going on here: 1. You iterate through all CPUs attempting EAS after checking for EAS, async and freq invariant.
  
  If energy model of this perf domain is feasible, add it to root.
3. Rajitb 18 Apr 2025
  
  in Public
  
  /* EAS is enabled for asymmetric CPU capacity topologies. */ if (!per_cpu(sd_asym_cpucapacity, cpu)) { if (sched_debug()) { pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", cpumask_pr_args(cpu_map)); } goto free; } /* EAS definitely does *not* handle SMT */ if (sched_smt_active()) { pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", cpumask_pr_args(cpu_map)); goto free; } if (!arch_scale_freq_invariant()) { if (sched_debug()) { pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported", cpumask_pr_args(cpu_map)); } goto free; }
  
  The checks arent policy but there is indication of requirements for the perf domains to be built like mentioned above. I believe those make it a policy ish decision.
4. Rajitb 18 Apr 2025
  
  in Public
  
  #ifdef HAVE_RT_PUSH_IPI
  
  What does this even mean?
5. Rajitb 18 Apr 2025
  
  in Public
  
  /* * A system can have three types of NUMA topology: * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes * NUMA_BACKPLANE: nodes can reach other nodes through a backplane * * The difference between a glueless mesh topology and a backplane * topology lies in whether communication between not directly * connected nodes goes through intermediary nodes (where programs * could run), or through backplane controllers. This affects * placement of programs. * * The type of topology can be discerned with the following tests: * - If the maximum distance between any nodes is 1 hop, the system * is directly connected. * - If for two nodes A and B, located N > 1 hops away from each other, * there is an intermediary node C, which is < N hops away from both * nodes A and B, the system is a glueless mesh. */ static void init_numa_topology_type(int offline_node)
  
  Look at this later on.
6. Rajitb 18 Apr 2025
  
  in Public
  
  static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) {
  
  Look at this later on.
7. Rajitb 18 Apr 2025
  
  in Public
  
  sd_span = sched_domain_span(sd); cpumask_and(sd_span, cpu_map, tl->mask(cpu)); sd_id = cpumask_first(sd_span); sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) == (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY), "CPU capacity asymmetry not supported on SMT\n"); /* * Convert topological properties into behaviour. */ /* Don't attempt to spread across CPUs of different capacities. */ if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child) sd->child->flags &= ~SD_PREFER_SIBLING; if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->imbalance_pct = 110; } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->imbalance_pct = 117; sd->cache_nice_tries = 1; #ifdef CONFIG_NUMA } else if (sd->flags & SD_NUMA) { sd->cache_nice_tries = 2; sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) { sd->flags &= ~(SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE); } #endif } else { sd->cache_nice_tries = 1; } /* * For all levels sharing cache; connect a sched_domain_shared * instance. */ if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->shared = *per_cpu_ptr(sdd->sds, sd_id); atomic_inc(&sd->shared->ref); atomic_set(&sd->shared->nr_busy_cpus, sd_weight); } sd->private = sdd;
  
  Policies: Asymmetry capacity/workload not supported on SMT because not having same capacity screws up SMT?
  
  if they are symmetric(SHARE_CPUCAP) & share cache (SHARE_PKG_RESOURCES), adjust their cache tries + imbalance in the sd. (wth is imbalance pct)
8. Rajitb 18 Apr 2025
  
  in Public
  
  /* * Topology list, bottom-up. */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, #endif #ifdef CONFIG_SCHED_CLUSTER { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, #endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, #endif { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, };
  
  List of supported topologies: smt,cluster,mc . I think important, but not policy and check if its a straightforwardu top to bottom list.
9. Rajitb 18 Apr 2025
  
  in Public
  
  #ifdef CONFIG_NUMA
  
  Start froim here for NUMA topology
10. Rajitb 18 Apr 2025
  
  in Public
  
  #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) void sched_init_numa(int offline_node)
  
  Another policy (maybe) code to look at?
11. Rajitb 18 Apr 2025
  
  in Public
  
  #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) /* Build perf. domains: */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < n && !sched_energy_update; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) { has_eas = true; goto match3; } } /* No match - add perf. domains for a new rd */ has_eas |= build_perf_domains(doms_new[i]);
  
  Policy that creates performance domains if EAS is enabled. This means that the groups of CPUs inside each domain is governed by schedutil + EAS. Schedutil is the only CPU governor that fits EAS.
  
  #ldos-config #ldos-ifdef-code
12. Rajitb 18 Apr 2025
  
  in Public
  
  #define EM_MAX_COMPLEXITY 2048
  
  Why was this number chosen?
13. Rajitb 18 Apr 2025
  
  in Public
  
  #ifdef CONFIG_PROC_SYSCTL
  
  allows for modifications of params through procfs. I havent figured out what the params are.
  
  Comments for this do it more justice than i could: The sysctl interface provides a means of dynamically changing certain kernel parameters and variables on the fly without requiring a recompile of the kernel or reboot of the system. The primary interface is through /proc/sys.
14. Rajitb 18 Apr 2025
  
  in Public
  
  #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) DEFINE_STATIC_KEY_FALSE(sched_energy_present); static unsigned int sysctl_sched_energy_aware = 1; static DEFINE_MUTEX(sched_energy_mutex); static bool sched_energy_update;
  
  Config, knobs for EAS.
15. Rajitb 18 Apr 2025
  
  in Public
  
  /* * Set up scheduler domains and groups. For now this just excludes isolated * CPUs, but could be used to exclude other special cases in the future. */ int __init sched_init_domains(const struct cpumask *cpu_map) { int err; zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL); zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL); zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); arch_update_cpu_topology(); asym_cpu_capacity_scan(); ndoms_cur = 1; doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) doms_cur = &fallback_doms; cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN)); err = build_sched_domains(doms_cur[0], NULL); return err; }
  
  Highest order in tracing function calls. Used in SMP in scheduling.
  
  Can use as starting point. No policy.
Visit annotations in context

Tags

#ldos-ifdef-code

#ldos-config

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/kernel/sched/topology.c
elixir.bootlin.com elixir.bootlin.com

zswap.c - mm/zswap.c - Linux source code v6.6.42 - Bootlin

1
1. Rajitb 16 Apr 2025
  
  in Public
  
  if (!zswap_non_same_filled_pages_enabled) goto freepage;
  
  Don't cache 'bigger' unoptimized file.
  
  #ldos-flag
Visit annotations in context

Tags

#ldos-flag

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/zswap.c
elixir.bootlin.com elixir.bootlin.com

migrate.c - mm/migrate.c - Linux source code v6.6.42 - Bootlin Elixir Cross Referencer

4
1. Rajitb 04 Apr 2025
  
  in Public
  
  int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node) { pg_data_t *pgdat = NODE_DATA(node); int isolated; int nr_remaining; unsigned int nr_succeeded; LIST_HEAD(migratepages); int nr_pages = thp_nr_pages(page); /* * Don't migrate file pages that are mapped in multiple processes * with execute permissions as they are probably shared libraries. */ if (page_mapcount(page) != 1 && page_is_file_lru(page) && (vma->vm_flags & VM_EXEC)) goto out; /* * Also do not migrate dirty pages as not all filesystems can move * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. */ if (page_is_file_lru(page) && PageDirty(page)) goto out; isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) goto out; list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio, NULL, node, MIGRATE_ASYNC, MR_NUMA_MISPLACED, &nr_succeeded); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page), -nr_pages); putback_lru_page(page); } isolated = 0; } if (nr_succeeded) { count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded); if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node)) mod_node_page_state(pgdat, PGPROMOTE_SUCCESS, nr_succeeded); } BUG_ON(!list_empty(&migratepages)); return isolated; out: put_page(page); return 0; }
  
  Migrate misplaced pages to the required node. Used through node/NUMA balancing. Policy ish with isolate. Order is isolate -> migrate else putback.
2. Rajitb 04 Apr 2025
  
  in Public
  
  static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int nr_pages = thp_nr_pages(page); int order = compound_order(page); VM_BUG_ON_PAGE(order && !PageTransHuge(page), page); /* Do not migrate THP mapped by multiple processes */ if (PageTransHuge(page) && total_mapcount(page) > 1) return 0; /* Avoid migrating to a node that is nearly full */ if (!migrate_balanced_pgdat(pgdat, nr_pages)) { int z; if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)) return 0; for (z = pgdat->nr_zones - 1; z >= 0; z--) { if (managed_zone(pgdat->node_zones + z)) break; } /* * If there are no managed zones, it should not proceed * further. */ if (z < 0) return 0; wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE); return 0; } if (!isolate_lru_page(page)) return 0; mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page), nr_pages); /* * Isolating the page has taken another reference, so the * caller's reference can be safely dropped without the page * disappearing underneath us during migration. */ put_page(page); return 1; }
  
  Most likely some resemblance of policy.
3. Rajitb 04 Apr 2025
  
  in Public
  
  #ifdef CONFIG_NUMA_BALANCING
  
  automatic page migration to nodes that are closer to CPU
4. Rajitb 04 Apr 2025
  
  in Public
  
  */ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, unsigned long nr_pages, const void __user * __user *pages, const int __user *nodes, int __user *status, int flags) { compat_uptr_t __user *compat_pages = (void __user *)pages; int current_node = NUMA_NO_NODE; LIST_HEAD(pagelist); int start, i; int err = 0, err1; lru_cache_disable(); for (i = start = 0; i < nr_pages; i++) { const void __user *p; int node; err = -EFAULT; if (in_compat_syscall()) { compat_uptr_t cp; if (get_user(cp, compat_pages + i)) goto out_flush; p = compat_ptr(cp); } else { if (get_user(p, pages + i)) goto out_flush; } if (get_user(node, nodes + i)) goto out_flush; err = -ENODEV; if (node < 0 || node >= MAX_NUMNODES) goto out_flush; if (!node_state(node, N_MEMORY)) goto out_flush; err = -EACCES; if (!node_isset(node, task_nodes)) goto out_flush; if (current_node == NUMA_NO_NODE) { current_node = node; start = i; } else if (node != current_node) { err = move_pages_and_store_status(mm, current_node, &pagelist, status, start, i, nr_pages); if (err) goto out; start = i; current_node = node; } /* * Errors in the page lookup or isolation are not fatal and we simply * report them via status */ err = add_page_for_migration(mm, p, current_node, &pagelist, flags & MPOL_MF_MOVE_ALL); if (err > 0) { /* The page is successfully queued for migration */ continue; } /* * The move_pages() man page does not have an -EEXIST choice, so * use -EFAULT instead. */ if (err == -EEXIST) err = -EFAULT; /* * If the page is already on the target node (!err), store the * node, otherwise, store the err. */ err = store_status(status, i, err ? : current_node, 1); if (err) goto out_flush; err = move_pages_and_store_status(mm, current_node, &pagelist, status, start, i, nr_pages); if (err) { /* We have accounted for page i */ if (err > 0) err--; goto out; } current_node = NUMA_NO_NODE; } out_flush: /* Make sure we do not overwrite the existing error */ err1 = move_pages_and_store_status(mm, current_node, &pagelist, status, start, i, nr_pages); if (err >= 0) err = err1; out: lru_cache_enable(); return err; }
  
  Go through nr_pages and retrieves nodes to check pages can be moved to said nodes.
Visit annotations in context

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/migrate.c
Mar 2025
elixir.bootlin.com elixir.bootlin.com

migrate.c - mm/migrate.c - Linux source code v6.6.42 - Bootlin Elixir Cross Referencer

3
1. Rajitb 21 Mar 2025
  
  in Public
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define NR_MAX_BATCHED_MIGRATION HPAGE_PMD_NR #else #define NR_MAX_BATCHED_MIGRATION 512 #endif #define NR_MAX_MIGRATE_PAGES_RETRY 10 #define NR_MAX_MIGRATE_ASYNC_RETRY 3 #define NR_MAX_MIGRATE_SYNC_RETRY \ (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY)
  
  Policy that determines batch of pages that can be migrated in one migration cycle and also specifies on the number of retries that are possible for re-migration (async vs sync)
  
  #ldos-high-confidence #ldos-value
2. Rajitb 21 Mar 2025
  
  in Public
  
  static int kernel_move_pages(pid_t pid, unsigned long nr_pages, const void __user * __user *pages, const int __user *nodes, int __user *status, int flags) { struct mm_struct *mm; int err; nodemask_t task_nodes; /* Check flags */ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; mm = find_mm_struct(pid, &task_nodes); if (IS_ERR(mm)) return PTR_ERR(mm); if (nodes) err = do_pages_move(mm, task_nodes, nr_pages, pages, nodes, status, flags); else err = do_pages_stat(mm, nr_pages, pages, status); mmput(mm);
  
  If you look below, this is a syscall for moving pages in userspace. Not a policy but good for testing func. calls by using it for workloads.
3. Rajitb 21 Mar 2025
  
  in Public
  
  int migrate_pages(struct list_head *from, new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { int rc, rc_gather; int nr_pages; struct folio *folio, *folio2; LIST_HEAD(folios); LIST_HEAD(ret_folios); LIST_HEAD(split_folios); struct migrate_pages_stats stats; trace_mm_migrate_pages_start(mode, reason); memset(&stats, 0, sizeof(stats)); rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private, mode, reason, &stats, &ret_folios); if (rc_gather < 0) goto out; again: nr_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { /* Retried hugetlb folios will be kept in list */ if (folio_test_hugetlb(folio)) { list_move_tail(&folio->lru, &ret_folios); continue; } nr_pages += folio_nr_pages(folio); if (nr_pages >= NR_MAX_BATCHED_MIGRATION) break; } if (nr_pages >= NR_MAX_BATCHED_MIGRATION) list_cut_before(&folios, from, &folio2->lru); else list_splice_init(from, &folios); if (mode == MIGRATE_ASYNC) rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, private, mode, reason, &ret_folios, &split_folios, &stats, NR_MAX_MIGRATE_PAGES_RETRY); else rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio, private, mode, reason, &ret_folios, &split_folios, &stats); list_splice_tail_init(&folios, &ret_folios); if (rc < 0) { rc_gather = rc; list_splice_tail(&split_folios, &ret_folios); goto out; } if (!list_empty(&split_folios)) { /* * Failure isn't counted since all split folios of a large folio * is counted as 1 failure already. And, we only try to migrate * with minimal effort, force MIGRATE_ASYNC mode and retry once. */ migrate_pages_batch(&split_folios, get_new_folio, put_new_folio, private, MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1); list_splice_tail_init(&split_folios, &ret_folios); } rc_gather += rc; if (!list_empty(from)) goto again; out: /* * Put the permanent failure folio back to migration list, they * will be put back to the right list by the caller. */ list_splice(&ret_folios, from); /* * Return 0 in case all split folios of fail-to-migrate large folios * are migrated successfully. */ if (list_empty(from)) rc_gather = 0; count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded); count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded); count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed); count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split); trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages, stats.nr_thp_succeeded, stats.nr_thp_failed, stats.nr_thp_split, mode, reason); if (ret_succeeded) *ret_succeeded = stats.nr_succeeded; return rc_gather; }
  
  Might.... be main func for migrating pages maintained by a folio to another free folio. No indication of policy... Except for the macros. Also worth looking into how list data structs are used. Overall not an interesting func despite linux kernel documentation (not the comments in here)
  
  #ldos-low-confidence
Visit annotations in context

Tags

#ldos-value

#ldos-low-confidence

#ldos-high-confidence

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/migrate.c
Feb 2025
elixir.bootlin.com elixir.bootlin.com

z3fold.c - mm/z3fold.c - Linux source code v6.6.42 - Bootlin Elixir Cross Referencer

18
1. Rajitb 28 Feb 2025
  
  in Public
  
  if (can_sleep) { lock_page(page); __SetPageMovable(page, &z3fold_mops); unlock_page(page); } else { WARN_ON(!trylock_page(page)); __SetPageMovable(page, &z3fold_mops); unlock_page(page); }
  
  why are we making this page migratable? and why can we lock if we can sleep?
2. Rajitb 28 Feb 2025
  
  in Public
  
  static int z3fold_page_migrate(struct page *newpage, struct page *page, enum migrate_mode mode)
  
  Tries to move contents from one page to another
3. Rajitb 28 Feb 2025
  
  in Public
  
  static int size_to_chunks(size_t size) { return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; }
  
  Interesting formula to convert size of page to no of chunks. Look into why this makes sense, especially why CHUNK_SHIFT was chosen. The func is not policy but perhaps insisting on a chunk size and quantity might be
4. Rajitb 28 Feb 2025
  
  in Public
  
  enum z3fold_page_flags { PAGE_HEADLESS = 0, MIDDLE_CHUNK_MAPPED, NEEDS_COMPACTING, PAGE_STALE, PAGE_CLAIMED, /* by either reclaim or free */ PAGE_MIGRATED, /* page is migrated and soon to be released */ };
  
  Interesting enums to note. A question is if these affect the inner policies of z3fold?
5. Rajitb 28 Feb 2025
  
  in Public
  
  migrate_enable(); if (!zhdr) { int cpu; /* look for _exact_ match on other cpus' lists */ for_each_online_cpu(cpu) { struct list_head *l; unbuddied = per_cpu_ptr(pool->unbuddied, cpu); spin_lock(&pool->lock); l = &unbuddied[chunks]; zhdr = list_first_entry_or_null(READ_ONCE(l), struct z3fold_header, buddy); if (!zhdr || !z3fold_page_trylock(zhdr)) { spin_unlock(&pool->lock); zhdr = NULL; continue; } list_del_init(&zhdr->buddy); zhdr->cpu = -1; spin_unlock(&pool->lock); page = virt_to_page(zhdr); if (test_bit(NEEDS_COMPACTING, &page->private) || test_bit(PAGE_CLAIMED, &page->private)) { z3fold_page_unlock(zhdr); zhdr = NULL; if (can_sleep) cond_resched(); continue; } kref_get(&zhdr->refcount); break; } } if (zhdr && !zhdr->slots) { zhdr->slots = alloc_slots(pool, GFP_ATOMIC); if (!zhdr->slots) goto out_fail; } return zhdr;
  
  Now we check only for unbuddied pages of size chunks on other cpus. We keep retrying and only fail if the slots/buddies could not be allocd
6. Rajitb 28 Feb 2025
  
  in Public
  
  unbuddied = this_cpu_ptr(pool->unbuddied); for_each_unbuddied_list(i, chunks) { struct list_head *l = &unbuddied[i]; zhdr = list_first_entry_or_null(READ_ONCE(l), struct z3fold_header, buddy);
  
  We iterate through the list of unbuddied z3fold headers starting from chunks to NCHUNKS
7. Rajitb 28 Feb 2025
  
  in Public
  
  migrate_disable();
  
  Psuedo or minor policy at best. z3fold alloc checks if we can find the z3fold header and by extension the unbuddied page of size min(0,chunks + i), i = 0 to NCHUNKS.
  
  Whats interesting yet simple is that you search for the unbuddied page on your own cpu before searching it on others.
8. Rajitb 24 Feb 2025
  
  in Public
  
  static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) { struct z3fold_header *zhdr; struct page *page; void *addr; enum buddy buddy; zhdr = get_z3fold_header(handle); addr = zhdr; page = virt_to_page(zhdr); if (test_bit(PAGE_HEADLESS, &page->private)) goto out; buddy = handle_to_buddy(handle); switch (buddy) { case FIRST: addr += ZHDR_SIZE_ALIGNED; break; case MIDDLE: addr += zhdr->start_middle << CHUNK_SHIFT; set_bit(MIDDLE_CHUNK_MAPPED, &page->private); break; case LAST: addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT); break; default: pr_err("unknown buddy id %d\n", buddy); WARN_ON(1); addr = NULL; break; } if (addr) zhdr->mapped_count++; out: put_z3fold_header(zhdr); return addr; }
  
  u extract buddy no from get_z3fold_header method. You likely use the buddy shift macro on the top. BUt not a policy and dont bother
9. Rajitb 21 Feb 2025
  
  in Public
  
  { struct z3fold_pool *pool = zhdr_to_pool(zhdr); void *p = zhdr; unsigned long old_handle = 0; size_t sz = 0; struct z3fold_header *new_zhdr = NULL; int first_idx = __idx(zhdr, FIRST); int middle_idx = __idx(zhdr, MIDDLE); int last_idx = __idx(zhdr, LAST); unsigned short *moved_chunks = NULL; /* * No need to protect slots here -- all the slots are "local" and * the page lock is already taken */ if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) { p += ZHDR_SIZE_ALIGNED; sz = zhdr->first_chunks << CHUNK_SHIFT; old_handle = (unsigned long)&zhdr->slots->slot[first_idx]; moved_chunks = &zhdr->first_chunks; } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) { p += zhdr->start_middle << CHUNK_SHIFT; sz = zhdr->middle_chunks << CHUNK_SHIFT; old_handle = (unsigned long)&zhdr->slots->slot[middle_idx]; moved_chunks = &zhdr->middle_chunks; } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) { p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); sz = zhdr->last_chunks << CHUNK_SHIFT; old_handle = (unsigned long)&zhdr->slots->slot[last_idx]; moved_chunks = &zhdr->last_chunks; } if (sz > 0) { enum buddy new_bud = HEADLESS; short chunks = size_to_chunks(sz); void *q; new_zhdr = __z3fold_alloc(pool, sz, false); if (!new_zhdr) return NULL; if (WARN_ON(new_zhdr == zhdr)) goto out_fail; new_bud = get_free_buddy(new_zhdr, chunks); q = new_zhdr; switch (new_bud) { case FIRST: new_zhdr->first_chunks = chunks; q += ZHDR_SIZE_ALIGNED; break; case MIDDLE: new_zhdr->middle_chunks = chunks; new_zhdr->start_middle = new_zhdr->first_chunks + ZHDR_CHUNKS; q += new_zhdr->start_middle << CHUNK_SHIFT; break; case LAST: new_zhdr->last_chunks = chunks; q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT); break; default: goto out_fail; } new_zhdr->foreign_handles++; memcpy(q, p, sz); write_lock(&zhdr->slots->lock); *(unsigned long *)old_handle = (unsigned long)new_zhdr + __idx(new_zhdr, new_bud); if (new_bud == LAST) *(unsigned long *)old_handle |= (new_zhdr->last_chunks << BUDDY_SHIFT); write_unlock(&zhdr->slots->lock); add_to_unbuddied(pool, new_zhdr); z3fold_page_unlock(new_zhdr); *moved_chunks = 0; } return new_zhdr; out_fail: if (new_zhdr && !put_z3fold_locked(new_zhdr)) { add_to_unbuddied(pool, new_zhdr); z3fold_page_unlock(new_zhdr); } return NULL; }
  
  Policy that finds a buddy to compact
10. Rajitb 20 Feb 2025
  
  in Public
  
  #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) #define CHUNK_SIZE (1 << CHUNK_SHIFT) #define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) #define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) #define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) #define NCHUNKS (TOTAL_CHUNKS - ZHDR_CHUNKS) #define BUDDY_MASK (0x3) #define BUDDY_SHIFT 2 #define SLOTS_ALIGN (0x40)
  
  Policy controlling macros that determine how chunks are allocd, moved, removed, combined etc.. In order: CHUNK_SHIFT: no of chunks wrt to one page CHUNK_SIZE: size of one chunk = 64 bytes ZHDR_SIZE_ALIGNED: rounds header to 64 bytes ZHDR_CHUNKS: no of chunks a header is TOTAL_CHUNKS: total no of chunks in a 4KB page NCHUNKS: no of chunks excluding the header BUDDY_MASK (0x3): 2 bits for top middle bot buddy BUDDY_SHIFT 2: bits 3:2 confirms what buddies are present SLOTS_ALIGN (0x40): is decimal 64
11. Rajitb 20 Feb 2025
  
  in Public
  
  #define NCHUNKS_ORDER 6
  
  used for bit shift ops. 2^6 = 64 bytes or chunks depending upon your problem
  
  #ldos-policy-use
12. Rajitb 14 Feb 2025
  
  in Public
  
  if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 && zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >= BIG_CHUNK_GAP) { mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS); zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; return 1;
  
  threshold for moving middle chunk to right after first chunk
13. Rajitb 14 Feb 2025
  
  in Public
  
  new_zhdr = __z3fold_alloc(pool, sz, false);
  
  GPT: allocates to same pool to avoid locality
14. Rajitb 13 Feb 2025
  
  in Public
  
  static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks) { enum buddy bud = HEADLESS; if (zhdr->middle_chunks) { if (!zhdr->first_chunks && chunks <= zhdr->start_middle - ZHDR_CHUNKS) bud = FIRST; else if (!zhdr->last_chunks) bud = LAST; } else { if (!zhdr->first_chunks) bud = FIRST; else if (!zhdr->last_chunks) bud = LAST; else bud = MIDDLE; } return bud; } static inline void *mchunk_memmove(struct z3fold_header *zhdr, unsigned short dst_chunk) { void *beg = zhdr; return memmove(beg + (dst_chunk << CHUNK_SHIFT), beg + (zhdr->start_middle << CHUNK_SHIFT), zhdr->middle_chunks << CHUNK_SHIFT); } static inline bool buddy_single(struct z3fold_header *zhdr) { return !((zhdr->first_chunks && zhdr->middle_chunks) || (zhdr->first_chunks && zhdr->last_chunks) || (zhdr->middle_chunks && zhdr->last_chunks)); }
  
  POLICY most likely x3
15. Rajitb 13 Feb 2025
  
  in Public
  
  if (!zhdr->foreign_handles && buddy_single(zhdr) && zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) { if (!put_z3fold_locked(zhdr)) { clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } return; }
  
  stands out as potential policy
16. Rajitb 13 Feb 2025
  
  in Public
  
  #define BIG_CHUNK_GAP 3 /* Has to be called with lock held */ static int z3fold_compact_page(struct z3fold_header *zhdr) { struct page *page = virt_to_page(zhdr); if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private)) return 0; /* can't move middle chunk, it's used */ if (unlikely(PageIsolated(page))) return 0; if (zhdr->middle_chunks == 0) return 0; /* nothing to compact */ if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { /* move to the beginning */ mchunk_memmove(zhdr, ZHDR_CHUNKS); zhdr->first_chunks = zhdr->middle_chunks; zhdr->middle_chunks = 0; zhdr->start_middle = 0; zhdr->first_num++; return 1; } /* * moving data is expensive, so let's only do that if * there's substantial gain (at least BIG_CHUNK_GAP chunks) */ if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 && zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >= BIG_CHUNK_GAP) { mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS); zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; return 1; } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 && TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle + zhdr->middle_chunks) >= BIG_CHUNK_GAP) { unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks - zhdr->middle_chunks; mchunk_memmove(zhdr, new_start); zhdr->start_middle = new_start; return 1; } return 0; }
  
  def policy in here. IMP
17. Rajitb 13 Feb 2025
  
  in Public
  
  if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; if (size > PAGE_SIZE) return -ENOSPC; if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) bud = HEADLESS; else { retry: zhdr = __z3fold_alloc(pool, size, can_sleep); if (zhdr) { bud = get_free_buddy(zhdr, chunks); if (bud == HEADLESS) { if (!put_z3fold_locked(zhdr)) z3fold_page_unlock(zhdr); pr_err("No free chunks in unbuddied\n"); WARN_ON(1); goto retry; } page = virt_to_page(zhdr); goto found; } bud = FIRST; }
  
  Leaning towards it being refgular checks and not policy
18. Rajitb 13 Feb 2025
  
  in Public
  
  static const struct movable_operations z3fold_mops = { .isolate_page = z3fold_page_isolate, .migrate_page = z3fold_page_migrate, .putback_page = z3fold_page_putback, };
  
  Defines type of movable ops on pages. Most likely policy influencing struct
Visit annotations in context

Tags

#ldos-policy-use

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/z3fold.c
elixir.bootlin.com elixir.bootlin.com

page_reporting.c - mm/page_reporting.c - Linux source code v6.6.42 - Bootlin Elixir Cross Referencer

5
1. Rajitb 07 Feb 2025
  
  in Public
  
  #define PAGE_REPORTING_DELAY (2 * HZ)
  
  Min delay that allows for more unreported pages to accumulate
2. Rajitb 07 Feb 2025
  
  in Public
  
  int page_reporting_register
  
  Registers page reporting device and checks in reporting_req to see if we have started requesting or need to wait for queue to build up
3. Rajitb 07 Feb 2025
  
  in Public
  
  /* allocate scatterlist to store pages being reported on */ sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
  
  Scatterlist is allocd here. arr that will contain contiguous blocks of phy uncontiguous mem.
4. Rajitb 07 Feb 2025
  
  in Public
  
  unsigned int page_reporting_order = -1;
  
  Kernel param that controls number of contiguous pages. -1 is default for invalid and 0 to MAX_ORDER is log base 2 of no.of pages.
  
  #ldos-value
5. Rajitb 03 Feb 2025
  
  in Public
  
  void page_reporting_unregister(struct page_reporting_dev_info *prdev)
  
  Same importance as unregister
Visit annotations in context

Tags

#ldos-value

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/page_reporting.c
Jan 2025
elixir.bootlin.com elixir.bootlin.com

zswap.c - mm/zswap.c - Linux source code v6.6.42 - Bootlin

11
1. Rajitb 30 Jan 2025
  
  in Public
  
  if (ret) { zswap_reject_reclaim_fail++; if (ret != -EAGAIN) break; if (++failures == MAX_RECLAIM_RETRIES) break; }
  
  The reclaim loop can only be run for 16 times, each time it tries to swap out the pool that the OOM killer is activated. This criteria to kill the reclaim process is decided by its oom_score determined by the kernel and adjustable in sysfs.
2. Rajitb 30 Jan 2025
  
  in Public
  
  zswap_pool_put(pool);
  
  not a policy, simply reduces reference count to the pool
3. Rajitb 30 Jan 2025
  
  in Public
  
  static int zswap_reclaim_entry(struct zswap_pool *pool) {
  
  Tries to writeback an entry to swap cache and removes it from zswap. We utilize this function when zpool and zswap are full, hence 'reclaim'.
4. Rajitb 30 Jan 2025
  
  in Public
  
  static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
  
  Not a policy but something to note. An array of red-black zswap_trees that contain zswap entries. Each tree is indexed by the swap device/file no.
5. Rajitb 26 Jan 2025
  
  in Public
  
  static unsigned int zswap_max_pool_percent = 20;
  
  Max percentage of total system RAM that can be used by the pool. Also a user controlled policy recognized by the Linux kernel documentation
  
  #ldos-high-confidence #ldos-value
6. Rajitb 26 Jan 2025
  
  in Public
  
  static bool zswap_same_filled_pages_enabled = true;
  
  Interesting optimization. If a page is just filled with purely 1s or 0s and this bool is true then it just compresses the page to one bit.
7. Rajitb 26 Jan 2025
  
  in Public
  
  static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
  
  Im just adding on here but usually you choose between zswap and zram. The latter acts as a block device to store compressed page and thus finds use in embedded Linux systems while the former requires an existing swap device to act as a fallback for removing entries. This means that the policy likely here is choosing an internal compression storage in RAM based on the system (Embedded vs Server/datacenter)
8. Rajitb 25 Jan 2025
  
  in Public
  
  cond_resched();
  
  Signals that this process is low priority and allows the scheduler to run higher priority processes. Not a policy but low prioritization can indicate importance of zswap policies
9. Rajitb 25 Jan 2025
  
  in Public
  
  /* reclaim space if needed */ if (zswap_is_full()) { zswap_pool_limit_hit++; zswap_pool_reached_full = true; goto shrink; } if (zswap_pool_reached_full) { if (!zswap_can_accept()) goto shrink; else zswap_pool_reached_full = false; }
  
  Checks if zswap device is full and if any zpools are available to store page. The check is based on what percentage of pool and RAM the user has allocated to zswap. The true policy arises in the question of WHY the user has set the percentage to those values. The shrink function is enabled in the case that there is no space in zswap.
10. Rajitb 25 Jan 2025
  
  in Public
  
  static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
  
  Dictates the type of compression algorithm to use as a string, with tradeoffs in speed vs size compressed. Can also be changed at boot time by changing boot params in sysfs
11. Rajitb 25 Jan 2025
  
  in Public
  
  static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
  
  Once you hit max no of pools, you dictate the max percentage of pages that a specific pool can accept before LRU page eviction to swap device
Visit annotations in context

Tags

#ldos-value

#ldos-high-confidence

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/zswap.c
Nov 2024
elixir.bootlin.com elixir.bootlin.com

gup.c - mm/gup.c - Linux source code v6.6.42 - Bootlin

61
1. Rajitb 11 Nov 2024
  
  in Public
  
  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; do { struct page *page = pfn_to_page(pfn); pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } SetPageReferenced(page); pages[*nr] = page; if (unlikely(try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } (*nr)++; pfn++; } while (addr += PAGE_SIZE, addr != end); put_dev_pagemap(pgmap); return addr == end; } static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } #else static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } #endif
  
  seems like a check to see if pages can be grabbed. A quick skim maybe hints possible checks if huge pages can be grabbed?
  
  #ldos-ifdef-code #ldos-function #ldos-high-confidence
2. Rajitb 11 Nov 2024
  
  in Public
  
  #ifdef CONFIG_ARCH_HAS_HUGEPD static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) { unsigned long __boundary = (addr + sz) & ~(sz-1); return (__boundary - 1 < end - 1) ? __boundary : end; } static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long pte_end; struct page *page; struct folio *folio; pte_t pte; int refs; pte_end = (addr + sz) & ~(sz-1); if (pte_end < end) end = pte_end; pte = huge_ptep_get(ptep); if (!pte_access_permitted(pte, flags & FOLL_WRITE)) return 0; /* hugepages are never "special" */ VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, struct page **pages, int *nr) { pte_t *ptep; unsigned long sz = 1UL << hugepd_shift(hugepd); unsigned long next; ptep = hugepte_offset(hugepd, addr, pdshift); do { next = hugepte_addr_end(addr, end, sz); if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr)) return 0; } while (ptep++, addr = next, addr != end); return 1; } #else static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_HUGEPD */ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *page; struct folio *folio; int refs; if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; return __gup_device_huge_pmd(orig, pmdp, addr, end, flags, pages, nr); } page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *page; struct folio *folio; int refs; if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; return __gup_device_huge_pud(orig, pudp, addr, end, flags, pages, nr); } page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { int refs; struct page *page; struct folio *folio; if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) return 0; BUILD_BUG_ON(pgd_devmap(orig)); page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { gup_put_folio(folio, refs, flags); return 0; } if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; pmdp = pmd_offset_lockless(pudp, pud, addr); do { pmd_t pmd = pmdp_get_lockless(pmdp); next = pmd_addr_end(addr, end); if (!pmd_present(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd))) { /* See gup_pte_range() */ if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { /* * architecture have different format for hugetlbfs * pmd format and THP pmd format */ if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, PMD_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); return 1; } static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; pudp = pud_offset_lockless(p4dp, p4d, addr); do { pud_t pud = READ_ONCE(*pudp); next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) return 0; if (unlikely(pud_huge(pud) || pud_devmap(pud))) { if (!gup_huge_pud(pud, pudp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, PUD_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); return 1; } static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp; p4dp = p4d_offset_lockless(pgdp, pgd, addr); do { p4d_t p4d = READ_ONCE(*p4dp); next = p4d_addr_end(addr, end); if (p4d_none(p4d)) return 0; BUILD_BUG_ON(p4d_huge(p4d)); if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, P4D_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); return 1; } static void gup_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pgd_t *pgdp; pgdp = pgd_offset(current->mm, addr); do { pgd_t pgd = READ_ONCE(*pgdp); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) return; if (unlikely(pgd_huge(pgd))) { if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, pages, nr)) return; } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, PGDIR_SHIFT, next, flags, pages, nr)) return; } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) return; } while (pgdp++, addr = next, addr != end); } #else static inline void gup_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { }
  
  policy use functions for gup_huge pte policy code function above (not right above, gotta scroll probably to find it)
  
  #ldos-function #ldos-flag #ldos-low-confidence #ldos-ifdef-code #ldos-policy-use
3. Rajitb 11 Nov 2024
  
  in Public
  
  static int internal_get_user_pages_fast(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { unsigned long len, end; unsigned long nr_pinned; int locked = 0; int ret; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | FOLL_FAST_ONLY | FOLL_NOFAULT | FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT))) return -EINVAL; if (gup_flags & FOLL_PIN) mm_set_has_pinned_flag(&current->mm->flags); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(&current->mm->mmap_lock); start = untagged_addr(start) & PAGE_MASK; len = nr_pages << PAGE_SHIFT; if (check_add_overflow(start, len, &end)) return -EOVERFLOW; if (end > TASK_SIZE_MAX) return -EFAULT; if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages); if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) return nr_pinned; /* Slow path: try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned, pages, &locked, gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so * returning -errno is not an option */ if (nr_pinned) return nr_pinned; return ret; } return ret + nr_pinned; } /** * get_user_pages_fast_only() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. * * If the architecture does not support this function, simply return with no * pages pinned. * * Careful, careful! COW breaking can go either way, so a non-write * access can get ambiguous page results. If you call this function without * 'write' set, you'd better be sure that you're ok with that ambiguity. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { /* * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, * because gup fast is always a "pin with a +1 page refcount" request. * * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET | FOLL_FAST_ONLY)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast_only); /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Attempt to pin user pages in memory without taking mm->mmap_lock. * If not successful, it will fall back to taking the lock and * calling get_user_pages(). * * Returns number of pages pinned. This may be fewer than the number requested. * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns * -errno. */ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { /* * The caller may or may not have explicitly set FOLL_GET; either way is * OK. However, internally (within mm/gup.c), gup fast variants must set * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" * request. */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); /** * pin_user_pages_fast() - pin user pages in memory without taking locks * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See * get_user_pages_fast() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for further details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page() will not remove pins from it. */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(pin_user_pages_fast); /** * pin_user_pages_remote() - pin pages of a remote process * * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See * get_user_pages_remote() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { int local_locked = 1; if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) return 0; return __gup_longterm_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_remote); /** * pin_user_pages() - pin user pages in memory for use by other devices * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { int locked = 1; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages); /* * pin_user_pages_unlocked() is the FOLL_PIN variant of * get_user_pages_unlocked(). Behavior is the same, except that this one sets * FOLL_PIN and rejects FOLL_GET. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { int locked = 0; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); }
  
  fast gup functions
  
  #ldos-function #ldos-low-confidence #ldos-ifdef-code #ldos-flag #ldos-policy-use
4. Rajitb 11 Nov 2024
  
  in Public
  
  /** * unpin_user_pages() - release an array of gup-pinned pages. * @pages: array of pages to be marked dirty and released. * @npages: number of pages in the @pages array. * * For each page in the @pages array, release the page using unpin_user_page(). * * Please see the unpin_user_page() documentation for details. */ void unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long i; struct folio *folio; unsigned int nr; /* * If this WARN_ON() fires, then the system *might* be leaking pages (by * leaving them pinned), but probably not. More likely, gup/pup returned * a hard -ERRNO error to the caller, who erroneously passed it here. */ if (WARN_ON(IS_ERR_VALUE(npages))) return; sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); gup_put_folio(folio, nr, FOLL_PIN); } }
  
  gup unpin function, not actual logic
  
  #ldos-function #ldos-low-confidence #ldos-flag #ldos-policy-use
5. Rajitb 11 Nov 2024
  
  in Public
  
  void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty) { unsigned long i; struct folio *folio; unsigned int nr; for (i = 0; i < npages; i += nr) { folio = gup_folio_range_next(page, npages, i, &nr); if (make_dirty && !folio_test_dirty(folio)) { folio_lock(folio); folio_mark_dirty(folio); folio_unlock(folio); } gup_put_folio(folio, nr, FOLL_PIN); } }
  
  unpin logic but for dirty pages
  
  #ldos-function #ldos-low-confidence #ldos-policy-use
6. Rajitb 11 Nov 2024
  
  in Public
  
  if ((flags & FOLL_DUMP) && (vma_is_anonymous(vma) || !vma->vm_ops->fault)) return ERR_PTR(-EFAULT); return NULL;
  
  explained in comments
  
  #ldos-low-confidence #ldos-flag #ldos-function #ldos-ifdef-code
7. Rajitb 11 Nov 2024
  
  in Public
  
  #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL /* * Fast-gup relies on pte change detection to avoid concurrent pgtable * operations. * * To pin the page, fast-gup needs to do below in order: * (1) pin the page (by prefetching pte), then (2) check pte not changed. * * For the rest of pgtable operations where pgtable updates can be racy * with fast-gup, we need to do (1) clear pte, then (2) check whether page * is pinned. * * Above will work for all pte-level operations, including THP split. * * For THP collapse, it's a bit more complicated because fast-gup may be * walking a pgtable page that is being freed (pte is still valid but pmd * can be cleared already). To avoid race in such condition, we need to * also check pmd here to make sure pmd doesn't change (corresponds to * pmdp_collapse_flush() in the THP collapse code path). */ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; int nr_start = *nr, ret = 0; pte_t *ptep, *ptem; ptem = ptep = pte_offset_map(&pmd, addr); if (!ptep) return 0; do { pte_t pte = ptep_get_lockless(ptep); struct page *page; struct folio *folio; /* * Always fallback to ordinary GUP on PROT_NONE-mapped pages: * pte_access_permitted() better should reject these pages * either way: otherwise, GUP-fast might succeed in * cases where ordinary GUP would fail due to VMA access * permissions. */ if (pte_protnone(pte)) goto pte_unmap; if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto pte_unmap; if (pte_devmap(pte)) { if (unlikely(flags & FOLL_LONGTERM)) goto pte_unmap; pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); goto pte_unmap; } } else if (pte_special(pte)) goto pte_unmap; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); folio = try_grab_folio(page, 1, flags); if (!folio) goto pte_unmap; if (unlikely(folio_is_secretmem(folio))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } /* * We need to make the page accessible if and only if we are * going to access its content (the FOLL_PIN case). Please * see Documentation/core-api/pin_user_pages.rst for * details. */ if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { gup_put_folio(folio, 1, flags); goto pte_unmap; } } folio_set_referenced(folio); pages[*nr] = page; (*nr)++; } while (ptep++, addr += PAGE_SIZE, addr != end); ret = 1; pte_unmap: if (pgmap) put_dev_pagemap(pgmap); pte_unmap(ptem); return ret; } #else /* * If we can't determine whether or not a pte is special, then fail immediately * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not * to be special. * * For a futex to be placed on a THP tail page, get_futex_key requires a * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
  
  non concurrent fast gup approach that checks for pinned page and unmaps pte or clears it
  
  #ldos-ifdef-code #ldos-high-confidence #ldos-function #ldos-algorithm #ldos-flag #ldos-formula
8. Rajitb 11 Nov 2024
  
  in Public
  
  #ifdef CONFIG_HAVE_FAST_GUP /* * Used in the GUP-fast path to determine whether a pin is permitted for a * specific folio. * * This call assumes the caller has pinned the folio, that the lowest page table * level still points to this folio, and that interrupts have been disabled. * * Writing to pinned file-backed dirty tracked folios is inherently problematic * (see comment describing the writable_file_mapping_allowed() function). We * therefore try to avoid the most egregious case of a long-term mapping doing * so. * * This function cannot be as thorough as that one as the VMA is not available * in the fast path, so instead we whitelist known good cases and if in doubt, * fall back to the slow path. */ static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags) { struct address_space *mapping; unsigned long mapping_flags; /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the one we disallow. */ if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) != (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) return true; /* The folio is pinned, so we can safely access folio fields. */ if (WARN_ON_ONCE(folio_test_slab(folio))) return false; /* hugetlb mappings do not require dirty-tracking. */ if (folio_test_hugetlb(folio)) return true; /* * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods * cannot proceed, which means no actions performed under RCU can * proceed either. * * inodes and thus their mappings are freed under RCU, which means the * mapping cannot be freed beneath us and thus we can safely dereference * it. */ lockdep_assert_irqs_disabled(); /* * However, there may be operations which _alter_ the mapping, so ensure * we read it once and only once. */ mapping = READ_ONCE(folio->mapping); /* * The mapping may have been truncated, in any case we cannot determine * if this mapping is safe - fall back to slow path to determine how to * proceed. */ if (!mapping) return false; /* Anonymous folios pose no problem. */ mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS; if (mapping_flags) return mapping_flags & PAGE_MAPPING_ANON; /* * At this point, we know the mapping is non-null and points to an * address_space object. The only remaining whitelisted file system is * shmem. */ return shmem_mapping(mapping); }
  
  policy logic. avoids locks unlike get user pages unlocked/locked which seems risky so its not supposed to be used on concurrent gup logic
  
  #ldos-flag #ldos-function #ldos-high-confidence #ldos-ifdef-code
9. Rajitb 11 Nov 2024
  
  in Public
  
  long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { int locked = 1; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); }
  
  policy logic.
  
  #ldos-high-confidence #ldos-flag
10. Rajitb 11 Nov 2024
  
  in Public
  
  static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, unsigned int flags, struct page **pages) { while ((*nr) - nr_start) { struct page *page = pages[--(*nr)]; ClearPageReferenced(page); if (flags & FOLL_PIN) unpin_user_page(page); else put_page(page); } }
  
  policy use function that undoes mapping
  
  #ldos-policy-use #ldos-function #ldos-low-confidence
11. Rajitb 11 Nov 2024
  
  in Public
  
  #ifdef CONFIG_MIGRATION /* * Returns the number of collected pages. Return value is always >= 0. */ static unsigned long collect_longterm_unpinnable_pages( struct list_head *movable_page_list, unsigned long nr_pages, struct page **pages) { unsigned long i, collected = 0; struct folio *prev_folio = NULL; bool drain_allow = true; for (i = 0; i < nr_pages; i++) { struct folio *folio = page_folio(pages[i]); if (folio == prev_folio) continue; prev_folio = folio; if (folio_is_longterm_pinnable(folio)) continue; collected++; if (folio_is_device_coherent(folio)) continue; if (folio_test_hugetlb(folio)) { isolate_hugetlb(folio, movable_page_list); continue; } if (!folio_test_lru(folio) && drain_allow) { lru_add_drain_all(); drain_allow = false; } if (!folio_isolate_lru(folio)) continue; list_add_tail(&folio->lru, movable_page_list); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } return collected; }
  
  #ldos-ifdef-code #ldos-function #ldos-low-confidence #ldos-flag
12. Rajitb 11 Nov 2024
  
  in Public
  
  #ifdef CONFIG_ELF_CORE struct page *get_dump_page(unsigned long addr) { struct page *page; int locked = 0; int ret; ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */
  
  part of policy use code likely
  
  #ldos-ifdef-code #ldos-flag #ldos-function #ldos-low-confidence
13. Rajitb 11 Nov 2024
  
  in Public
  
  int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; long ret = 0; end = start + len; for (nstart = start; nstart < end; nstart = nend) { /* * We want to fault in pages for [nstart; end) address range. * Find first corresponding VMA. */ if (!locked) { locked = 1; mmap_read_lock(mm); vma = find_vma_intersection(mm, nstart, end); } else if (nstart >= vma->vm_end) vma = find_vma_intersection(mm, vma->vm_end, end); if (!vma) break; /* * Set [nstart; nend) to intersection of desired address * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); if (vma->vm_flags & (VM_IO | VM_PFNMAP)) continue; if (nstart < vma->vm_start) nstart = vma->vm_start; /* * Now fault in a range of pages. populate_vma_page_range() * double checks the vma flags, so that it won't mlock pages * if the vma was already munlocked. */ ret = populate_vma_page_range(vma, nstart, nend, &locked); if (ret < 0) { if (ignore_errors) { ret = 0; continue; /* continue at next VMA */ } break; } nend = nstart + ret * PAGE_SIZE; ret = 0; } if (locked) mmap_read_unlock(mm); return ret; /* 0 or negative error code */ }
  
  policy use function that populates pages like the func before this.
  
  #ldos-function #ldos-low-confidence #ldos-flag
14. Rajitb 11 Nov 2024
  
  in Public
  
  long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked) { struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int local_locked = 1; int gup_flags; long ret; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* * Rightly or wrongly, the VM_LOCKONFAULT case has never used * faultin_page() to break COW, so it has no work to do here. */ if (vma->vm_flags & VM_LOCKONFAULT) return nr_pages; gup_flags = FOLL_TOUCH; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW * and we would not want to dirty them for nothing. */ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; /* * We want mlock to succeed for regions that have any permissions * other than PROT_NONE. */ if (vma_is_accessible(vma)) gup_flags |= FOLL_FORCE; if (locked) gup_flags |= FOLL_UNLOCKABLE; /* * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ ret = __get_user_pages(mm, start, nr_pages, gup_flags, NULL, locked ? locked : &local_locked); lru_add_drain(); return ret; }
  
  policy use code.
  
  #ldos-flag #ldos-function #ldos-low-confidence
15. Rajitb 10 Nov 2024
  
  in Public
  
  long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { int local_locked = 1; if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_TOUCH | FOLL_REMOTE)) return -EINVAL; return __get_user_pages_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); }
  
  policy logic
  
  #ldos-high-confidence #ldos-function #ldos-flag
16. Rajitb 10 Nov 2024
  
  in Public
  
  static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int flags) { long ret, pages_done; bool must_unlock = false; /* * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. */ if (!*locked) { if (mmap_read_lock_killable(mm)) return -EAGAIN; must_unlock = true; *locked = 1; } else mmap_assert_locked(mm); if (flags & FOLL_PIN) mm_set_has_pinned_flag(&mm->flags); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior * is to set FOLL_GET if the caller wants pages[] filled in (but has * carelessly failed to specify FOLL_GET), so keep doing that, but only * for FOLL_GET, not for the newer FOLL_PIN. * * FOLL_PIN always expects pages to be non-null, but no need to assert * that here, as any failures will be obvious enough. */ if (pages && !(flags & FOLL_PIN)) flags |= FOLL_GET; pages_done = 0; for (;;) { ret = __get_user_pages(mm, start, nr_pages, flags, pages, locked); if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ pages_done = ret; break; } /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */ if (!*locked) { BUG_ON(ret < 0); BUG_ON(ret >= nr_pages); } if (ret > 0) { nr_pages -= ret; pages_done += ret; if (!nr_pages) break; } if (*locked) { /* * VM_FAULT_RETRY didn't trigger or it was a * FOLL_NOWAIT. */ if (!pages_done) pages_done = ret; break; } /* * VM_FAULT_RETRY triggered, so seek to the faulting offset. * For the prefault case (!pages) we only update counts. */ if (likely(pages)) pages += ret; start += ret << PAGE_SHIFT; /* The lock was temporarily dropped, so we must unlock later */ must_unlock = true; retry: /* * Repeat on the address that fired VM_FAULT_RETRY * with both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_TRIED. Note that GUP can be interrupted * by fatal signals of even common signals, depending on * the caller's request. So we need to check it before we * start trying again otherwise it can loop forever. */ if (gup_signal_pending(flags)) { if (!pages_done) pages_done = -EINTR; break; } ret = mmap_read_lock_killable(mm); if (ret) { BUG_ON(ret > 0); if (!pages_done) pages_done = ret; break; } *locked = 1; ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED, pages, locked); if (!*locked) { /* Continue to retry until we succeeded */ BUG_ON(ret != 0); goto retry; } if (ret != 1) { BUG_ON(ret > 1); if (!pages_done) pages_done = ret; break; } nr_pages--; pages_done++; if (!nr_pages) break; if (likely(pages)) pages++; start += PAGE_SIZE; } if (must_unlock && *locked) { /* * We either temporarily dropped the lock, or the caller * requested that we both acquire and drop the lock. Either way, * we must now unlock, and notify the caller of that state. */ mmap_read_unlock(mm); *locked = 0; } return pages_done; }
  
  same as gup but sets/unsets mmap_lock
  
  #ldos-flag #ldos-formula #ldos-high-confidence #ldos-ifdef-code #ldos-function
17. Rajitb 10 Nov 2024
  
  in Public
  
  if (!(flags & FOLL_INTERRUPTIBLE)) return false;
  
  fatal fault signal handler
  
  #ldos-flag #ldos-low-confidence
18. Rajitb 10 Nov 2024
  
  in Public
  
  int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { struct vm_area_struct *vma; vm_fault_t ret; address = untagged_addr_remote(mm, address); if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; retry: vma = gup_vma_lookup(mm, address); if (!vma) return -EFAULT; if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; if ((fault_flags & FAULT_FLAG_KILLABLE) && fatal_signal_pending(current)) return -EINTR; ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_COMPLETED) { /* * NOTE: it's a pity that we need to retake the lock here * to pair with the unlock() in the callers. Ideally we * could tell the callers so they do not need to unlock. */ mmap_read_lock(mm); *unlocked = true; return 0; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); *unlocked = true; fault_flags |= FAULT_FLAG_TRIED; goto retry; } return 0; }
  
  resolves user page fault. policy logic
  
  #ldos-flag #ldos-high-confidence #ldos-function
19. Rajitb 10 Nov 2024
  
  in Public
  
  static long __get_user_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; struct follow_page_context ctx = { NULL }; if (!nr_pages) return 0; start = untagged_addr_remote(mm, start); VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); do { struct page *page; unsigned int foll_flags = gup_flags; unsigned int page_increm; /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { /* * MADV_POPULATE_(READ|WRITE) wants to handle VMA * lookups+error reporting differently. */ if (gup_flags & FOLL_MADV_POPULATE) { vma = vma_lookup(mm, start); if (!vma) { ret = -ENOMEM; goto out; } if (check_vma_flags(vma, gup_flags)) { ret = -EINVAL; goto out; } goto retry; } vma = gup_vma_lookup(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, pages ? &page : NULL); if (ret) goto out; ctx.page_mask = 0; goto next_page; } if (!vma) { ret = -EFAULT; goto out; } ret = check_vma_flags(vma, gup_flags); if (ret) goto out; } retry: /* * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ if (fatal_signal_pending(current)) { ret = -EINTR; goto out; } cond_resched(); page = follow_page_mask(vma, start, foll_flags, &ctx); if (!page || PTR_ERR(page) == -EMLINK) { ret = faultin_page(vma, start, &foll_flags, PTR_ERR(page) == -EMLINK, locked); switch (ret) { case 0: goto retry; case -EBUSY: case -EAGAIN: ret = 0; fallthrough; case -EFAULT: case -ENOMEM: case -EHWPOISON: goto out; } BUG(); } else if (PTR_ERR(page) == -EEXIST) { /* * Proper page table entry exists, but no corresponding * struct page. If the caller expects **pages to be * filled in, bail out now, because that can't be done * for this page. */ if (pages) { ret = PTR_ERR(page); goto out; } } else if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } next_page: page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; if (pages) { struct page *subpage; unsigned int j; /* * This must be a large folio (and doesn't need to * be the whole folio; it can be part of it), do * the refcount work for all the subpages too. * * NOTE: here the page may not be the head page * e.g. when start addr is not thp-size aligned. * try_grab_folio() should have taken care of tail * pages. */ if (page_increm > 1) { struct folio *folio; /* * Since we already hold refcount on the * large folio, this should never fail. */ folio = try_grab_folio(page, page_increm - 1, foll_flags); if (WARN_ON_ONCE(!folio)) { /* * Release the 1st page ref if the * folio is problematic, fail hard. */ gup_put_folio(page_folio(page), 1, foll_flags); ret = -EFAULT; goto out; } } for (j = 0; j < page_increm; j++) { subpage = nth_page(page, j); pages[i + j] = subpage; flush_anon_page(vma, subpage, start + j * PAGE_SIZE); flush_dcache_page(subpage); } } i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; } while (nr_pages); out: if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return i ? i : ret; }
  
  Literally the actual policy logic of gup. Most important piece of code right here for gup
  
  #ldos-high-confidence #ldos-formula #ldos-algorithm #ldos-function #ldos-value #ldos-threshold
20. Rajitb 10 Nov 2024
  
  in Public
  
  #ifdef CONFIG_STACK_GROWSUP return vma_lookup(mm, addr); #else static volatile unsigned long next_warn; struct vm_area_struct *vma; unsigned long now, next; vma = find_vma(mm, addr); if (!vma || (addr >= vma->vm_start)) return vma; /* Only warn for half-way relevant accesses */ if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; if (vma->vm_start - addr > 65536) return NULL; /* Let's not warn more than once an hour.. */ now = jiffies; next = next_warn; if (next && time_before(now, next)) return NULL; next_warn = now + 60*60*HZ; /* Let people know things may have changed. */ pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n", current->comm, task_pid_nr(current), vma->vm_start, vma->vm_end, addr); dump_stack(); return NULL;
  
  helper func to lookup vma(virtual mem area) that warns per hour about half way relevant acc and changes in stack
  
  #ldos-low-confidence #ldos-threshold #ldos-policy-use #ldos-value #ldos-function
21. Rajitb 10 Nov 2024
  
  in Public
  
  static bool writable_file_mapping_allowed(struct vm_area_struct *vma, unsigned long gup_flags) { /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the case we disallow. */ if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) != (FOLL_PIN | FOLL_LONGTERM)) return true; /* * If the VMA does not require dirty tracking then no problematic write * can occur either. */ return !vma_needs_dirty_tracking(vma); }
  
  Def policy code. checks if we can write to a map
  
  #ldos-flag #ldos-function #ldos-high-confidence
22. Rajitb 10 Nov 2024
  
  in Public
  
  if (*flags & FOLL_NOFAULT) return -EFAULT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; if (*flags & FOLL_UNLOCKABLE) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE. * That's because some callers may not be prepared to * handle early exits caused by non-fatal signals. */ if (*flags & FOLL_INTERRUPTIBLE) fault_flags |= FAULT_FLAG_INTERRUPTIBLE; } if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (*flags & FOLL_TRIED) { /* * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED * can co-exist */ fault_flags |= FAULT_FLAG_TRIED; } if (unshare) { fault_flags |= FAULT_FLAG_UNSHARE; /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */ VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE); } ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_COMPLETED) { /* * With FAULT_FLAG_RETRY_NOWAIT we'll never release the * mmap lock in the page fault handler. Sanity check this. */ WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT); *locked = 0; /* * We should do the same as VM_FAULT_RETRY, but let's not * return -EBUSY since that's not reflecting the reality of * what has happened - we've just fully completed a page * fault, with the mmap lock released. Use -EAGAIN to show * that we want to take the mmap lock _again_. */ return -EAGAIN; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, *flags); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *locked = 0; return -EBUSY; }
  
  Seems it's just setting flags for page faults based on flags param
  
  #ldos-low-confidence #ldos-flag #ldos-function
23. Rajitb 10 Nov 2024
  
  in Public
  
  /* user gate pages are read-only */ if (gup_flags & FOLL_WRITE) return -EFAULT; if (address > TASK_SIZE) pgd = pgd_offset_k(address); else pgd = pgd_offset_gate(mm, address); if (pgd_none(*pgd)) return -EFAULT; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d)) return -EFAULT; pud = pud_offset(p4d, address); if (pud_none(*pud)) return -EFAULT; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return -EFAULT; pte = pte_offset_map(pmd, address); if (!pte) return -EFAULT; entry = ptep_get(pte); if (pte_none(entry)) goto unmap; *vma = get_gate_vma(mm); if (!page) goto out; *page = vm_normal_page(*vma, address, entry); if (!*page) { if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry))) goto unmap; *page = pte_page(entry); } ret = try_grab_page(*page, gup_flags); if (unlikely(ret)) goto unmap;
  
  Most of these seem like sanity checks right up until line 897 i.e, 'if(!page)'* after which we seem to unmap the page.
  
  #ldos-high-confidence #ldos-function #ldos-threshold #ldos-ifdef-code
24. Rajitb 10 Nov 2024
  
  in Public
  
  static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct follow_page_context *ctx) { pgd_t *pgd; struct mm_struct *mm = vma->vm_mm; ctx->page_mask = 0; /* * Call hugetlb_follow_page_mask for hugetlb vmas as it will use * special hugetlb page table walking code. This eliminates the * need to check for hugetlb entries in the general walking code. */ if (is_vm_hugetlb_page(vma)) return hugetlb_follow_page_mask(vma, address, flags, &ctx->page_mask); pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); return follow_p4d_mask(vma, address, pgd, flags, ctx); }
  
  places mask after following page into pte
  
  #ldos-high-confidence #ldos-flag #ldos-function #ldos-ifdef-code
25. Rajitb 10 Nov 2024
  
  in Public
  
  struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags) { struct follow_page_context ctx = { NULL }; struct page *page; if (vma_is_secretmem(vma)) return NULL; if (WARN_ON_ONCE(foll_flags & FOLL_PIN)) return NULL; /* * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect * to fail on PROT_NONE-mapped pages. */ page = follow_page_mask(vma, address, foll_flags, &ctx); if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return page; }
  
  finds page
  
  #ldos-flag #ldos-function #ldos-high-confidence
26. Rajitb 10 Nov 2024
  
  in Public
  
  if (flags & FOLL_SPLIT_PMD) { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); /* If pmd was left empty, stuff a page table in there quickly */ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) : follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); ctx->page_mask = HPAGE_PMD_NR - 1; return page;
  
  we're finding the page again but storing page mask in ctx
  
  #ldos-flag #ldos-function #ldos-high-confidence
27. Rajitb 10 Nov 2024
  
  in Public
  
  if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) return no_page_table(vma, flags); ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); return no_page_table(vma, flags); } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); }
  
  branch prediction to check if pmd is there and if it's big
  
  #ldos-function #ldos-low-confidence #ldos-ifdef-code #
28. Rajitb 10 Nov 2024
  
  in Public
  
  if (pmd_none(pmdval)) return no_page_table(vma, flags); if (!pmd_present(pmdval)) return no_page_table(vma, flags); if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; }
  
  checks if pmd is there. im assuming it's page middle dir.
  
  # #ldos-low-confidence #ldos-function
29. Rajitb 10 Nov 2024
  
  in Public
  
  /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags); pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags)) goto no_page; page = vm_normal_page(vma, address, pte); /* * We only care about anon pages in can_follow_write_pte() and don't * have to worry about pte_devmap() because they are never anon. */ if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, page, vma, flags)) { page = NULL; goto out; } if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN * case since they are only valid while holding the pgmap * reference. */ *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); if (*pgmap) page = pte_page(pte); else goto no_page; } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); goto out; } if (is_zero_pfn(pte_pfn(pte))) { page = pte_page(pte); } else { ret = follow_pfn_pte(vma, address, ptep, flags); page = ERR_PTR(ret); goto out; } } if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { page = ERR_PTR(-EMLINK); goto out; } VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ ret = try_grab_page(page, flags); if (unlikely(ret)) { page = ERR_PTR(ret); goto out; } /* * We need to make the page accessible if and only if we are going * to access its content (the FOLL_PIN case). Please see * Documentation/core-api/pin_user_pages.rst for details. */ if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { unpin_user_page(page); page = ERR_PTR(ret); goto out; } } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * mark_page_accessed(). */ mark_page_accessed(page); }
  
  finds page in pte. Judging by the complexity of the logic this is most likely policy code because we're literally getting user page
  
  #ldos-high-confidence #ldos-flag #ldos-function #ldos-ifdef-code
30. Rajitb 10 Nov 2024
  
  in Public
  
  if (pte_write(pte)) return true; /* Maybe FOLL_FORCE is set to override it? */ if (!(flags & FOLL_FORCE)) return false; /* But FOLL_FORCE has no effect on shared mappings */ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) return false; /* ... or read-only private ones */ if (!(vma->vm_flags & VM_MAYWRITE)) return false; /* ... or already writable ones that just need to take a write fault */ if (vma->vm_flags & VM_WRITE) return false; /* * See can_change_pte_writable(): we broke COW and could map the page * writable if we have an exclusive anonymous page ... */ if (!page || !PageAnon(page) || !PageAnonExclusive(page)) return false; /* ... and a write-fault isn't required for other reasons. */ if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) return false; return !userfaultfd_pte_wp(vma, pte);
  
  flag checks to see if u can write to a pte
  
  #ldos-flag
31. Rajitb 10 Nov 2024
  
  in Public
  
  if (flags & FOLL_TOUCH) { pte_t orig_entry = ptep_get(pte); pte_t entry = orig_entry; if (flags & FOLL_WRITE) entry = pte_mkdirty(entry); entry = pte_mkyoung(entry); if (!pte_same(orig_entry, entry)) { set_pte_at(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); }
  
  uses pte to mark dirty pages and finds pfn in pte
  
  #ldos-flag #ldos-function #ldos-high-confidence
32. Rajitb 10 Nov 2024
  
  in Public
  
  void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty) { unsigned long i; struct folio *folio; unsigned int nr; if (!make_dirty) { unpin_user_pages(pages, npages); return; } sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); /* * Checking PageDirty at this point may race with * clear_page_dirty_for_io(), but that's OK. Two key * cases: * * 1) This code sees the page as already dirty, so it * skips the call to set_page_dirty(). That could happen * because clear_page_dirty_for_io() called * page_mkclean(), followed by set_page_dirty(). * However, now the page is going to get written back, * which meets the original intention of setting it * dirty, so all is well: clear_page_dirty_for_io() goes * on to call TestClearPageDirty(), and write the page * back. * * 2) This code sees the page as clean, so it calls * set_page_dirty(). The page stays dirty, despite being * written back, so it gets written back again in the * next writeback cycle. This is harmless. */ if (!folio_test_dirty(folio)) { folio_lock(folio); folio_mark_dirty(folio); folio_unlock(folio); } gup_put_folio(folio, nr, FOLL_PIN); } }
  
  unpins and dirties page
  
  #ldos-function #ldos-low-confidence #ldos-policy-use
33. Rajitb 10 Nov 2024
  
  in Public
  
  static inline struct folio *gup_folio_next(struct page **list, unsigned long npages, unsigned long i, unsigned int *ntails) { struct folio *folio = page_folio(list[i]); unsigned int nr; for (nr = i + 1; nr < npages; nr++) { if (page_folio(list[nr]) != folio) break; } *ntails = nr - i; return folio; }
  
  gets folio of next page along with reference to end of folio
  
  #ldos-function #ldos-low-confidence #ldos-policy-use
34. Rajitb 10 Nov 2024
  
  in Public
  
  static inline struct folio *gup_folio_range_next(struct page *start, unsigned long npages, unsigned long i, unsigned int *ntails) { struct page *next = nth_page(start, i); struct folio *folio = page_folio(next); unsigned int nr = 1; if (folio_test_large(folio)) nr = min_t(unsigned int, npages - i, folio_nr_pages(folio) - folio_page_idx(folio, next)); *ntails = nr; return folio; }
  
  gets the folio of the next page from start to 'i' range. also gets the tail folio/reference
  
  #ldos-policy-use #ldos-low-confidence #ldos-function
35. Rajitb 10 Nov 2024
  
  in Public
  
  folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
  
  function for adding reference
  
  #ldos-low-confidence #ldos-function #ldos-value
36. Rajitb 10 Nov 2024
  
  in Public
  
  void unpin_user_page(struct page *page) { sanity_check_pinned_pages(&page, 1); gup_put_folio(page_folio(page), 1, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_page);
  
  actual policy use logic
  
  #ldos-policy-use #ldos-low-confidence #ldos-function
37. Rajitb 10 Nov 2024
  
  in Public
  
  struct folio *folio = page_folio(page); if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) return -EREMOTEIO; if (flags & FOLL_GET) folio_ref_inc(folio);
  
  checks for code that is involved in policy but is not the actual logic
  
  #ldos-ifdef-code #ldos-flag #ldos-function #ldos-threshold #ldos-low-confidence
38. Rajitb 10 Nov 2024
  
  in Public
  
  else if (flags & FOLL_PIN) { /* * Don't take a pin on the zero page - it's not going anywhere * and it is used in a *lot* of places. */ if (is_zero_page(page)) return 0; /* * Similar to try_grab_folio(): be sure to *also* * increment the normal page refcount field at least once, * so that the page really is pinned. */ if (folio_test_large(folio)) { folio_ref_add(folio, 1); atomic_add(1, &folio->_pincount); } else { folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); } node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); }
  
  Logic that actually tries to grab the folio. Also policy use code and not actual policy
  
  #ldos-flag #ldos-function #ldos-low-confidence
39. Rajitb 10 Nov 2024
  
  in Public
  
  if (!put_devmap_managed_page_refs(&folio->page, refs)) folio_put_refs(folio, refs);
  
  Definitely a vital and straightforward policy use section of gup that simples places a reference on the folio
  
  #ldos-function #ldos-policy-use #ldos-low-confidence
40. Rajitb 10 Nov 2024
  
  in Public
  
  if (flags & FOLL_PIN) { if (is_zero_folio(folio)) return; node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); if (folio_test_large(folio)) atomic_sub(refs, &folio->_pincount); else refs *= GUP_PIN_COUNTING_BIAS; }
  
  Checks if the folio is zero/large
  
  #ldos-flag #ldos-function #ldos-low-confidence #ldos-policy-use
41. Rajitb 10 Nov 2024
  
  in Public
  
  if (folio_test_large(folio)) atomic_add(refs, &folio->_pincount); else folio_ref_add(folio, refs * (GUP_PIN_COUNTING_BIAS - 1))
  
  maintaining reference counts. Part of policy logic most likely
  
  #ldos-function #ldos-low-confidence #ldos-policy-use
42. Rajitb 10 Nov 2024
  
  in Public
  
  if (unlikely((flags & FOLL_LONGTERM) && !folio_is_longterm_pinnable(folio))) { if (!put_devmap_managed_page_refs(&folio->page, refs)) folio_put_refs(folio, refs); return NULL;
  
  checks for longterm folio pins.
  
  #ldos-flag #ldos-function #ldos-low-confidence #ldos-ifdef-code
43. Rajitb 10 Nov 2024
  
  in Public
  
  if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) return NULL; if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) return NULL;
  
  Time saving predictions(unlikely) and single time warning func(WARN_ON_ONCE) for flags. Not actual policy logic so low confidence.
  
  #ldos-formula #ldos-flag #ldos-low-confidence #ldos-ifdef-code
44. Rajitb 10 Nov 2024
  
  in Public
  
  if (unlikely(page_folio(page) != folio)) { if (!put_devmap_managed_page_refs(&folio->page, refs)) folio_put_refs(folio, refs); goto retry;
  
  Uses prediction to check if a folio still points to the page. This is part of the function that tries to retrieve the folio to confirm that it is associated with a page.
  
  #ldos-function #ldos-high-confidence #ldos-ifdef-code
45. Rajitb 10 Nov 2024
  
  in Public
  
  folio = page_folio(page); if (WARN_ON_ONCE(folio_ref_count(folio) < 0)) return NULL; if (unlikely(!folio_ref_try_add(folio, refs))) return NULL;
  
  These increment the reference count for the folio since you're returning a reference of the folio. Important function so important internal logic subsequently
  
  #ldos-function #ldos-high-confidence #ldos-ifdef-code
46. Rajitb 10 Nov 2024
  
  in Public
  
  if (is_zero_page(page) || !folio_test_anon(folio)) continue; if (!folio_test_large(folio) || folio_test_hugetlb(folio))
  
  Sanity checks for pinned pages wouldn't classify as policy logic but common sense pre-checks for the actual policy. But I think it's worth tagging this to gain a sense of what is not policy code
  
  #ldos-function #ldos-low-confidence
47. Rajitb 10 Nov 2024
  
  in Public
  
  if (is_zero_page(page)) return page_folio(page); folio = try_get_folio(page, refs); if (!folio) return NULL;
  
  Just trying to check for zero pages and trying to retrieve folios. Unlikely policy logic
  
  #ldos-formula #ldos-low-confidence
48. Rajitb 10 Nov 2024
  
  in Public
  
  if (flags & FOLL_GET) return try_get_folio(page, refs);
  
  Policy logic that determines and tries to retrieve folios based on given flags.
  
  #ldos-flag #ldos-high-confidence
49. Rajitb 03 Nov 2024
  
  in Public
  
  if (gup_flags & FOLL_PIN) mm_set_has_pinned_flag(&current->mm->flags); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(&current->mm->mmap_lock); start = untagged_addr(start) & PAGE_MASK; len = nr_pages << PAGE_SHIFT; if (check_add_overflow(start, len, &end)) return -EOVERFLOW; if (end > TASK_SIZE_MAX) return -EFAULT; if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT;
  
  checking for overflow in page alloc likely
  
  #ldos-flag #ldos-formula
50. Rajitb 03 Nov 2024
  
  in Public
  
  if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { gup_put_folio(folio, 1, flags); goto pte_unmap; } }
  
  part of policy code
  
  #ldos-flag #ldos-high-confidence
51. Rajitb 03 Nov 2024
  
  in Public
  
  if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, locked, gup_flags);
  
  policy decision to get locked page!
  
  #ldos-high-confidence #ldos-flag
52. Rajitb 03 Nov 2024
  
  in Public
  
  if (folio_is_device_coherent(folio)) { /* * Migration will fail if the page is pinned, so convert * the pin on the source page to a normal reference. */ pages[i] = NULL; folio_get(folio); gup_put_folio(folio, 1, FOLL_PIN); if (migrate_device_coherent_page(&folio->page)) { ret = -EBUSY; goto err; } continue; }
  
  algorithm/fnc decision
53. Rajitb 03 Nov 2024
  
  in Public
  
  vm_flags = (foll_flags & FOLL_WRITE) ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (foll_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
  
  vm_flags
  
  #ldos-flag
54. Rajitb 03 Nov 2024
  
  in Public
  
  if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ pages_done = ret; break; }
  
  flag
  
  #ldos-flag
55. Rajitb 03 Nov 2024
  
  in Public
  
  if (ret & VM_FAULT_COMPLETED) { /* * NOTE: it's a pity that we need to retake the lock here * to pair with the unlock() in the callers. Ideally we * could tell the callers so they do not need to unlock. */ mmap_read_lock(mm); *unlocked = true; return 0; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); *unlocked = true; fault_flags |= FAULT_FLAG_TRIED; goto retry; }
  
  a lot of VM fault flags that are checked when calling a function that handles page faults
  
  #ldos-flag
56. Rajitb 03 Nov 2024
  
  in Public
  
  if ((fault_flags & FAULT_FLAG_KILLABLE) && fatal_signal_pending(current))
  
  flag fault killable
57. Rajitb 02 Nov 2024
  
  in Public
  
  if (flags & FOLL_PIN) mm_set_has_pinned_flag(&mm->flags);
  
  #ldos-flag
58. Rajitb 02 Nov 2024
  
  in Public
  
  if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
  
  flag pol to check if map is unlocked (for pos remap?)
  
  #ldos-flag
59. Rajitb 02 Nov 2024
  
  in Public
  
  if (page_increm > nr_pages) page_increm = nr_pages;
  
  next page logic
  
  #ldos-flag #ldos-high-confidence
60. Rajitb 02 Nov 2024
  
  in Public
  
  */ if (gup_flags & FOLL_MADV_POPULATE) { vma = vma_lookup(mm, start); if (!vma) { ret = -ENOMEM; goto out; } if (check_vma_flags(vma, gup_flags)) { ret = -EINVAL; goto out; } goto retry; }
  
  page populate flag for sure
  
  #ldos-high-confidence#ldos-flag#ldos-formula
61. Rajitb 02 Nov 2024
  
  in Public
  
  if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ if (is_vm_hugetlb_page(vma)) return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could * set a breakpoint in a read-only mapping of an * executable, without corrupting the file (yet only * when that file had been opened for writing!). * Anon pages in shared mappings are surprising: now * just reject it. */ if (!is_cow_mapping(vm_flags)) return -EFAULT; } } else if (!(vm_flags & VM_READ)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* * Is there actually any vma we can reach here which does not * have VM_MAYREAD set? */ if (!(vm_flags & VM_MAYREAD)) return -EFAULT; }
  
  cow mapping, sdw stack seem like imp policies which dont seem like regular flags
  
  #ldos-flag
Visit annotations in context

Tags

#

#ldos-algorithm

#ldos-threshold

#ldos-low-confidence

#ldos-high-confidence

#ldos-policy-use

#ldos-flag

#ldos-value

#ldos-high-confidence#ldos-flag#ldos-formula

#ldos-formula

#ldos-function

#ldos-ifdef-code

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/gup.c
Oct 2024
elixir.bootlin.com elixir.bootlin.com

gup.c - mm/gup.c - Linux source code v6.6.42 - Bootlin

3
1. Rajitb 06 Oct 2024
  
  in Public
  
  if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) != (FOLL_PIN | FOLL_LONGTERM)) return true;
  
  most likely part of a policy code that includes flag based decision making
  
  #ldos-flag
2. Rajitb 06 Oct 2024
  
  in Public
  
  if (ctx.pgmap) put_dev_pagemap(ctx.pgmap);
  
  Low confidence policy
3. Rajitb 06 Oct 2024
  
  in Public
  
  if (flags & FOLL_TOUCH)
  
  value policy!!
Visit annotations in context

Tags

#ldos-flag

Annotators

Rajitb

URL

elixir.bootlin.com/linux/v6.6.42/source/mm/gup.c

Rajit

Annotations: 121

Joined: September 6, 2024

Tags

Annotators

URL

Tags

Annotators

URL

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL

Tags

Annotators

URL