- Oct 2024
-
elixir.bootlin.com elixir.bootlin.com
-
#ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); static void sum_vm_events(unsigned long *ret) { int cpu; int i; memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); for_each_online_cpu(cpu) { struct vm_event_state *this = &per_cpu(vm_event_states, cpu); for (i = 0; i < NR_VM_EVENT_ITEMS; i++) ret[i] += this->event[i]; } } /* * Accumulate the vm event counters across all CPUs. * The result is unavoidably approximate - it can change * during and after execution of this function. */ void all_vm_events(unsigned long *ret) { cpus_read_lock(); sum_vm_events(ret); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(all_vm_events); /* * Fold the foreign cpu events into our own. * * This is adding to the events on one processor * but keeps the global counts constant. */ void vm_events_fold_cpu(int cpu) { struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); int i; for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { count_vm_events(i, fold_state->event[i]); fold_state->event[i] = 0; } } #endif /* CONFIG_VM_EVENT_COUNTERS */
When CONFIG_VM_EVENT_COUNTERS is enabled, the kernel compiles the code that collects and manages virtual memory (vm) event counters. These counters track events like page faults, page allocations, and swap operations. The functions
sum_vm_events
,all_vm_events
, andvm_events_fold_cpu
are responsble for accumulating these statistics across all CPUs.If CONFIG_VM_EVENT_COUNTERS is not enabled, this code is excluded from the build. This means the kernel won't collect these detailed VM statistics, reducing memory usage and avoiding the overhead associated with tracking them.
-
#ifdef CONFIG_NUMA int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; /* zero numa counters within a zone */ static void zero_zone_numa_counters(struct zone *zone) { int item, cpu; for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) { atomic_long_set(&zone->vm_numa_event[item], 0); for_each_online_cpu(cpu) { per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item] = 0; } } } /* zero numa counters of all the populated zones */ static void zero_zones_numa_counters(void) { struct zone *zone; for_each_populated_zone(zone) zero_zone_numa_counters(zone); } /* zero global numa counters */ static void zero_global_numa_counters(void) { int item; for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) atomic_long_set(&vm_numa_event[item], 0); } static void invalid_numa_statistics(void) { zero_zones_numa_counters(); zero_global_numa_counters(); } static DEFINE_MUTEX(vm_numa_stat_lock); int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int ret, oldval; mutex_lock(&vm_numa_stat_lock); if (write) oldval = sysctl_vm_numa_stat; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (ret || !write) goto out; if (oldval == sysctl_vm_numa_stat) goto out; else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) { static_branch_enable(&vm_numa_stat_key); pr_info("enable numa statistics\n"); } else { static_branch_disable(&vm_numa_stat_key); invalid_numa_statistics(); pr_info("disable numa statistics, and clear numa counters\n"); } out: mutex_unlock(&vm_numa_stat_lock); return ret; } #endif
This conditionally includes the NUMA-specific code based on whether the
CONFIG_NUMA
option is enabled during kernel compilation. Within this conditional block, thesysctl_vm_numa_stat_handler
function provides a runtime configuration mechanism through thesysctl_vm_numa_stat
variable. This function allows system administrators to enable or disable the collection of NUMA statistics at runtime. When the value ofsysctl_vm_numa_stat
changes, the function changes thevm_numa_stat_key
to start or stop the statistics collection and clears the NUMA counters if disabled.
-
-
elixir.bootlin.com elixir.bootlin.com
-
if (should_proactive_compact_node(pgdat)) { unsigned int prev_score, score; prev_score = fragmentation_score_node(pgdat); proactive_compact_node(pgdat); score = fragmentation_score_node(pgdat); /* * Defer proactive compaction if the fragmentation * score did not go down i.e. no progress made. */ if (unlikely(score >= prev_score)) timeout = default_timeout << COMPACT_MAX_DEFER_SHIFT; }
This part of the code checks whether proactive compaction should be triggered using the
should_proactive_compact_node(pgdat)
predicate. If proactive compaction is necessary, it stores the node's fragmentation score (prev_score
), performs proactive compaction (proactive_compact_node(pgdat)
), and then checks the new fragmentation score (score). If the score does not decrease after compaction (indicating no progress), the system defers further proactive compaction by increasing the timeout (timeout = default_timeout << COMPACT_MAX_DEFER_SHIFT
). This helps avoid repeated compaction attempts without progress. -
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) static ssize_t compact_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int nid = dev->id; if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { /* Flush pending updates to the LRU lists */ lru_add_drain_all(); compact_node(nid); } return count; } static DEVICE_ATTR_WO(compact); int compaction_register_node(struct node *node) { return device_create_file(&node->dev, &dev_attr_compact); } void compaction_unregister_node(struct node *node) { device_remove_file(&node->dev, &dev_attr_compact); } #endif /* CONFIG_SYSFS && CONFIG_NUMA */
When these options are enabled, the functions compact_store, compaction_register_node, and compaction_unregister_node are included in the build. The compact_store function allows users to trigger memory compaction on specific NUMA nodes by writing to the sysfs interface. Inside this function, it checks if the node ID is valid and online, drains pending updates to the LRU lists with lru_add_drain_all(), and then calls compact_node(nid) to perform compaction on that node.
Including these functions provides fine-grained control over memory management in NUMA system. If even one of the flag is not eneabled, these functions are excluded from the build, and the per-node compaction interface is unavailable.
-
if (prio > MIN_COMPACT_PRIORITY && compaction_deferred(zone, order)) { rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); continue; } status = compact_zone_order(zone, order, gfp_mask, prio, alloc_flags, ac->highest_zoneidx, capture); rc = max(status, rc); /* The allocation should succeed, stop compacting */ if (status == COMPACT_SUCCESS) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller * will repeat this with true if allocation indeed * succeeds in this zone. */ compaction_defer_reset(zone, order, false); break; } if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE || status == COMPACT_PARTIAL_SKIPPED)) /* * We think that allocation won't succeed in this zone * so we defer compaction there. If it ends up * succeeding after all, it will be reset. */ defer_compaction(zone, order); /* * We might have stopped compacting due to need_resched() in * async compaction, or due to a fatal signal detected. In that * case do not try further zones */ if ((prio == COMPACT_PRIO_ASYNC && need_resched()) || fatal_signal_pending(current)) break;
The kernel attempts to compact memory across different zones, guided by several key predicates. First, if compaction for a zone has been deferred and the priority is above
MIN_COMPACT_PRIORITY
, the function skips the zone, marking the result asCOMPACT_DEFERRED
. For each zone, it calls compact_zone_order and updates the overall status. If compaction succeeds (COMPACT_SUCCESS
), the kernel resets the deferred state for the zone and stops further compaction attempts, expecting allocation to succeed. If compaction is either complete or partially skipped in non-async mode, the zone is marked for deferred compaction, preventing further unnecessary attempts. If compaction is interrupted due to rescheduling or a fatal signal, the function exits early to avoid system overload. -
if (cc->nr_freepages > 0) { unsigned long free_pfn = release_freepages(&cc->freepages); cc->nr_freepages = 0; VM_BUG_ON(free_pfn == 0); /* The cached pfn is always the first in a pageblock */ free_pfn = pageblock_start_pfn(free_pfn); /* * Only go back, not forward. The cached pfn might have been * already reset to zone end in compact_finished() */ if (free_pfn > cc->zone->compact_cached_free_pfn) cc->zone->compact_cached_free_pfn = free_pfn; }
This part ensures efficient release of free pages back into the system. If there are free pages to be released (
cc->nr_freepages > 0
), the functionrelease_freepages
is called, and the first page frame number (PFN) in the pageblock is recorded. The cached PFN is then updated to point to the start of the newly freed pageblock. A critical check (VM_BUG_ON
) ensures that the process does not operate on invalid or uninitialized page ranges.This ensures that the cached free PFN is always updated in a conservative manner i.e. only moving backward, not forward. This avoids skipping any potential free pages. This helps optimize subsequent compaction attempts by ensuring that future operations start from the correct position.
-
if (err) { putback_movable_pages(&cc->migratepages); /* * migrate_pages() may return -ENOMEM when scanners meet * and we want compact_finished() to detect it */ if (err == -ENOMEM && !compact_scanners_met(cc)) { ret = COMPACT_CONTENDED; goto out; } /* * If an ASYNC or SYNC_LIGHT fails to migrate a page * within the pageblock_order-aligned block and * fast_find_migrateblock may be used then scan the * remainder of the pageblock. This will mark the * pageblock "skip" to avoid rescanning in the near * future. This will isolate more pages than necessary * for the request but avoid loops due to * fast_find_migrateblock revisiting blocks that were * recently partially scanned. */ if (!pageblock_aligned(cc->migrate_pfn) && !cc->ignore_skip_hint && !cc->finish_pageblock && (cc->mode < MIGRATE_SYNC)) { cc->finish_pageblock = true; /* * Draining pcplists does not help THP if * any page failed to migrate. Even after * drain, the pageblock will not be free. */ if (cc->order == COMPACTION_HPAGE_ORDER) last_migrated_pfn = 0; goto rescan; } }
This block of code handles errors during page migration by first resetting the state to prevent inconsistencies. If a migration error occurs, the kernel puts back the movable pages. If the error is due to memory shortage (
-ENOMEM
), the process checks if the migration and free page scanners have met. If not, compaction is terminated early withCOMPACT_CONTENDED
. ForASYNC
orSYNC_LIGHT
modes, if the migration fails within an unaligned pageblock, the kernel marks the block for skipping to avoid unnecessary rescanning. Thus, improving efficiency by isolating more pages than required and preventing revisits to partially scanned blocks. For transparent huge pages (THP), it ensures the failed pageblocks are not reused. -
#ifdef CONFIG_CMA /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ if (migratetype == MIGRATE_MOVABLE && !free_area_empty(area, MIGRATE_CMA)) return COMPACT_SUCCESS; #endif
This block allows the kernel's memory compaction algorithm to consider
MIGRATE_CMA
pages as fallback options forMIGRATE_MOVABLE
allocations when the Contiguous Memory Allocator (CMA) is enabled (CONFIG_CMA
is defined).When
CONFIG_CMA
is enabled, the compaction process becomes more flexible by allowing the use of free CMA pages if standard movable pages are unavailable, potentially improving allocation success rates.If
CONFIG_CMA
is disabled, this fallback mechanism is omitted, ensuring that CMA regions remain dedicated to their primary purpose of providing contiguous memory blocks for devices that require them.https://elixir.bootlin.com/linux/v6.6.42/source/mm/Kconfig#L895
-
/* * compaction_suitable: Is this suitable to run compaction on this zone now? */ bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx) { enum compact_result compact_result; bool suitable; suitable = __compaction_suitable(zone, order, highest_zoneidx, zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation * * index of -1000 would imply allocations might succeed depending on * watermarks, but we already failed the high-order watermark check * index towards 0 implies failure is due to lack of memory * index towards 1000 implies failure is due to fragmentation * * Only compact if a failure would be due to fragmentation. Also * ignore fragindex for non-costly orders where the alternative to * a successful reclaim/compaction is OOM. Fragindex and the * vm.extfrag_threshold sysctl is meant as a heuristic to prevent * excessive compaction for costly orders, but it should not be at the * expense of system stability. */ if (suitable) { compact_result = COMPACT_CONTINUE; if (order > PAGE_ALLOC_COSTLY_ORDER) { int fragindex = fragmentation_index(zone, order); if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) { suitable = false; compact_result = COMPACT_NOT_SUITABLE_ZONE; } } } else { compact_result = COMPACT_SKIPPED; } trace_mm_compaction_suitable(zone, order, compact_result); return suitable; }
This function decides whether to initiate memory compaction on a particular zone based on system conditions like the fragmentation index and the external fragmentation threshold (
sysctl_extfrag_threshold
). It determines whether compaction would be effective or should be skipped. For high-order allocations, if the fragmentation index is below the threshold, the function opts to avoid unnecessary compaction, conserving system resources and preventing potential performance degradation.This enhances linux's memory management efficiency by preventing futile compaction attempts; such as in low-memory situations. Initiating compaction only when fragmentation is the primary issue, helps the system optimize CPU utilization. This should improve memory allocation success rates.
-
static unsigned long fast_find_migrateblock(struct compact_control *cc) { unsigned int limit = freelist_scan_limit(cc); unsigned int nr_scanned = 0; unsigned long distance; unsigned long pfn = cc->migrate_pfn; unsigned long high_pfn; int order; bool found_block = false; /* Skip hints are relied on to avoid repeats on the fast search */ if (cc->ignore_skip_hint) return pfn; /* * If the pageblock should be finished then do not select a different * pageblock. */ if (cc->finish_pageblock) return pfn; /* * If the migrate_pfn is not at the start of a zone or the start * of a pageblock then assume this is a continuation of a previous * scan restarted due to COMPACT_CLUSTER_MAX. */ if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn)) return pfn; /* * For smaller orders, just linearly scan as the number of pages * to migrate should be relatively small and does not necessarily * justify freeing up a large block for a small allocation. */ if (cc->order <= PAGE_ALLOC_COSTLY_ORDER) return pfn; /* * Only allow kcompactd and direct requests for movable pages to * quickly clear out a MOVABLE pageblock for allocation. This * reduces the risk that a large movable pageblock is freed for * an unmovable/reclaimable small allocation. */ if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE) return pfn; /* * When starting the migration scanner, pick any pageblock within the * first half of the search space. Otherwise try and pick a pageblock * within the first eighth to reduce the chances that a migration * target later becomes a source. */ distance = (cc->free_pfn - cc->migrate_pfn) >> 1; if (cc->migrate_pfn != cc->zone->zone_start_pfn) distance >>= 2; high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance); for (order = cc->order - 1; order >= PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < limit; order--) { struct free_area *area = &cc->zone->free_area[order]; struct list_head *freelist; unsigned long flags; struct page *freepage; if (!area->nr_free) continue; spin_lock_irqsave(&cc->zone->lock, flags); freelist = &area->free_list[MIGRATE_MOVABLE]; list_for_each_entry(freepage, freelist, buddy_list) { unsigned long free_pfn; if (nr_scanned++ >= limit) { move_freelist_tail(freelist, freepage); break; } free_pfn = page_to_pfn(freepage); if (free_pfn < high_pfn) { /* * Avoid if skipped recently. Ideally it would * move to the tail but even safe iteration of * the list assumes an entry is deleted, not * reordered. */ if (get_pageblock_skip(freepage)) continue; /* Reorder to so a future search skips recent pages */ move_freelist_tail(freelist, freepage); update_fast_start_pfn(cc, free_pfn); pfn = pageblock_start_pfn(free_pfn); if (pfn < cc->zone->zone_start_pfn) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; break; } } spin_unlock_irqrestore(&cc->zone->lock, flags); } cc->total_migrate_scanned += nr_scanned; /* * If fast scanning failed then use a cached entry for a page block * that had free pages as the basis for starting a linear scan. */ if (!found_block) { cc->fast_search_fail++; pfn = reinit_migrate_pfn(cc); } return pfn; }
This function helps determine when and how to perform a fast search for a suitable migration block during memory compaction. It evaluates predicates such as skipping the fast search if skip hints are ignored
(if (cc->ignore_skip_hint) return pfn;)
, if the current scan should be finished (if (cc->finish_pageblock) return pfn;
), if the migration pointer isn't at the start of a pageblock (if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn)) return pfn;
), or if the allocation order is smlal (if (cc->order <= PAGE_ALLOC_COSTLY_ORDER) return pfn;
). It also restricts fast searches to movable pages during direct compaction for higher-order allocations (if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE) return pfn;
). Based on these conditions, the function performs actions like initiating a fast search under specific circumstances to quickly locate suitable blocks, limiting the number of pages scanned to conserve CPU resources (if (nr_scanned++ >= limit) { move_freelist_tail(freelist, freepage); break;
}), and adjusting scanning strategies based on previous successes or failures to enhance adaptability (if (!found_block) { cc->fast_search_fail++; pfn = reinit_migrate_pfn(cc);
}). -
#ifdef CONFIG_SPARSEMEM
#ifdef CONFIG_SPARSEMEM
conditionally includes or excludes the implementation of theskip_offline_sections
andskip_offline_sections_reverse
functions based on whether theCONFIG_SPARSEMEM
option is enabled in the kernel configuration.CONFIG_SPARSEMEM
enables the the kernel to support sparse memory systems where memory sections can be dynamically online or offline. They provide logic to skip over offline memory sections during operations like memory compaction, ensuring that the system does not attempt to access or manipulate memory that is not currently available.If
CONFIG_SPARSEMEM
is not defined, these functions return 0, effectively disabling the handling of offline memory sections.https://elixir.bootlin.com/linux/v6.6.42/source/mm/Kconfig#L461
-
#ifdef CONFIG_COMPACTION
This option acts as a policy control mechanism that determines whether the memory compaction feature is included in the kernel build. By including the compaction-related code within
#ifdef CONFIG_COMPACTION
and#endif
[Line 522], the code conditionally compiles these sections based on the configuration setting. This affects the kernel's behavior regarding memory management and fragmentation handling. CONFIG_COMPACTION is defined in https://elixir.bootlin.com/linux/v6.6.42/source/mm/Kconfig#L637 and default is set toYes
orTrue
-