- Oct 2024
-
elixir.bootlin.com elixir.bootlin.com
-
static void wb_position_ratio(struct dirty_throttle_control *dtc)
dynamically calculates pos_ratio based on the state of global and per-writeback (wb) dirty limits, write bandwidth, and thresholds
-
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
This is a configuration policy to force strictlimit.
-
/* throttle according to the chosen dtc */ dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> RATELIMIT_CALC_SHIFT; max_pause = wb_max_pause(wb, sdtc->wb_dirty); min_pause = wb_min_pause(wb, max_pause, task_ratelimit, dirty_ratelimit, &nr_dirtied_pause); if (unlikely(task_ratelimit == 0)) { period = max_pause; pause = max_pause; goto pause; } period = HZ * pages_dirtied / task_ratelimit; pause = period; if (current->dirty_paused_when) pause -= now - current->dirty_paused_when; /* * For less than 1s think time (ext3/4 may block the dirtier * for up to 800ms from time to time on 1-HDD; so does xfs, * however at much less frequency), try to compensate it in * future periods by updating the virtual time; otherwise just * do a reset, as it may be a light dirtier. */ if (pause < min_pause) { trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, period, min(pause, 0L), start_time); if (pause < -HZ) { current->dirty_paused_when = now; current->nr_dirtied = 0; } else if (period) { current->dirty_paused_when += period; current->nr_dirtied = 0; } else if (current->nr_dirtied_pause <= pages_dirtied) current->nr_dirtied_pause += pages_dirtied; break; } if (unlikely(pause > max_pause)) { /* for occasional dropped task_ratelimit */ now += min(pause - max_pause, max_pause); pause = max_pause; } pause: trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, sdtc->wb_thresh, sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, period, pause, start_time); if (flags & BDP_ASYNC) { ret = -EAGAIN; break; } __set_current_state(TASK_KILLABLE); bdi->last_bdp_sleep = jiffies; io_schedule_timeout(pause); current->dirty_paused_when = now + pause; current->nr_dirtied = 0; current->nr_dirtied_pause = nr_dirtied_pause;
This part of the code makes algorithmic decision on how long a task should throttle based on the rate at which it dirties memory pages.
-
if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
This is a configuration policy that controls whether to update the limit in the control group. The config enables support for controlling the writeback of dirty pages on a per-cgroup basis in the Linux kernel. This allows for better resource management and improved performance.
-
-
elixir.bootlin.com elixir.bootlin.com
-
static bool __meminit defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { static unsigned long prev_end_pfn, nr_initialised; if (early_page_ext_enabled()) return false; /* * prev_end_pfn static that contains the end of previous zone * No need to protect because called very early in boot before smp_init. */ if (prev_end_pfn != end_pfn) { prev_end_pfn = end_pfn; nr_initialised = 0; } /* Always populate low zones for address-constrained allocations */ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) return false; if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) return true; /* * We start only with one section of pages, more pages are added as * needed until the rest of deferred pages are initialized. */ nr_initialised++; if ((nr_initialised > PAGES_PER_SECTION) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { NODE_DATA(nid)->first_deferred_pfn = pfn; return true; } return false; }
This policy decides whether to defer the remaining initialization; it's used on line 885 of this page. Note that this function definition relies on CONFIG_DEFERRED_STRUCT_PAGE_INIT being defined.
-
if (mminit_loglevel < MMINIT_VERIFY)
This policy controls whether the function will print information about the zonelist. This decision is determined by the value of the "mminit_level" enum in "mm/internal.h".
-
if (overcommit_policy == OVERCOMMIT_NEVER)
This policy controls the memory batch size based on the overcommit policy, choosing a smaller batch size when the policy is OVERCOMMIT_NEVER.
-
-
elixir.bootlin.com elixir.bootlin.com
-
if (!should_skip_kasan_unpoison(gfp_flags) && kasan_unpoison_pages(page, order, init)) { /* Take note that memory was initialized by KASAN. */ if (kasan_has_integrated_init()) init = false; } else { /* * If memory tags have not been set by KASAN, reset the page * tags to ensure page_address() dereferencing does not fault. */ for (i = 0; i != 1 << order; ++i) page_kasan_tag_reset(page + i); }
This decision made by configuration (KASAN's mode)
-
if (can_direct_reclaim && can_compact && (costly_order || (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) && !gfp_pfmemalloc_allowed(gfp_mask))
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
The value of PAGE_ALLOC_COSTLY_ORDER is defined as 3, heuristically determining whether try direct compaction or not. However, can_direct_reclaim is determined by caller, so it is also a configuaration policy.
-
if (!can_direct_reclaim)
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
gfp_mask is parameter passed by caller and it specify whether the caller would like to reclaim pages or not if failing to allocate pages.
-
if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
PAGE_ALLOC_COSTLY_ORDER is a heuristic threshold determining whether should retry the compaction or not.
-
if (!node_isset(node, *used_node_mask)) { node_set(node, *used_node_mask); return node; } for_each_node_state(n, N_MEMORY) { /* Don't want a node to appear more than once */ if (node_isset(n, *used_node_mask)) continue; /* Use the distance array to find the distance */ val = node_distance(node, n); /* Penalize nodes under us ("prefer the next node") */ val += (n < node); /* Give preference to headless and unused nodes */ if (!cpumask_empty(cpumask_of_node(n))) val += PENALTY_FOR_NODE_WITH_CPUS; /* Slight preference for less loaded node */ val *= MAX_NUMNODES; val += node_load[n]; if (val < min_val) { min_val = val; best_node = n; } } if (best_node >= 0) node_set(best_node, *used_node_mask);
Selects the best node based on a heuristic that takes into account node distance, CPU availability, and load. The code prefers unused nodes, penalizes closer nodes, and gives preference to less-loaded nodes. It then updates the used node mask to prevent reuse of the same node.
-
/* * !costly requests are much more important than * __GFP_RETRY_MAYFAIL costly ones because they are de * facto nofail and invoke OOM killer to move on while * costly can fail and users are ready to cope with * that. 1/4 retries is rather arbitrary but we would * need much more detailed feedback from compaction to * make a better decision. */ if (order > PAGE_ALLOC_COSTLY_ORDER) max_retries /= 4; if (++(*compaction_retries) <= max_retries) { ret = true; goto out; }
Adjusts the maximum number of retries for compaction based on allocation order. When handling high-order allocations, the retries are reduced to 1/4 to avoid overcommitting resources. This is a heuristic decision.
-
if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) *no_progress_loops = 0; else (*no_progress_loops)++;
Decision on whether to reset or increment no_progress_loops based on memory reclaim progress and allocation order. The progress is determined at runtime, with PAGE_ALLOC_COSTLY_ORDER serving as a threshold, representing a heuristic decision.
-
if (gfp_mask & __GFP_NORETRY)
gfp_mask is provided by caller. This configuration determine whether the function would retry to allocate pages or not.
-
/* * Do not steal pages from freelists belonging to other pageblocks * i.e. orders < pageblock_order. If there are no local zones free, * the zonelists will be reiterated without ALLOC_NOFRAGMENT. */ if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT) min_order = pageblock_order;
This prevents fragmentation by avoiding small allocations to not cause excessive fragmentation in memory. With ALLOC_NOGRAGMENT flag is set then the kernel will only allocate from larger pageblocks.
-
/* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); for (i = 0; i < numpages; i++) clear_highpage_kasan_tagged(page + i); kasan_enable_current();
The entire functions clears memory pages but while avoiding the KASAN redzone. This redzones are used to detect memory corruption. After the clearing, the KASAN is re-enabled.
-
/* * Check tail pages before head page information is cleared to * avoid checking PageCompound for order-0 pages. */ if (unlikely(order)) { bool compound = PageCompound(page); int i; VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); if (compound) page[1].flags &= ~PAGE_FLAGS_SECOND;
This policy handles compound/big pages. Like what the comment mentions, ensuring that the order matches and that each tail page is handled accordingly. This algorithmic policy prevents possible issues when trying to free large memory blocks.
-
if (order > PAGE_ALLOC_COSTLY_ORDER) { VM_BUG_ON(order != pageblock_order); movable = migratetype == MIGRATE_MOVABLE; return NR_LOWORDER_PCP_LISTS + movable; }
Manages memory pages efficiently in the buddy allocator by organizing them in page of their migratetype and order. It ensures that by organizing by types and sizes it can be easily found during allocation and deallocation.
-
/* * Temporary debugging check for pages not lying within a given zone. */ static int __maybe_unused bad_range(struct zone *zone, struct page *page) { if (page_outside_zone_boundaries(zone, page)) return 1; if (zone != page_zone(page)) return 1; return 0; }
Policy that ensures that pages are assigned on the correct zones. This should be within the kernel's memory management system. Checks for potential corruption of mismanagement of pages. Enforces constraints.
-
int ratio = sysctl_lowmem_reserve_ratio[i]; bool clear = !ratio || !zone_managed_pages(zone); unsigned long managed_pages = 0; for (j = i + 1; j < MAX_NR_ZONES; j++) { struct zone *upper_zone = &pgdat->node_zones[j]; managed_pages += zone_managed_pages(upper_zone); if (clear) zone->lowmem_reserve[j] = 0; else zone->lowmem_reserve[j] = managed_pages / ratio; }
This is a configuration policy since it revolves around the "sysctl_lowmem_reserve_ratio" which is a configurable setting that can be changed dynamically at runtime. The for loop applies the logic of the configuration but the parameter controls how much memory to reserve. The calculation of memory reserves based on this ratio are configuration policies.
-
if (likely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype);
The decision policy checks if the free page count is updated to reflect the allocation of deallocation of memory blocks of a migratype (if the page is movable, unmovable, or reclaimable).
-
static inline bool free_page_is_bad(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) return false; /* Something has gone sideways, find it */ free_page_is_bad_report(page); return true; }
This policy checks the state of a page to verify if it is in the expected state. If in expected state returns false and ca be freed normally. The policy decides whether to report and handle a bad page based on its state.
-
if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages); }
This watermark policy that determines when to ignore the watermark boost for certain types of memory allocations. Boosting is used to temporarily increase the free memory threshold, but this policy decides when the boost should be ignored.
-
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages) { long min = mark; int o; /* free_pages may go negative - that's OK */ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); if (unlikely(alloc_flags & ALLOC_RESERVES)) { /* * __GFP_HIGH allows access to 50% of the min reserve as well * as OOM. */ if (alloc_flags & ALLOC_MIN_RESERVE) { min -= min / 2; /* * Non-blocking allocations (e.g. GFP_ATOMIC) can * access more reserves than just __GFP_HIGH. Other * non-blocking allocations requests such as GFP_NOWAIT * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get * access to the min reserve. */ if (alloc_flags & ALLOC_NON_BLOCK) min -= min / 4; } /* * OOM victims can try even harder than the normal reserve * users on the grounds that it's definitely going to be in * the exit path shortly and free memory. Any allocation it * makes during the free path will be small and short-lived. */ if (alloc_flags & ALLOC_OOM) min -= min / 2; } /* * Check watermarks for an order-0 allocation request. If these * are not met, then a high-order request also cannot go ahead * even if a suitable page happened to be free. */ if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) return false; /* If this is an order-0 request then the watermark is fine */ if (!order) return true; /* For a high-order request, check at least one suitable page is free */ for (o = order; o < NR_PAGE_ORDERS; o++) { struct free_area *area = &z->free_area[o]; int mt; if (!area->nr_free) continue; for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { if (!free_area_empty(area, mt)) return true; } #ifdef CONFIG_CMA if ((alloc_flags & ALLOC_CMA) && !free_area_empty(area, MIGRATE_CMA)) { return true; } #endif if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) && !free_area_empty(area, MIGRATE_HIGHATOMIC)) { return true; } } return false; }
Algorithmic policy for memory allocation. Checks if there are enough free pages for an allocation request based on a watermark-based allocation policy criteria. Is is tone by checking the number of free pages in the zone to a threshold "mark".
-
-
elixir.bootlin.com elixir.bootlin.com
-
if (prev_class) { if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) { pool->size_class[i] = prev_class; continue; } }
This is an algorithmic policy. A
zs_pool
maintainszs_page
s of differentsize_class
. However, somesize_class
es share exactly same characteristics, namelypages_per_zspage
andobjs_per_zspage
. Recall the other annotation of mine, it searched freezspage
s bysize_class
:zspage = find_get_zspage(class);
. Thus, grouping different classes improves memory utilization. -
zspage = find_get_zspage(class); if (likely(zspage)) { obj = obj_malloc(pool, zspage, handle); /* Now move the zspage to another fullness group, if required */ fix_fullness_group(class, zspage); record_obj(handle, obj); class_stat_inc(class, ZS_OBJS_INUSE, 1); goto out; }
This is an algorithmic policy. Instead of immediately allocating new zspages for each memory request, the algorithm first attempts to find and reuse existing partially filled zspages from a given size class by invoking the find_get_zspage(class) function. It also updates the corresponding fullness groups.
-
-
elixir.bootlin.com elixir.bootlin.com
-
1
This is a configuration policy that sets the timeout between retries if vmap_pages_range() fails. This could be tunable variable.
-
if (!(flags & VM_ALLOC)) area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, KASAN_VMALLOC_PROT_NORMAL);
This is an algorithmic policy. This is an optimization that prevents duplicate marks of accessibility. Only pages allocated without
VM_ALLOC
(e.g. ioremap) was not set accessible (unpoison), thus requiring explicit setting here. -
100U
This is an configuration policy that determines 100 pages are the upper limit for the bulk-allocator. However, the implementation of
alloc_pages_bulk_array_mempolicy
does not explicitly limit in the implementation. So I believe it is an algorithmic policy related to some sort of optimization. -
if (!order) {
This is an algorithmic policy determines that only use the bulk allocator for order-0 pages (non-super pages). Maybe the bulk allocator could be applied to super pages to speed up allocation. Currently I haven't seen the reason why it cannot be applied.
-
if (likely(count <= VMAP_MAX_ALLOC)) { mem = vb_alloc(size, GFP_KERNEL); if (IS_ERR(mem)) return NULL; addr = (unsigned long)mem; } else { struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END, node, GFP_KERNEL, VMAP_RAM); if (IS_ERR(va)) return NULL; addr = va->va_start; mem = (void *)addr; }
This is an algorithmic policy that determines whether to use a more efficient
vl_alloc
which does not involve complex virtual-to-physical mappings. Unlike the latteralloc_vmap_area
,vb_alloc
does not need to traverse the rb-tree of free vmap areas. It simply find a larger enough block fromvmap_block_queue
. -
VMAP_PURGE_THRESHOLD
The threshold VMAP_PURGE_THRESHOLD is a configuration policy that could be tuned by machine learning. Setting this threshold lower reduces purging activities while setting it higher reduces framentation.
-
if (!(force_purge
This is an algorithmic policy that prevents purging blocks with considerable amount of "usable memory" unless requested with force_purge.
-
resched_threshold = lazy_max_pages() << 1;
The assignment of resched_threshold and lines 1776-1777 are configuration policies to determine fewer than which number of lazily-freed pages it should yield CPU temporarily to higher-priority tasks.
-
log = fls(num_online_cpus());
This heuristic scales lazy_max_pages logarithmically, which is a configuration policy. Alternatively, machine learning could determine the optimal scaling function—whether linear, logarithmic, square-root, or another approach.
-
32UL * 1024 * 1024
This is a configuration policy that decides to always returns multiples of 32 MB worth of pages. This could be a configurable variable rather than a fixed magic number.
-
-
elixir.bootlin.com elixir.bootlin.com
-
/* * If the user wants hardware cache aligned objects then follow that * suggestion if the object is sufficiently large. * * The hardware cache alignment cannot override the specified * alignment though. If that is greater then use it. */ if (flags & SLAB_HWCACHE_ALIGN) { unsigned int ralign; ralign = cache_line_size(); while (size <= ralign / 2) ralign /= 2; align = max(align, ralign); }
This SLAB_HWCACHE_ALIGN is defined by the user, making users decide whether hardware cache should align objects or not.
-
if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_NORMAL)) flags |= SLAB_NO_MERGE;
This is a configuration policy. If CONFIG_MEMCG_KMEM is enabled, disable cache merging for KMALLOC_NORMAL caches.
-
-
elixir.bootlin.com elixir.bootlin.com
-
size = count_history_pages(mapping, index, max);
This uses the size of contigiously cached pages in cache to get a "conservative" estimation of the "length of a sequential read sequence" or the "thrashing threshold in memory tight systems". LDOS could use different policies to predict the optimal read ahead size.
-
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE
This chunks the read-ahead into 2 megabyte units to avoid pinning too much memory at once. LDOS could replace this with a dynamically-sized chunk to better optimize for other use cases.
-
-
elixir.bootlin.com elixir.bootlin.com
-
static bool should_skip_region(struct memblock_type *type, struct memblock_region *m, int nid, int flags) { int m_nid = memblock_get_region_node(m); /* we never skip regions when iterating memblock.reserved or physmem */ if (type != memblock_memory) return false; /* only memory regions are associated with nodes, check it */ if (nid != NUMA_NO_NODE && nid != m_nid) return true; /* skip hotpluggable memory regions if needed */ if (movable_node_is_enabled() && memblock_is_hotpluggable(m) && !(flags & MEMBLOCK_HOTPLUG)) return true; /* if we want mirror memory skip non-mirror memory regions */ if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) return true; /* skip nomap memory unless we were asked for it explicitly */ if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) return true; /* skip driver-managed memory unless we were asked for it explicitly */ if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m)) return true; return false; }
This policy determines whether a memblock region should be skipped, based on several checks that incorporate various flags. You can see this policy being used in other functions on lines 1080 and 1184 in this file; these other functions appear to be sub-functions for iterators on the memblock regions.
-
if (memblock_bottom_up())
This policy controls whether the memblock allocator should allocate memory from the bottom up or from the top down.
-
- Sep 2024
-
elixir.bootlin.com elixir.bootlin.com
-
if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { lruvec->file_cost /= 2; lruvec->anon_cost /= 2; }
It is a configuration policy. The policy here is to adjust the cost of current page. The cost means the overhead for kernel to operate the page if the page is swapped out. Kernel adopts a decay policy. Specifically, if current cost is greater than lrusize/4, its cost will be divided by 2. If kernel has no this policy, after a long-term running, the cost of each page is very high. Kernel might mistakenly reserve pages which are frequently visited long time ago but are inactive currently. It might cause performance degradation. The value (lrusize/4) here has a trade-off between performance and sensitivity. For example, if the value is too small, kernel will frequently adjust the priority of each page, resulting in process's performance degradation. If the value is too large, kernel might be misleaded by historical data, causing wrongly swapping currently popular pages and further performance degradation.
-
if (megs < 16) page_cluster = 2; else page_cluster = 3;
It is a configuration policy. The policy here is to determine the size of page cluster. "2" and "3" is determined externally to the kernel. Page cluster is the actual execution unit when swapping. If the machine is small-memory, kernel executes swap operation on 4 (2^2) pages for each time. Otherwise, kernel operats 8 (2^3) for each time. The rational here is to avoid much memory pressure for small-memory system and to improve performance for large-memory system.
-
-
elixir.bootlin.com elixir.bootlin.com
-
mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
It is a configuration policy. It is used to count the physical page for memory control group. The policy here is one physical page corresponds to one count to the memory control group. If using huge page, the value here might not be 1,
-
schedule_timeout_uninterruptible(1);
It is a configuration policy. If kernel cannot allocate an enough virtual space with alignment, while nofail is specified to disallow failure, the kernel will let the process to sleep for 1 time slot. In this period, the process cannot be interrupted and quit the schedule queue of CPU. The "1" here is a configuration parameter, made externally to the kernel. It is not large enough for latency-sensitive process and is not small enough to retry frequently.
-
if (array_size > PAGE_SIZE) { area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, area->caller); } else { area->pages = kmalloc_node(array_size, nested_gfp, node); }
It is an algorithmic policy and also a configuration policy. The algorithmic policy here is to maxmize the data locality of virtual memory address, by letting the space be in continuous virtual pages. If the demanded size of virtual memory is larger than one page, the kernel will allocate multiple continuous pages for it by using __vmalloc_node(). Otherwise, the kernel will call kmalloc_node() to place it in a single page. On the other hand, to allocate continuous pages, we should invoke __vmalloc_node() by declaring align = 1, which is a configuration policy.
-
if (!counters) return; if (v->flags & VM_UNINITIALIZED) return;
It is an algorithmic policy. The show_numa_info() is used for printing how the virtual memory of v (vm_struct) distributes its pages across numa nodes. The two if conditions are used for filtering the invalid v. The first if means the virtual memory has not been associated with physical page. The second if means the virtual memory has not been initialized yet.
-
if (page) copied = copy_page_to_iter_nofault(page, offset, length, iter); else copied = zero_iter(iter, length);
It is an algorithmic policy. The policy here is to avoid lock/unlock overhead by using copy_page_to_iter_nofault() function plus if-else branch. Because copy_page_to_iter_nofault() is based on no page fault, we need to check the physical page is still alive. So the kernel uses an if condition to guarantee copy_page_to_iter_nofault() without page fault. On the other hand, if the page is not valid, kernel chooses to fill the iter using zero value instead of returning an error. I think it is because it can make invocation more convenient. The caller does not need to do a verification of the return value and does not need further actions to handle it.
-
if (base + last_end < vmalloc_start + last_end) goto overflow; /* * Fitting base has not been found. */ if (va == NULL) goto overflow;
It is an algorithmic policy. The two if conditions are used for checking the feasibility of the allocation. The first if means the adress of the allocation is beyond the allowable address, causing overflow. In this case, since the base address is decided from higher one to smaller one, reducing base address cannot work. Note that the base address is dependent on the va. The second if means we don't find a valid va which has appropriate base address. So the policy here is going to overflow branch to re-allocate va list.
-
if (base + start < va->va_start) { va = node_to_va(rb_prev(&va->rb_node)); base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; }
It is an algorithmic policy. It is used for checking whether the start address of current va is valid. The policy here is to change a new va, different from the solution of "end address check" (explained in my other annotation). The reason is that we find the base address from high address to low address. If we continue to reduce the base address, the if condition will be always wrong. Changing a new va with a lower start address is the only solution.
-
if (base + end > va->va_end) { base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; }
It is an algorithmic policy. The outer loop is an endless loop until finding a vaild virtual memory space satisfying the requirements. The if condition here is used to guarantee the space in va is large enough. If not, we continue to scan the memory space from high address to low address while satisfying the alignment requirement. Specifically, tuning the base address to a lower address. And then, it will retry for verification.
-
va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (WARN_ON_ONCE(!va)) continue;
It is an algorithmic policy. The for loop is used for transferring the virtual memory managment by using from vm_struct (link list) to vm_area (tree?). Inside the for loop, if we cannot allocate a new vmap_area from vmap_area_cachep (va = NULL), the program will simply print a warning and skip current tmp, instread of retry or anything to guarantee the integrity.
-
if (!spin_trylock(&vmap_area_lock)) return false;
It is an algorithmic policy. The vmalloc_dump_obj() function is used for printing the information of specified virtual memory. Due to concurrency, before manipulating the critical state, we must obtain the lock. The policy here is spin_trylock. It only tries once. If fails, the function will simply returns false. I think the policy here is to avoid long-term waiting to improve the performance.
-
-
elixir.bootlin.com elixir.bootlin.com
-
static inline unsigned int calc_slab_order(unsigned int size, unsigned int min_order, unsigned int max_order, unsigned int fract_leftover) { unsigned int order; for (order = min_order; order <= max_order; order++) { unsigned int slab_size = (unsigned int)PAGE_SIZE << order; unsigned int rem; rem = slab_size % size; if (rem <= slab_size / fract_leftover) break; } return order; }
Code to choose how many pages to allocate for a new slab to minimize wasted space from the remainder
-
-
elixir.bootlin.com elixir.bootlin.com
-
if (folio) { pgoff_t start; rcu_read_lock(); start = page_cache_next_miss(ractl->mapping, index + 1, max_pages); rcu_read_unlock(); if (!start || start - index > max_pages) return; ra->start = start; ra->size = start - index; /* old async_size */ ra->size += req_size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit;
Read ahead policy for marked folios
-
- Aug 2024
-
elixir.bootlin.com elixir.bootlin.com
-
if (hugetlb_cgroup_disabled())
This probably not an interesting policy decision for ldos. It is a feature flag for the running OS. But if cgroups were decided by policy then this flag would be controlled by the cgroup decision.
-
-
git.doit.wisc.edu git.doit.wisc.edu
-
int vm_swappiness = 60;
then main parameter that controls how aggressive the system will swap anon pages vs file pages.
-
(sc->order > PAGE_ALLOC_COSTLY_ORDER || sc->priority < DEF_PRIORITY - 2))
constants
-
MIN_NR_GENS
magic number?
-
static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
figure out how many pages to scan.
-
static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) {
sets up the struct scan_control. Most of the value come from elsewhere but this function seems to bring it all together.
-
-
git.doit.wisc.edu git.doit.wisc.edu
-
Flag indicating whether KSM should run.
-
flag indicating if KSM should merge same-pages across NUMA nodes
-
KSM_ATTR(advisor_target_scan_time);
target scan time -- used by the EWA?
-
KSM_ATTR(advisor_max_pages_to_scan);
max number of pages to scan per iteration
-
KSM_ATTR(advisor_min_pages_to_scan);
min pages to scan per iteration
-
KSM_ATTR(advisor_max_cpu);
max amount of cpu per iteration of the ksmd?
-
KSM_ATTR(advisor_mode);
the mode that ksm runs in -- only two for now.
-
KSM_ATTR(smart_scan);
smart scan -- not sure what this does
-
KSM_ATTR(stable_node_chains_prune_millisecs);
millis before a page is removed from the stable tree??
-
KSM_ATTR(max_page_sharing);
what is the max number of page sharings -- I think this is how many times a single page can be shared (to limit the length of the reverse map?)
-
KSM_ATTR(use_zero_pages);
shouls KSM use special zero page handling
-
KSM_ATTR(merge_across_nodes);
should ksm merge across NUMA nodes
-
KSM_ATTR(run);
flag indicating if ksm should run (0 or 1)
-
KSM_ATTR(pages_to_scan);
number of pages to scan per loop iteration
-
KSM_ATTR(sleep_millisecs);
sleep time between ksmd loop iterations
-
ksm_thread_pages_to_scan
how many pages to scan. This is what many of the other config values are trying to get at.
-
#define DEFAULT_PAGES_TO_SCAN 100
constant for the number of pages to scan
-
Constant default value for pages to scan
-
configuration for targeted scan time
-
max pages to scan during a single pass of the KSM loop
-
min number of pages to scan during a KSM loo[p
-
config for how much cpu to consume.
-
choose the mode to run -- either using EWA or just some fixed values.
-
function to compute a EWA for the number of pages to scan. Uses many of the config parameters from sysfs
-
configuration for how long before a page is pruned from the stable tree
-
configuration for max page sharing -- not quite sure what this is doing
-
Configuration to decide if KSM should use special handilng for pages filled with zeros.
-
Configuration for the number of pages to scan with each run
-
Configuration for how often the thread should run
-
- Jul 2024
-
git.doit.wisc.edu git.doit.wisc.edu
-
static void scan_time_advisor(void)
This function is calculating the pages to scan based on a other metrics such as cpu consumed. Seems like a good place for a learned algorithm.
-
-
-
numa_migrate_preferred(p);
Thread migration from auto-NUMA Balancing; Potential source of conflicts against CPU Load Balancer's decisions
-
- Jun 2024
-
git.doit.wisc.edu git.doit.wisc.edu
-
/* * The larger the object size is, the more slabs we want on the partial * list to avoid pounding the page allocator excessively. */ s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
A policy decision about how often we may have to go to the page allocator.
-
/* * calculate_sizes() determines the order and the distribution of data within * a slab object. */ 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 static int calculate_sizes(struct kmem_cache *s) {
computes a several values for the allocator based on the size and flags of the allocator being created.
-
#ifndef CONFIG_SLUB_TINY static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
Depending on the CONFIG_SLUB_TINY should ther be an active slab for each CPU?
-
static inline int calculate_order(unsigned int size) { unsigned int order; unsigned int min_objects; unsigned int max_objects; unsigned int min_order; min_objects = slub_min_objects; if (!min_objects) {
calculate the order (power of two number of pages) that each slab in this allocator should have.
-
max_objects = max(order_objects(slub_max_order, size), 1U); min_objects = min(min_objects, max_objects); min_order = max_t(unsigned int, slub_min_order, get_order(min_objects * size)); if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1;
Policy calculation
-
s->flags & __OBJECT_POISON
apply policy
-
s->flags & SLAB_RED_ZONE)
debug plicy
-
slab, slab->objects, slab->inuse, slab->freelist, folio_flags(folio, 0)
possible features
-
if (s->flags & __CMPXCHG_DOUBLE) {
Config fastpath?
-
if (s->flags & __CMPXCHG_DOUBLE) {
Fast path config
-
#ifdef CONFIG_SLUB_CPU_PARTIAL
Conditional code
-
#ifndef CONFIG_SLUB_TINY static void prefetch_freepointer(const struct kmem_cache *s, void *object) { prefetchw(object + s->offset); } #endif
conditional code
-
ALLOC_FASTPATH, /* Allocation from cpu slab */ 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ FREE_FASTPATH, /* Free to cpu slab */ FREE_SLOWPATH, /* Freeing not to cpu slab */ FREE_FROZEN, /* Freeing to frozen slab */ FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ FREE_REMOVE_PARTIAL, /* Freeing removes last object */ ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ ALLOC_SLAB, /* Cpu slab acquired from page allocator */ ALLOC_REFILL, /* Refill cpu slab from slab freelist */ ALLOC_NODE_MISMATCH, /* Switching cpu slab */ FREE_SLAB, /* Slab freed to the page allocator */ CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ DEACTIVATE_BYPASS, /* Implicit deactivation */ ORDER_FALLBACK, /* Number of times fallback was necessary */ CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */ CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ CPU_PARTIAL_FREE, /* Refill cpu partial on free */ CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ NR_SLUB_STAT_ITEMS
Possible features
-
Should the allocator do consistency checks?
-
why slow path vs fast path? policy?
-
is there a policy decision here? Why would we choose to use lockless vs. not?
-
set the number of slabs per cpu
-
-
git.doit.wisc.edu git.doit.wisc.edu
-
int calculate_normal_threshold(struct zone *zone) { int threshold; int mem; /* memory in 128 MB units */ /* * The threshold scales with the number of processors and the amount * of memory per zone. More memory means that we can defer updates for * longer, more processors could lead to more contention. * fls() is used to have a cheap way of logarithmic scaling. * * Some sample thresholds: * * Threshold Processors (fls) Zonesize fls(mem)+1 * ------------------------------------------------------------------ * 8 1 1 0.9-1 GB 4 * 16 2 2 0.9-1 GB 4 * 20 2 2 1-2 GB 5 * 24 2 2 2-4 GB 6 * 28 2 2 4-8 GB 7 * 32 2 2 8-16 GB 8 * 4 2 2 <128M 1 * 30 4 3 2-4 GB 5 * 48 4 3 8-16 GB 8 * 32 8 4 1-2 GB 4 * 32 8 4 0.9-1GB 4 * 10 16 5 <128M 1 * 40 16 5 900M 4 * 70 64 7 2-4 GB 5 * 84 64 7 4-8 GB 6 * 108 512 9 4-8 GB 6 * 125 1024 10 8-16 GB 8 * 125 1024 10 16-32 GB 9 */ mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT); threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); /* * Maximum threshold is 125 */ threshold = min(125, threshold); return threshold; }
a "magic" formula for computing the amount of memory per zone.
-
-
git.doit.wisc.edu git.doit.wisc.edu
-
khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
bunch of "magic" numbers for hugepaged
-
- May 2024
-
git.doit.wisc.edu git.doit.wisc.edu
-
40.45 KiB Blame Edit Open in Web IDE . Quickly and easily edit multiple files in your project. Edit single file Edit this file only. Lock Replace Delete 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 // SPDX-License-Identifier: GPL-2.0-only
Debug only
-
-
git.doit.wisc.edu git.doit.wisc.edu
-
// SPDX-License-Identifier: GPL-2.0
Debug only
-
-
git.doit.wisc.edu git.doit.wisc.edu
-
// SPDX-License-Identifier: GPL-2.0
Debug only
-
-
git.doit.wisc.edu git.doit.wisc.edu
-
mm/debug.c
Debug only - no policy
-
-
git.doit.wisc.edu git.doit.wisc.edu
-
debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops); debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops); debugfs_create_file("base_pfn", 0444, tmp, &cma->base_pfn, &cma_debugfs_fops); debugfs_create_file("count", 0444, tmp, &cma->count, &cma_debugfs_fops); debugfs_create_file("order_per_bit", 0444, tmp, &cma->order_per_bit, &cma_debugfs_fops); debugfs_create_file("used", 0444, tmp, cma, &cma_used_fops); debugfs_create_file("maxchunk", 0444, tmp, cma, &cma_maxchunk_fops); cma->dfs_bitmap.array = (u32 *)cma->bitmap; cma->dfs_bitmap.n_elements = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32));
feature data
-
-
git.doit.wisc.edu git.doit.wisc.edu
-
extern struct cma cma_areas[MAX_CMA_AREAS];
policy use?
-
-
git.doit.wisc.edu git.doit.wisc.edu
-
count_vm_event(CMA_ALLOC_FAIL);
Feature data
-
cma_sysfs_account_success_pages(cma, count)
Feature data
-
ARRAY_SIZE(cma_areas)
Policy use - cma_areas size
-
ARRAY_SIZE(cma_areas)
Policy use - cma_areas size
-
cma_bitmap_maxno(cma)
policy application -= maxno of bitmaps
-
-
git.doit.wisc.edu git.doit.wisc.edu
-
mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
Policy application: choosing map size for page usage?
-
-
git.doit.wisc.edu git.doit.wisc.edu
-
if (IS_ENABLED(CONFIG_BALLOON_COMPACTION) && PageIsolated(page)) { /* raced with isolation */ unlock_page(page); continue; }
Config enabled code
-
-
git.doit.wisc.edu git.doit.wisc.edu
-
bdi->dev = NULL; kref_init(&bdi->refcnt); bdi->min_ratio = 0; bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); bdi->last_bdp_sleep = jiffies;
Initial config settings
-
static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_min_ratio_fine.attr, &dev_attr_max_ratio.attr, &dev_attr_max_ratio_fine.attr, &dev_attr_min_bytes.attr, &dev_attr_max_bytes.attr, &dev_attr_stable_pages_required.attr, &dev_attr_strict_limit.attr, NULL, };
Policy parameter data structure
-
static ssize_t strict_limit_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int strict_limit; ssize_t ret; ret = kstrtouint(buf, 10, &strict_limit); if (ret < 0) return ret; ret = bdi_set_strict_limit(bdi, strict_limit); if (!ret) ret = count; return ret; }
Policy function to set parameters
-
static ssize_t max_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_max_bytes(bdi, bytes); if (!ret) ret = count; return ret; }
Policy function to set parameters
-
static ssize_t min_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_min_bytes(bdi, bytes); if (!ret) ret = count;
Policy function to set parametesr
-
static ssize_t max_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; }
Policy function to set paraameters
-
static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio(bdi, ratio); if (!ret) ret = count; return ret; }
Policy function to set parameters
-
static ssize_t min_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_min_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; }
Policy function to set parameters
-
unsigned long nr_dirty; unsigned long nr_io; unsigned long nr_more_io; unsigned long nr_dirty_time; unsigned long nr_writeback; unsigned long nr_reclaimable; unsigned long nr_dirtied; unsigned long nr_written; unsigned long dirty_thresh; unsigned long wb_thresh;
Possible features
-