1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/slab.h>
3 #include <linux/lockdep.h>
4 #include <linux/sysfs.h>
5 #include <linux/kobject.h>
6 #include <linux/memory.h>
7 #include <linux/memory-tiers.h>
8 
9 #include "internal.h"
10 
11 struct memory_tier {
12 	/* hierarchy of memory tiers */
13 	struct list_head list;
14 	/* list of all memory types part of this tier */
15 	struct list_head memory_types;
16 	/*
17 	 * start value of abstract distance. memory tier maps
18 	 * an abstract distance  range,
19 	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
20 	 */
21 	int adistance_start;
22 	struct device dev;
23 	/* All the nodes that are part of all the lower memory tiers. */
24 	nodemask_t lower_tier_mask;
25 };
26 
27 struct demotion_nodes {
28 	nodemask_t preferred;
29 };
30 
31 struct node_memory_type_map {
32 	struct memory_dev_type *memtype;
33 	int map_count;
34 };
35 
36 static DEFINE_MUTEX(memory_tier_lock);
37 static LIST_HEAD(memory_tiers);
38 static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
39 static struct memory_dev_type *default_dram_type;
40 
41 static struct bus_type memory_tier_subsys = {
42 	.name = "memory_tiering",
43 	.dev_name = "memory_tier",
44 };
45 
46 #ifdef CONFIG_MIGRATION
47 static int top_tier_adistance;
48 /*
49  * node_demotion[] examples:
50  *
51  * Example 1:
52  *
53  * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
54  *
55  * node distances:
56  * node   0    1    2    3
57  *    0  10   20   30   40
58  *    1  20   10   40   30
59  *    2  30   40   10   40
60  *    3  40   30   40   10
61  *
62  * memory_tiers0 = 0-1
63  * memory_tiers1 = 2-3
64  *
65  * node_demotion[0].preferred = 2
66  * node_demotion[1].preferred = 3
67  * node_demotion[2].preferred = <empty>
68  * node_demotion[3].preferred = <empty>
69  *
70  * Example 2:
71  *
72  * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
73  *
74  * node distances:
75  * node   0    1    2
76  *    0  10   20   30
77  *    1  20   10   30
78  *    2  30   30   10
79  *
80  * memory_tiers0 = 0-2
81  *
82  * node_demotion[0].preferred = <empty>
83  * node_demotion[1].preferred = <empty>
84  * node_demotion[2].preferred = <empty>
85  *
86  * Example 3:
87  *
88  * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
89  *
90  * node distances:
91  * node   0    1    2
92  *    0  10   20   30
93  *    1  20   10   40
94  *    2  30   40   10
95  *
96  * memory_tiers0 = 1
97  * memory_tiers1 = 0
98  * memory_tiers2 = 2
99  *
100  * node_demotion[0].preferred = 2
101  * node_demotion[1].preferred = 0
102  * node_demotion[2].preferred = <empty>
103  *
104  */
105 static struct demotion_nodes *node_demotion __read_mostly;
106 #endif /* CONFIG_MIGRATION */
107 
to_memory_tier(struct device * device)108 static inline struct memory_tier *to_memory_tier(struct device *device)
109 {
110 	return container_of(device, struct memory_tier, dev);
111 }
112 
get_memtier_nodemask(struct memory_tier * memtier)113 static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
114 {
115 	nodemask_t nodes = NODE_MASK_NONE;
116 	struct memory_dev_type *memtype;
117 
118 	list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
119 		nodes_or(nodes, nodes, memtype->nodes);
120 
121 	return nodes;
122 }
123 
memory_tier_device_release(struct device * dev)124 static void memory_tier_device_release(struct device *dev)
125 {
126 	struct memory_tier *tier = to_memory_tier(dev);
127 	/*
128 	 * synchronize_rcu in clear_node_memory_tier makes sure
129 	 * we don't have rcu access to this memory tier.
130 	 */
131 	kfree(tier);
132 }
133 
nodelist_show(struct device * dev,struct device_attribute * attr,char * buf)134 static ssize_t nodelist_show(struct device *dev,
135 			     struct device_attribute *attr, char *buf)
136 {
137 	int ret;
138 	nodemask_t nmask;
139 
140 	mutex_lock(&memory_tier_lock);
141 	nmask = get_memtier_nodemask(to_memory_tier(dev));
142 	ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
143 	mutex_unlock(&memory_tier_lock);
144 	return ret;
145 }
146 static DEVICE_ATTR_RO(nodelist);
147 
148 static struct attribute *memtier_dev_attrs[] = {
149 	&dev_attr_nodelist.attr,
150 	NULL
151 };
152 
153 static const struct attribute_group memtier_dev_group = {
154 	.attrs = memtier_dev_attrs,
155 };
156 
157 static const struct attribute_group *memtier_dev_groups[] = {
158 	&memtier_dev_group,
159 	NULL
160 };
161 
find_create_memory_tier(struct memory_dev_type * memtype)162 static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
163 {
164 	int ret;
165 	bool found_slot = false;
166 	struct memory_tier *memtier, *new_memtier;
167 	int adistance = memtype->adistance;
168 	unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE;
169 
170 	lockdep_assert_held_once(&memory_tier_lock);
171 
172 	adistance = round_down(adistance, memtier_adistance_chunk_size);
173 	/*
174 	 * If the memtype is already part of a memory tier,
175 	 * just return that.
176 	 */
177 	if (!list_empty(&memtype->tier_sibiling)) {
178 		list_for_each_entry(memtier, &memory_tiers, list) {
179 			if (adistance == memtier->adistance_start)
180 				return memtier;
181 		}
182 		WARN_ON(1);
183 		return ERR_PTR(-EINVAL);
184 	}
185 
186 	list_for_each_entry(memtier, &memory_tiers, list) {
187 		if (adistance == memtier->adistance_start) {
188 			goto link_memtype;
189 		} else if (adistance < memtier->adistance_start) {
190 			found_slot = true;
191 			break;
192 		}
193 	}
194 
195 	new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
196 	if (!new_memtier)
197 		return ERR_PTR(-ENOMEM);
198 
199 	new_memtier->adistance_start = adistance;
200 	INIT_LIST_HEAD(&new_memtier->list);
201 	INIT_LIST_HEAD(&new_memtier->memory_types);
202 	if (found_slot)
203 		list_add_tail(&new_memtier->list, &memtier->list);
204 	else
205 		list_add_tail(&new_memtier->list, &memory_tiers);
206 
207 	new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
208 	new_memtier->dev.bus = &memory_tier_subsys;
209 	new_memtier->dev.release = memory_tier_device_release;
210 	new_memtier->dev.groups = memtier_dev_groups;
211 
212 	ret = device_register(&new_memtier->dev);
213 	if (ret) {
214 		list_del(&memtier->list);
215 		put_device(&memtier->dev);
216 		return ERR_PTR(ret);
217 	}
218 	memtier = new_memtier;
219 
220 link_memtype:
221 	list_add(&memtype->tier_sibiling, &memtier->memory_types);
222 	return memtier;
223 }
224 
__node_get_memory_tier(int node)225 static struct memory_tier *__node_get_memory_tier(int node)
226 {
227 	pg_data_t *pgdat;
228 
229 	pgdat = NODE_DATA(node);
230 	if (!pgdat)
231 		return NULL;
232 	/*
233 	 * Since we hold memory_tier_lock, we can avoid
234 	 * RCU read locks when accessing the details. No
235 	 * parallel updates are possible here.
236 	 */
237 	return rcu_dereference_check(pgdat->memtier,
238 				     lockdep_is_held(&memory_tier_lock));
239 }
240 
241 #ifdef CONFIG_MIGRATION
node_is_toptier(int node)242 bool node_is_toptier(int node)
243 {
244 	bool toptier;
245 	pg_data_t *pgdat;
246 	struct memory_tier *memtier;
247 
248 	pgdat = NODE_DATA(node);
249 	if (!pgdat)
250 		return false;
251 
252 	rcu_read_lock();
253 	memtier = rcu_dereference(pgdat->memtier);
254 	if (!memtier) {
255 		toptier = true;
256 		goto out;
257 	}
258 	if (memtier->adistance_start <= top_tier_adistance)
259 		toptier = true;
260 	else
261 		toptier = false;
262 out:
263 	rcu_read_unlock();
264 	return toptier;
265 }
266 
node_get_allowed_targets(pg_data_t * pgdat,nodemask_t * targets)267 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
268 {
269 	struct memory_tier *memtier;
270 
271 	/*
272 	 * pg_data_t.memtier updates includes a synchronize_rcu()
273 	 * which ensures that we either find NULL or a valid memtier
274 	 * in NODE_DATA. protect the access via rcu_read_lock();
275 	 */
276 	rcu_read_lock();
277 	memtier = rcu_dereference(pgdat->memtier);
278 	if (memtier)
279 		*targets = memtier->lower_tier_mask;
280 	else
281 		*targets = NODE_MASK_NONE;
282 	rcu_read_unlock();
283 }
284 
285 /**
286  * next_demotion_node() - Get the next node in the demotion path
287  * @node: The starting node to lookup the next node
288  *
289  * Return: node id for next memory node in the demotion path hierarchy
290  * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
291  * @node online or guarantee that it *continues* to be the next demotion
292  * target.
293  */
next_demotion_node(int node)294 int next_demotion_node(int node)
295 {
296 	struct demotion_nodes *nd;
297 	int target;
298 
299 	if (!node_demotion)
300 		return NUMA_NO_NODE;
301 
302 	nd = &node_demotion[node];
303 
304 	/*
305 	 * node_demotion[] is updated without excluding this
306 	 * function from running.
307 	 *
308 	 * Make sure to use RCU over entire code blocks if
309 	 * node_demotion[] reads need to be consistent.
310 	 */
311 	rcu_read_lock();
312 	/*
313 	 * If there are multiple target nodes, just select one
314 	 * target node randomly.
315 	 *
316 	 * In addition, we can also use round-robin to select
317 	 * target node, but we should introduce another variable
318 	 * for node_demotion[] to record last selected target node,
319 	 * that may cause cache ping-pong due to the changing of
320 	 * last target node. Or introducing per-cpu data to avoid
321 	 * caching issue, which seems more complicated. So selecting
322 	 * target node randomly seems better until now.
323 	 */
324 	target = node_random(&nd->preferred);
325 	rcu_read_unlock();
326 
327 	return target;
328 }
329 
disable_all_demotion_targets(void)330 static void disable_all_demotion_targets(void)
331 {
332 	struct memory_tier *memtier;
333 	int node;
334 
335 	for_each_node_state(node, N_MEMORY) {
336 		node_demotion[node].preferred = NODE_MASK_NONE;
337 		/*
338 		 * We are holding memory_tier_lock, it is safe
339 		 * to access pgda->memtier.
340 		 */
341 		memtier = __node_get_memory_tier(node);
342 		if (memtier)
343 			memtier->lower_tier_mask = NODE_MASK_NONE;
344 	}
345 	/*
346 	 * Ensure that the "disable" is visible across the system.
347 	 * Readers will see either a combination of before+disable
348 	 * state or disable+after.  They will never see before and
349 	 * after state together.
350 	 */
351 	synchronize_rcu();
352 }
353 
354 /*
355  * Find an automatic demotion target for all memory
356  * nodes. Failing here is OK.  It might just indicate
357  * being at the end of a chain.
358  */
establish_demotion_targets(void)359 static void establish_demotion_targets(void)
360 {
361 	struct memory_tier *memtier;
362 	struct demotion_nodes *nd;
363 	int target = NUMA_NO_NODE, node;
364 	int distance, best_distance;
365 	nodemask_t tier_nodes, lower_tier;
366 
367 	lockdep_assert_held_once(&memory_tier_lock);
368 
369 	if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
370 		return;
371 
372 	disable_all_demotion_targets();
373 
374 	for_each_node_state(node, N_MEMORY) {
375 		best_distance = -1;
376 		nd = &node_demotion[node];
377 
378 		memtier = __node_get_memory_tier(node);
379 		if (!memtier || list_is_last(&memtier->list, &memory_tiers))
380 			continue;
381 		/*
382 		 * Get the lower memtier to find the  demotion node list.
383 		 */
384 		memtier = list_next_entry(memtier, list);
385 		tier_nodes = get_memtier_nodemask(memtier);
386 		/*
387 		 * find_next_best_node, use 'used' nodemask as a skip list.
388 		 * Add all memory nodes except the selected memory tier
389 		 * nodelist to skip list so that we find the best node from the
390 		 * memtier nodelist.
391 		 */
392 		nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
393 
394 		/*
395 		 * Find all the nodes in the memory tier node list of same best distance.
396 		 * add them to the preferred mask. We randomly select between nodes
397 		 * in the preferred mask when allocating pages during demotion.
398 		 */
399 		do {
400 			target = find_next_best_node(node, &tier_nodes);
401 			if (target == NUMA_NO_NODE)
402 				break;
403 
404 			distance = node_distance(node, target);
405 			if (distance == best_distance || best_distance == -1) {
406 				best_distance = distance;
407 				node_set(target, nd->preferred);
408 			} else {
409 				break;
410 			}
411 		} while (1);
412 	}
413 	/*
414 	 * Promotion is allowed from a memory tier to higher
415 	 * memory tier only if the memory tier doesn't include
416 	 * compute. We want to skip promotion from a memory tier,
417 	 * if any node that is part of the memory tier have CPUs.
418 	 * Once we detect such a memory tier, we consider that tier
419 	 * as top tiper from which promotion is not allowed.
420 	 */
421 	list_for_each_entry_reverse(memtier, &memory_tiers, list) {
422 		tier_nodes = get_memtier_nodemask(memtier);
423 		nodes_and(tier_nodes, node_states[N_CPU], tier_nodes);
424 		if (!nodes_empty(tier_nodes)) {
425 			/*
426 			 * abstract distance below the max value of this memtier
427 			 * is considered toptier.
428 			 */
429 			top_tier_adistance = memtier->adistance_start +
430 						MEMTIER_CHUNK_SIZE - 1;
431 			break;
432 		}
433 	}
434 	/*
435 	 * Now build the lower_tier mask for each node collecting node mask from
436 	 * all memory tier below it. This allows us to fallback demotion page
437 	 * allocation to a set of nodes that is closer the above selected
438 	 * perferred node.
439 	 */
440 	lower_tier = node_states[N_MEMORY];
441 	list_for_each_entry(memtier, &memory_tiers, list) {
442 		/*
443 		 * Keep removing current tier from lower_tier nodes,
444 		 * This will remove all nodes in current and above
445 		 * memory tier from the lower_tier mask.
446 		 */
447 		tier_nodes = get_memtier_nodemask(memtier);
448 		nodes_andnot(lower_tier, lower_tier, tier_nodes);
449 		memtier->lower_tier_mask = lower_tier;
450 	}
451 }
452 
453 #else
disable_all_demotion_targets(void)454 static inline void disable_all_demotion_targets(void) {}
establish_demotion_targets(void)455 static inline void establish_demotion_targets(void) {}
456 #endif /* CONFIG_MIGRATION */
457 
__init_node_memory_type(int node,struct memory_dev_type * memtype)458 static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
459 {
460 	if (!node_memory_types[node].memtype)
461 		node_memory_types[node].memtype = memtype;
462 	/*
463 	 * for each device getting added in the same NUMA node
464 	 * with this specific memtype, bump the map count. We
465 	 * Only take memtype device reference once, so that
466 	 * changing a node memtype can be done by droping the
467 	 * only reference count taken here.
468 	 */
469 
470 	if (node_memory_types[node].memtype == memtype) {
471 		if (!node_memory_types[node].map_count++)
472 			kref_get(&memtype->kref);
473 	}
474 }
475 
set_node_memory_tier(int node)476 static struct memory_tier *set_node_memory_tier(int node)
477 {
478 	struct memory_tier *memtier;
479 	struct memory_dev_type *memtype;
480 	pg_data_t *pgdat = NODE_DATA(node);
481 
482 
483 	lockdep_assert_held_once(&memory_tier_lock);
484 
485 	if (!node_state(node, N_MEMORY))
486 		return ERR_PTR(-EINVAL);
487 
488 	__init_node_memory_type(node, default_dram_type);
489 
490 	memtype = node_memory_types[node].memtype;
491 	node_set(node, memtype->nodes);
492 	memtier = find_create_memory_tier(memtype);
493 	if (!IS_ERR(memtier))
494 		rcu_assign_pointer(pgdat->memtier, memtier);
495 	return memtier;
496 }
497 
destroy_memory_tier(struct memory_tier * memtier)498 static void destroy_memory_tier(struct memory_tier *memtier)
499 {
500 	list_del(&memtier->list);
501 	device_unregister(&memtier->dev);
502 }
503 
clear_node_memory_tier(int node)504 static bool clear_node_memory_tier(int node)
505 {
506 	bool cleared = false;
507 	pg_data_t *pgdat;
508 	struct memory_tier *memtier;
509 
510 	pgdat = NODE_DATA(node);
511 	if (!pgdat)
512 		return false;
513 
514 	/*
515 	 * Make sure that anybody looking at NODE_DATA who finds
516 	 * a valid memtier finds memory_dev_types with nodes still
517 	 * linked to the memtier. We achieve this by waiting for
518 	 * rcu read section to finish using synchronize_rcu.
519 	 * This also enables us to free the destroyed memory tier
520 	 * with kfree instead of kfree_rcu
521 	 */
522 	memtier = __node_get_memory_tier(node);
523 	if (memtier) {
524 		struct memory_dev_type *memtype;
525 
526 		rcu_assign_pointer(pgdat->memtier, NULL);
527 		synchronize_rcu();
528 		memtype = node_memory_types[node].memtype;
529 		node_clear(node, memtype->nodes);
530 		if (nodes_empty(memtype->nodes)) {
531 			list_del_init(&memtype->tier_sibiling);
532 			if (list_empty(&memtier->memory_types))
533 				destroy_memory_tier(memtier);
534 		}
535 		cleared = true;
536 	}
537 	return cleared;
538 }
539 
release_memtype(struct kref * kref)540 static void release_memtype(struct kref *kref)
541 {
542 	struct memory_dev_type *memtype;
543 
544 	memtype = container_of(kref, struct memory_dev_type, kref);
545 	kfree(memtype);
546 }
547 
alloc_memory_type(int adistance)548 struct memory_dev_type *alloc_memory_type(int adistance)
549 {
550 	struct memory_dev_type *memtype;
551 
552 	memtype = kmalloc(sizeof(*memtype), GFP_KERNEL);
553 	if (!memtype)
554 		return ERR_PTR(-ENOMEM);
555 
556 	memtype->adistance = adistance;
557 	INIT_LIST_HEAD(&memtype->tier_sibiling);
558 	memtype->nodes  = NODE_MASK_NONE;
559 	kref_init(&memtype->kref);
560 	return memtype;
561 }
562 EXPORT_SYMBOL_GPL(alloc_memory_type);
563 
destroy_memory_type(struct memory_dev_type * memtype)564 void destroy_memory_type(struct memory_dev_type *memtype)
565 {
566 	kref_put(&memtype->kref, release_memtype);
567 }
568 EXPORT_SYMBOL_GPL(destroy_memory_type);
569 
init_node_memory_type(int node,struct memory_dev_type * memtype)570 void init_node_memory_type(int node, struct memory_dev_type *memtype)
571 {
572 
573 	mutex_lock(&memory_tier_lock);
574 	__init_node_memory_type(node, memtype);
575 	mutex_unlock(&memory_tier_lock);
576 }
577 EXPORT_SYMBOL_GPL(init_node_memory_type);
578 
clear_node_memory_type(int node,struct memory_dev_type * memtype)579 void clear_node_memory_type(int node, struct memory_dev_type *memtype)
580 {
581 	mutex_lock(&memory_tier_lock);
582 	if (node_memory_types[node].memtype == memtype)
583 		node_memory_types[node].map_count--;
584 	/*
585 	 * If we umapped all the attached devices to this node,
586 	 * clear the node memory type.
587 	 */
588 	if (!node_memory_types[node].map_count) {
589 		node_memory_types[node].memtype = NULL;
590 		kref_put(&memtype->kref, release_memtype);
591 	}
592 	mutex_unlock(&memory_tier_lock);
593 }
594 EXPORT_SYMBOL_GPL(clear_node_memory_type);
595 
memtier_hotplug_callback(struct notifier_block * self,unsigned long action,void * _arg)596 static int __meminit memtier_hotplug_callback(struct notifier_block *self,
597 					      unsigned long action, void *_arg)
598 {
599 	struct memory_tier *memtier;
600 	struct memory_notify *arg = _arg;
601 
602 	/*
603 	 * Only update the node migration order when a node is
604 	 * changing status, like online->offline.
605 	 */
606 	if (arg->status_change_nid < 0)
607 		return notifier_from_errno(0);
608 
609 	switch (action) {
610 	case MEM_OFFLINE:
611 		mutex_lock(&memory_tier_lock);
612 		if (clear_node_memory_tier(arg->status_change_nid))
613 			establish_demotion_targets();
614 		mutex_unlock(&memory_tier_lock);
615 		break;
616 	case MEM_ONLINE:
617 		mutex_lock(&memory_tier_lock);
618 		memtier = set_node_memory_tier(arg->status_change_nid);
619 		if (!IS_ERR(memtier))
620 			establish_demotion_targets();
621 		mutex_unlock(&memory_tier_lock);
622 		break;
623 	}
624 
625 	return notifier_from_errno(0);
626 }
627 
memory_tier_init(void)628 static int __init memory_tier_init(void)
629 {
630 	int ret, node;
631 	struct memory_tier *memtier;
632 
633 	ret = subsys_virtual_register(&memory_tier_subsys, NULL);
634 	if (ret)
635 		panic("%s() failed to register memory tier subsystem\n", __func__);
636 
637 #ifdef CONFIG_MIGRATION
638 	node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
639 				GFP_KERNEL);
640 	WARN_ON(!node_demotion);
641 #endif
642 	mutex_lock(&memory_tier_lock);
643 	/*
644 	 * For now we can have 4 faster memory tiers with smaller adistance
645 	 * than default DRAM tier.
646 	 */
647 	default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
648 	if (!default_dram_type)
649 		panic("%s() failed to allocate default DRAM tier\n", __func__);
650 
651 	/*
652 	 * Look at all the existing N_MEMORY nodes and add them to
653 	 * default memory tier or to a tier if we already have memory
654 	 * types assigned.
655 	 */
656 	for_each_node_state(node, N_MEMORY) {
657 		memtier = set_node_memory_tier(node);
658 		if (IS_ERR(memtier))
659 			/*
660 			 * Continue with memtiers we are able to setup
661 			 */
662 			break;
663 	}
664 	establish_demotion_targets();
665 	mutex_unlock(&memory_tier_lock);
666 
667 	hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO);
668 	return 0;
669 }
670 subsys_initcall(memory_tier_init);
671 
672 bool numa_demotion_enabled = false;
673 
674 #ifdef CONFIG_MIGRATION
675 #ifdef CONFIG_SYSFS
numa_demotion_enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)676 static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
677 					  struct kobj_attribute *attr, char *buf)
678 {
679 	return sysfs_emit(buf, "%s\n",
680 			  numa_demotion_enabled ? "true" : "false");
681 }
682 
numa_demotion_enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)683 static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
684 					   struct kobj_attribute *attr,
685 					   const char *buf, size_t count)
686 {
687 	ssize_t ret;
688 
689 	ret = kstrtobool(buf, &numa_demotion_enabled);
690 	if (ret)
691 		return ret;
692 
693 	return count;
694 }
695 
696 static struct kobj_attribute numa_demotion_enabled_attr =
697 	__ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
698 	       numa_demotion_enabled_store);
699 
700 static struct attribute *numa_attrs[] = {
701 	&numa_demotion_enabled_attr.attr,
702 	NULL,
703 };
704 
705 static const struct attribute_group numa_attr_group = {
706 	.attrs = numa_attrs,
707 };
708 
numa_init_sysfs(void)709 static int __init numa_init_sysfs(void)
710 {
711 	int err;
712 	struct kobject *numa_kobj;
713 
714 	numa_kobj = kobject_create_and_add("numa", mm_kobj);
715 	if (!numa_kobj) {
716 		pr_err("failed to create numa kobject\n");
717 		return -ENOMEM;
718 	}
719 	err = sysfs_create_group(numa_kobj, &numa_attr_group);
720 	if (err) {
721 		pr_err("failed to register numa group\n");
722 		goto delete_obj;
723 	}
724 	return 0;
725 
726 delete_obj:
727 	kobject_put(numa_kobj);
728 	return err;
729 }
730 subsys_initcall(numa_init_sysfs);
731 #endif /* CONFIG_SYSFS */
732 #endif
733