1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/slab.h>
3 #include <linux/lockdep.h>
4 #include <linux/sysfs.h>
5 #include <linux/kobject.h>
6 #include <linux/memory.h>
7 #include <linux/memory-tiers.h>
8 
9 #include "internal.h"
10 
11 struct memory_tier {
12 	/* hierarchy of memory tiers */
13 	struct list_head list;
14 	/* list of all memory types part of this tier */
15 	struct list_head memory_types;
16 	/*
17 	 * start value of abstract distance. memory tier maps
18 	 * an abstract distance  range,
19 	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
20 	 */
21 	int adistance_start;
22 	struct device dev;
23 	/* All the nodes that are part of all the lower memory tiers. */
24 	nodemask_t lower_tier_mask;
25 };
26 
27 struct demotion_nodes {
28 	nodemask_t preferred;
29 };
30 
31 struct node_memory_type_map {
32 	struct memory_dev_type *memtype;
33 	int map_count;
34 };
35 
36 static DEFINE_MUTEX(memory_tier_lock);
37 static LIST_HEAD(memory_tiers);
38 static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
39 static struct memory_dev_type *default_dram_type;
40 
41 static struct bus_type memory_tier_subsys = {
42 	.name = "memory_tiering",
43 	.dev_name = "memory_tier",
44 };
45 
46 #ifdef CONFIG_MIGRATION
47 static int top_tier_adistance;
48 /*
49  * node_demotion[] examples:
50  *
51  * Example 1:
52  *
53  * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
54  *
55  * node distances:
56  * node   0    1    2    3
57  *    0  10   20   30   40
58  *    1  20   10   40   30
59  *    2  30   40   10   40
60  *    3  40   30   40   10
61  *
62  * memory_tiers0 = 0-1
63  * memory_tiers1 = 2-3
64  *
65  * node_demotion[0].preferred = 2
66  * node_demotion[1].preferred = 3
67  * node_demotion[2].preferred = <empty>
68  * node_demotion[3].preferred = <empty>
69  *
70  * Example 2:
71  *
72  * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
73  *
74  * node distances:
75  * node   0    1    2
76  *    0  10   20   30
77  *    1  20   10   30
78  *    2  30   30   10
79  *
80  * memory_tiers0 = 0-2
81  *
82  * node_demotion[0].preferred = <empty>
83  * node_demotion[1].preferred = <empty>
84  * node_demotion[2].preferred = <empty>
85  *
86  * Example 3:
87  *
88  * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
89  *
90  * node distances:
91  * node   0    1    2
92  *    0  10   20   30
93  *    1  20   10   40
94  *    2  30   40   10
95  *
96  * memory_tiers0 = 1
97  * memory_tiers1 = 0
98  * memory_tiers2 = 2
99  *
100  * node_demotion[0].preferred = 2
101  * node_demotion[1].preferred = 0
102  * node_demotion[2].preferred = <empty>
103  *
104  */
105 static struct demotion_nodes *node_demotion __read_mostly;
106 #endif /* CONFIG_MIGRATION */
107 
to_memory_tier(struct device * device)108 static inline struct memory_tier *to_memory_tier(struct device *device)
109 {
110 	return container_of(device, struct memory_tier, dev);
111 }
112 
get_memtier_nodemask(struct memory_tier * memtier)113 static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
114 {
115 	nodemask_t nodes = NODE_MASK_NONE;
116 	struct memory_dev_type *memtype;
117 
118 	list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
119 		nodes_or(nodes, nodes, memtype->nodes);
120 
121 	return nodes;
122 }
123 
memory_tier_device_release(struct device * dev)124 static void memory_tier_device_release(struct device *dev)
125 {
126 	struct memory_tier *tier = to_memory_tier(dev);
127 	/*
128 	 * synchronize_rcu in clear_node_memory_tier makes sure
129 	 * we don't have rcu access to this memory tier.
130 	 */
131 	kfree(tier);
132 }
133 
nodelist_show(struct device * dev,struct device_attribute * attr,char * buf)134 static ssize_t nodelist_show(struct device *dev,
135 			     struct device_attribute *attr, char *buf)
136 {
137 	int ret;
138 	nodemask_t nmask;
139 
140 	mutex_lock(&memory_tier_lock);
141 	nmask = get_memtier_nodemask(to_memory_tier(dev));
142 	ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
143 	mutex_unlock(&memory_tier_lock);
144 	return ret;
145 }
146 static DEVICE_ATTR_RO(nodelist);
147 
148 static struct attribute *memtier_dev_attrs[] = {
149 	&dev_attr_nodelist.attr,
150 	NULL
151 };
152 
153 static const struct attribute_group memtier_dev_group = {
154 	.attrs = memtier_dev_attrs,
155 };
156 
157 static const struct attribute_group *memtier_dev_groups[] = {
158 	&memtier_dev_group,
159 	NULL
160 };
161 
find_create_memory_tier(struct memory_dev_type * memtype)162 static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
163 {
164 	int ret;
165 	bool found_slot = false;
166 	struct memory_tier *memtier, *new_memtier;
167 	int adistance = memtype->adistance;
168 	unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE;
169 
170 	lockdep_assert_held_once(&memory_tier_lock);
171 
172 	adistance = round_down(adistance, memtier_adistance_chunk_size);
173 	/*
174 	 * If the memtype is already part of a memory tier,
175 	 * just return that.
176 	 */
177 	if (!list_empty(&memtype->tier_sibiling)) {
178 		list_for_each_entry(memtier, &memory_tiers, list) {
179 			if (adistance == memtier->adistance_start)
180 				return memtier;
181 		}
182 		WARN_ON(1);
183 		return ERR_PTR(-EINVAL);
184 	}
185 
186 	list_for_each_entry(memtier, &memory_tiers, list) {
187 		if (adistance == memtier->adistance_start) {
188 			goto link_memtype;
189 		} else if (adistance < memtier->adistance_start) {
190 			found_slot = true;
191 			break;
192 		}
193 	}
194 
195 	new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
196 	if (!new_memtier)
197 		return ERR_PTR(-ENOMEM);
198 
199 	new_memtier->adistance_start = adistance;
200 	INIT_LIST_HEAD(&new_memtier->list);
201 	INIT_LIST_HEAD(&new_memtier->memory_types);
202 	if (found_slot)
203 		list_add_tail(&new_memtier->list, &memtier->list);
204 	else
205 		list_add_tail(&new_memtier->list, &memory_tiers);
206 
207 	new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
208 	new_memtier->dev.bus = &memory_tier_subsys;
209 	new_memtier->dev.release = memory_tier_device_release;
210 	new_memtier->dev.groups = memtier_dev_groups;
211 
212 	ret = device_register(&new_memtier->dev);
213 	if (ret) {
214 		list_del(&new_memtier->list);
215 		put_device(&new_memtier->dev);
216 		return ERR_PTR(ret);
217 	}
218 	memtier = new_memtier;
219 
220 link_memtype:
221 	list_add(&memtype->tier_sibiling, &memtier->memory_types);
222 	return memtier;
223 }
224 
__node_get_memory_tier(int node)225 static struct memory_tier *__node_get_memory_tier(int node)
226 {
227 	pg_data_t *pgdat;
228 
229 	pgdat = NODE_DATA(node);
230 	if (!pgdat)
231 		return NULL;
232 	/*
233 	 * Since we hold memory_tier_lock, we can avoid
234 	 * RCU read locks when accessing the details. No
235 	 * parallel updates are possible here.
236 	 */
237 	return rcu_dereference_check(pgdat->memtier,
238 				     lockdep_is_held(&memory_tier_lock));
239 }
240 
241 #ifdef CONFIG_MIGRATION
node_is_toptier(int node)242 bool node_is_toptier(int node)
243 {
244 	bool toptier;
245 	pg_data_t *pgdat;
246 	struct memory_tier *memtier;
247 
248 	pgdat = NODE_DATA(node);
249 	if (!pgdat)
250 		return false;
251 
252 	rcu_read_lock();
253 	memtier = rcu_dereference(pgdat->memtier);
254 	if (!memtier) {
255 		toptier = true;
256 		goto out;
257 	}
258 	if (memtier->adistance_start <= top_tier_adistance)
259 		toptier = true;
260 	else
261 		toptier = false;
262 out:
263 	rcu_read_unlock();
264 	return toptier;
265 }
266 
node_get_allowed_targets(pg_data_t * pgdat,nodemask_t * targets)267 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
268 {
269 	struct memory_tier *memtier;
270 
271 	/*
272 	 * pg_data_t.memtier updates includes a synchronize_rcu()
273 	 * which ensures that we either find NULL or a valid memtier
274 	 * in NODE_DATA. protect the access via rcu_read_lock();
275 	 */
276 	rcu_read_lock();
277 	memtier = rcu_dereference(pgdat->memtier);
278 	if (memtier)
279 		*targets = memtier->lower_tier_mask;
280 	else
281 		*targets = NODE_MASK_NONE;
282 	rcu_read_unlock();
283 }
284 
285 /**
286  * next_demotion_node() - Get the next node in the demotion path
287  * @node: The starting node to lookup the next node
288  *
289  * Return: node id for next memory node in the demotion path hierarchy
290  * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
291  * @node online or guarantee that it *continues* to be the next demotion
292  * target.
293  */
next_demotion_node(int node)294 int next_demotion_node(int node)
295 {
296 	struct demotion_nodes *nd;
297 	int target;
298 
299 	if (!node_demotion)
300 		return NUMA_NO_NODE;
301 
302 	nd = &node_demotion[node];
303 
304 	/*
305 	 * node_demotion[] is updated without excluding this
306 	 * function from running.
307 	 *
308 	 * Make sure to use RCU over entire code blocks if
309 	 * node_demotion[] reads need to be consistent.
310 	 */
311 	rcu_read_lock();
312 	/*
313 	 * If there are multiple target nodes, just select one
314 	 * target node randomly.
315 	 *
316 	 * In addition, we can also use round-robin to select
317 	 * target node, but we should introduce another variable
318 	 * for node_demotion[] to record last selected target node,
319 	 * that may cause cache ping-pong due to the changing of
320 	 * last target node. Or introducing per-cpu data to avoid
321 	 * caching issue, which seems more complicated. So selecting
322 	 * target node randomly seems better until now.
323 	 */
324 	target = node_random(&nd->preferred);
325 	rcu_read_unlock();
326 
327 	return target;
328 }
329 
disable_all_demotion_targets(void)330 static void disable_all_demotion_targets(void)
331 {
332 	struct memory_tier *memtier;
333 	int node;
334 
335 	for_each_node_state(node, N_MEMORY) {
336 		node_demotion[node].preferred = NODE_MASK_NONE;
337 		/*
338 		 * We are holding memory_tier_lock, it is safe
339 		 * to access pgda->memtier.
340 		 */
341 		memtier = __node_get_memory_tier(node);
342 		if (memtier)
343 			memtier->lower_tier_mask = NODE_MASK_NONE;
344 	}
345 	/*
346 	 * Ensure that the "disable" is visible across the system.
347 	 * Readers will see either a combination of before+disable
348 	 * state or disable+after.  They will never see before and
349 	 * after state together.
350 	 */
351 	synchronize_rcu();
352 }
353 
354 /*
355  * Find an automatic demotion target for all memory
356  * nodes. Failing here is OK.  It might just indicate
357  * being at the end of a chain.
358  */
establish_demotion_targets(void)359 static void establish_demotion_targets(void)
360 {
361 	struct memory_tier *memtier;
362 	struct demotion_nodes *nd;
363 	int target = NUMA_NO_NODE, node;
364 	int distance, best_distance;
365 	nodemask_t tier_nodes, lower_tier;
366 
367 	lockdep_assert_held_once(&memory_tier_lock);
368 
369 	if (!node_demotion)
370 		return;
371 
372 	disable_all_demotion_targets();
373 
374 	for_each_node_state(node, N_MEMORY) {
375 		best_distance = -1;
376 		nd = &node_demotion[node];
377 
378 		memtier = __node_get_memory_tier(node);
379 		if (!memtier || list_is_last(&memtier->list, &memory_tiers))
380 			continue;
381 		/*
382 		 * Get the lower memtier to find the  demotion node list.
383 		 */
384 		memtier = list_next_entry(memtier, list);
385 		tier_nodes = get_memtier_nodemask(memtier);
386 		/*
387 		 * find_next_best_node, use 'used' nodemask as a skip list.
388 		 * Add all memory nodes except the selected memory tier
389 		 * nodelist to skip list so that we find the best node from the
390 		 * memtier nodelist.
391 		 */
392 		nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
393 
394 		/*
395 		 * Find all the nodes in the memory tier node list of same best distance.
396 		 * add them to the preferred mask. We randomly select between nodes
397 		 * in the preferred mask when allocating pages during demotion.
398 		 */
399 		do {
400 			target = find_next_best_node(node, &tier_nodes);
401 			if (target == NUMA_NO_NODE)
402 				break;
403 
404 			distance = node_distance(node, target);
405 			if (distance == best_distance || best_distance == -1) {
406 				best_distance = distance;
407 				node_set(target, nd->preferred);
408 			} else {
409 				break;
410 			}
411 		} while (1);
412 	}
413 	/*
414 	 * Promotion is allowed from a memory tier to higher
415 	 * memory tier only if the memory tier doesn't include
416 	 * compute. We want to skip promotion from a memory tier,
417 	 * if any node that is part of the memory tier have CPUs.
418 	 * Once we detect such a memory tier, we consider that tier
419 	 * as top tiper from which promotion is not allowed.
420 	 */
421 	list_for_each_entry_reverse(memtier, &memory_tiers, list) {
422 		tier_nodes = get_memtier_nodemask(memtier);
423 		nodes_and(tier_nodes, node_states[N_CPU], tier_nodes);
424 		if (!nodes_empty(tier_nodes)) {
425 			/*
426 			 * abstract distance below the max value of this memtier
427 			 * is considered toptier.
428 			 */
429 			top_tier_adistance = memtier->adistance_start +
430 						MEMTIER_CHUNK_SIZE - 1;
431 			break;
432 		}
433 	}
434 	/*
435 	 * Now build the lower_tier mask for each node collecting node mask from
436 	 * all memory tier below it. This allows us to fallback demotion page
437 	 * allocation to a set of nodes that is closer the above selected
438 	 * perferred node.
439 	 */
440 	lower_tier = node_states[N_MEMORY];
441 	list_for_each_entry(memtier, &memory_tiers, list) {
442 		/*
443 		 * Keep removing current tier from lower_tier nodes,
444 		 * This will remove all nodes in current and above
445 		 * memory tier from the lower_tier mask.
446 		 */
447 		tier_nodes = get_memtier_nodemask(memtier);
448 		nodes_andnot(lower_tier, lower_tier, tier_nodes);
449 		memtier->lower_tier_mask = lower_tier;
450 	}
451 }
452 
453 #else
establish_demotion_targets(void)454 static inline void establish_demotion_targets(void) {}
455 #endif /* CONFIG_MIGRATION */
456 
__init_node_memory_type(int node,struct memory_dev_type * memtype)457 static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
458 {
459 	if (!node_memory_types[node].memtype)
460 		node_memory_types[node].memtype = memtype;
461 	/*
462 	 * for each device getting added in the same NUMA node
463 	 * with this specific memtype, bump the map count. We
464 	 * Only take memtype device reference once, so that
465 	 * changing a node memtype can be done by droping the
466 	 * only reference count taken here.
467 	 */
468 
469 	if (node_memory_types[node].memtype == memtype) {
470 		if (!node_memory_types[node].map_count++)
471 			kref_get(&memtype->kref);
472 	}
473 }
474 
set_node_memory_tier(int node)475 static struct memory_tier *set_node_memory_tier(int node)
476 {
477 	struct memory_tier *memtier;
478 	struct memory_dev_type *memtype;
479 	pg_data_t *pgdat = NODE_DATA(node);
480 
481 
482 	lockdep_assert_held_once(&memory_tier_lock);
483 
484 	if (!node_state(node, N_MEMORY))
485 		return ERR_PTR(-EINVAL);
486 
487 	__init_node_memory_type(node, default_dram_type);
488 
489 	memtype = node_memory_types[node].memtype;
490 	node_set(node, memtype->nodes);
491 	memtier = find_create_memory_tier(memtype);
492 	if (!IS_ERR(memtier))
493 		rcu_assign_pointer(pgdat->memtier, memtier);
494 	return memtier;
495 }
496 
destroy_memory_tier(struct memory_tier * memtier)497 static void destroy_memory_tier(struct memory_tier *memtier)
498 {
499 	list_del(&memtier->list);
500 	device_unregister(&memtier->dev);
501 }
502 
clear_node_memory_tier(int node)503 static bool clear_node_memory_tier(int node)
504 {
505 	bool cleared = false;
506 	pg_data_t *pgdat;
507 	struct memory_tier *memtier;
508 
509 	pgdat = NODE_DATA(node);
510 	if (!pgdat)
511 		return false;
512 
513 	/*
514 	 * Make sure that anybody looking at NODE_DATA who finds
515 	 * a valid memtier finds memory_dev_types with nodes still
516 	 * linked to the memtier. We achieve this by waiting for
517 	 * rcu read section to finish using synchronize_rcu.
518 	 * This also enables us to free the destroyed memory tier
519 	 * with kfree instead of kfree_rcu
520 	 */
521 	memtier = __node_get_memory_tier(node);
522 	if (memtier) {
523 		struct memory_dev_type *memtype;
524 
525 		rcu_assign_pointer(pgdat->memtier, NULL);
526 		synchronize_rcu();
527 		memtype = node_memory_types[node].memtype;
528 		node_clear(node, memtype->nodes);
529 		if (nodes_empty(memtype->nodes)) {
530 			list_del_init(&memtype->tier_sibiling);
531 			if (list_empty(&memtier->memory_types))
532 				destroy_memory_tier(memtier);
533 		}
534 		cleared = true;
535 	}
536 	return cleared;
537 }
538 
release_memtype(struct kref * kref)539 static void release_memtype(struct kref *kref)
540 {
541 	struct memory_dev_type *memtype;
542 
543 	memtype = container_of(kref, struct memory_dev_type, kref);
544 	kfree(memtype);
545 }
546 
alloc_memory_type(int adistance)547 struct memory_dev_type *alloc_memory_type(int adistance)
548 {
549 	struct memory_dev_type *memtype;
550 
551 	memtype = kmalloc(sizeof(*memtype), GFP_KERNEL);
552 	if (!memtype)
553 		return ERR_PTR(-ENOMEM);
554 
555 	memtype->adistance = adistance;
556 	INIT_LIST_HEAD(&memtype->tier_sibiling);
557 	memtype->nodes  = NODE_MASK_NONE;
558 	kref_init(&memtype->kref);
559 	return memtype;
560 }
561 EXPORT_SYMBOL_GPL(alloc_memory_type);
562 
put_memory_type(struct memory_dev_type * memtype)563 void put_memory_type(struct memory_dev_type *memtype)
564 {
565 	kref_put(&memtype->kref, release_memtype);
566 }
567 EXPORT_SYMBOL_GPL(put_memory_type);
568 
init_node_memory_type(int node,struct memory_dev_type * memtype)569 void init_node_memory_type(int node, struct memory_dev_type *memtype)
570 {
571 
572 	mutex_lock(&memory_tier_lock);
573 	__init_node_memory_type(node, memtype);
574 	mutex_unlock(&memory_tier_lock);
575 }
576 EXPORT_SYMBOL_GPL(init_node_memory_type);
577 
clear_node_memory_type(int node,struct memory_dev_type * memtype)578 void clear_node_memory_type(int node, struct memory_dev_type *memtype)
579 {
580 	mutex_lock(&memory_tier_lock);
581 	if (node_memory_types[node].memtype == memtype)
582 		node_memory_types[node].map_count--;
583 	/*
584 	 * If we umapped all the attached devices to this node,
585 	 * clear the node memory type.
586 	 */
587 	if (!node_memory_types[node].map_count) {
588 		node_memory_types[node].memtype = NULL;
589 		put_memory_type(memtype);
590 	}
591 	mutex_unlock(&memory_tier_lock);
592 }
593 EXPORT_SYMBOL_GPL(clear_node_memory_type);
594 
memtier_hotplug_callback(struct notifier_block * self,unsigned long action,void * _arg)595 static int __meminit memtier_hotplug_callback(struct notifier_block *self,
596 					      unsigned long action, void *_arg)
597 {
598 	struct memory_tier *memtier;
599 	struct memory_notify *arg = _arg;
600 
601 	/*
602 	 * Only update the node migration order when a node is
603 	 * changing status, like online->offline.
604 	 */
605 	if (arg->status_change_nid < 0)
606 		return notifier_from_errno(0);
607 
608 	switch (action) {
609 	case MEM_OFFLINE:
610 		mutex_lock(&memory_tier_lock);
611 		if (clear_node_memory_tier(arg->status_change_nid))
612 			establish_demotion_targets();
613 		mutex_unlock(&memory_tier_lock);
614 		break;
615 	case MEM_ONLINE:
616 		mutex_lock(&memory_tier_lock);
617 		memtier = set_node_memory_tier(arg->status_change_nid);
618 		if (!IS_ERR(memtier))
619 			establish_demotion_targets();
620 		mutex_unlock(&memory_tier_lock);
621 		break;
622 	}
623 
624 	return notifier_from_errno(0);
625 }
626 
memory_tier_init(void)627 static int __init memory_tier_init(void)
628 {
629 	int ret, node;
630 	struct memory_tier *memtier;
631 
632 	ret = subsys_virtual_register(&memory_tier_subsys, NULL);
633 	if (ret)
634 		panic("%s() failed to register memory tier subsystem\n", __func__);
635 
636 #ifdef CONFIG_MIGRATION
637 	node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
638 				GFP_KERNEL);
639 	WARN_ON(!node_demotion);
640 #endif
641 	mutex_lock(&memory_tier_lock);
642 	/*
643 	 * For now we can have 4 faster memory tiers with smaller adistance
644 	 * than default DRAM tier.
645 	 */
646 	default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
647 	if (IS_ERR(default_dram_type))
648 		panic("%s() failed to allocate default DRAM tier\n", __func__);
649 
650 	/*
651 	 * Look at all the existing N_MEMORY nodes and add them to
652 	 * default memory tier or to a tier if we already have memory
653 	 * types assigned.
654 	 */
655 	for_each_node_state(node, N_MEMORY) {
656 		memtier = set_node_memory_tier(node);
657 		if (IS_ERR(memtier))
658 			/*
659 			 * Continue with memtiers we are able to setup
660 			 */
661 			break;
662 	}
663 	establish_demotion_targets();
664 	mutex_unlock(&memory_tier_lock);
665 
666 	hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
667 	return 0;
668 }
669 subsys_initcall(memory_tier_init);
670 
671 bool numa_demotion_enabled = false;
672 
673 #ifdef CONFIG_MIGRATION
674 #ifdef CONFIG_SYSFS
demotion_enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)675 static ssize_t demotion_enabled_show(struct kobject *kobj,
676 				     struct kobj_attribute *attr, char *buf)
677 {
678 	return sysfs_emit(buf, "%s\n",
679 			  numa_demotion_enabled ? "true" : "false");
680 }
681 
demotion_enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)682 static ssize_t demotion_enabled_store(struct kobject *kobj,
683 				      struct kobj_attribute *attr,
684 				      const char *buf, size_t count)
685 {
686 	ssize_t ret;
687 
688 	ret = kstrtobool(buf, &numa_demotion_enabled);
689 	if (ret)
690 		return ret;
691 
692 	return count;
693 }
694 
695 static struct kobj_attribute numa_demotion_enabled_attr =
696 	__ATTR_RW(demotion_enabled);
697 
698 static struct attribute *numa_attrs[] = {
699 	&numa_demotion_enabled_attr.attr,
700 	NULL,
701 };
702 
703 static const struct attribute_group numa_attr_group = {
704 	.attrs = numa_attrs,
705 };
706 
numa_init_sysfs(void)707 static int __init numa_init_sysfs(void)
708 {
709 	int err;
710 	struct kobject *numa_kobj;
711 
712 	numa_kobj = kobject_create_and_add("numa", mm_kobj);
713 	if (!numa_kobj) {
714 		pr_err("failed to create numa kobject\n");
715 		return -ENOMEM;
716 	}
717 	err = sysfs_create_group(numa_kobj, &numa_attr_group);
718 	if (err) {
719 		pr_err("failed to register numa group\n");
720 		goto delete_obj;
721 	}
722 	return 0;
723 
724 delete_obj:
725 	kobject_put(numa_kobj);
726 	return err;
727 }
728 subsys_initcall(numa_init_sysfs);
729 #endif /* CONFIG_SYSFS */
730 #endif
731