1 // SPDX-License-Identifier: GPL-2.0
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <fcntl.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9 #include <sys/stat.h>
10 #include <sys/types.h>
11 #include <unistd.h>
12 #include <sys/wait.h>
13 #include <errno.h>
14 #include <sys/sysinfo.h>
15 #include <pthread.h>
16 
17 #include "../kselftest.h"
18 #include "cgroup_util.h"
19 
20 
21 /*
22  * Memory cgroup charging is performed using percpu batches 64 pages
23  * big (look at MEMCG_CHARGE_BATCH), whereas memory.stat is exact. So
24  * the maximum discrepancy between charge and vmstat entries is number
25  * of cpus multiplied by 64 pages.
26  */
27 #define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs())
28 
29 
alloc_dcache(const char * cgroup,void * arg)30 static int alloc_dcache(const char *cgroup, void *arg)
31 {
32 	unsigned long i;
33 	struct stat st;
34 	char buf[128];
35 
36 	for (i = 0; i < (unsigned long)arg; i++) {
37 		snprintf(buf, sizeof(buf),
38 			"/something-non-existent-with-a-long-name-%64lu-%d",
39 			 i, getpid());
40 		stat(buf, &st);
41 	}
42 
43 	return 0;
44 }
45 
46 /*
47  * This test allocates 100000 of negative dentries with long names.
48  * Then it checks that "slab" in memory.stat is larger than 1M.
49  * Then it sets memory.high to 1M and checks that at least 1/2
50  * of slab memory has been reclaimed.
51  */
test_kmem_basic(const char * root)52 static int test_kmem_basic(const char *root)
53 {
54 	int ret = KSFT_FAIL;
55 	char *cg = NULL;
56 	long slab0, slab1, current;
57 
58 	cg = cg_name(root, "kmem_basic_test");
59 	if (!cg)
60 		goto cleanup;
61 
62 	if (cg_create(cg))
63 		goto cleanup;
64 
65 	if (cg_run(cg, alloc_dcache, (void *)100000))
66 		goto cleanup;
67 
68 	slab0 = cg_read_key_long(cg, "memory.stat", "slab ");
69 	if (slab0 < (1 << 20))
70 		goto cleanup;
71 
72 	cg_write(cg, "memory.high", "1M");
73 	slab1 = cg_read_key_long(cg, "memory.stat", "slab ");
74 	if (slab1 <= 0)
75 		goto cleanup;
76 
77 	current = cg_read_long(cg, "memory.current");
78 	if (current <= 0)
79 		goto cleanup;
80 
81 	if (slab1 < slab0 / 2 && current < slab0 / 2)
82 		ret = KSFT_PASS;
83 cleanup:
84 	cg_destroy(cg);
85 	free(cg);
86 
87 	return ret;
88 }
89 
alloc_kmem_fn(void * arg)90 static void *alloc_kmem_fn(void *arg)
91 {
92 	alloc_dcache(NULL, (void *)100);
93 	return NULL;
94 }
95 
alloc_kmem_smp(const char * cgroup,void * arg)96 static int alloc_kmem_smp(const char *cgroup, void *arg)
97 {
98 	int nr_threads = 2 * get_nprocs();
99 	pthread_t *tinfo;
100 	unsigned long i;
101 	int ret = -1;
102 
103 	tinfo = calloc(nr_threads, sizeof(pthread_t));
104 	if (tinfo == NULL)
105 		return -1;
106 
107 	for (i = 0; i < nr_threads; i++) {
108 		if (pthread_create(&tinfo[i], NULL, &alloc_kmem_fn,
109 				   (void *)i)) {
110 			free(tinfo);
111 			return -1;
112 		}
113 	}
114 
115 	for (i = 0; i < nr_threads; i++) {
116 		ret = pthread_join(tinfo[i], NULL);
117 		if (ret)
118 			break;
119 	}
120 
121 	free(tinfo);
122 	return ret;
123 }
124 
cg_run_in_subcgroups(const char * parent,int (* fn)(const char * cgroup,void * arg),void * arg,int times)125 static int cg_run_in_subcgroups(const char *parent,
126 				int (*fn)(const char *cgroup, void *arg),
127 				void *arg, int times)
128 {
129 	char *child;
130 	int i;
131 
132 	for (i = 0; i < times; i++) {
133 		child = cg_name_indexed(parent, "child", i);
134 		if (!child)
135 			return -1;
136 
137 		if (cg_create(child)) {
138 			cg_destroy(child);
139 			free(child);
140 			return -1;
141 		}
142 
143 		if (cg_run(child, fn, NULL)) {
144 			cg_destroy(child);
145 			free(child);
146 			return -1;
147 		}
148 
149 		cg_destroy(child);
150 		free(child);
151 	}
152 
153 	return 0;
154 }
155 
156 /*
157  * The test creates and destroys a large number of cgroups. In each cgroup it
158  * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS
159  * threads. Then it checks the sanity of numbers on the parent level:
160  * the total size of the cgroups should be roughly equal to
161  * anon + file + slab + kernel_stack.
162  */
test_kmem_memcg_deletion(const char * root)163 static int test_kmem_memcg_deletion(const char *root)
164 {
165 	long current, slab, anon, file, kernel_stack, pagetables, percpu, sock, sum;
166 	int ret = KSFT_FAIL;
167 	char *parent;
168 
169 	parent = cg_name(root, "kmem_memcg_deletion_test");
170 	if (!parent)
171 		goto cleanup;
172 
173 	if (cg_create(parent))
174 		goto cleanup;
175 
176 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
177 		goto cleanup;
178 
179 	if (cg_run_in_subcgroups(parent, alloc_kmem_smp, NULL, 100))
180 		goto cleanup;
181 
182 	current = cg_read_long(parent, "memory.current");
183 	slab = cg_read_key_long(parent, "memory.stat", "slab ");
184 	anon = cg_read_key_long(parent, "memory.stat", "anon ");
185 	file = cg_read_key_long(parent, "memory.stat", "file ");
186 	kernel_stack = cg_read_key_long(parent, "memory.stat", "kernel_stack ");
187 	pagetables = cg_read_key_long(parent, "memory.stat", "pagetables ");
188 	percpu = cg_read_key_long(parent, "memory.stat", "percpu ");
189 	sock = cg_read_key_long(parent, "memory.stat", "sock ");
190 	if (current < 0 || slab < 0 || anon < 0 || file < 0 ||
191 	    kernel_stack < 0 || pagetables < 0 || percpu < 0 || sock < 0)
192 		goto cleanup;
193 
194 	sum = slab + anon + file + kernel_stack + pagetables + percpu + sock;
195 	if (abs(sum - current) < MAX_VMSTAT_ERROR) {
196 		ret = KSFT_PASS;
197 	} else {
198 		printf("memory.current = %ld\n", current);
199 		printf("slab + anon + file + kernel_stack = %ld\n", sum);
200 		printf("slab = %ld\n", slab);
201 		printf("anon = %ld\n", anon);
202 		printf("file = %ld\n", file);
203 		printf("kernel_stack = %ld\n", kernel_stack);
204 		printf("pagetables = %ld\n", pagetables);
205 		printf("percpu = %ld\n", percpu);
206 		printf("sock = %ld\n", sock);
207 	}
208 
209 cleanup:
210 	cg_destroy(parent);
211 	free(parent);
212 
213 	return ret;
214 }
215 
216 /*
217  * The test reads the entire /proc/kpagecgroup. If the operation went
218  * successfully (and the kernel didn't panic), the test is treated as passed.
219  */
test_kmem_proc_kpagecgroup(const char * root)220 static int test_kmem_proc_kpagecgroup(const char *root)
221 {
222 	unsigned long buf[128];
223 	int ret = KSFT_FAIL;
224 	ssize_t len;
225 	int fd;
226 
227 	fd = open("/proc/kpagecgroup", O_RDONLY);
228 	if (fd < 0)
229 		return ret;
230 
231 	do {
232 		len = read(fd, buf, sizeof(buf));
233 	} while (len > 0);
234 
235 	if (len == 0)
236 		ret = KSFT_PASS;
237 
238 	close(fd);
239 	return ret;
240 }
241 
pthread_wait_fn(void * arg)242 static void *pthread_wait_fn(void *arg)
243 {
244 	sleep(100);
245 	return NULL;
246 }
247 
spawn_1000_threads(const char * cgroup,void * arg)248 static int spawn_1000_threads(const char *cgroup, void *arg)
249 {
250 	int nr_threads = 1000;
251 	pthread_t *tinfo;
252 	unsigned long i;
253 	long stack;
254 	int ret = -1;
255 
256 	tinfo = calloc(nr_threads, sizeof(pthread_t));
257 	if (tinfo == NULL)
258 		return -1;
259 
260 	for (i = 0; i < nr_threads; i++) {
261 		if (pthread_create(&tinfo[i], NULL, &pthread_wait_fn,
262 				   (void *)i)) {
263 			free(tinfo);
264 			return(-1);
265 		}
266 	}
267 
268 	stack = cg_read_key_long(cgroup, "memory.stat", "kernel_stack ");
269 	if (stack >= 4096 * 1000)
270 		ret = 0;
271 
272 	free(tinfo);
273 	return ret;
274 }
275 
276 /*
277  * The test spawns a process, which spawns 1000 threads. Then it checks
278  * that memory.stat's kernel_stack is at least 1000 pages large.
279  */
test_kmem_kernel_stacks(const char * root)280 static int test_kmem_kernel_stacks(const char *root)
281 {
282 	int ret = KSFT_FAIL;
283 	char *cg = NULL;
284 
285 	cg = cg_name(root, "kmem_kernel_stacks_test");
286 	if (!cg)
287 		goto cleanup;
288 
289 	if (cg_create(cg))
290 		goto cleanup;
291 
292 	if (cg_run(cg, spawn_1000_threads, NULL))
293 		goto cleanup;
294 
295 	ret = KSFT_PASS;
296 cleanup:
297 	cg_destroy(cg);
298 	free(cg);
299 
300 	return ret;
301 }
302 
303 /*
304  * This test sequentionally creates 30 child cgroups, allocates some
305  * kernel memory in each of them, and deletes them. Then it checks
306  * that the number of dying cgroups on the parent level is 0.
307  */
test_kmem_dead_cgroups(const char * root)308 static int test_kmem_dead_cgroups(const char *root)
309 {
310 	int ret = KSFT_FAIL;
311 	char *parent;
312 	long dead;
313 	int i;
314 
315 	parent = cg_name(root, "kmem_dead_cgroups_test");
316 	if (!parent)
317 		goto cleanup;
318 
319 	if (cg_create(parent))
320 		goto cleanup;
321 
322 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
323 		goto cleanup;
324 
325 	if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30))
326 		goto cleanup;
327 
328 	for (i = 0; i < 5; i++) {
329 		dead = cg_read_key_long(parent, "cgroup.stat",
330 					"nr_dying_descendants ");
331 		if (dead == 0) {
332 			ret = KSFT_PASS;
333 			break;
334 		}
335 		/*
336 		 * Reclaiming cgroups might take some time,
337 		 * let's wait a bit and repeat.
338 		 */
339 		sleep(1);
340 	}
341 
342 cleanup:
343 	cg_destroy(parent);
344 	free(parent);
345 
346 	return ret;
347 }
348 
349 /*
350  * This test creates a sub-tree with 1000 memory cgroups.
351  * Then it checks that the memory.current on the parent level
352  * is greater than 0 and approximates matches the percpu value
353  * from memory.stat.
354  */
test_percpu_basic(const char * root)355 static int test_percpu_basic(const char *root)
356 {
357 	int ret = KSFT_FAIL;
358 	char *parent, *child;
359 	long current, percpu;
360 	int i;
361 
362 	parent = cg_name(root, "percpu_basic_test");
363 	if (!parent)
364 		goto cleanup;
365 
366 	if (cg_create(parent))
367 		goto cleanup;
368 
369 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
370 		goto cleanup;
371 
372 	for (i = 0; i < 1000; i++) {
373 		child = cg_name_indexed(parent, "child", i);
374 		if (!child)
375 			return -1;
376 
377 		if (cg_create(child))
378 			goto cleanup_children;
379 
380 		free(child);
381 	}
382 
383 	current = cg_read_long(parent, "memory.current");
384 	percpu = cg_read_key_long(parent, "memory.stat", "percpu ");
385 
386 	if (current > 0 && percpu > 0 && abs(current - percpu) <
387 	    MAX_VMSTAT_ERROR)
388 		ret = KSFT_PASS;
389 	else
390 		printf("memory.current %ld\npercpu %ld\n",
391 		       current, percpu);
392 
393 cleanup_children:
394 	for (i = 0; i < 1000; i++) {
395 		child = cg_name_indexed(parent, "child", i);
396 		cg_destroy(child);
397 		free(child);
398 	}
399 
400 cleanup:
401 	cg_destroy(parent);
402 	free(parent);
403 
404 	return ret;
405 }
406 
407 #define T(x) { x, #x }
408 struct kmem_test {
409 	int (*fn)(const char *root);
410 	const char *name;
411 } tests[] = {
412 	T(test_kmem_basic),
413 	T(test_kmem_memcg_deletion),
414 	T(test_kmem_proc_kpagecgroup),
415 	T(test_kmem_kernel_stacks),
416 	T(test_kmem_dead_cgroups),
417 	T(test_percpu_basic),
418 };
419 #undef T
420 
main(int argc,char ** argv)421 int main(int argc, char **argv)
422 {
423 	char root[PATH_MAX];
424 	int i, ret = EXIT_SUCCESS;
425 
426 	if (cg_find_unified_root(root, sizeof(root)))
427 		ksft_exit_skip("cgroup v2 isn't mounted\n");
428 
429 	/*
430 	 * Check that memory controller is available:
431 	 * memory is listed in cgroup.controllers
432 	 */
433 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
434 		ksft_exit_skip("memory controller isn't available\n");
435 
436 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
437 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
438 			ksft_exit_skip("Failed to set memory controller\n");
439 
440 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
441 		switch (tests[i].fn(root)) {
442 		case KSFT_PASS:
443 			ksft_test_result_pass("%s\n", tests[i].name);
444 			break;
445 		case KSFT_SKIP:
446 			ksft_test_result_skip("%s\n", tests[i].name);
447 			break;
448 		default:
449 			ret = EXIT_FAILURE;
450 			ksft_test_result_fail("%s\n", tests[i].name);
451 			break;
452 		}
453 	}
454 
455 	return ret;
456 }
457