1 #define _GNU_SOURCE
2 #include <ctype.h>
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <limits.h>
6 #include <dirent.h>
7 #include <signal.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <stdbool.h>
11 #include <string.h>
12 #include <unistd.h>
13 
14 #include <sys/mman.h>
15 #include <sys/wait.h>
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <sys/sysmacros.h>
19 #include <sys/vfs.h>
20 
21 #include "linux/magic.h"
22 
23 #include "vm_util.h"
24 
25 #ifndef MADV_PAGEOUT
26 #define MADV_PAGEOUT 21
27 #endif
28 #ifndef MADV_POPULATE_READ
29 #define MADV_POPULATE_READ 22
30 #endif
31 #ifndef MADV_COLLAPSE
32 #define MADV_COLLAPSE 25
33 #endif
34 
35 #define BASE_ADDR ((void *)(1UL << 30))
36 static unsigned long hpage_pmd_size;
37 static unsigned long page_size;
38 static int hpage_pmd_nr;
39 
40 #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
41 #define PID_SMAPS "/proc/self/smaps"
42 #define TEST_FILE "collapse_test_file"
43 
44 #define MAX_LINE_LENGTH 500
45 
46 enum vma_type {
47 	VMA_ANON,
48 	VMA_FILE,
49 	VMA_SHMEM,
50 };
51 
52 struct mem_ops {
53 	void *(*setup_area)(int nr_hpages);
54 	void (*cleanup_area)(void *p, unsigned long size);
55 	void (*fault)(void *p, unsigned long start, unsigned long end);
56 	bool (*check_huge)(void *addr, int nr_hpages);
57 	const char *name;
58 };
59 
60 static struct mem_ops *file_ops;
61 static struct mem_ops *anon_ops;
62 static struct mem_ops *shmem_ops;
63 
64 struct collapse_context {
65 	void (*collapse)(const char *msg, char *p, int nr_hpages,
66 			 struct mem_ops *ops, bool expect);
67 	bool enforce_pte_scan_limits;
68 	const char *name;
69 };
70 
71 static struct collapse_context *khugepaged_context;
72 static struct collapse_context *madvise_context;
73 
74 struct file_info {
75 	const char *dir;
76 	char path[PATH_MAX];
77 	enum vma_type type;
78 	int fd;
79 	char dev_queue_read_ahead_path[PATH_MAX];
80 };
81 
82 static struct file_info finfo;
83 
84 enum thp_enabled {
85 	THP_ALWAYS,
86 	THP_MADVISE,
87 	THP_NEVER,
88 };
89 
90 static const char *thp_enabled_strings[] = {
91 	"always",
92 	"madvise",
93 	"never",
94 	NULL
95 };
96 
97 enum thp_defrag {
98 	THP_DEFRAG_ALWAYS,
99 	THP_DEFRAG_DEFER,
100 	THP_DEFRAG_DEFER_MADVISE,
101 	THP_DEFRAG_MADVISE,
102 	THP_DEFRAG_NEVER,
103 };
104 
105 static const char *thp_defrag_strings[] = {
106 	"always",
107 	"defer",
108 	"defer+madvise",
109 	"madvise",
110 	"never",
111 	NULL
112 };
113 
114 enum shmem_enabled {
115 	SHMEM_ALWAYS,
116 	SHMEM_WITHIN_SIZE,
117 	SHMEM_ADVISE,
118 	SHMEM_NEVER,
119 	SHMEM_DENY,
120 	SHMEM_FORCE,
121 };
122 
123 static const char *shmem_enabled_strings[] = {
124 	"always",
125 	"within_size",
126 	"advise",
127 	"never",
128 	"deny",
129 	"force",
130 	NULL
131 };
132 
133 struct khugepaged_settings {
134 	bool defrag;
135 	unsigned int alloc_sleep_millisecs;
136 	unsigned int scan_sleep_millisecs;
137 	unsigned int max_ptes_none;
138 	unsigned int max_ptes_swap;
139 	unsigned int max_ptes_shared;
140 	unsigned long pages_to_scan;
141 };
142 
143 struct settings {
144 	enum thp_enabled thp_enabled;
145 	enum thp_defrag thp_defrag;
146 	enum shmem_enabled shmem_enabled;
147 	bool use_zero_page;
148 	struct khugepaged_settings khugepaged;
149 	unsigned long read_ahead_kb;
150 };
151 
152 static struct settings saved_settings;
153 static bool skip_settings_restore;
154 
155 static int exit_status;
156 
success(const char * msg)157 static void success(const char *msg)
158 {
159 	printf(" \e[32m%s\e[0m\n", msg);
160 }
161 
fail(const char * msg)162 static void fail(const char *msg)
163 {
164 	printf(" \e[31m%s\e[0m\n", msg);
165 	exit_status++;
166 }
167 
skip(const char * msg)168 static void skip(const char *msg)
169 {
170 	printf(" \e[33m%s\e[0m\n", msg);
171 }
172 
read_file(const char * path,char * buf,size_t buflen)173 static int read_file(const char *path, char *buf, size_t buflen)
174 {
175 	int fd;
176 	ssize_t numread;
177 
178 	fd = open(path, O_RDONLY);
179 	if (fd == -1)
180 		return 0;
181 
182 	numread = read(fd, buf, buflen - 1);
183 	if (numread < 1) {
184 		close(fd);
185 		return 0;
186 	}
187 
188 	buf[numread] = '\0';
189 	close(fd);
190 
191 	return (unsigned int) numread;
192 }
193 
write_file(const char * path,const char * buf,size_t buflen)194 static int write_file(const char *path, const char *buf, size_t buflen)
195 {
196 	int fd;
197 	ssize_t numwritten;
198 
199 	fd = open(path, O_WRONLY);
200 	if (fd == -1) {
201 		printf("open(%s)\n", path);
202 		exit(EXIT_FAILURE);
203 		return 0;
204 	}
205 
206 	numwritten = write(fd, buf, buflen - 1);
207 	close(fd);
208 	if (numwritten < 1) {
209 		printf("write(%s)\n", buf);
210 		exit(EXIT_FAILURE);
211 		return 0;
212 	}
213 
214 	return (unsigned int) numwritten;
215 }
216 
read_string(const char * name,const char * strings[])217 static int read_string(const char *name, const char *strings[])
218 {
219 	char path[PATH_MAX];
220 	char buf[256];
221 	char *c;
222 	int ret;
223 
224 	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
225 	if (ret >= PATH_MAX) {
226 		printf("%s: Pathname is too long\n", __func__);
227 		exit(EXIT_FAILURE);
228 	}
229 
230 	if (!read_file(path, buf, sizeof(buf))) {
231 		perror(path);
232 		exit(EXIT_FAILURE);
233 	}
234 
235 	c = strchr(buf, '[');
236 	if (!c) {
237 		printf("%s: Parse failure\n", __func__);
238 		exit(EXIT_FAILURE);
239 	}
240 
241 	c++;
242 	memmove(buf, c, sizeof(buf) - (c - buf));
243 
244 	c = strchr(buf, ']');
245 	if (!c) {
246 		printf("%s: Parse failure\n", __func__);
247 		exit(EXIT_FAILURE);
248 	}
249 	*c = '\0';
250 
251 	ret = 0;
252 	while (strings[ret]) {
253 		if (!strcmp(strings[ret], buf))
254 			return ret;
255 		ret++;
256 	}
257 
258 	printf("Failed to parse %s\n", name);
259 	exit(EXIT_FAILURE);
260 }
261 
write_string(const char * name,const char * val)262 static void write_string(const char *name, const char *val)
263 {
264 	char path[PATH_MAX];
265 	int ret;
266 
267 	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
268 	if (ret >= PATH_MAX) {
269 		printf("%s: Pathname is too long\n", __func__);
270 		exit(EXIT_FAILURE);
271 	}
272 
273 	if (!write_file(path, val, strlen(val) + 1)) {
274 		perror(path);
275 		exit(EXIT_FAILURE);
276 	}
277 }
278 
_read_num(const char * path)279 static const unsigned long _read_num(const char *path)
280 {
281 	char buf[21];
282 
283 	if (read_file(path, buf, sizeof(buf)) < 0) {
284 		perror("read_file(read_num)");
285 		exit(EXIT_FAILURE);
286 	}
287 
288 	return strtoul(buf, NULL, 10);
289 }
290 
read_num(const char * name)291 static const unsigned long read_num(const char *name)
292 {
293 	char path[PATH_MAX];
294 	int ret;
295 
296 	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
297 	if (ret >= PATH_MAX) {
298 		printf("%s: Pathname is too long\n", __func__);
299 		exit(EXIT_FAILURE);
300 	}
301 	return _read_num(path);
302 }
303 
_write_num(const char * path,unsigned long num)304 static void _write_num(const char *path, unsigned long num)
305 {
306 	char buf[21];
307 
308 	sprintf(buf, "%ld", num);
309 	if (!write_file(path, buf, strlen(buf) + 1)) {
310 		perror(path);
311 		exit(EXIT_FAILURE);
312 	}
313 }
314 
write_num(const char * name,unsigned long num)315 static void write_num(const char *name, unsigned long num)
316 {
317 	char path[PATH_MAX];
318 	int ret;
319 
320 	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
321 	if (ret >= PATH_MAX) {
322 		printf("%s: Pathname is too long\n", __func__);
323 		exit(EXIT_FAILURE);
324 	}
325 	_write_num(path, num);
326 }
327 
write_settings(struct settings * settings)328 static void write_settings(struct settings *settings)
329 {
330 	struct khugepaged_settings *khugepaged = &settings->khugepaged;
331 
332 	write_string("enabled", thp_enabled_strings[settings->thp_enabled]);
333 	write_string("defrag", thp_defrag_strings[settings->thp_defrag]);
334 	write_string("shmem_enabled",
335 			shmem_enabled_strings[settings->shmem_enabled]);
336 	write_num("use_zero_page", settings->use_zero_page);
337 
338 	write_num("khugepaged/defrag", khugepaged->defrag);
339 	write_num("khugepaged/alloc_sleep_millisecs",
340 			khugepaged->alloc_sleep_millisecs);
341 	write_num("khugepaged/scan_sleep_millisecs",
342 			khugepaged->scan_sleep_millisecs);
343 	write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
344 	write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
345 	write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
346 	write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
347 
348 	if (file_ops && finfo.type == VMA_FILE)
349 		_write_num(finfo.dev_queue_read_ahead_path,
350 			   settings->read_ahead_kb);
351 }
352 
353 #define MAX_SETTINGS_DEPTH 4
354 static struct settings settings_stack[MAX_SETTINGS_DEPTH];
355 static int settings_index;
356 
current_settings(void)357 static struct settings *current_settings(void)
358 {
359 	if (!settings_index) {
360 		printf("Fail: No settings set");
361 		exit(EXIT_FAILURE);
362 	}
363 	return settings_stack + settings_index - 1;
364 }
365 
push_settings(struct settings * settings)366 static void push_settings(struct settings *settings)
367 {
368 	if (settings_index >= MAX_SETTINGS_DEPTH) {
369 		printf("Fail: Settings stack exceeded");
370 		exit(EXIT_FAILURE);
371 	}
372 	settings_stack[settings_index++] = *settings;
373 	write_settings(current_settings());
374 }
375 
pop_settings(void)376 static void pop_settings(void)
377 {
378 	if (settings_index <= 0) {
379 		printf("Fail: Settings stack empty");
380 		exit(EXIT_FAILURE);
381 	}
382 	--settings_index;
383 	write_settings(current_settings());
384 }
385 
restore_settings(int sig)386 static void restore_settings(int sig)
387 {
388 	if (skip_settings_restore)
389 		goto out;
390 
391 	printf("Restore THP and khugepaged settings...");
392 	write_settings(&saved_settings);
393 	success("OK");
394 	if (sig)
395 		exit(EXIT_FAILURE);
396 out:
397 	exit(exit_status);
398 }
399 
save_settings(void)400 static void save_settings(void)
401 {
402 	printf("Save THP and khugepaged settings...");
403 	saved_settings = (struct settings) {
404 		.thp_enabled = read_string("enabled", thp_enabled_strings),
405 		.thp_defrag = read_string("defrag", thp_defrag_strings),
406 		.shmem_enabled =
407 			read_string("shmem_enabled", shmem_enabled_strings),
408 		.use_zero_page = read_num("use_zero_page"),
409 	};
410 	saved_settings.khugepaged = (struct khugepaged_settings) {
411 		.defrag = read_num("khugepaged/defrag"),
412 		.alloc_sleep_millisecs =
413 			read_num("khugepaged/alloc_sleep_millisecs"),
414 		.scan_sleep_millisecs =
415 			read_num("khugepaged/scan_sleep_millisecs"),
416 		.max_ptes_none = read_num("khugepaged/max_ptes_none"),
417 		.max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
418 		.max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
419 		.pages_to_scan = read_num("khugepaged/pages_to_scan"),
420 	};
421 	if (file_ops && finfo.type == VMA_FILE)
422 		saved_settings.read_ahead_kb =
423 				_read_num(finfo.dev_queue_read_ahead_path);
424 
425 	success("OK");
426 
427 	signal(SIGTERM, restore_settings);
428 	signal(SIGINT, restore_settings);
429 	signal(SIGHUP, restore_settings);
430 	signal(SIGQUIT, restore_settings);
431 }
432 
get_finfo(const char * dir)433 static void get_finfo(const char *dir)
434 {
435 	struct stat path_stat;
436 	struct statfs fs;
437 	char buf[1 << 10];
438 	char path[PATH_MAX];
439 	char *str, *end;
440 
441 	finfo.dir = dir;
442 	stat(finfo.dir, &path_stat);
443 	if (!S_ISDIR(path_stat.st_mode)) {
444 		printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
445 		exit(EXIT_FAILURE);
446 	}
447 	if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
448 		     finfo.dir) >= sizeof(finfo.path)) {
449 		printf("%s: Pathname is too long\n", __func__);
450 		exit(EXIT_FAILURE);
451 	}
452 	if (statfs(finfo.dir, &fs)) {
453 		perror("statfs()");
454 		exit(EXIT_FAILURE);
455 	}
456 	finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
457 	if (finfo.type == VMA_SHMEM)
458 		return;
459 
460 	/* Find owning device's queue/read_ahead_kb control */
461 	if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
462 		     major(path_stat.st_dev), minor(path_stat.st_dev))
463 	    >= sizeof(path)) {
464 		printf("%s: Pathname is too long\n", __func__);
465 		exit(EXIT_FAILURE);
466 	}
467 	if (read_file(path, buf, sizeof(buf)) < 0) {
468 		perror("read_file(read_num)");
469 		exit(EXIT_FAILURE);
470 	}
471 	if (strstr(buf, "DEVTYPE=disk")) {
472 		/* Found it */
473 		if (snprintf(finfo.dev_queue_read_ahead_path,
474 			     sizeof(finfo.dev_queue_read_ahead_path),
475 			     "/sys/dev/block/%d:%d/queue/read_ahead_kb",
476 			     major(path_stat.st_dev), minor(path_stat.st_dev))
477 		    >= sizeof(finfo.dev_queue_read_ahead_path)) {
478 			printf("%s: Pathname is too long\n", __func__);
479 			exit(EXIT_FAILURE);
480 		}
481 		return;
482 	}
483 	if (!strstr(buf, "DEVTYPE=partition")) {
484 		printf("%s: Unknown device type: %s\n", __func__, path);
485 		exit(EXIT_FAILURE);
486 	}
487 	/*
488 	 * Partition of block device - need to find actual device.
489 	 * Using naming convention that devnameN is partition of
490 	 * device devname.
491 	 */
492 	str = strstr(buf, "DEVNAME=");
493 	if (!str) {
494 		printf("%s: Could not read: %s", __func__, path);
495 		exit(EXIT_FAILURE);
496 	}
497 	str += 8;
498 	end = str;
499 	while (*end) {
500 		if (isdigit(*end)) {
501 			*end = '\0';
502 			if (snprintf(finfo.dev_queue_read_ahead_path,
503 				     sizeof(finfo.dev_queue_read_ahead_path),
504 				     "/sys/block/%s/queue/read_ahead_kb",
505 				     str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
506 				printf("%s: Pathname is too long\n", __func__);
507 				exit(EXIT_FAILURE);
508 			}
509 			return;
510 		}
511 		++end;
512 	}
513 	printf("%s: Could not read: %s\n", __func__, path);
514 	exit(EXIT_FAILURE);
515 }
516 
check_swap(void * addr,unsigned long size)517 static bool check_swap(void *addr, unsigned long size)
518 {
519 	bool swap = false;
520 	int ret;
521 	FILE *fp;
522 	char buffer[MAX_LINE_LENGTH];
523 	char addr_pattern[MAX_LINE_LENGTH];
524 
525 	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
526 		       (unsigned long) addr);
527 	if (ret >= MAX_LINE_LENGTH) {
528 		printf("%s: Pattern is too long\n", __func__);
529 		exit(EXIT_FAILURE);
530 	}
531 
532 
533 	fp = fopen(PID_SMAPS, "r");
534 	if (!fp) {
535 		printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
536 		exit(EXIT_FAILURE);
537 	}
538 	if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
539 		goto err_out;
540 
541 	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
542 		       size >> 10);
543 	if (ret >= MAX_LINE_LENGTH) {
544 		printf("%s: Pattern is too long\n", __func__);
545 		exit(EXIT_FAILURE);
546 	}
547 	/*
548 	 * Fetch the Swap: in the same block and check whether it got
549 	 * the expected number of hugeepages next.
550 	 */
551 	if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
552 		goto err_out;
553 
554 	if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
555 		goto err_out;
556 
557 	swap = true;
558 err_out:
559 	fclose(fp);
560 	return swap;
561 }
562 
alloc_mapping(int nr)563 static void *alloc_mapping(int nr)
564 {
565 	void *p;
566 
567 	p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
568 		 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
569 	if (p != BASE_ADDR) {
570 		printf("Failed to allocate VMA at %p\n", BASE_ADDR);
571 		exit(EXIT_FAILURE);
572 	}
573 
574 	return p;
575 }
576 
fill_memory(int * p,unsigned long start,unsigned long end)577 static void fill_memory(int *p, unsigned long start, unsigned long end)
578 {
579 	int i;
580 
581 	for (i = start / page_size; i < end / page_size; i++)
582 		p[i * page_size / sizeof(*p)] = i + 0xdead0000;
583 }
584 
585 /*
586  * MADV_COLLAPSE is a best-effort request and may fail if an internal
587  * resource is temporarily unavailable, in which case it will set errno to
588  * EAGAIN.  In such a case, immediately reattempt the operation one more
589  * time.
590  */
madvise_collapse_retry(void * p,unsigned long size)591 static int madvise_collapse_retry(void *p, unsigned long size)
592 {
593 	bool retry = true;
594 	int ret;
595 
596 retry:
597 	ret = madvise(p, size, MADV_COLLAPSE);
598 	if (ret && errno == EAGAIN && retry) {
599 		retry = false;
600 		goto retry;
601 	}
602 	return ret;
603 }
604 
605 /*
606  * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
607  * validate_memory()'able contents.
608  */
alloc_hpage(struct mem_ops * ops)609 static void *alloc_hpage(struct mem_ops *ops)
610 {
611 	void *p = ops->setup_area(1);
612 
613 	ops->fault(p, 0, hpage_pmd_size);
614 
615 	/*
616 	 * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
617 	 * The latter is ineligible for collapse by MADV_COLLAPSE
618 	 * while the former might cause MADV_COLLAPSE to race with
619 	 * khugepaged on low-load system (like a test machine), which
620 	 * would cause MADV_COLLAPSE to fail with EAGAIN.
621 	 */
622 	printf("Allocate huge page...");
623 	if (madvise_collapse_retry(p, hpage_pmd_size)) {
624 		perror("madvise(MADV_COLLAPSE)");
625 		exit(EXIT_FAILURE);
626 	}
627 	if (!ops->check_huge(p, 1)) {
628 		perror("madvise(MADV_COLLAPSE)");
629 		exit(EXIT_FAILURE);
630 	}
631 	if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
632 		perror("madvise(MADV_HUGEPAGE)");
633 		exit(EXIT_FAILURE);
634 	}
635 	success("OK");
636 	return p;
637 }
638 
validate_memory(int * p,unsigned long start,unsigned long end)639 static void validate_memory(int *p, unsigned long start, unsigned long end)
640 {
641 	int i;
642 
643 	for (i = start / page_size; i < end / page_size; i++) {
644 		if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
645 			printf("Page %d is corrupted: %#x\n",
646 					i, p[i * page_size / sizeof(*p)]);
647 			exit(EXIT_FAILURE);
648 		}
649 	}
650 }
651 
anon_setup_area(int nr_hpages)652 static void *anon_setup_area(int nr_hpages)
653 {
654 	return alloc_mapping(nr_hpages);
655 }
656 
anon_cleanup_area(void * p,unsigned long size)657 static void anon_cleanup_area(void *p, unsigned long size)
658 {
659 	munmap(p, size);
660 }
661 
anon_fault(void * p,unsigned long start,unsigned long end)662 static void anon_fault(void *p, unsigned long start, unsigned long end)
663 {
664 	fill_memory(p, start, end);
665 }
666 
anon_check_huge(void * addr,int nr_hpages)667 static bool anon_check_huge(void *addr, int nr_hpages)
668 {
669 	return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
670 }
671 
file_setup_area(int nr_hpages)672 static void *file_setup_area(int nr_hpages)
673 {
674 	int fd;
675 	void *p;
676 	unsigned long size;
677 
678 	unlink(finfo.path);  /* Cleanup from previous failed tests */
679 	printf("Creating %s for collapse%s...", finfo.path,
680 	       finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
681 	fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
682 		  777);
683 	if (fd < 0) {
684 		perror("open()");
685 		exit(EXIT_FAILURE);
686 	}
687 
688 	size = nr_hpages * hpage_pmd_size;
689 	p = alloc_mapping(nr_hpages);
690 	fill_memory(p, 0, size);
691 	write(fd, p, size);
692 	close(fd);
693 	munmap(p, size);
694 	success("OK");
695 
696 	printf("Opening %s read only for collapse...", finfo.path);
697 	finfo.fd = open(finfo.path, O_RDONLY, 777);
698 	if (finfo.fd < 0) {
699 		perror("open()");
700 		exit(EXIT_FAILURE);
701 	}
702 	p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
703 		 MAP_PRIVATE, finfo.fd, 0);
704 	if (p == MAP_FAILED || p != BASE_ADDR) {
705 		perror("mmap()");
706 		exit(EXIT_FAILURE);
707 	}
708 
709 	/* Drop page cache */
710 	write_file("/proc/sys/vm/drop_caches", "3", 2);
711 	success("OK");
712 	return p;
713 }
714 
file_cleanup_area(void * p,unsigned long size)715 static void file_cleanup_area(void *p, unsigned long size)
716 {
717 	munmap(p, size);
718 	close(finfo.fd);
719 	unlink(finfo.path);
720 }
721 
file_fault(void * p,unsigned long start,unsigned long end)722 static void file_fault(void *p, unsigned long start, unsigned long end)
723 {
724 	if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
725 		perror("madvise(MADV_POPULATE_READ");
726 		exit(EXIT_FAILURE);
727 	}
728 }
729 
file_check_huge(void * addr,int nr_hpages)730 static bool file_check_huge(void *addr, int nr_hpages)
731 {
732 	switch (finfo.type) {
733 	case VMA_FILE:
734 		return check_huge_file(addr, nr_hpages, hpage_pmd_size);
735 	case VMA_SHMEM:
736 		return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
737 	default:
738 		exit(EXIT_FAILURE);
739 		return false;
740 	}
741 }
742 
shmem_setup_area(int nr_hpages)743 static void *shmem_setup_area(int nr_hpages)
744 {
745 	void *p;
746 	unsigned long size = nr_hpages * hpage_pmd_size;
747 
748 	finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
749 	if (finfo.fd < 0)  {
750 		perror("memfd_create()");
751 		exit(EXIT_FAILURE);
752 	}
753 	if (ftruncate(finfo.fd, size)) {
754 		perror("ftruncate()");
755 		exit(EXIT_FAILURE);
756 	}
757 	p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
758 		 0);
759 	if (p != BASE_ADDR) {
760 		perror("mmap()");
761 		exit(EXIT_FAILURE);
762 	}
763 	return p;
764 }
765 
shmem_cleanup_area(void * p,unsigned long size)766 static void shmem_cleanup_area(void *p, unsigned long size)
767 {
768 	munmap(p, size);
769 	close(finfo.fd);
770 }
771 
shmem_check_huge(void * addr,int nr_hpages)772 static bool shmem_check_huge(void *addr, int nr_hpages)
773 {
774 	return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
775 }
776 
777 static struct mem_ops __anon_ops = {
778 	.setup_area = &anon_setup_area,
779 	.cleanup_area = &anon_cleanup_area,
780 	.fault = &anon_fault,
781 	.check_huge = &anon_check_huge,
782 	.name = "anon",
783 };
784 
785 static struct mem_ops __file_ops = {
786 	.setup_area = &file_setup_area,
787 	.cleanup_area = &file_cleanup_area,
788 	.fault = &file_fault,
789 	.check_huge = &file_check_huge,
790 	.name = "file",
791 };
792 
793 static struct mem_ops __shmem_ops = {
794 	.setup_area = &shmem_setup_area,
795 	.cleanup_area = &shmem_cleanup_area,
796 	.fault = &anon_fault,
797 	.check_huge = &shmem_check_huge,
798 	.name = "shmem",
799 };
800 
__madvise_collapse(const char * msg,char * p,int nr_hpages,struct mem_ops * ops,bool expect)801 static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
802 			       struct mem_ops *ops, bool expect)
803 {
804 	int ret;
805 	struct settings settings = *current_settings();
806 
807 	printf("%s...", msg);
808 
809 	/*
810 	 * Prevent khugepaged interference and tests that MADV_COLLAPSE
811 	 * ignores /sys/kernel/mm/transparent_hugepage/enabled
812 	 */
813 	settings.thp_enabled = THP_NEVER;
814 	settings.shmem_enabled = SHMEM_NEVER;
815 	push_settings(&settings);
816 
817 	/* Clear VM_NOHUGEPAGE */
818 	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
819 	ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
820 	if (((bool)ret) == expect)
821 		fail("Fail: Bad return value");
822 	else if (!ops->check_huge(p, expect ? nr_hpages : 0))
823 		fail("Fail: check_huge()");
824 	else
825 		success("OK");
826 
827 	pop_settings();
828 }
829 
madvise_collapse(const char * msg,char * p,int nr_hpages,struct mem_ops * ops,bool expect)830 static void madvise_collapse(const char *msg, char *p, int nr_hpages,
831 			     struct mem_ops *ops, bool expect)
832 {
833 	/* Sanity check */
834 	if (!ops->check_huge(p, 0)) {
835 		printf("Unexpected huge page\n");
836 		exit(EXIT_FAILURE);
837 	}
838 	__madvise_collapse(msg, p, nr_hpages, ops, expect);
839 }
840 
841 #define TICK 500000
wait_for_scan(const char * msg,char * p,int nr_hpages,struct mem_ops * ops)842 static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
843 			  struct mem_ops *ops)
844 {
845 	int full_scans;
846 	int timeout = 6; /* 3 seconds */
847 
848 	/* Sanity check */
849 	if (!ops->check_huge(p, 0)) {
850 		printf("Unexpected huge page\n");
851 		exit(EXIT_FAILURE);
852 	}
853 
854 	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
855 
856 	/* Wait until the second full_scan completed */
857 	full_scans = read_num("khugepaged/full_scans") + 2;
858 
859 	printf("%s...", msg);
860 	while (timeout--) {
861 		if (ops->check_huge(p, nr_hpages))
862 			break;
863 		if (read_num("khugepaged/full_scans") >= full_scans)
864 			break;
865 		printf(".");
866 		usleep(TICK);
867 	}
868 
869 	madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
870 
871 	return timeout == -1;
872 }
873 
khugepaged_collapse(const char * msg,char * p,int nr_hpages,struct mem_ops * ops,bool expect)874 static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
875 				struct mem_ops *ops, bool expect)
876 {
877 	if (wait_for_scan(msg, p, nr_hpages, ops)) {
878 		if (expect)
879 			fail("Timeout");
880 		else
881 			success("OK");
882 		return;
883 	}
884 
885 	/*
886 	 * For file and shmem memory, khugepaged only retracts pte entries after
887 	 * putting the new hugepage in the page cache. The hugepage must be
888 	 * subsequently refaulted to install the pmd mapping for the mm.
889 	 */
890 	if (ops != &__anon_ops)
891 		ops->fault(p, 0, nr_hpages * hpage_pmd_size);
892 
893 	if (ops->check_huge(p, expect ? nr_hpages : 0))
894 		success("OK");
895 	else
896 		fail("Fail");
897 }
898 
899 static struct collapse_context __khugepaged_context = {
900 	.collapse = &khugepaged_collapse,
901 	.enforce_pte_scan_limits = true,
902 	.name = "khugepaged",
903 };
904 
905 static struct collapse_context __madvise_context = {
906 	.collapse = &madvise_collapse,
907 	.enforce_pte_scan_limits = false,
908 	.name = "madvise",
909 };
910 
is_tmpfs(struct mem_ops * ops)911 static bool is_tmpfs(struct mem_ops *ops)
912 {
913 	return ops == &__file_ops && finfo.type == VMA_SHMEM;
914 }
915 
alloc_at_fault(void)916 static void alloc_at_fault(void)
917 {
918 	struct settings settings = *current_settings();
919 	char *p;
920 
921 	settings.thp_enabled = THP_ALWAYS;
922 	push_settings(&settings);
923 
924 	p = alloc_mapping(1);
925 	*p = 1;
926 	printf("Allocate huge page on fault...");
927 	if (check_huge_anon(p, 1, hpage_pmd_size))
928 		success("OK");
929 	else
930 		fail("Fail");
931 
932 	pop_settings();
933 
934 	madvise(p, page_size, MADV_DONTNEED);
935 	printf("Split huge PMD on MADV_DONTNEED...");
936 	if (check_huge_anon(p, 0, hpage_pmd_size))
937 		success("OK");
938 	else
939 		fail("Fail");
940 	munmap(p, hpage_pmd_size);
941 }
942 
collapse_full(struct collapse_context * c,struct mem_ops * ops)943 static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
944 {
945 	void *p;
946 	int nr_hpages = 4;
947 	unsigned long size = nr_hpages * hpage_pmd_size;
948 
949 	p = ops->setup_area(nr_hpages);
950 	ops->fault(p, 0, size);
951 	c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
952 		    ops, true);
953 	validate_memory(p, 0, size);
954 	ops->cleanup_area(p, size);
955 }
956 
collapse_empty(struct collapse_context * c,struct mem_ops * ops)957 static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
958 {
959 	void *p;
960 
961 	p = ops->setup_area(1);
962 	c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
963 	ops->cleanup_area(p, hpage_pmd_size);
964 }
965 
collapse_single_pte_entry(struct collapse_context * c,struct mem_ops * ops)966 static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
967 {
968 	void *p;
969 
970 	p = ops->setup_area(1);
971 	ops->fault(p, 0, page_size);
972 	c->collapse("Collapse PTE table with single PTE entry present", p,
973 		    1, ops, true);
974 	ops->cleanup_area(p, hpage_pmd_size);
975 }
976 
collapse_max_ptes_none(struct collapse_context * c,struct mem_ops * ops)977 static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
978 {
979 	int max_ptes_none = hpage_pmd_nr / 2;
980 	struct settings settings = *current_settings();
981 	void *p;
982 
983 	settings.khugepaged.max_ptes_none = max_ptes_none;
984 	push_settings(&settings);
985 
986 	p = ops->setup_area(1);
987 
988 	if (is_tmpfs(ops)) {
989 		/* shmem pages always in the page cache */
990 		printf("tmpfs...");
991 		skip("Skip");
992 		goto skip;
993 	}
994 
995 	ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
996 	c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
997 		    ops, !c->enforce_pte_scan_limits);
998 	validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
999 
1000 	if (c->enforce_pte_scan_limits) {
1001 		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
1002 		c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
1003 			    true);
1004 		validate_memory(p, 0,
1005 				(hpage_pmd_nr - max_ptes_none) * page_size);
1006 	}
1007 skip:
1008 	ops->cleanup_area(p, hpage_pmd_size);
1009 	pop_settings();
1010 }
1011 
collapse_swapin_single_pte(struct collapse_context * c,struct mem_ops * ops)1012 static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
1013 {
1014 	void *p;
1015 
1016 	p = ops->setup_area(1);
1017 	ops->fault(p, 0, hpage_pmd_size);
1018 
1019 	printf("Swapout one page...");
1020 	if (madvise(p, page_size, MADV_PAGEOUT)) {
1021 		perror("madvise(MADV_PAGEOUT)");
1022 		exit(EXIT_FAILURE);
1023 	}
1024 	if (check_swap(p, page_size)) {
1025 		success("OK");
1026 	} else {
1027 		fail("Fail");
1028 		goto out;
1029 	}
1030 
1031 	c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
1032 		    true);
1033 	validate_memory(p, 0, hpage_pmd_size);
1034 out:
1035 	ops->cleanup_area(p, hpage_pmd_size);
1036 }
1037 
collapse_max_ptes_swap(struct collapse_context * c,struct mem_ops * ops)1038 static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
1039 {
1040 	int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
1041 	void *p;
1042 
1043 	p = ops->setup_area(1);
1044 	ops->fault(p, 0, hpage_pmd_size);
1045 
1046 	printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
1047 	if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
1048 		perror("madvise(MADV_PAGEOUT)");
1049 		exit(EXIT_FAILURE);
1050 	}
1051 	if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
1052 		success("OK");
1053 	} else {
1054 		fail("Fail");
1055 		goto out;
1056 	}
1057 
1058 	c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
1059 		    !c->enforce_pte_scan_limits);
1060 	validate_memory(p, 0, hpage_pmd_size);
1061 
1062 	if (c->enforce_pte_scan_limits) {
1063 		ops->fault(p, 0, hpage_pmd_size);
1064 		printf("Swapout %d of %d pages...", max_ptes_swap,
1065 		       hpage_pmd_nr);
1066 		if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
1067 			perror("madvise(MADV_PAGEOUT)");
1068 			exit(EXIT_FAILURE);
1069 		}
1070 		if (check_swap(p, max_ptes_swap * page_size)) {
1071 			success("OK");
1072 		} else {
1073 			fail("Fail");
1074 			goto out;
1075 		}
1076 
1077 		c->collapse("Collapse with max_ptes_swap pages swapped out", p,
1078 			    1, ops, true);
1079 		validate_memory(p, 0, hpage_pmd_size);
1080 	}
1081 out:
1082 	ops->cleanup_area(p, hpage_pmd_size);
1083 }
1084 
collapse_single_pte_entry_compound(struct collapse_context * c,struct mem_ops * ops)1085 static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
1086 {
1087 	void *p;
1088 
1089 	p = alloc_hpage(ops);
1090 
1091 	if (is_tmpfs(ops)) {
1092 		/* MADV_DONTNEED won't evict tmpfs pages */
1093 		printf("tmpfs...");
1094 		skip("Skip");
1095 		goto skip;
1096 	}
1097 
1098 	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
1099 	printf("Split huge page leaving single PTE mapping compound page...");
1100 	madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
1101 	if (ops->check_huge(p, 0))
1102 		success("OK");
1103 	else
1104 		fail("Fail");
1105 
1106 	c->collapse("Collapse PTE table with single PTE mapping compound page",
1107 		    p, 1, ops, true);
1108 	validate_memory(p, 0, page_size);
1109 skip:
1110 	ops->cleanup_area(p, hpage_pmd_size);
1111 }
1112 
collapse_full_of_compound(struct collapse_context * c,struct mem_ops * ops)1113 static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
1114 {
1115 	void *p;
1116 
1117 	p = alloc_hpage(ops);
1118 	printf("Split huge page leaving single PTE page table full of compound pages...");
1119 	madvise(p, page_size, MADV_NOHUGEPAGE);
1120 	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
1121 	if (ops->check_huge(p, 0))
1122 		success("OK");
1123 	else
1124 		fail("Fail");
1125 
1126 	c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
1127 		    true);
1128 	validate_memory(p, 0, hpage_pmd_size);
1129 	ops->cleanup_area(p, hpage_pmd_size);
1130 }
1131 
collapse_compound_extreme(struct collapse_context * c,struct mem_ops * ops)1132 static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
1133 {
1134 	void *p;
1135 	int i;
1136 
1137 	p = ops->setup_area(1);
1138 	for (i = 0; i < hpage_pmd_nr; i++) {
1139 		printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
1140 				i + 1, hpage_pmd_nr);
1141 
1142 		madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
1143 		ops->fault(BASE_ADDR, 0, hpage_pmd_size);
1144 		if (!ops->check_huge(BASE_ADDR, 1)) {
1145 			printf("Failed to allocate huge page\n");
1146 			exit(EXIT_FAILURE);
1147 		}
1148 		madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
1149 
1150 		p = mremap(BASE_ADDR - i * page_size,
1151 				i * page_size + hpage_pmd_size,
1152 				(i + 1) * page_size,
1153 				MREMAP_MAYMOVE | MREMAP_FIXED,
1154 				BASE_ADDR + 2 * hpage_pmd_size);
1155 		if (p == MAP_FAILED) {
1156 			perror("mremap+unmap");
1157 			exit(EXIT_FAILURE);
1158 		}
1159 
1160 		p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
1161 				(i + 1) * page_size,
1162 				(i + 1) * page_size + hpage_pmd_size,
1163 				MREMAP_MAYMOVE | MREMAP_FIXED,
1164 				BASE_ADDR - (i + 1) * page_size);
1165 		if (p == MAP_FAILED) {
1166 			perror("mremap+alloc");
1167 			exit(EXIT_FAILURE);
1168 		}
1169 	}
1170 
1171 	ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
1172 	ops->fault(p, 0, hpage_pmd_size);
1173 	if (!ops->check_huge(p, 1))
1174 		success("OK");
1175 	else
1176 		fail("Fail");
1177 
1178 	c->collapse("Collapse PTE table full of different compound pages", p, 1,
1179 		    ops, true);
1180 
1181 	validate_memory(p, 0, hpage_pmd_size);
1182 	ops->cleanup_area(p, hpage_pmd_size);
1183 }
1184 
collapse_fork(struct collapse_context * c,struct mem_ops * ops)1185 static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
1186 {
1187 	int wstatus;
1188 	void *p;
1189 
1190 	p = ops->setup_area(1);
1191 
1192 	printf("Allocate small page...");
1193 	ops->fault(p, 0, page_size);
1194 	if (ops->check_huge(p, 0))
1195 		success("OK");
1196 	else
1197 		fail("Fail");
1198 
1199 	printf("Share small page over fork()...");
1200 	if (!fork()) {
1201 		/* Do not touch settings on child exit */
1202 		skip_settings_restore = true;
1203 		exit_status = 0;
1204 
1205 		if (ops->check_huge(p, 0))
1206 			success("OK");
1207 		else
1208 			fail("Fail");
1209 
1210 		ops->fault(p, page_size, 2 * page_size);
1211 		c->collapse("Collapse PTE table with single page shared with parent process",
1212 			    p, 1, ops, true);
1213 
1214 		validate_memory(p, 0, page_size);
1215 		ops->cleanup_area(p, hpage_pmd_size);
1216 		exit(exit_status);
1217 	}
1218 
1219 	wait(&wstatus);
1220 	exit_status += WEXITSTATUS(wstatus);
1221 
1222 	printf("Check if parent still has small page...");
1223 	if (ops->check_huge(p, 0))
1224 		success("OK");
1225 	else
1226 		fail("Fail");
1227 	validate_memory(p, 0, page_size);
1228 	ops->cleanup_area(p, hpage_pmd_size);
1229 }
1230 
collapse_fork_compound(struct collapse_context * c,struct mem_ops * ops)1231 static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
1232 {
1233 	int wstatus;
1234 	void *p;
1235 
1236 	p = alloc_hpage(ops);
1237 	printf("Share huge page over fork()...");
1238 	if (!fork()) {
1239 		/* Do not touch settings on child exit */
1240 		skip_settings_restore = true;
1241 		exit_status = 0;
1242 
1243 		if (ops->check_huge(p, 1))
1244 			success("OK");
1245 		else
1246 			fail("Fail");
1247 
1248 		printf("Split huge page PMD in child process...");
1249 		madvise(p, page_size, MADV_NOHUGEPAGE);
1250 		madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
1251 		if (ops->check_huge(p, 0))
1252 			success("OK");
1253 		else
1254 			fail("Fail");
1255 		ops->fault(p, 0, page_size);
1256 
1257 		write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
1258 		c->collapse("Collapse PTE table full of compound pages in child",
1259 			    p, 1, ops, true);
1260 		write_num("khugepaged/max_ptes_shared",
1261 			  current_settings()->khugepaged.max_ptes_shared);
1262 
1263 		validate_memory(p, 0, hpage_pmd_size);
1264 		ops->cleanup_area(p, hpage_pmd_size);
1265 		exit(exit_status);
1266 	}
1267 
1268 	wait(&wstatus);
1269 	exit_status += WEXITSTATUS(wstatus);
1270 
1271 	printf("Check if parent still has huge page...");
1272 	if (ops->check_huge(p, 1))
1273 		success("OK");
1274 	else
1275 		fail("Fail");
1276 	validate_memory(p, 0, hpage_pmd_size);
1277 	ops->cleanup_area(p, hpage_pmd_size);
1278 }
1279 
collapse_max_ptes_shared(struct collapse_context * c,struct mem_ops * ops)1280 static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
1281 {
1282 	int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
1283 	int wstatus;
1284 	void *p;
1285 
1286 	p = alloc_hpage(ops);
1287 	printf("Share huge page over fork()...");
1288 	if (!fork()) {
1289 		/* Do not touch settings on child exit */
1290 		skip_settings_restore = true;
1291 		exit_status = 0;
1292 
1293 		if (ops->check_huge(p, 1))
1294 			success("OK");
1295 		else
1296 			fail("Fail");
1297 
1298 		printf("Trigger CoW on page %d of %d...",
1299 				hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
1300 		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
1301 		if (ops->check_huge(p, 0))
1302 			success("OK");
1303 		else
1304 			fail("Fail");
1305 
1306 		c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
1307 			    1, ops, !c->enforce_pte_scan_limits);
1308 
1309 		if (c->enforce_pte_scan_limits) {
1310 			printf("Trigger CoW on page %d of %d...",
1311 			       hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
1312 			ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
1313 				    page_size);
1314 			if (ops->check_huge(p, 0))
1315 				success("OK");
1316 			else
1317 				fail("Fail");
1318 
1319 			c->collapse("Collapse with max_ptes_shared PTEs shared",
1320 				    p, 1, ops, true);
1321 		}
1322 
1323 		validate_memory(p, 0, hpage_pmd_size);
1324 		ops->cleanup_area(p, hpage_pmd_size);
1325 		exit(exit_status);
1326 	}
1327 
1328 	wait(&wstatus);
1329 	exit_status += WEXITSTATUS(wstatus);
1330 
1331 	printf("Check if parent still has huge page...");
1332 	if (ops->check_huge(p, 1))
1333 		success("OK");
1334 	else
1335 		fail("Fail");
1336 	validate_memory(p, 0, hpage_pmd_size);
1337 	ops->cleanup_area(p, hpage_pmd_size);
1338 }
1339 
madvise_collapse_existing_thps(struct collapse_context * c,struct mem_ops * ops)1340 static void madvise_collapse_existing_thps(struct collapse_context *c,
1341 					   struct mem_ops *ops)
1342 {
1343 	void *p;
1344 
1345 	p = ops->setup_area(1);
1346 	ops->fault(p, 0, hpage_pmd_size);
1347 	c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
1348 	validate_memory(p, 0, hpage_pmd_size);
1349 
1350 	/* c->collapse() will find a hugepage and complain - call directly. */
1351 	__madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
1352 	validate_memory(p, 0, hpage_pmd_size);
1353 	ops->cleanup_area(p, hpage_pmd_size);
1354 }
1355 
1356 /*
1357  * Test race with khugepaged where page tables have been retracted and
1358  * pmd cleared.
1359  */
madvise_retracted_page_tables(struct collapse_context * c,struct mem_ops * ops)1360 static void madvise_retracted_page_tables(struct collapse_context *c,
1361 					  struct mem_ops *ops)
1362 {
1363 	void *p;
1364 	int nr_hpages = 1;
1365 	unsigned long size = nr_hpages * hpage_pmd_size;
1366 
1367 	p = ops->setup_area(nr_hpages);
1368 	ops->fault(p, 0, size);
1369 
1370 	/* Let khugepaged collapse and leave pmd cleared */
1371 	if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
1372 			  ops)) {
1373 		fail("Timeout");
1374 		return;
1375 	}
1376 	success("OK");
1377 	c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
1378 		    true);
1379 	validate_memory(p, 0, size);
1380 	ops->cleanup_area(p, size);
1381 }
1382 
usage(void)1383 static void usage(void)
1384 {
1385 	fprintf(stderr, "\nUsage: ./khugepaged <test type> [dir]\n\n");
1386 	fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
1387 	fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
1388 	fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
1389 	fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
1390 	fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
1391 	fprintf(stderr,	"\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
1392 	fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
1393 	fprintf(stderr,	"\tmounted with huge=madvise option for khugepaged tests to work\n");
1394 	exit(1);
1395 }
1396 
parse_test_type(int argc,const char ** argv)1397 static void parse_test_type(int argc, const char **argv)
1398 {
1399 	char *buf;
1400 	const char *token;
1401 
1402 	if (argc == 1) {
1403 		/* Backwards compatibility */
1404 		khugepaged_context =  &__khugepaged_context;
1405 		madvise_context =  &__madvise_context;
1406 		anon_ops = &__anon_ops;
1407 		return;
1408 	}
1409 
1410 	buf = strdup(argv[1]);
1411 	token = strsep(&buf, ":");
1412 
1413 	if (!strcmp(token, "all")) {
1414 		khugepaged_context =  &__khugepaged_context;
1415 		madvise_context =  &__madvise_context;
1416 	} else if (!strcmp(token, "khugepaged")) {
1417 		khugepaged_context =  &__khugepaged_context;
1418 	} else if (!strcmp(token, "madvise")) {
1419 		madvise_context =  &__madvise_context;
1420 	} else {
1421 		usage();
1422 	}
1423 
1424 	if (!buf)
1425 		usage();
1426 
1427 	if (!strcmp(buf, "all")) {
1428 		file_ops =  &__file_ops;
1429 		anon_ops = &__anon_ops;
1430 		shmem_ops = &__shmem_ops;
1431 	} else if (!strcmp(buf, "anon")) {
1432 		anon_ops = &__anon_ops;
1433 	} else if (!strcmp(buf, "file")) {
1434 		file_ops =  &__file_ops;
1435 	} else if (!strcmp(buf, "shmem")) {
1436 		shmem_ops = &__shmem_ops;
1437 	} else {
1438 		usage();
1439 	}
1440 
1441 	if (!file_ops)
1442 		return;
1443 
1444 	if (argc != 3)
1445 		usage();
1446 }
1447 
main(int argc,const char ** argv)1448 int main(int argc, const char **argv)
1449 {
1450 	struct settings default_settings = {
1451 		.thp_enabled = THP_MADVISE,
1452 		.thp_defrag = THP_DEFRAG_ALWAYS,
1453 		.shmem_enabled = SHMEM_ADVISE,
1454 		.use_zero_page = 0,
1455 		.khugepaged = {
1456 			.defrag = 1,
1457 			.alloc_sleep_millisecs = 10,
1458 			.scan_sleep_millisecs = 10,
1459 		},
1460 		/*
1461 		 * When testing file-backed memory, the collapse path
1462 		 * looks at how many pages are found in the page cache, not
1463 		 * what pages are mapped. Disable read ahead optimization so
1464 		 * pages don't find their way into the page cache unless
1465 		 * we mem_ops->fault() them in.
1466 		 */
1467 		.read_ahead_kb = 0,
1468 	};
1469 
1470 	parse_test_type(argc, argv);
1471 
1472 	if (file_ops)
1473 		get_finfo(argv[2]);
1474 
1475 	setbuf(stdout, NULL);
1476 
1477 	page_size = getpagesize();
1478 	hpage_pmd_size = read_pmd_pagesize();
1479 	hpage_pmd_nr = hpage_pmd_size / page_size;
1480 
1481 	default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
1482 	default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
1483 	default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
1484 	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
1485 
1486 	save_settings();
1487 	push_settings(&default_settings);
1488 
1489 	alloc_at_fault();
1490 
1491 #define TEST(t, c, o) do { \
1492 	if (c && o) { \
1493 		printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
1494 		t(c, o); \
1495 	} \
1496 	} while (0)
1497 
1498 	TEST(collapse_full, khugepaged_context, anon_ops);
1499 	TEST(collapse_full, khugepaged_context, file_ops);
1500 	TEST(collapse_full, khugepaged_context, shmem_ops);
1501 	TEST(collapse_full, madvise_context, anon_ops);
1502 	TEST(collapse_full, madvise_context, file_ops);
1503 	TEST(collapse_full, madvise_context, shmem_ops);
1504 
1505 	TEST(collapse_empty, khugepaged_context, anon_ops);
1506 	TEST(collapse_empty, madvise_context, anon_ops);
1507 
1508 	TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
1509 	TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
1510 	TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
1511 	TEST(collapse_single_pte_entry, madvise_context, anon_ops);
1512 	TEST(collapse_single_pte_entry, madvise_context, file_ops);
1513 	TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
1514 
1515 	TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
1516 	TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
1517 	TEST(collapse_max_ptes_none, madvise_context, anon_ops);
1518 	TEST(collapse_max_ptes_none, madvise_context, file_ops);
1519 
1520 	TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
1521 	TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
1522 	TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
1523 	TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
1524 
1525 	TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
1526 	TEST(collapse_full_of_compound, khugepaged_context, file_ops);
1527 	TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
1528 	TEST(collapse_full_of_compound, madvise_context, anon_ops);
1529 	TEST(collapse_full_of_compound, madvise_context, file_ops);
1530 	TEST(collapse_full_of_compound, madvise_context, shmem_ops);
1531 
1532 	TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
1533 	TEST(collapse_compound_extreme, madvise_context, anon_ops);
1534 
1535 	TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
1536 	TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
1537 
1538 	TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
1539 	TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
1540 
1541 	TEST(collapse_fork, khugepaged_context, anon_ops);
1542 	TEST(collapse_fork, madvise_context, anon_ops);
1543 
1544 	TEST(collapse_fork_compound, khugepaged_context, anon_ops);
1545 	TEST(collapse_fork_compound, madvise_context, anon_ops);
1546 
1547 	TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
1548 	TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
1549 
1550 	TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
1551 	TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
1552 	TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
1553 
1554 	TEST(madvise_retracted_page_tables, madvise_context, file_ops);
1555 	TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
1556 
1557 	restore_settings(0);
1558 }
1559