Lines Matching +full:- +full:2 +full:g

1 // SPDX-License-Identifier: GPL-2.0
5 * numa: Simulate NUMA-sensitive workload and measure their NUMA performance
10 #include <subcmd/parse-options.h>
45 * Regular printout to the terminal, suppressed if -q is specified:
47 #define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0)
53 #define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0)
128 /* Affinity options -C and -N: */
134 /* Global, read-writable area, accessible to all processes and threads: */
162 static struct global_info *g = NULL; variable
173 OPT_STRING('G', "mb_global" , &p0.mb_global_str, "MB", "global memory (MBs)"),
182 …OPT_BOOLEAN('R', "data_reads" , &p0.data_reads, "access the data via reads (can be mixed with -W)"…
183 …OPT_BOOLEAN('W', "data_writes" , &p0.data_writes, "access the data via writes (can be mixed with -
201 OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
230 for (i = 0; i < g->p.nr_nodes; i++) { in nr_numa_nodes()
257 for (cpu = 0; cpu < (int)cpumask->size; cpu++) { in node_has_cpus()
289 if (target_cpu == -1) { in bind_to_cpu()
292 for (cpu = 0; cpu < g->p.nr_cpus; cpu++) in bind_to_cpu()
295 if (target_cpu < 0 || target_cpu >= g->p.nr_cpus) in bind_to_cpu()
312 BUG_ON(-1); in bind_to_cpu()
338 for (cpu = 0; cpu < g->p.nr_cpus; cpu++) in bind_to_node()
347 for (cpu = 0; cpu < (int)cpumask->size; cpu++) { in bind_to_node()
366 BUG_ON(-1); in bind_to_node()
386 ret = set_mempolicy(MPOL_DEFAULT, NULL, g->p.nr_nodes-1); in mempol_restore()
405 ret = set_mempolicy(MPOL_BIND, node_mask->maskp, node_mask->size + 1); in bind_to_memnode()
406 dprintf("binding to node %d, mask: %016lx => %d\n", node, *node_mask->maskp, ret); in bind_to_memnode()
412 #define HPSIZE (2*1024*1024)
443 buf = (void *)mmap(0, bytes, PROT_READ|PROT_WRITE, MAP_ANON|map_flags, -1, 0); in alloc_data()
444 BUG_ON(buf == (void *)-1); in alloc_data()
449 if (ret && !g->print_once) { in alloc_data()
450 g->print_once = 1; in alloc_data()
451 …printf("WARNING: Could not enable THP - do: 'echo madvise > /sys/kernel/mm/transparent_hugepage/en… in alloc_data()
456 if (ret && !g->print_once) { in alloc_data()
457 g->print_once = 1; in alloc_data()
477 /* Align to 2MB boundary: */ in alloc_data()
478 buf = (void *)(((unsigned long)buf + HPSIZE-1) & ~(HPSIZE-1)); in alloc_data()
506 return alloc_data(bytes, MAP_SHARED, 1, g->p.init_cpu0, g->p.thp, g->p.init_random); in zalloc_shared_data()
514 return alloc_data(bytes, MAP_SHARED, 0, g->p.init_cpu0, g->p.thp, g->p.init_random); in setup_shared_data()
518 * Allocate process-local memory - this will either be shared between
523 return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0, g->p.thp, g->p.init_random); in setup_private_data()
541 if (!g->p.cpu_list_str) in parse_setup_cpu_list()
544 dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks); in parse_setup_cpu_list()
546 str0 = str = strdup(g->p.cpu_list_str); in parse_setup_cpu_list()
565 tok_end = strstr(tok, "-"); in parse_setup_cpu_list()
572 /* CPU range specified (for example: "5-11"): */ in parse_setup_cpu_list()
581 BUG_ON(step <= 0 || step >= g->p.nr_cpus); in parse_setup_cpu_list()
586 * Eg: "--cpus 8_4-16#4" means: '--cpus 8_4,12_4,16_4', in parse_setup_cpu_list()
593 BUG_ON(bind_len <= 0 || bind_len > g->p.nr_cpus); in parse_setup_cpu_list()
604 dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul); in parse_setup_cpu_list()
606 if (bind_cpu_0 >= g->p.nr_cpus || bind_cpu_1 >= g->p.nr_cpus) { in parse_setup_cpu_list()
607 printf("\nTest not applicable, system has only %d CPUs.\n", g->p.nr_cpus); in parse_setup_cpu_list()
608 return -1; in parse_setup_cpu_list()
613 return -1; in parse_setup_cpu_list()
620 size_t size = CPU_ALLOC_SIZE(g->p.nr_cpus); in parse_setup_cpu_list()
626 if (t >= g->p.nr_tasks) { in parse_setup_cpu_list()
630 td = g->threads + t; in parse_setup_cpu_list()
635 tprintf("%2d/%d", bind_cpu, bind_len); in parse_setup_cpu_list()
637 tprintf("%2d", bind_cpu); in parse_setup_cpu_list()
640 td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus); in parse_setup_cpu_list()
641 BUG_ON(!td->bind_cpumask); in parse_setup_cpu_list()
642 CPU_ZERO_S(size, td->bind_cpumask); in parse_setup_cpu_list()
644 if (cpu < 0 || cpu >= g->p.nr_cpus) { in parse_setup_cpu_list()
645 CPU_FREE(td->bind_cpumask); in parse_setup_cpu_list()
646 BUG_ON(-1); in parse_setup_cpu_list()
648 CPU_SET_S(cpu, size, td->bind_cpumask); in parse_setup_cpu_list()
658 if (t < g->p.nr_tasks) in parse_setup_cpu_list()
659 printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t); in parse_setup_cpu_list()
669 return -1; in parse_cpus_opt()
689 if (!g->p.node_list_str) in parse_setup_node_list()
692 dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks); in parse_setup_node_list()
694 str0 = str = strdup(g->p.node_list_str); in parse_setup_node_list()
712 tok_end = strstr(tok, "-"); in parse_setup_node_list()
719 /* NODE range specified (for example: "5-11"): */ in parse_setup_node_list()
728 BUG_ON(step <= 0 || step >= g->p.nr_nodes); in parse_setup_node_list()
739 dprintf("NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step); in parse_setup_node_list()
741 if (bind_node_0 >= g->p.nr_nodes || bind_node_1 >= g->p.nr_nodes) { in parse_setup_node_list()
742 printf("\nTest not applicable, system has only %d nodes.\n", g->p.nr_nodes); in parse_setup_node_list()
743 return -1; in parse_setup_node_list()
753 if (t >= g->p.nr_tasks || !node_has_cpus(bind_node)) { in parse_setup_node_list()
757 td = g->threads + t; in parse_setup_node_list()
760 tprintf(" %2d", bind_node); in parse_setup_node_list()
762 tprintf(",%2d", bind_node); in parse_setup_node_list()
764 td->bind_node = bind_node; in parse_setup_node_list()
773 if (t < g->p.nr_tasks) in parse_setup_node_list()
774 printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->p.nr_tasks - t); in parse_setup_node_list()
784 return -1; in parse_nodes_opt()
792 return (lfsr>>1) ^ ((0x0u - (lfsr & 0x1u)) & taps); in lfsr_32()
803 if (g->p.data_reads) in access_data()
805 if (g->p.data_writes) in access_data()
834 if (g->p.data_zero_memset && !g->p.data_rand_walk) { in do_work()
841 chunk_1 = words/g->p.nr_loops; in do_work()
845 off -= words; in do_work()
847 if (g->p.data_rand_walk) { in do_work()
857 end = min(start + 1024, words-1); in do_work()
859 if (g->p.data_zero_memset) { in do_work()
860 bzero(data + start, (end-start) * sizeof(u64)); in do_work()
866 } else if (!g->p.data_backwards || (nr + loop) & 1) { in do_work()
887 d = data + off - 1; in do_work()
892 d = data + words-1; in do_work()
898 d--; in do_work()
911 g->threads[task_nr].curr_cpu = cpu; in update_curr_cpu()
920 * to a single node. A count of g->p.nr_nodes means it's
929 node_present = (char *)malloc(g->p.nr_nodes * sizeof(char)); in count_process_nodes()
931 for (nodes = 0; nodes < g->p.nr_nodes; nodes++) in count_process_nodes()
934 for (t = 0; t < g->p.nr_threads; t++) { in count_process_nodes()
939 task_nr = process_nr*g->p.nr_threads + t; in count_process_nodes()
940 td = g->threads + task_nr; in count_process_nodes()
942 node = numa_node_of_cpu(td->curr_cpu); in count_process_nodes()
943 if (node < 0) /* curr_cpu was likely still -1 */ { in count_process_nodes()
953 for (n = 0; n < g->p.nr_nodes; n++) in count_process_nodes()
961 * Count the number of distinct process-threads a node contains.
965 * process then we are well-converged.
972 for (p = 0; p < g->p.nr_proc; p++) { in count_node_processes()
973 for (t = 0; t < g->p.nr_threads; t++) { in count_node_processes()
978 task_nr = p*g->p.nr_threads + t; in count_node_processes()
979 td = g->threads + task_nr; in count_node_processes()
981 n = numa_node_of_cpu(td->curr_cpu); in count_node_processes()
997 nodes_min = -1; in calc_convergence_compression()
1000 for (p = 0; p < g->p.nr_proc; p++) { in calc_convergence_compression()
1017 tprintf(" {%d-%d}", nodes_min, nodes_max); in calc_convergence_compression()
1036 if (!g->p.show_convergence && !g->p.measure_convergence) in calc_convergence()
1039 nodes = (int *)malloc(g->p.nr_nodes * sizeof(int)); in calc_convergence()
1041 for (node = 0; node < g->p.nr_nodes; node++) in calc_convergence()
1044 loops_done_min = -1; in calc_convergence()
1047 for (t = 0; t < g->p.nr_tasks; t++) { in calc_convergence()
1048 struct thread_data *td = g->threads + t; in calc_convergence()
1051 cpu = td->curr_cpu; in calc_convergence()
1061 loops_done = td->loops_done; in calc_convergence()
1067 nr_min = g->p.nr_tasks; in calc_convergence()
1070 for (node = 0; node < g->p.nr_nodes; node++) { in calc_convergence()
1080 BUG_ON(sum > g->p.nr_tasks); in calc_convergence()
1082 if (0 && (sum < g->p.nr_tasks)) { in calc_convergence()
1089 * on nodes - when we are converged this will decrease in calc_convergence()
1090 * to g->p.nr_proc: in calc_convergence()
1094 for (node = 0; node < g->p.nr_nodes; node++) { in calc_convergence()
1101 tprintf(" %2d/%-2d", nr, processes); in calc_convergence()
1106 distance = nr_max - nr_min; in calc_convergence()
1108 tprintf(" [%2d/%-2d]", distance, process_groups); in calc_convergence()
1110 tprintf(" l:%3d-%-3d (%3d)", in calc_convergence()
1111 loops_done_min, loops_done_max, loops_done_max-loops_done_min); in calc_convergence()
1114 double skew = 1.0 - (double)loops_done_min/loops_done_max; in calc_convergence()
1121 if (strong && process_groups == g->p.nr_proc) { in calc_convergence()
1125 if (g->p.measure_convergence) { in calc_convergence()
1126 g->all_converged = true; in calc_convergence()
1127 g->stop_work = true; in calc_convergence()
1132 tprintf(" (%6.1fs de-converged)", runtime_ns_max / NSEC_PER_SEC); in calc_convergence()
1144 (double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max / NSEC_PER_SEC / 60.0); in show_summary()
1148 if (g->p.show_details >= 0) in show_summary()
1156 int process_nr = td->process_nr; in worker_thread()
1157 int thread_nr = td->thread_nr; in worker_thread()
1159 int task_nr = td->task_nr; in worker_thread()
1160 int details = g->p.show_details; in worker_thread()
1163 u64 val = td->val; in worker_thread()
1173 bind_to_cpumask(td->bind_cpumask); in worker_thread()
1174 bind_to_memnode(td->bind_node); in worker_thread()
1178 global_data = g->data; in worker_thread()
1179 process_data = td->process_data; in worker_thread()
1180 thread_data = setup_private_data(g->p.bytes_thread); in worker_thread()
1185 if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1) in worker_thread()
1192 if (details >= 2) { in worker_thread()
1193 printf("# thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n", in worker_thread()
1197 if (g->p.serialize_startup) { in worker_thread()
1198 mutex_lock(&g->startup_mutex); in worker_thread()
1199 g->nr_tasks_started++; in worker_thread()
1201 if (g->nr_tasks_started == g->p.nr_tasks) in worker_thread()
1202 cond_signal(&g->startup_cond); in worker_thread()
1204 mutex_unlock(&g->startup_mutex); in worker_thread()
1207 mutex_lock(&g->start_work_mutex); in worker_thread()
1208 g->start_work = false; in worker_thread()
1209 g->nr_tasks_working++; in worker_thread()
1210 while (!g->start_work) in worker_thread()
1211 cond_wait(&g->start_work_cond, &g->start_work_mutex); in worker_thread()
1213 mutex_unlock(&g->start_work_mutex); in worker_thread()
1221 for (l = 0; l < g->p.nr_loops; l++) { in worker_thread()
1224 if (g->stop_work) in worker_thread()
1227 val += do_work(global_data, g->p.bytes_global, process_nr, g->p.nr_proc, l, val); in worker_thread()
1228 val += do_work(process_data, g->p.bytes_process, thread_nr, g->p.nr_threads, l, val); in worker_thread()
1229 val += do_work(thread_data, g->p.bytes_thread, 0, 1, l, val); in worker_thread()
1231 if (g->p.sleep_usecs) { in worker_thread()
1232 mutex_lock(td->process_lock); in worker_thread()
1233 usleep(g->p.sleep_usecs); in worker_thread()
1234 mutex_unlock(td->process_lock); in worker_thread()
1237 * Amount of work to be done under a process-global lock: in worker_thread()
1239 if (g->p.bytes_process_locked) { in worker_thread()
1240 mutex_lock(td->process_lock); in worker_thread()
1241 val += do_work(process_data, g->p.bytes_process_locked, thread_nr, g->p.nr_threads, l, val); in worker_thread()
1242 mutex_unlock(td->process_lock); in worker_thread()
1245 work_done = g->p.bytes_global + g->p.bytes_process + in worker_thread()
1246 g->p.bytes_process_locked + g->p.bytes_thread; in worker_thread()
1251 if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs) in worker_thread()
1254 td->loops_done = l; in worker_thread()
1259 if (g->p.nr_secs) { in worker_thread()
1261 if ((u32)diff.tv_sec >= g->p.nr_secs) { in worker_thread()
1262 g->stop_work = true; in worker_thread()
1272 * Perturb the first task's equilibrium every g->p.perturb_secs seconds, in worker_thread()
1275 …if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs)… in worker_thread()
1287 this_cpu = g->threads[task_nr].curr_cpu; in worker_thread()
1288 if (this_cpu < g->p.nr_cpus/2) in worker_thread()
1289 target_cpu = g->p.nr_cpus-1; in worker_thread()
1309 printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016"PRIx64"]\n", in worker_thread()
1326 td->runtime_ns = diff.tv_sec * NSEC_PER_SEC; in worker_thread()
1327 td->runtime_ns += diff.tv_usec * NSEC_PER_USEC; in worker_thread()
1328 secs = td->runtime_ns / NSEC_PER_SEC; in worker_thread()
1329 td->speed_gbs = secs ? bytes_done / secs / 1e9 : 0; in worker_thread()
1332 td->system_time_ns = rusage.ru_stime.tv_sec * NSEC_PER_SEC; in worker_thread()
1333 td->system_time_ns += rusage.ru_stime.tv_usec * NSEC_PER_USEC; in worker_thread()
1334 td->user_time_ns = rusage.ru_utime.tv_sec * NSEC_PER_SEC; in worker_thread()
1335 td->user_time_ns += rusage.ru_utime.tv_usec * NSEC_PER_USEC; in worker_thread()
1337 free_data(thread_data, g->p.bytes_thread); in worker_thread()
1339 mutex_lock(&g->stop_work_mutex); in worker_thread()
1340 g->bytes_done += bytes_done; in worker_thread()
1341 mutex_unlock(&g->stop_work_mutex); in worker_thread()
1366 task_nr = process_nr*g->p.nr_threads; in worker_process()
1367 td = g->threads + task_nr; in worker_process()
1369 bind_to_memnode(td->bind_node); in worker_process()
1370 bind_to_cpumask(td->bind_cpumask); in worker_process()
1372 pthreads = zalloc(g->p.nr_threads * sizeof(pthread_t)); in worker_process()
1373 process_data = setup_private_data(g->p.bytes_process); in worker_process()
1375 if (g->p.show_details >= 3) { in worker_process()
1376 printf(" # process %2d global mem: %p, process mem: %p\n", in worker_process()
1377 process_nr, g->data, process_data); in worker_process()
1380 for (t = 0; t < g->p.nr_threads; t++) { in worker_process()
1381 task_nr = process_nr*g->p.nr_threads + t; in worker_process()
1382 td = g->threads + task_nr; in worker_process()
1384 td->process_data = process_data; in worker_process()
1385 td->process_nr = process_nr; in worker_process()
1386 td->thread_nr = t; in worker_process()
1387 td->task_nr = task_nr; in worker_process()
1388 td->val = rand(); in worker_process()
1389 td->curr_cpu = -1; in worker_process()
1390 td->process_lock = &process_lock; in worker_process()
1396 for (t = 0; t < g->p.nr_threads; t++) { in worker_process()
1401 free_data(process_data, g->p.bytes_process); in worker_process()
1407 if (g->p.show_details < 0) in print_summary()
1412 g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks", nr_numa_nodes(), g->p.nr_cpus); in print_summary()
1414 g->p.nr_loops, g->p.bytes_global/1024/1024); in print_summary()
1416 g->p.nr_loops, g->p.bytes_process/1024/1024); in print_summary()
1418 g->p.nr_loops, g->p.bytes_thread/1024/1024); in print_summary()
1427 ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; in init_thread_data()
1430 g->threads = zalloc_shared_data(size); in init_thread_data()
1432 for (t = 0; t < g->p.nr_tasks; t++) { in init_thread_data()
1433 struct thread_data *td = g->threads + t; in init_thread_data()
1434 size_t cpuset_size = CPU_ALLOC_SIZE(g->p.nr_cpus); in init_thread_data()
1438 td->bind_node = NUMA_NO_NODE; in init_thread_data()
1441 td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus); in init_thread_data()
1442 BUG_ON(!td->bind_cpumask); in init_thread_data()
1443 CPU_ZERO_S(cpuset_size, td->bind_cpumask); in init_thread_data()
1444 for (cpu = 0; cpu < g->p.nr_cpus; cpu++) in init_thread_data()
1445 CPU_SET_S(cpu, cpuset_size, td->bind_cpumask); in init_thread_data()
1451 ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; in deinit_thread_data()
1455 for (t = 0; t < g->p.nr_tasks; t++) { in deinit_thread_data()
1456 struct thread_data *td = g->threads + t; in deinit_thread_data()
1457 CPU_FREE(td->bind_cpumask); in deinit_thread_data()
1460 free_data(g->threads, size); in deinit_thread_data()
1465 g = (void *)alloc_data(sizeof(*g), MAP_SHARED, 1, 0, 0 /* THP */, 0); in init()
1468 g->p = p0; in init()
1470 g->p.nr_cpus = numa_num_configured_cpus(); in init()
1472 g->p.nr_nodes = numa_max_node() + 1; in init()
1475 BUG_ON(g->p.nr_nodes < 0); in init()
1477 if (g->p.show_quiet && !g->p.show_details) in init()
1478 g->p.show_details = -1; in init()
1481 if (!g->p.mb_global_str && !g->p.mb_proc_str && !g->p.mb_thread_str) in init()
1482 return -1; in init()
1484 if (g->p.mb_global_str) { in init()
1485 g->p.mb_global = atof(g->p.mb_global_str); in init()
1486 BUG_ON(g->p.mb_global < 0); in init()
1489 if (g->p.mb_proc_str) { in init()
1490 g->p.mb_proc = atof(g->p.mb_proc_str); in init()
1491 BUG_ON(g->p.mb_proc < 0); in init()
1494 if (g->p.mb_proc_locked_str) { in init()
1495 g->p.mb_proc_locked = atof(g->p.mb_proc_locked_str); in init()
1496 BUG_ON(g->p.mb_proc_locked < 0); in init()
1497 BUG_ON(g->p.mb_proc_locked > g->p.mb_proc); in init()
1500 if (g->p.mb_thread_str) { in init()
1501 g->p.mb_thread = atof(g->p.mb_thread_str); in init()
1502 BUG_ON(g->p.mb_thread < 0); in init()
1505 BUG_ON(g->p.nr_threads <= 0); in init()
1506 BUG_ON(g->p.nr_proc <= 0); in init()
1508 g->p.nr_tasks = g->p.nr_proc*g->p.nr_threads; in init()
1510 g->p.bytes_global = g->p.mb_global *1024L*1024L; in init()
1511 g->p.bytes_process = g->p.mb_proc *1024L*1024L; in init()
1512 g->p.bytes_process_locked = g->p.mb_proc_locked *1024L*1024L; in init()
1513 g->p.bytes_thread = g->p.mb_thread *1024L*1024L; in init()
1515 g->data = setup_shared_data(g->p.bytes_global); in init()
1518 mutex_init_pshared(&g->start_work_mutex); in init()
1519 cond_init_pshared(&g->start_work_cond); in init()
1520 mutex_init_pshared(&g->startup_mutex); in init()
1521 cond_init_pshared(&g->startup_cond); in init()
1522 mutex_init_pshared(&g->stop_work_mutex); in init()
1528 return -1; in init()
1538 free_data(g->data, g->p.bytes_global); in deinit()
1539 g->data = NULL; in deinit()
1543 free_data(g, sizeof(*g)); in deinit()
1544 g = NULL; in deinit()
1556 if (!g->p.show_quiet) in print_res()
1557 printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short); in print_res()
1576 return -1; in __bench_numa()
1578 pids = zalloc(g->p.nr_proc * sizeof(*pids)); in __bench_numa()
1579 pid = -1; in __bench_numa()
1581 if (g->p.serialize_startup) { in __bench_numa()
1588 for (i = 0; i < g->p.nr_proc; i++) { in __bench_numa()
1590 dprintf(" # process %2d: PID %d\n", i, pid); in __bench_numa()
1603 if (g->p.serialize_startup) { in __bench_numa()
1611 mutex_lock(&g->startup_mutex); in __bench_numa()
1612 while (g->nr_tasks_started != g->p.nr_tasks) in __bench_numa()
1613 cond_wait(&g->startup_cond, &g->startup_mutex); in __bench_numa()
1615 mutex_unlock(&g->startup_mutex); in __bench_numa()
1619 mutex_lock(&g->start_work_mutex); in __bench_numa()
1620 threads_ready = (g->nr_tasks_working == g->p.nr_tasks); in __bench_numa()
1621 mutex_unlock(&g->start_work_mutex); in __bench_numa()
1639 mutex_lock(&g->start_work_mutex); in __bench_numa()
1640 g->start_work = true; in __bench_numa()
1641 mutex_unlock(&g->start_work_mutex); in __bench_numa()
1642 cond_broadcast(&g->start_work_cond); in __bench_numa()
1650 for (i = 0; i < g->p.nr_proc; i++) { in __bench_numa()
1658 runtime_ns_min = -1LL; in __bench_numa()
1660 for (t = 0; t < g->p.nr_tasks; t++) { in __bench_numa()
1661 u64 thread_runtime_ns = g->threads[t].runtime_ns; in __bench_numa()
1681 bytes = g->bytes_done; in __bench_numa()
1682 runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / NSEC_PER_SEC; in __bench_numa()
1684 if (g->p.measure_convergence) { in __bench_numa()
1686 "secs,", "NUMA-convergence-latency", "secs latency to NUMA-converge"); in __bench_numa()
1690 "secs,", "runtime-max/thread", "secs slowest (max) thread-runtime"); in __bench_numa()
1693 "secs,", "runtime-min/thread", "secs fastest (min) thread-runtime"); in __bench_numa()
1696 "secs,", "runtime-avg/thread", "secs average thread-runtime"); in __bench_numa()
1698 delta_runtime = (runtime_sec_max - runtime_sec_min)/2.0; in __bench_numa()
1700 "%,", "spread-runtime/thread", "% difference between max/avg runtime"); in __bench_numa()
1702 print_res(name, bytes / g->p.nr_tasks / 1e9, in __bench_numa()
1706 "GB,", "data-total", "GB data processed, total"); in __bench_numa()
1708 print_res(name, runtime_sec_max * NSEC_PER_SEC / (bytes / g->p.nr_tasks), in __bench_numa()
1711 print_res(name, bytes / g->p.nr_tasks / 1e9 / runtime_sec_max, in __bench_numa()
1712 "GB/sec,", "thread-speed", "GB/sec/thread speed"); in __bench_numa()
1715 "GB/sec,", "total-speed", "GB/sec total speed"); in __bench_numa()
1717 if (g->p.show_details >= 2) { in __bench_numa()
1718 char tname[14 + 2 * 11 + 1]; in __bench_numa()
1720 for (p = 0; p < g->p.nr_proc; p++) { in __bench_numa()
1721 for (t = 0; t < g->p.nr_threads; t++) { in __bench_numa()
1723 td = g->threads + p*g->p.nr_threads + t; in __bench_numa()
1725 print_res(tname, td->speed_gbs, in __bench_numa()
1726 "GB/sec", "thread-speed", "GB/sec/thread speed"); in __bench_numa()
1727 print_res(tname, td->system_time_ns / NSEC_PER_SEC, in __bench_numa()
1728 "secs", "thread-system-time", "system CPU time/thread"); in __bench_numa()
1729 print_res(tname, td->user_time_ns / NSEC_PER_SEC, in __bench_numa()
1730 "secs", "thread-user-time", "user CPU time/thread"); in __bench_numa()
1773 p->serialize_startup = 1; in init_params()
1774 p->data_reads = true; in init_params()
1775 p->data_writes = true; in init_params()
1776 p->data_backwards = true; in init_params()
1777 p->data_rand_walk = true; in init_params()
1778 p->nr_loops = -1; in init_params()
1779 p->init_random = true; in init_params()
1780 p->mb_global_str = "1"; in init_params()
1781 p->nr_proc = 1; in init_params()
1782 p->nr_threads = 1; in init_params()
1783 p->nr_secs = 5; in init_params()
1784 p->run_all = argc == 1; in init_params()
1802 return -1; in run_bench_numa()
1805 #define OPT_BW_RAM "-s", "20", "-zZq", "--thp", " 1", "--no-data_rand_walk"
1806 #define OPT_BW_RAM_NOTHP OPT_BW_RAM, "--thp", "-1"
1808 #define OPT_CONV "-s", "100", "-zZ0qcm", "--thp", " 1"
1809 #define OPT_CONV_NOTHP OPT_CONV, "--thp", "-1"
1811 #define OPT_BW "-s", "20", "-zZ0q", "--thp", " 1"
1812 #define OPT_BW_NOTHP OPT_BW, "--thp", "-1"
1815 * The built-in test-suite executed by "perf bench numa -a".
1820 /* Basic single-stream NUMA bandwidth measurements: */
1821 { "RAM-bw-local,", "mem", "-p", "1", "-t", "1", "-P", "1024",
1822 "-C" , "0", "-M", "0", OPT_BW_RAM },
1823 { "RAM-bw-local-NOTHP,",
1824 "mem", "-p", "1", "-t", "1", "-P", "1024",
1825 "-C" , "0", "-M", "0", OPT_BW_RAM_NOTHP },
1826 { "RAM-bw-remote,", "mem", "-p", "1", "-t", "1", "-P", "1024",
1827 "-C" , "0", "-M", "1", OPT_BW_RAM },
1829 /* 2-stream NUMA bandwidth measurements: */
1830 { "RAM-bw-local-2x,", "mem", "-p", "2", "-t", "1", "-P", "1024",
1831 "-C", "0,2", "-M", "0x2", OPT_BW_RAM },
1832 { "RAM-bw-remote-2x,", "mem", "-p", "2", "-t", "1", "-P", "1024",
1833 "-C", "0,2", "-M", "1x2", OPT_BW_RAM },
1835 /* Cross-stream NUMA bandwidth measurement: */
1836 { "RAM-bw-cross,", "mem", "-p", "2", "-t", "1", "-P", "1024",
1837 "-C", "0,8", "-M", "1,0", OPT_BW_RAM },
1840 { " 1x3-convergence,", "mem", "-p", "1", "-t", "3", "-P", "512", OPT_CONV },
1841 { " 1x4-convergence,", "mem", "-p", "1", "-t", "4", "-P", "512", OPT_CONV },
1842 { " 1x6-convergence,", "mem", "-p", "1", "-t", "6", "-P", "1020", OPT_CONV },
1843 { " 2x3-convergence,", "mem", "-p", "2", "-t", "3", "-P", "1020", OPT_CONV },
1844 { " 3x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV },
1845 { " 4x4-convergence,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV },
1846 { " 4x4-convergence-NOTHP,",
1847 "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV_NOTHP },
1848 { " 4x6-convergence,", "mem", "-p", "4", "-t", "6", "-P", "1020", OPT_CONV },
1849 { " 4x8-convergence,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_CONV },
1850 { " 8x4-convergence,", "mem", "-p", "8", "-t", "4", "-P", "512", OPT_CONV },
1851 { " 8x4-convergence-NOTHP,",
1852 "mem", "-p", "8", "-t", "4", "-P", "512", OPT_CONV_NOTHP },
1853 { " 3x1-convergence,", "mem", "-p", "3", "-t", "1", "-P", "512", OPT_CONV },
1854 { " 4x1-convergence,", "mem", "-p", "4", "-t", "1", "-P", "512", OPT_CONV },
1855 { " 8x1-convergence,", "mem", "-p", "8", "-t", "1", "-P", "512", OPT_CONV },
1856 { "16x1-convergence,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_CONV },
1857 { "32x1-convergence,", "mem", "-p", "32", "-t", "1", "-P", "128", OPT_CONV },
1860 { " 2x1-bw-process,", "mem", "-p", "2", "-t", "1", "-P", "1024", OPT_BW },
1861 { " 3x1-bw-process,", "mem", "-p", "3", "-t", "1", "-P", "1024", OPT_BW },
1862 { " 4x1-bw-process,", "mem", "-p", "4", "-t", "1", "-P", "1024", OPT_BW },
1863 { " 8x1-bw-process,", "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW },
1864 { " 8x1-bw-process-NOTHP,",
1865 "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW_NOTHP },
1866 { "16x1-bw-process,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_BW },
1868 { " 1x4-bw-thread,", "mem", "-p", "1", "-t", "4", "-T", "256", OPT_BW },
1869 { " 1x8-bw-thread,", "mem", "-p", "1", "-t", "8", "-T", "256", OPT_BW },
1870 { "1x16-bw-thread,", "mem", "-p", "1", "-t", "16", "-T", "128", OPT_BW },
1871 { "1x32-bw-thread,", "mem", "-p", "1", "-t", "32", "-T", "64", OPT_BW },
1873 { " 2x3-bw-process,", "mem", "-p", "2", "-t", "3", "-P", "512", OPT_BW },
1874 { " 4x4-bw-process,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_BW },
1875 { " 4x6-bw-process,", "mem", "-p", "4", "-t", "6", "-P", "512", OPT_BW },
1876 { " 4x8-bw-process,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW },
1877 { " 4x8-bw-process-NOTHP,",
1878 "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW_NOTHP },
1879 { " 3x3-bw-process,", "mem", "-p", "3", "-t", "3", "-P", "512", OPT_BW },
1880 { " 5x5-bw-process,", "mem", "-p", "5", "-t", "5", "-P", "512", OPT_BW },
1882 { "2x16-bw-process,", "mem", "-p", "2", "-t", "16", "-P", "512", OPT_BW },
1883 { "1x32-bw-process,", "mem", "-p", "1", "-t", "32", "-P", "2048", OPT_BW },
1885 { "numa02-bw,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW },
1886 { "numa02-bw-NOTHP,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW_NOTHP },
1887 { "numa01-bw-thread,", "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW },
1888 { "numa01-bw-thread-NOTHP,",
1889 "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW_NOTHP },
1898 ret = system("echo ' #'; echo ' # Running test on: '$(uname -a); echo ' #'"); in bench_all()
1927 return -1; in bench_numa()