1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4  *
5  * membarrier system call
6  */
7 #include "sched.h"
8 
9 /*
10  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
11  * except MEMBARRIER_CMD_QUERY.
12  */
13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK			\
15 	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE			\
16 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
17 #else
18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	0
19 #endif
20 
21 #ifdef CONFIG_RSEQ
22 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK		\
23 	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ			\
24 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
25 #else
26 #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK	0
27 #endif
28 
29 #define MEMBARRIER_CMD_BITMASK						\
30 	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED	\
31 	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED			\
32 	| MEMBARRIER_CMD_PRIVATE_EXPEDITED				\
33 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED			\
34 	| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
35 
ipi_mb(void * info)36 static void ipi_mb(void *info)
37 {
38 	smp_mb();	/* IPIs should be serializing but paranoid. */
39 }
40 
ipi_sync_core(void * info)41 static void ipi_sync_core(void *info)
42 {
43 	/*
44 	 * The smp_mb() in membarrier after all the IPIs is supposed to
45 	 * ensure that memory on remote CPUs that occur before the IPI
46 	 * become visible to membarrier()'s caller -- see scenario B in
47 	 * the big comment at the top of this file.
48 	 *
49 	 * A sync_core() would provide this guarantee, but
50 	 * sync_core_before_usermode() might end up being deferred until
51 	 * after membarrier()'s smp_mb().
52 	 */
53 	smp_mb();	/* IPIs should be serializing but paranoid. */
54 
55 	sync_core_before_usermode();
56 }
57 
ipi_rseq(void * info)58 static void ipi_rseq(void *info)
59 {
60 	/*
61 	 * Ensure that all stores done by the calling thread are visible
62 	 * to the current task before the current task resumes.  We could
63 	 * probably optimize this away on most architectures, but by the
64 	 * time we've already sent an IPI, the cost of the extra smp_mb()
65 	 * is negligible.
66 	 */
67 	smp_mb();
68 	rseq_preempt(current);
69 }
70 
ipi_sync_rq_state(void * info)71 static void ipi_sync_rq_state(void *info)
72 {
73 	struct mm_struct *mm = (struct mm_struct *) info;
74 
75 	if (current->mm != mm)
76 		return;
77 	this_cpu_write(runqueues.membarrier_state,
78 		       atomic_read(&mm->membarrier_state));
79 	/*
80 	 * Issue a memory barrier after setting
81 	 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
82 	 * guarantee that no memory access following registration is reordered
83 	 * before registration.
84 	 */
85 	smp_mb();
86 }
87 
membarrier_exec_mmap(struct mm_struct * mm)88 void membarrier_exec_mmap(struct mm_struct *mm)
89 {
90 	/*
91 	 * Issue a memory barrier before clearing membarrier_state to
92 	 * guarantee that no memory access prior to exec is reordered after
93 	 * clearing this state.
94 	 */
95 	smp_mb();
96 	atomic_set(&mm->membarrier_state, 0);
97 	/*
98 	 * Keep the runqueue membarrier_state in sync with this mm
99 	 * membarrier_state.
100 	 */
101 	this_cpu_write(runqueues.membarrier_state, 0);
102 }
103 
membarrier_global_expedited(void)104 static int membarrier_global_expedited(void)
105 {
106 	int cpu;
107 	cpumask_var_t tmpmask;
108 
109 	if (num_online_cpus() == 1)
110 		return 0;
111 
112 	/*
113 	 * Matches memory barriers around rq->curr modification in
114 	 * scheduler.
115 	 */
116 	smp_mb();	/* system call entry is not a mb. */
117 
118 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
119 		return -ENOMEM;
120 
121 	cpus_read_lock();
122 	rcu_read_lock();
123 	for_each_online_cpu(cpu) {
124 		struct task_struct *p;
125 
126 		/*
127 		 * Skipping the current CPU is OK even through we can be
128 		 * migrated at any point. The current CPU, at the point
129 		 * where we read raw_smp_processor_id(), is ensured to
130 		 * be in program order with respect to the caller
131 		 * thread. Therefore, we can skip this CPU from the
132 		 * iteration.
133 		 */
134 		if (cpu == raw_smp_processor_id())
135 			continue;
136 
137 		if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
138 		    MEMBARRIER_STATE_GLOBAL_EXPEDITED))
139 			continue;
140 
141 		/*
142 		 * Skip the CPU if it runs a kernel thread. The scheduler
143 		 * leaves the prior task mm in place as an optimization when
144 		 * scheduling a kthread.
145 		 */
146 		p = rcu_dereference(cpu_rq(cpu)->curr);
147 		if (p->flags & PF_KTHREAD)
148 			continue;
149 
150 		__cpumask_set_cpu(cpu, tmpmask);
151 	}
152 	rcu_read_unlock();
153 
154 	preempt_disable();
155 	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
156 	preempt_enable();
157 
158 	free_cpumask_var(tmpmask);
159 	cpus_read_unlock();
160 
161 	/*
162 	 * Memory barrier on the caller thread _after_ we finished
163 	 * waiting for the last IPI. Matches memory barriers around
164 	 * rq->curr modification in scheduler.
165 	 */
166 	smp_mb();	/* exit from system call is not a mb */
167 	return 0;
168 }
169 
membarrier_private_expedited(int flags,int cpu_id)170 static int membarrier_private_expedited(int flags, int cpu_id)
171 {
172 	cpumask_var_t tmpmask;
173 	struct mm_struct *mm = current->mm;
174 	smp_call_func_t ipi_func = ipi_mb;
175 
176 	if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
177 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
178 			return -EINVAL;
179 		if (!(atomic_read(&mm->membarrier_state) &
180 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
181 			return -EPERM;
182 		ipi_func = ipi_sync_core;
183 	} else if (flags == MEMBARRIER_FLAG_RSEQ) {
184 		if (!IS_ENABLED(CONFIG_RSEQ))
185 			return -EINVAL;
186 		if (!(atomic_read(&mm->membarrier_state) &
187 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
188 			return -EPERM;
189 		ipi_func = ipi_rseq;
190 	} else {
191 		WARN_ON_ONCE(flags);
192 		if (!(atomic_read(&mm->membarrier_state) &
193 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
194 			return -EPERM;
195 	}
196 
197 	if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
198 	    (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
199 		return 0;
200 
201 	/*
202 	 * Matches memory barriers around rq->curr modification in
203 	 * scheduler.
204 	 */
205 	smp_mb();	/* system call entry is not a mb. */
206 
207 	if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
208 		return -ENOMEM;
209 
210 	cpus_read_lock();
211 
212 	if (cpu_id >= 0) {
213 		struct task_struct *p;
214 
215 		if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
216 			goto out;
217 		rcu_read_lock();
218 		p = rcu_dereference(cpu_rq(cpu_id)->curr);
219 		if (!p || p->mm != mm) {
220 			rcu_read_unlock();
221 			goto out;
222 		}
223 		rcu_read_unlock();
224 	} else {
225 		int cpu;
226 
227 		rcu_read_lock();
228 		for_each_online_cpu(cpu) {
229 			struct task_struct *p;
230 
231 			p = rcu_dereference(cpu_rq(cpu)->curr);
232 			if (p && p->mm == mm)
233 				__cpumask_set_cpu(cpu, tmpmask);
234 		}
235 		rcu_read_unlock();
236 	}
237 
238 	if (cpu_id >= 0) {
239 		/*
240 		 * smp_call_function_single() will call ipi_func() if cpu_id
241 		 * is the calling CPU.
242 		 */
243 		smp_call_function_single(cpu_id, ipi_func, NULL, 1);
244 	} else {
245 		/*
246 		 * For regular membarrier, we can save a few cycles by
247 		 * skipping the current cpu -- we're about to do smp_mb()
248 		 * below, and if we migrate to a different cpu, this cpu
249 		 * and the new cpu will execute a full barrier in the
250 		 * scheduler.
251 		 *
252 		 * For SYNC_CORE, we do need a barrier on the current cpu --
253 		 * otherwise, if we are migrated and replaced by a different
254 		 * task in the same mm just before, during, or after
255 		 * membarrier, we will end up with some thread in the mm
256 		 * running without a core sync.
257 		 *
258 		 * For RSEQ, don't rseq_preempt() the caller.  User code
259 		 * is not supposed to issue syscalls at all from inside an
260 		 * rseq critical section.
261 		 */
262 		if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
263 			preempt_disable();
264 			smp_call_function_many(tmpmask, ipi_func, NULL, true);
265 			preempt_enable();
266 		} else {
267 			on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
268 		}
269 	}
270 
271 out:
272 	if (cpu_id < 0)
273 		free_cpumask_var(tmpmask);
274 	cpus_read_unlock();
275 
276 	/*
277 	 * Memory barrier on the caller thread _after_ we finished
278 	 * waiting for the last IPI. Matches memory barriers around
279 	 * rq->curr modification in scheduler.
280 	 */
281 	smp_mb();	/* exit from system call is not a mb */
282 
283 	return 0;
284 }
285 
sync_runqueues_membarrier_state(struct mm_struct * mm)286 static int sync_runqueues_membarrier_state(struct mm_struct *mm)
287 {
288 	int membarrier_state = atomic_read(&mm->membarrier_state);
289 	cpumask_var_t tmpmask;
290 	int cpu;
291 
292 	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
293 		this_cpu_write(runqueues.membarrier_state, membarrier_state);
294 
295 		/*
296 		 * For single mm user, we can simply issue a memory barrier
297 		 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
298 		 * mm and in the current runqueue to guarantee that no memory
299 		 * access following registration is reordered before
300 		 * registration.
301 		 */
302 		smp_mb();
303 		return 0;
304 	}
305 
306 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
307 		return -ENOMEM;
308 
309 	/*
310 	 * For mm with multiple users, we need to ensure all future
311 	 * scheduler executions will observe @mm's new membarrier
312 	 * state.
313 	 */
314 	synchronize_rcu();
315 
316 	/*
317 	 * For each cpu runqueue, if the task's mm match @mm, ensure that all
318 	 * @mm's membarrier state set bits are also set in in the runqueue's
319 	 * membarrier state. This ensures that a runqueue scheduling
320 	 * between threads which are users of @mm has its membarrier state
321 	 * updated.
322 	 */
323 	cpus_read_lock();
324 	rcu_read_lock();
325 	for_each_online_cpu(cpu) {
326 		struct rq *rq = cpu_rq(cpu);
327 		struct task_struct *p;
328 
329 		p = rcu_dereference(rq->curr);
330 		if (p && p->mm == mm)
331 			__cpumask_set_cpu(cpu, tmpmask);
332 	}
333 	rcu_read_unlock();
334 
335 	preempt_disable();
336 	smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
337 	preempt_enable();
338 
339 	free_cpumask_var(tmpmask);
340 	cpus_read_unlock();
341 
342 	return 0;
343 }
344 
membarrier_register_global_expedited(void)345 static int membarrier_register_global_expedited(void)
346 {
347 	struct task_struct *p = current;
348 	struct mm_struct *mm = p->mm;
349 	int ret;
350 
351 	if (atomic_read(&mm->membarrier_state) &
352 	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
353 		return 0;
354 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
355 	ret = sync_runqueues_membarrier_state(mm);
356 	if (ret)
357 		return ret;
358 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
359 		  &mm->membarrier_state);
360 
361 	return 0;
362 }
363 
membarrier_register_private_expedited(int flags)364 static int membarrier_register_private_expedited(int flags)
365 {
366 	struct task_struct *p = current;
367 	struct mm_struct *mm = p->mm;
368 	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
369 	    set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
370 	    ret;
371 
372 	if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
373 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
374 			return -EINVAL;
375 		ready_state =
376 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
377 	} else if (flags == MEMBARRIER_FLAG_RSEQ) {
378 		if (!IS_ENABLED(CONFIG_RSEQ))
379 			return -EINVAL;
380 		ready_state =
381 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
382 	} else {
383 		WARN_ON_ONCE(flags);
384 	}
385 
386 	/*
387 	 * We need to consider threads belonging to different thread
388 	 * groups, which use the same mm. (CLONE_VM but not
389 	 * CLONE_THREAD).
390 	 */
391 	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
392 		return 0;
393 	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
394 		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
395 	if (flags & MEMBARRIER_FLAG_RSEQ)
396 		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
397 	atomic_or(set_state, &mm->membarrier_state);
398 	ret = sync_runqueues_membarrier_state(mm);
399 	if (ret)
400 		return ret;
401 	atomic_or(ready_state, &mm->membarrier_state);
402 
403 	return 0;
404 }
405 
406 /**
407  * sys_membarrier - issue memory barriers on a set of threads
408  * @cmd:    Takes command values defined in enum membarrier_cmd.
409  * @flags:  Currently needs to be 0 for all commands other than
410  *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
411  *          case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
412  *          contains the CPU on which to interrupt (= restart)
413  *          the RSEQ critical section.
414  * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
415  *          RSEQ CS should be interrupted (@cmd must be
416  *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
417  *
418  * If this system call is not implemented, -ENOSYS is returned. If the
419  * command specified does not exist, not available on the running
420  * kernel, or if the command argument is invalid, this system call
421  * returns -EINVAL. For a given command, with flags argument set to 0,
422  * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
423  * always return the same value until reboot. In addition, it can return
424  * -ENOMEM if there is not enough memory available to perform the system
425  * call.
426  *
427  * All memory accesses performed in program order from each targeted thread
428  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
429  * the semantic "barrier()" to represent a compiler barrier forcing memory
430  * accesses to be performed in program order across the barrier, and
431  * smp_mb() to represent explicit memory barriers forcing full memory
432  * ordering across the barrier, we have the following ordering table for
433  * each pair of barrier(), sys_membarrier() and smp_mb():
434  *
435  * The pair ordering is detailed as (O: ordered, X: not ordered):
436  *
437  *                        barrier()   smp_mb() sys_membarrier()
438  *        barrier()          X           X            O
439  *        smp_mb()           X           O            O
440  *        sys_membarrier()   O           O            O
441  */
SYSCALL_DEFINE3(membarrier,int,cmd,unsigned int,flags,int,cpu_id)442 SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
443 {
444 	switch (cmd) {
445 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
446 		if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
447 			return -EINVAL;
448 		break;
449 	default:
450 		if (unlikely(flags))
451 			return -EINVAL;
452 	}
453 
454 	if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
455 		cpu_id = -1;
456 
457 	switch (cmd) {
458 	case MEMBARRIER_CMD_QUERY:
459 	{
460 		int cmd_mask = MEMBARRIER_CMD_BITMASK;
461 
462 		if (tick_nohz_full_enabled())
463 			cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
464 		return cmd_mask;
465 	}
466 	case MEMBARRIER_CMD_GLOBAL:
467 		/* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
468 		if (tick_nohz_full_enabled())
469 			return -EINVAL;
470 		if (num_online_cpus() > 1)
471 			synchronize_rcu();
472 		return 0;
473 	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
474 		return membarrier_global_expedited();
475 	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
476 		return membarrier_register_global_expedited();
477 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
478 		return membarrier_private_expedited(0, cpu_id);
479 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
480 		return membarrier_register_private_expedited(0);
481 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
482 		return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
483 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
484 		return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
485 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
486 		return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
487 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
488 		return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
489 	default:
490 		return -EINVAL;
491 	}
492 }
493