1 /*
2  * This file is subject to the terms and conditions of the GNU General Public
3  * License.  See the file "COPYING" in the main directory of this archive
4  * for more details.
5  *
6  * Unified implementation of memcpy, memmove and the __copy_user backend.
7  *
8  * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
9  * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10  * Copyright (C) 2002 Broadcom, Inc.
11  *   memcpy/copy_user author: Mark Vandevoorde
12  *
13  * Mnemonic names for arguments to memcpy/__copy_user
14  */
15 
16 #include <linux/export.h>
17 #include <asm/asm.h>
18 #include <asm/asm-offsets.h>
19 #include <asm/regdef.h>
20 
21 #define dst a0
22 #define src a1
23 #define len a2
24 
25 /*
26  * Spec
27  *
28  * memcpy copies len bytes from src to dst and sets v0 to dst.
29  * It assumes that
30  *   - src and dst don't overlap
31  *   - src is readable
32  *   - dst is writable
33  * memcpy uses the standard calling convention
34  *
35  * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
36  * the number of uncopied bytes due to an exception caused by a read or write.
37  * __copy_user assumes that src and dst don't overlap, and that the call is
38  * implementing one of the following:
39  *   copy_to_user
40  *     - src is readable  (no exceptions when reading src)
41  *   copy_from_user
42  *     - dst is writable  (no exceptions when writing dst)
43  * __copy_user uses a non-standard calling convention; see
44  * arch/mips/include/asm/uaccess.h
45  *
46  * When an exception happens on a load, the handler must
47  # ensure that all of the destination buffer is overwritten to prevent
48  * leaking information to user mode programs.
49  */
50 
51 /*
52  * Implementation
53  */
54 
55 /*
56  * The exception handler for loads requires that:
57  *  1- AT contain the address of the byte just past the end of the source
58  *     of the copy,
59  *  2- src_entry <= src < AT, and
60  *  3- (dst - src) == (dst_entry - src_entry),
61  * The _entry suffix denotes values when __copy_user was called.
62  *
63  * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
64  * (2) is met by incrementing src by the number of bytes copied
65  * (3) is met by not doing loads between a pair of increments of dst and src
66  *
67  * The exception handlers for stores adjust len (if necessary) and return.
68  * These handlers do not need to overwrite any data.
69  *
70  * For __rmemcpy and memmove an exception is always a kernel bug, therefore
71  * they're not protected.
72  */
73 
74 #define EXC(inst_reg,addr,handler)		\
75 9:	inst_reg, addr;				\
76 	.section __ex_table,"a";		\
77 	PTR_WD	9b, handler;			\
78 	.previous
79 
80 /*
81  * Only on the 64-bit kernel we can made use of 64-bit registers.
82  */
83 
84 #define LOAD   ld
85 #define LOADL  ldl
86 #define LOADR  ldr
87 #define STOREL sdl
88 #define STORER sdr
89 #define STORE  sd
90 #define ADD    daddu
91 #define SUB    dsubu
92 #define SRL    dsrl
93 #define SRA    dsra
94 #define SLL    dsll
95 #define SLLV   dsllv
96 #define SRLV   dsrlv
97 #define NBYTES 8
98 #define LOG_NBYTES 3
99 
100 /*
101  * As we are sharing code base with the mips32 tree (which use the o32 ABI
102  * register definitions). We need to redefine the register definitions from
103  * the n64 ABI register naming to the o32 ABI register naming.
104  */
105 #undef t0
106 #undef t1
107 #undef t2
108 #undef t3
109 #define t0	$8
110 #define t1	$9
111 #define t2	$10
112 #define t3	$11
113 #define t4	$12
114 #define t5	$13
115 #define t6	$14
116 #define t7	$15
117 
118 #ifdef CONFIG_CPU_LITTLE_ENDIAN
119 #define LDFIRST LOADR
120 #define LDREST	LOADL
121 #define STFIRST STORER
122 #define STREST	STOREL
123 #define SHIFT_DISCARD SLLV
124 #else
125 #define LDFIRST LOADL
126 #define LDREST	LOADR
127 #define STFIRST STOREL
128 #define STREST	STORER
129 #define SHIFT_DISCARD SRLV
130 #endif
131 
132 #define FIRST(unit) ((unit)*NBYTES)
133 #define REST(unit)  (FIRST(unit)+NBYTES-1)
134 #define UNIT(unit)  FIRST(unit)
135 
136 #define ADDRMASK (NBYTES-1)
137 
138 	.text
139 	.set	noreorder
140 	.set	noat
141 
142 /*
143  * A combined memcpy/__copy_user
144  * __copy_user sets len to 0 for success; else to an upper bound of
145  * the number of uncopied bytes.
146  * memcpy sets v0 to dst.
147  */
148 	.align	5
149 LEAF(memcpy)					/* a0=dst a1=src a2=len */
150 EXPORT_SYMBOL(memcpy)
151 	move	v0, dst				/* return value */
152 __memcpy:
153 FEXPORT(__raw_copy_from_user)
154 EXPORT_SYMBOL(__raw_copy_from_user)
155 FEXPORT(__raw_copy_to_user)
156 EXPORT_SYMBOL(__raw_copy_to_user)
157 	/*
158 	 * Note: dst & src may be unaligned, len may be 0
159 	 * Temps
160 	 */
161 	#
162 	# Octeon doesn't care if the destination is unaligned. The hardware
163 	# can fix it faster than we can special case the assembly.
164 	#
165 	pref	0, 0(src)
166 	sltu	t0, len, NBYTES		# Check if < 1 word
167 	bnez	t0, copy_bytes_checklen
168 	 and	t0, src, ADDRMASK	# Check if src unaligned
169 	bnez	t0, src_unaligned
170 	 sltu	t0, len, 4*NBYTES	# Check if < 4 words
171 	bnez	t0, less_than_4units
172 	 sltu	t0, len, 8*NBYTES	# Check if < 8 words
173 	bnez	t0, less_than_8units
174 	 sltu	t0, len, 16*NBYTES	# Check if < 16 words
175 	bnez	t0, cleanup_both_aligned
176 	 sltu	t0, len, 128+1		# Check if len < 129
177 	bnez	t0, 1f			# Skip prefetch if len is too short
178 	 sltu	t0, len, 256+1		# Check if len < 257
179 	bnez	t0, 1f			# Skip prefetch if len is too short
180 	 pref	0, 128(src)		# We must not prefetch invalid addresses
181 	#
182 	# This is where we loop if there is more than 128 bytes left
183 2:	pref	0, 256(src)		# We must not prefetch invalid addresses
184 	#
185 	# This is where we loop if we can't prefetch anymore
186 1:
187 EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
188 EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
189 EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
190 EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
191 	SUB	len, len, 16*NBYTES
192 EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p16u)
193 EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p15u)
194 EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p14u)
195 EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p13u)
196 EXC(	LOAD	t0, UNIT(4)(src),	l_exc_copy)
197 EXC(	LOAD	t1, UNIT(5)(src),	l_exc_copy)
198 EXC(	LOAD	t2, UNIT(6)(src),	l_exc_copy)
199 EXC(	LOAD	t3, UNIT(7)(src),	l_exc_copy)
200 EXC(	STORE	t0, UNIT(4)(dst),	s_exc_p12u)
201 EXC(	STORE	t1, UNIT(5)(dst),	s_exc_p11u)
202 EXC(	STORE	t2, UNIT(6)(dst),	s_exc_p10u)
203 	ADD	src, src, 16*NBYTES
204 EXC(	STORE	t3, UNIT(7)(dst),	s_exc_p9u)
205 	ADD	dst, dst, 16*NBYTES
206 EXC(	LOAD	t0, UNIT(-8)(src),	l_exc_copy_rewind16)
207 EXC(	LOAD	t1, UNIT(-7)(src),	l_exc_copy_rewind16)
208 EXC(	LOAD	t2, UNIT(-6)(src),	l_exc_copy_rewind16)
209 EXC(	LOAD	t3, UNIT(-5)(src),	l_exc_copy_rewind16)
210 EXC(	STORE	t0, UNIT(-8)(dst),	s_exc_p8u)
211 EXC(	STORE	t1, UNIT(-7)(dst),	s_exc_p7u)
212 EXC(	STORE	t2, UNIT(-6)(dst),	s_exc_p6u)
213 EXC(	STORE	t3, UNIT(-5)(dst),	s_exc_p5u)
214 EXC(	LOAD	t0, UNIT(-4)(src),	l_exc_copy_rewind16)
215 EXC(	LOAD	t1, UNIT(-3)(src),	l_exc_copy_rewind16)
216 EXC(	LOAD	t2, UNIT(-2)(src),	l_exc_copy_rewind16)
217 EXC(	LOAD	t3, UNIT(-1)(src),	l_exc_copy_rewind16)
218 EXC(	STORE	t0, UNIT(-4)(dst),	s_exc_p4u)
219 EXC(	STORE	t1, UNIT(-3)(dst),	s_exc_p3u)
220 EXC(	STORE	t2, UNIT(-2)(dst),	s_exc_p2u)
221 EXC(	STORE	t3, UNIT(-1)(dst),	s_exc_p1u)
222 	sltu	t0, len, 256+1		# See if we can prefetch more
223 	beqz	t0, 2b
224 	 sltu	t0, len, 128		# See if we can loop more time
225 	beqz	t0, 1b
226 	 nop
227 	#
228 	# Jump here if there are less than 16*NBYTES left.
229 	#
230 cleanup_both_aligned:
231 	beqz	len, done
232 	 sltu	t0, len, 8*NBYTES
233 	bnez	t0, less_than_8units
234 	 nop
235 EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
236 EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
237 EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
238 EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
239 	SUB	len, len, 8*NBYTES
240 EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p8u)
241 EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p7u)
242 EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p6u)
243 EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p5u)
244 EXC(	LOAD	t0, UNIT(4)(src),	l_exc_copy)
245 EXC(	LOAD	t1, UNIT(5)(src),	l_exc_copy)
246 EXC(	LOAD	t2, UNIT(6)(src),	l_exc_copy)
247 EXC(	LOAD	t3, UNIT(7)(src),	l_exc_copy)
248 EXC(	STORE	t0, UNIT(4)(dst),	s_exc_p4u)
249 EXC(	STORE	t1, UNIT(5)(dst),	s_exc_p3u)
250 EXC(	STORE	t2, UNIT(6)(dst),	s_exc_p2u)
251 EXC(	STORE	t3, UNIT(7)(dst),	s_exc_p1u)
252 	ADD	src, src, 8*NBYTES
253 	beqz	len, done
254 	 ADD	dst, dst, 8*NBYTES
255 	#
256 	# Jump here if there are less than 8*NBYTES left.
257 	#
258 less_than_8units:
259 	sltu	t0, len, 4*NBYTES
260 	bnez	t0, less_than_4units
261 	 nop
262 EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
263 EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
264 EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
265 EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
266 	SUB	len, len, 4*NBYTES
267 EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p4u)
268 EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
269 EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
270 EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
271 	ADD	src, src, 4*NBYTES
272 	beqz	len, done
273 	 ADD	dst, dst, 4*NBYTES
274 	#
275 	# Jump here if there are less than 4*NBYTES left. This means
276 	# we may need to copy up to 3 NBYTES words.
277 	#
278 less_than_4units:
279 	sltu	t0, len, 1*NBYTES
280 	bnez	t0, copy_bytes_checklen
281 	 nop
282 	#
283 	# 1) Copy NBYTES, then check length again
284 	#
285 EXC(	LOAD	t0, 0(src),		l_exc)
286 	SUB	len, len, NBYTES
287 	sltu	t1, len, 8
288 EXC(	STORE	t0, 0(dst),		s_exc_p1u)
289 	ADD	src, src, NBYTES
290 	bnez	t1, copy_bytes_checklen
291 	 ADD	dst, dst, NBYTES
292 	#
293 	# 2) Copy NBYTES, then check length again
294 	#
295 EXC(	LOAD	t0, 0(src),		l_exc)
296 	SUB	len, len, NBYTES
297 	sltu	t1, len, 8
298 EXC(	STORE	t0, 0(dst),		s_exc_p1u)
299 	ADD	src, src, NBYTES
300 	bnez	t1, copy_bytes_checklen
301 	 ADD	dst, dst, NBYTES
302 	#
303 	# 3) Copy NBYTES, then check length again
304 	#
305 EXC(	LOAD	t0, 0(src),		l_exc)
306 	SUB	len, len, NBYTES
307 	ADD	src, src, NBYTES
308 	ADD	dst, dst, NBYTES
309 	b copy_bytes_checklen
310 EXC(	 STORE	t0, -8(dst),		s_exc_p1u)
311 
312 src_unaligned:
313 #define rem t8
314 	SRL	t0, len, LOG_NBYTES+2	 # +2 for 4 units/iter
315 	beqz	t0, cleanup_src_unaligned
316 	 and	rem, len, (4*NBYTES-1)	 # rem = len % 4*NBYTES
317 1:
318 /*
319  * Avoid consecutive LD*'s to the same register since some mips
320  * implementations can't issue them in the same cycle.
321  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
322  * are to the same unit (unless src is aligned, but it's not).
323  */
324 EXC(	LDFIRST t0, FIRST(0)(src),	l_exc)
325 EXC(	LDFIRST t1, FIRST(1)(src),	l_exc_copy)
326 	SUB	len, len, 4*NBYTES
327 EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
328 EXC(	LDREST	t1, REST(1)(src),	l_exc_copy)
329 EXC(	LDFIRST t2, FIRST(2)(src),	l_exc_copy)
330 EXC(	LDFIRST t3, FIRST(3)(src),	l_exc_copy)
331 EXC(	LDREST	t2, REST(2)(src),	l_exc_copy)
332 EXC(	LDREST	t3, REST(3)(src),	l_exc_copy)
333 	ADD	src, src, 4*NBYTES
334 EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p4u)
335 EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
336 EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
337 EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
338 	bne	len, rem, 1b
339 	 ADD	dst, dst, 4*NBYTES
340 
341 cleanup_src_unaligned:
342 	beqz	len, done
343 	 and	rem, len, NBYTES-1  # rem = len % NBYTES
344 	beq	rem, len, copy_bytes
345 	 nop
346 1:
347 EXC(	LDFIRST t0, FIRST(0)(src),	l_exc)
348 EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
349 	SUB	len, len, NBYTES
350 EXC(	STORE	t0, 0(dst),		s_exc_p1u)
351 	ADD	src, src, NBYTES
352 	bne	len, rem, 1b
353 	 ADD	dst, dst, NBYTES
354 
355 copy_bytes_checklen:
356 	beqz	len, done
357 	 nop
358 copy_bytes:
359 	/* 0 < len < NBYTES  */
360 #define COPY_BYTE(N)			\
361 EXC(	lb	t0, N(src), l_exc);	\
362 	SUB	len, len, 1;		\
363 	beqz	len, done;		\
364 EXC(	 sb	t0, N(dst), s_exc_p1)
365 
366 	COPY_BYTE(0)
367 	COPY_BYTE(1)
368 	COPY_BYTE(2)
369 	COPY_BYTE(3)
370 	COPY_BYTE(4)
371 	COPY_BYTE(5)
372 EXC(	lb	t0, NBYTES-2(src), l_exc)
373 	SUB	len, len, 1
374 	jr	ra
375 EXC(	 sb	t0, NBYTES-2(dst), s_exc_p1)
376 done:
377 	jr	ra
378 	 nop
379 	END(memcpy)
380 
381 l_exc_copy_rewind16:
382 	/* Rewind src and dst by 16*NBYTES for l_exc_copy */
383 	SUB	src, src, 16*NBYTES
384 	SUB	dst, dst, 16*NBYTES
385 l_exc_copy:
386 	/*
387 	 * Copy bytes from src until faulting load address (or until a
388 	 * lb faults)
389 	 *
390 	 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
391 	 * may be more than a byte beyond the last address.
392 	 * Hence, the lb below may get an exception.
393 	 *
394 	 * Assumes src < THREAD_BUADDR($28)
395 	 */
396 	LOAD	t0, TI_TASK($28)
397 	LOAD	t0, THREAD_BUADDR(t0)
398 1:
399 EXC(	lb	t1, 0(src),	l_exc)
400 	ADD	src, src, 1
401 	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
402 	bne	src, t0, 1b
403 	 ADD	dst, dst, 1
404 l_exc:
405 	LOAD	t0, TI_TASK($28)
406 	LOAD	t0, THREAD_BUADDR(t0)	# t0 is just past last good address
407 	SUB	len, AT, t0		# len number of uncopied bytes
408 	jr	ra
409 	 nop
410 
411 
412 #define SEXC(n)				\
413 s_exc_p ## n ## u:			\
414 	jr	ra;			\
415 	 ADD	len, len, n*NBYTES
416 
417 SEXC(16)
418 SEXC(15)
419 SEXC(14)
420 SEXC(13)
421 SEXC(12)
422 SEXC(11)
423 SEXC(10)
424 SEXC(9)
425 SEXC(8)
426 SEXC(7)
427 SEXC(6)
428 SEXC(5)
429 SEXC(4)
430 SEXC(3)
431 SEXC(2)
432 SEXC(1)
433 
434 s_exc_p1:
435 	jr	ra
436 	 ADD	len, len, 1
437 s_exc:
438 	jr	ra
439 	 nop
440 
441 	.align	5
442 LEAF(memmove)
443 EXPORT_SYMBOL(memmove)
444 	ADD	t0, a0, a2
445 	ADD	t1, a1, a2
446 	sltu	t0, a1, t0			# dst + len <= src -> memcpy
447 	sltu	t1, a0, t1			# dst >= src + len -> memcpy
448 	and	t0, t1
449 	beqz	t0, __memcpy
450 	 move	v0, a0				/* return value */
451 	beqz	a2, r_out
452 	END(memmove)
453 
454 	/* fall through to __rmemcpy */
455 LEAF(__rmemcpy)					/* a0=dst a1=src a2=len */
456 	 sltu	t0, a1, a0
457 	beqz	t0, r_end_bytes_up		# src >= dst
458 	 nop
459 	ADD	a0, a2				# dst = dst + len
460 	ADD	a1, a2				# src = src + len
461 
462 r_end_bytes:
463 	lb	t0, -1(a1)
464 	SUB	a2, a2, 0x1
465 	sb	t0, -1(a0)
466 	SUB	a1, a1, 0x1
467 	bnez	a2, r_end_bytes
468 	 SUB	a0, a0, 0x1
469 
470 r_out:
471 	jr	ra
472 	 move	a2, zero
473 
474 r_end_bytes_up:
475 	lb	t0, (a1)
476 	SUB	a2, a2, 0x1
477 	sb	t0, (a0)
478 	ADD	a1, a1, 0x1
479 	bnez	a2, r_end_bytes_up
480 	 ADD	a0, a0, 0x1
481 
482 	jr	ra
483 	 move	a2, zero
484 	END(__rmemcpy)
485