1/*
2 * ====================================================
3 * Copyright (C) 2007 by Ellips BV. All rights reserved.
4 *
5 * Permission to use, copy, modify, and distribute this
6 * software is freely granted, provided that this notice
7 * is preserved.
8 * ====================================================
9 */
10
11  #include "x86_64mach.h"
12
13  .global SYM (memcpy)
14  SOTYPE_FUNCTION(memcpy)
15
16SYM (memcpy):
17  movq    rdi, rax                /* Store destination in return value */
18  cmpq    $16, rdx
19  jb      byte_copy
20
21  movq    rdi, r8                 /* Align destination on quad word boundary */
22  andq    $7, r8
23  jz      quadword_aligned
24  movq    $8, rcx
25  subq    r8, rcx
26  subq    rcx, rdx
27  rep     movsb
28
29quadword_aligned:
30  cmpq    $256, rdx
31  jb      quadword_copy
32
33  pushq    rax
34  pushq    r12
35  pushq    r13
36  pushq    r14
37
38  movq    rdx, rcx                /* Copy 128 bytes at a time with minimum cache polution */
39  shrq    $7, rcx
40
41  .p2align 4
42loop:
43  prefetchnta   768 (rsi)
44  prefetchnta   832 (rsi)
45
46  movq       (rsi), rax
47  movq     8 (rsi), r8
48  movq    16 (rsi), r9
49  movq    24 (rsi), r10
50  movq    32 (rsi), r11
51  movq    40 (rsi), r12
52  movq    48 (rsi), r13
53  movq    56 (rsi), r14
54
55  movntiq rax,    (rdi)
56  movntiq r8 ,  8 (rdi)
57  movntiq r9 , 16 (rdi)
58  movntiq r10, 24 (rdi)
59  movntiq r11, 32 (rdi)
60  movntiq r12, 40 (rdi)
61  movntiq r13, 48 (rdi)
62  movntiq r14, 56 (rdi)
63
64  movq     64 (rsi), rax
65  movq     72 (rsi), r8
66  movq     80 (rsi), r9
67  movq     88 (rsi), r10
68  movq     96 (rsi), r11
69  movq    104 (rsi), r12
70  movq    112 (rsi), r13
71  movq    120 (rsi), r14
72
73  movntiq rax,  64 (rdi)
74  movntiq r8 ,  72 (rdi)
75  movntiq r9 ,  80 (rdi)
76  movntiq r10,  88 (rdi)
77  movntiq r11,  96 (rdi)
78  movntiq r12, 104 (rdi)
79  movntiq r13, 112 (rdi)
80  movntiq r14, 120 (rdi)
81
82  leaq    128 (rsi), rsi
83  leaq    128 (rdi), rdi
84
85  dec     rcx
86  jnz     loop
87
88  sfence
89  movq    rdx, rcx
90  andq    $127, rcx
91  rep     movsb
92  popq    r14
93  popq    r13
94  popq    r12
95  popq    rax
96  ret
97
98
99byte_copy:
100  movq    rdx, rcx
101  rep     movsb
102  ret
103
104
105quadword_copy:
106  movq    rdx, rcx
107  shrq    $3, rcx
108  .p2align 4
109  rep     movsq
110  movq    rdx, rcx
111  andq    $7, rcx
112  rep     movsb                   /* Copy the remaining bytes */
113  ret
114
115#if defined(__linux__) && defined(__ELF__)
116.section .note.GNU-stack,"",%progbits
117#endif
118