1/*
2 * ====================================================
3 * Copyright (C) 2007 by Ellips BV. All rights reserved.
4 *
5 * Permission to use, copy, modify, and distribute this
6 * software is freely granted, provided that this notice
7 * is preserved.
8 * ====================================================
9 */
10
11#include <picolibc.h>
12
13  #include "x86_64mach.h"
14
15  .global SYM (memcpy)
16  SOTYPE_FUNCTION(memcpy)
17
18SYM (memcpy):
19  movq    rdi, rax                /* Store destination in return value */
20  cmpq    $16, rdx
21  jb      byte_copy
22
23  movq    rdi, r8                 /* Align destination on quad word boundary */
24  andq    $7, r8
25  jz      quadword_aligned
26  movq    $8, rcx
27  subq    r8, rcx
28  subq    rcx, rdx
29  rep     movsb
30
31quadword_aligned:
32  cmpq    $256, rdx
33  jb      quadword_copy
34
35  pushq    rax
36  pushq    r12
37  pushq    r13
38  pushq    r14
39
40  movq    rdx, rcx                /* Copy 128 bytes at a time with minimum cache polution */
41  shrq    $7, rcx
42
43  .p2align 4
44loop:
45  prefetchnta   768 (rsi)
46  prefetchnta   832 (rsi)
47
48  movq       (rsi), rax
49  movq     8 (rsi), r8
50  movq    16 (rsi), r9
51  movq    24 (rsi), r10
52  movq    32 (rsi), r11
53  movq    40 (rsi), r12
54  movq    48 (rsi), r13
55  movq    56 (rsi), r14
56
57  movntiq rax,    (rdi)
58  movntiq r8 ,  8 (rdi)
59  movntiq r9 , 16 (rdi)
60  movntiq r10, 24 (rdi)
61  movntiq r11, 32 (rdi)
62  movntiq r12, 40 (rdi)
63  movntiq r13, 48 (rdi)
64  movntiq r14, 56 (rdi)
65
66  movq     64 (rsi), rax
67  movq     72 (rsi), r8
68  movq     80 (rsi), r9
69  movq     88 (rsi), r10
70  movq     96 (rsi), r11
71  movq    104 (rsi), r12
72  movq    112 (rsi), r13
73  movq    120 (rsi), r14
74
75  movntiq rax,  64 (rdi)
76  movntiq r8 ,  72 (rdi)
77  movntiq r9 ,  80 (rdi)
78  movntiq r10,  88 (rdi)
79  movntiq r11,  96 (rdi)
80  movntiq r12, 104 (rdi)
81  movntiq r13, 112 (rdi)
82  movntiq r14, 120 (rdi)
83
84  leaq    128 (rsi), rsi
85  leaq    128 (rdi), rdi
86
87  dec     rcx
88  jnz     loop
89
90  sfence
91  movq    rdx, rcx
92  andq    $127, rcx
93  rep     movsb
94  popq    r14
95  popq    r13
96  popq    r12
97  popq    rax
98  ret
99
100
101byte_copy:
102  movq    rdx, rcx
103  rep     movsb
104  ret
105
106
107quadword_copy:
108  movq    rdx, rcx
109  shrq    $3, rcx
110  .p2align 4
111  rep     movsq
112  movq    rdx, rcx
113  andq    $7, rcx
114  rep     movsb                   /* Copy the remaining bytes */
115  ret
116
117#if defined(__linux__) && defined(__ELF__)
118.section .note.GNU-stack,"",%progbits
119#endif
120