1/* 2 * ==================================================== 3 * Copyright (C) 2007 by Ellips BV. All rights reserved. 4 * 5 * Permission to use, copy, modify, and distribute this 6 * software is freely granted, provided that this notice 7 * is preserved. 8 * ==================================================== 9 */ 10 11#include <picolibc.h> 12 13 #include "x86_64mach.h" 14 15 .global SYM (memcpy) 16 SOTYPE_FUNCTION(memcpy) 17 18SYM (memcpy): 19 movq rdi, rax /* Store destination in return value */ 20 cmpq $16, rdx 21 jb byte_copy 22 23 movq rdi, r8 /* Align destination on quad word boundary */ 24 andq $7, r8 25 jz quadword_aligned 26 movq $8, rcx 27 subq r8, rcx 28 subq rcx, rdx 29 rep movsb 30 31quadword_aligned: 32 cmpq $256, rdx 33 jb quadword_copy 34 35 pushq rax 36 pushq r12 37 pushq r13 38 pushq r14 39 40 movq rdx, rcx /* Copy 128 bytes at a time with minimum cache polution */ 41 shrq $7, rcx 42 43 .p2align 4 44loop: 45 prefetchnta 768 (rsi) 46 prefetchnta 832 (rsi) 47 48 movq (rsi), rax 49 movq 8 (rsi), r8 50 movq 16 (rsi), r9 51 movq 24 (rsi), r10 52 movq 32 (rsi), r11 53 movq 40 (rsi), r12 54 movq 48 (rsi), r13 55 movq 56 (rsi), r14 56 57 movntiq rax, (rdi) 58 movntiq r8 , 8 (rdi) 59 movntiq r9 , 16 (rdi) 60 movntiq r10, 24 (rdi) 61 movntiq r11, 32 (rdi) 62 movntiq r12, 40 (rdi) 63 movntiq r13, 48 (rdi) 64 movntiq r14, 56 (rdi) 65 66 movq 64 (rsi), rax 67 movq 72 (rsi), r8 68 movq 80 (rsi), r9 69 movq 88 (rsi), r10 70 movq 96 (rsi), r11 71 movq 104 (rsi), r12 72 movq 112 (rsi), r13 73 movq 120 (rsi), r14 74 75 movntiq rax, 64 (rdi) 76 movntiq r8 , 72 (rdi) 77 movntiq r9 , 80 (rdi) 78 movntiq r10, 88 (rdi) 79 movntiq r11, 96 (rdi) 80 movntiq r12, 104 (rdi) 81 movntiq r13, 112 (rdi) 82 movntiq r14, 120 (rdi) 83 84 leaq 128 (rsi), rsi 85 leaq 128 (rdi), rdi 86 87 dec rcx 88 jnz loop 89 90 sfence 91 movq rdx, rcx 92 andq $127, rcx 93 rep movsb 94 popq r14 95 popq r13 96 popq r12 97 popq rax 98 ret 99 100 101byte_copy: 102 movq rdx, rcx 103 rep movsb 104 ret 105 106 107quadword_copy: 108 movq rdx, rcx 109 shrq $3, rcx 110 .p2align 4 111 rep movsq 112 movq rdx, rcx 113 andq $7, rcx 114 rep movsb /* Copy the remaining bytes */ 115 ret 116 117#if defined(__linux__) && defined(__ELF__) 118.section .note.GNU-stack,"",%progbits 119#endif 120