Lines Matching +full:right +full:- +full:aligned
1 /* SPDX-License-Identifier: GPL-2.0 */
23 * Copyright (C) 2000-2001 Hewlett-Packard Co
27 * - handle the case where we have more than 16 bytes and the alignment
29 * - more benchmarking
30 * - fix extraneous stop bit introduced by the EX() macro.
42 #define EPI p[PIPE_DEPTH-1]
56 #define rshift r14 // right shift in bits
83 adds len2=-1,len // br.ctop is repeat/until
116 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
125 // Not 8-byte aligned
133 // The basic idea is that we copy byte-by-byte at the head so
134 // that we can reach 8-byte alignment for both src1 and dst1.
135 // Then copy the body using software pipelined 8-byte copy,
136 // shifting the two back-to-back words right and left, then copy
137 // the tail by copying byte-by-byte.
139 // Fault handling. If the byte-by-byte at the head fails on the
142 // If 8-byte software pipeline fails on the load, do the same as
143 // failure_in3 does. If the byte-by-byte at the tail fails, it is
153 // Optimization. If dst1 is 8-byte aligned (quite common), we don't need
154 // to copy the head to dst1, to start 8-byte copy software pipeline.
155 // We know src1 is not 8-byte aligned in this case.
175 (p14) sub word1=8,src2 // (8 - src offset)
177 (p15) sub word1=8,dst2 // (8 - dst offset)
189 adds cnt=-1,word1
198 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
205 (p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy
207 shr.u cnt=len1,3 // number of 64-bit words
209 adds cnt=-1,cnt
215 // Now both src1 and dst1 point to an 8-byte aligned address. And
226 // 2 (EPI_1): Shift right pair, saving to tmp
230 // because we need 2 back-to-back val1[] to get tmp.
234 #define EPI_1 p[PIPE_DEPTH-2]
242 (EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
250 (EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
259 // Since the instruction 'shrp' requires a fixed 128-bit value
290 (p14) adds dst1=-8,dst1
309 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
334 EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned
335 (p6) adds len1=-1,len1;;
338 EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned
339 (p7) adds len1=-2,len1;;
346 EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned
355 EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned
357 (p8) adds len1=-4,len1
360 (p9) adds len1=-8,len1;;
361 shr.u cnt=len1,4 // number of 128-bit (2x64bit) words
366 adds tmp=-1,cnt // br.ctop is repeat/until
380 EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16)
381 (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
388 // is 16 byte aligned AND we have less than 16 bytes to copy.
420 // - the pipeline: loads/stores are not in sync (pipeline)
425 // - pipeline effect
430 // - single/multi dispersal independence.
433 // - we don't disrupt the pipeline, i.e. data in transit in
443 (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
460 (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
481 // ---------|-----
499 // - if you fail on 1, 2, 4 then you have never executed any smaller
510 // - if you fail on the ld8 in the head, it means you went straight
513 // you are 8byte aligned but also 16byte align, therefore you would
522 // aligned.
526 // - are right on a page boundary
528 // - are at more than 16 bytes from a page boundary with
578 (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16
579 (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16