Lines Matching +full:byte +full:- +full:len

1 /* SPDX-License-Identifier: GPL-2.0 */
23 * Copyright (C) 2000-2001 Hewlett-Packard Co
27 * - handle the case where we have more than 16 bytes and the alignment
29 * - more benchmarking
30 * - fix extraneous stop bit introduced by the EX() macro.
39 #define COPY_BREAK 16 // we do byte copy below (must be >=16)
42 #define EPI p[PIPE_DEPTH-1]
49 #define len in2 macro
83 adds len2=-1,len // br.ctop is repeat/until
86 ;; // RAW of cfm when len=0
87 cmp.eq p8,p0=r0,len // check for zero length
92 add enddst=dst,len // first byte after end of source
93 add endsrc=src,len // first byte after end of destination
105 cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy
111 // Now we do the byte by byte loop with software pipeline
116 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
125 // Not 8-byte aligned
133 // The basic idea is that we copy byte-by-byte at the head so
134 // that we can reach 8-byte alignment for both src1 and dst1.
135 // Then copy the body using software pipelined 8-byte copy,
136 // shifting the two back-to-back words right and left, then copy
137 // the tail by copying byte-by-byte.
139 // Fault handling. If the byte-by-byte at the head fails on the
142 // If 8-byte software pipeline fails on the load, do the same as
143 // failure_in3 does. If the byte-by-byte at the tail fails, it is
153 // Optimization. If dst1 is 8-byte aligned (quite common), we don't need
154 // to copy the head to dst1, to start 8-byte copy software pipeline.
155 // We know src1 is not 8-byte aligned in this case.
164 sub len1=len,t1 // set len1
175 (p14) sub word1=8,src2 // (8 - src offset)
177 (p15) sub word1=8,dst2 // (8 - dst offset)
184 sub len1=len,word1 // resulting len
189 adds cnt=-1,word1
198 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
205 (p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy
207 shr.u cnt=len1,3 // number of 64-bit words
209 adds cnt=-1,cnt
215 // Now both src1 and dst1 point to an 8-byte aligned address. And
230 // because we need 2 back-to-back val1[] to get tmp.
234 #define EPI_1 p[PIPE_DEPTH-2]
242 (EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
250 (EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
259 // Since the instruction 'shrp' requires a fixed 128-bit value
290 (p14) adds dst1=-8,dst1
298 // To fix that, we simply copy the tail byte by byte.
309 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
325 mov len1=len // copy because of rotation
331 // forward slowly until we reach 16byte alignment: no need to
334 EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned
335 (p6) adds len1=-1,len1;;
338 EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned
339 (p7) adds len1=-2,len1;;
346 EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned
355 EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned
357 (p8) adds len1=-4,len1
360 (p9) adds len1=-8,len1;;
361 shr.u cnt=len1,4 // number of 128-bit (2x64bit) words
366 adds tmp=-1,cnt // br.ctop is repeat/until
380 EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16)
381 (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
385 // Tail correction based on len only
388 // is 16 byte aligned AND we have less than 16 bytes to copy.
402 EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left
416 // Here we handle the case where the byte by byte copy fails
420 // - the pipeline: loads/stores are not in sync (pipeline)
425 // - pipeline effect
430 // - single/multi dispersal independence.
433 // - we don't disrupt the pipeline, i.e. data in transit in
443 (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
452 // This is the case where the byte by byte copy fails on the load
460 (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
463 sub len=enddst,dst1,1 // precompute len
481 // ---------|-----
495 // As we move towards eight byte alignment we may encounter faults.
499 // - if you fail on 1, 2, 4 then you have never executed any smaller
510 // - if you fail on the ld8 in the head, it means you went straight
511 // to it, i.e. 8byte alignment within an unexisting page.
513 // you are 8byte aligned but also 16byte align, therefore you would
514 // either go for the 16byte copy loop OR the ld8 in the tail part.
517 // would have defaulted to the byte by byte copy.
521 // Here we now we have less than 16 bytes AND we are either 8 or 16 byte
526 // - are right on a page boundary
528 // - are at more than 16 bytes from a page boundary with
541 sub len=endsrc,src1,1
551 mov ar.lc=len // Continue with a stupid byte store.
578 (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16
579 (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
583 sub len=enddst,dst1,1 // precompute len
594 sub len=enddst,dst1,1 // precompute len