Lines Matching +full:many +full:- +full:to +full:- +full:one
1 /* SPDX-License-Identifier: GPL-2.0 */
9 * in0: address of buffer to checksum (char *)
12 * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
17 * More optimization cleanup - remove excessive stop bits.
22 * back-to-back 8-byte words per loop. Clean up the initialization
24 * Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
31 // The goal is to go as quickly as possible to the point where
37 // allows us to commute operations. So we do the "head" and "tail"
38 // first to finish at full speed in the body. Once we get the head and
42 // into one 8 byte word. In this case we have only one entry in the pipeline.
44 // We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
45 // possible load latency and also to accommodate for head and tail.
48 // down to 16bits taking care of the carry.
54 // |---|
56 // |---|
57 // | | - : in transit data
58 // |---|
59 // | | LOAD_LATENCY : current value to add to checksum
60 // |---|
61 // | | LOAD_LATENCY+1 : previous value added to checksum
62 // |---| (previous iteration)
65 // |---|
67 // |---|
68 // | | LOAD_LATENCY-1 : new checksum
69 // |---|
71 // |---|
73 // |---|
80 // - Maybe another algorithm which would take care of the folding at the
82 // - Work with people more knowledgeable than me on the network stack
83 // to figure out if we could not split the function depending on the
85 // where we know we have at least 20bytes worth of data to checksum.
86 // - Do a better job of handling small packets.
87 // - Note on prefetching: it was found that under various load, i.e. ftp read/write,
89 // on the data that buffer points to (partly because the checksum is often preceded by
139 (p6) br.ret.spnt.many rp // return if zero or negative length
141 mov hmask=-1 // initialize head mask
143 and first1=-8,buf // 8-byte align down address of first1 element
145 and firstoff=7,buf // how many bytes off for first1 element
146 mov tmask=-1 // initialize tail mask
149 adds tmp2=-1,tmp1 // last-1
150 and lastoff=7,tmp1 // how many bytes off for last element
152 sub tmp1=8,lastoff // complement to lastoff
153 and last=-8,tmp2 // address of word containing last byte
155 sub tmp3=last,first1 // tmp3=distance from first1 to last
158 cmp.eq p8,p9=last,first1 // everything fits in one word ?
161 and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0
166 (p9) adds tmp3=-8,tmp3 // effectively loaded
175 (p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only
177 shr.u count=count,3 // how many 8-byte?
179 // If count is odd, finish this 8-byte word so that we can
180 // load two back-to-back 8-byte words per loop thereafter.
188 cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte
191 (p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word)
195 ld8 word1[1]=[first1],8 // load an 8-byte word
197 adds count=-1,count // loaded an 8-byte word
205 // Fall through to calculate the checksum, feeding result1[0] as
208 // Calculate the checksum loading two 8-byte words per loop.
214 adds count=-1,count
231 (ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
232 (ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
238 // Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
239 (pC1[1])adds carry1=1,carry1 // since we miss the last one
289 (p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
290 br.ret.sptk.many rp
321 //(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
322 // br.ret.sptk.many rp