do_csum.S - OpenGrok cross reference for /Linux-v5.10/arch/ia64/lib/do

Lines Matching +full:many +full:- +full:to +full:- +full:one
1 /* SPDX-License-Identifier: GPL-2.0 */
9  *	in0: address of buffer to checksum (char *)
12  * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
17  *		More optimization cleanup - remove excessive stop bits.
22  *		back-to-back 8-byte words per loop. Clean up the initialization
24  *		Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
31 //	The goal is to go as quickly as possible to the point where
37 //	allows us to commute operations. So we do the "head" and "tail"
38 //	first to finish at full speed in the body. Once we get the head and
42 //	into one 8 byte word. In this case we have only one entry in the pipeline.
44 //	We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
45 //	possible load latency and also to accommodate for head and tail.
48 //	down to 16bits taking care of the carry.
54 //	|---|
56 //	|---|
57 //      |   | -			: in transit data
58 //	|---|
59 //      |   | LOAD_LATENCY	: current value to add to checksum
60 //	|---|
61 //      |   | LOAD_LATENCY+1	: previous value added to checksum
62 //      |---|			(previous iteration)
65 //	|---|
67 //	|---|
68 //      |   | LOAD_LATENCY-1	: new checksum
69 //	|---|
71 //	|---|
73 //      |---|
80 //	- Maybe another algorithm which would take care of the folding at the
82 //	- Work with people more knowledgeable than me on the network stack
83 //	  to figure out if we could not split the function depending on the
85 //	  where we know we have at least 20bytes worth of data to checksum.
86 //	- Do a better job of handling small packets.
87 //	- Note on prefetching: it was found that under various load, i.e. ftp read/write,
89 //	  on the data that buffer points to (partly because the checksum is often preceded by
139 (p6)	br.ret.spnt.many rp	// return if zero or negative length
141 	mov hmask=-1		// initialize head mask
143 	and first1=-8,buf	// 8-byte align down address of first1 element
145 	and firstoff=7,buf	// how many bytes off for first1 element
146 	mov tmask=-1		// initialize tail mask
149 	adds tmp2=-1,tmp1	// last-1
150 	and lastoff=7,tmp1	// how many bytes off for last element
152 	sub tmp1=8,lastoff	// complement to lastoff
153 	and last=-8,tmp2	// address of word containing last byte
155 	sub tmp3=last,first1	// tmp3=distance from first1 to last
158 	cmp.eq p8,p9=last,first1	// everything fits in one word ?
161 	and tmp1=7, tmp1	// make sure that if tmp1==8 -> tmp1=0
166 (p9)	adds tmp3=-8,tmp3	// effectively loaded
175 (p8)	and hmask=hmask,tmask	// apply tail mask to head mask if 1 word only
177 	shr.u count=count,3	// how many 8-byte?
179 	// If count is odd, finish this 8-byte word so that we can
180 	// load two back-to-back 8-byte words per loop thereafter.
188 	cmp.eq.or.andcm p8,p0=0,count		// exit if zero 8-byte
191 (p8)	br.cond.dptk .do_csum_exit	// if (within an 8-byte word)
195 	ld8 word1[1]=[first1],8		// load an 8-byte word
197 	adds count=-1,count		// loaded an 8-byte word
205 	// Fall through to calculate the checksum, feeding result1[0] as
208 	// Calculate the checksum loading two 8-byte words per loop.
214 	adds count=-1,count
231 (ELD)	add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
232 (ELD)	add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
238 	// Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
239 (pC1[1])adds carry1=1,carry1	// since we miss the last one
289 (p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
290 	br.ret.sptk.many rp
321 //(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
322 //	br.ret.sptk.many rp