1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		IP/TCP/UDP checksumming routines
7 *
8 * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
9 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
10 *		Tom May, <ftom@netcom.com>
11 *              Pentium Pro/II routines:
12 *              Alexander Kjeldaas <astor@guardian.no>
13 *              Finn Arne Gangstad <finnag@guardian.no>
14 *		Lots of code moved from tcp.c and ip.c; see those files
15 *		for more names.
16 *
17 * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
18 *			     handling.
19 *		Andi Kleen,  add zeroing on error
20 *                   converted to pure assembler
21 *
22 *		This program is free software; you can redistribute it and/or
23 *		modify it under the terms of the GNU General Public License
24 *		as published by the Free Software Foundation; either version
25 *		2 of the License, or (at your option) any later version.
26 */
27
28#include <asm/errno.h>
29#include <asm/asm.h>
30#include <asm/export.h>
31
32/*
33 * computes a partial checksum, e.g. for TCP/UDP fragments
34 */
35
36/*
37unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
38 */
39
40.text
41.align 4
42.globl csum_partial
43
44#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
45
46	  /*
47	   * Experiments with Ethernet and SLIP connections show that buff
48	   * is aligned on either a 2-byte or 4-byte boundary.  We get at
49	   * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
50	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte
51	   * alignment for the unrolled loop.
52	   */
53csum_partial:
54	pushl %esi
55	pushl %ebx
56	movl 20(%esp),%eax	# Function arg: unsigned int sum
57	movl 16(%esp),%ecx	# Function arg: int len
58	movl 12(%esp),%esi	# Function arg: unsigned char *buff
59	testl $2, %esi		# Check alignment.
60	jz 2f			# Jump if alignment is ok.
61	subl $2, %ecx		# Alignment uses up two bytes.
62	jae 1f			# Jump if we had at least two bytes.
63	addl $2, %ecx		# ecx was < 2.  Deal with it.
64	jmp 4f
651:	movw (%esi), %bx
66	addl $2, %esi
67	addw %bx, %ax
68	adcl $0, %eax
692:
70	movl %ecx, %edx
71	shrl $5, %ecx
72	jz 2f
73	testl %esi, %esi
741:	movl (%esi), %ebx
75	adcl %ebx, %eax
76	movl 4(%esi), %ebx
77	adcl %ebx, %eax
78	movl 8(%esi), %ebx
79	adcl %ebx, %eax
80	movl 12(%esi), %ebx
81	adcl %ebx, %eax
82	movl 16(%esi), %ebx
83	adcl %ebx, %eax
84	movl 20(%esi), %ebx
85	adcl %ebx, %eax
86	movl 24(%esi), %ebx
87	adcl %ebx, %eax
88	movl 28(%esi), %ebx
89	adcl %ebx, %eax
90	lea 32(%esi), %esi
91	dec %ecx
92	jne 1b
93	adcl $0, %eax
942:	movl %edx, %ecx
95	andl $0x1c, %edx
96	je 4f
97	shrl $2, %edx		# This clears CF
983:	adcl (%esi), %eax
99	lea 4(%esi), %esi
100	dec %edx
101	jne 3b
102	adcl $0, %eax
1034:	andl $3, %ecx
104	jz 7f
105	cmpl $2, %ecx
106	jb 5f
107	movw (%esi),%cx
108	leal 2(%esi),%esi
109	je 6f
110	shll $16,%ecx
1115:	movb (%esi),%cl
1126:	addl %ecx,%eax
113	adcl $0, %eax
1147:
115	popl %ebx
116	popl %esi
117	ret
118
119#else
120
121/* Version for PentiumII/PPro */
122
123csum_partial:
124	pushl %esi
125	pushl %ebx
126	movl 20(%esp),%eax	# Function arg: unsigned int sum
127	movl 16(%esp),%ecx	# Function arg: int len
128	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf
129
130	testl $2, %esi
131	jnz 30f
13210:
133	movl %ecx, %edx
134	movl %ecx, %ebx
135	andl $0x7c, %ebx
136	shrl $7, %ecx
137	addl %ebx,%esi
138	shrl $2, %ebx
139	negl %ebx
140	lea 45f(%ebx,%ebx,2), %ebx
141	testl %esi, %esi
142	jmp *%ebx
143
144	# Handle 2-byte-aligned regions
14520:	addw (%esi), %ax
146	lea 2(%esi), %esi
147	adcl $0, %eax
148	jmp 10b
149
15030:	subl $2, %ecx
151	ja 20b
152	je 32f
153	movzbl (%esi),%ebx	# csumming 1 byte, 2-aligned
154	addl %ebx, %eax
155	adcl $0, %eax
156	jmp 80f
15732:
158	addw (%esi), %ax	# csumming 2 bytes, 2-aligned
159	adcl $0, %eax
160	jmp 80f
161
16240:
163	addl -128(%esi), %eax
164	adcl -124(%esi), %eax
165	adcl -120(%esi), %eax
166	adcl -116(%esi), %eax
167	adcl -112(%esi), %eax
168	adcl -108(%esi), %eax
169	adcl -104(%esi), %eax
170	adcl -100(%esi), %eax
171	adcl -96(%esi), %eax
172	adcl -92(%esi), %eax
173	adcl -88(%esi), %eax
174	adcl -84(%esi), %eax
175	adcl -80(%esi), %eax
176	adcl -76(%esi), %eax
177	adcl -72(%esi), %eax
178	adcl -68(%esi), %eax
179	adcl -64(%esi), %eax
180	adcl -60(%esi), %eax
181	adcl -56(%esi), %eax
182	adcl -52(%esi), %eax
183	adcl -48(%esi), %eax
184	adcl -44(%esi), %eax
185	adcl -40(%esi), %eax
186	adcl -36(%esi), %eax
187	adcl -32(%esi), %eax
188	adcl -28(%esi), %eax
189	adcl -24(%esi), %eax
190	adcl -20(%esi), %eax
191	adcl -16(%esi), %eax
192	adcl -12(%esi), %eax
193	adcl -8(%esi), %eax
194	adcl -4(%esi), %eax
19545:
196	lea 128(%esi), %esi
197	adcl $0, %eax
198	dec %ecx
199	jge 40b
200	movl %edx, %ecx
20150:	andl $3, %ecx
202	jz 80f
203
204	# Handle the last 1-3 bytes without jumping
205	notl %ecx		# 1->2, 2->1, 3->0, higher bits are masked
206	movl $0xffffff,%ebx	# by the shll and shrl instructions
207	shll $3,%ecx
208	shrl %cl,%ebx
209	andl -128(%esi),%ebx	# esi is 4-aligned so should be ok
210	addl %ebx,%eax
211	adcl $0,%eax
21280:
213	popl %ebx
214	popl %esi
215	ret
216
217#endif
218	EXPORT_SYMBOL(csum_partial)
219