1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# This tests basic flowtable functionality.
5# Creates following topology:
6#
7# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000)
8# Router1 is the one doing flow offloading, Router2 has no special
9# purpose other than having a link that is smaller than either Originator
10# and responder, i.e. TCPMSS announced values are too large and will still
11# result in fragmentation and/or PMTU discovery.
12
13# Kselftest framework requirement - SKIP code is 4.
14ksft_skip=4
15ret=0
16
17ns1in=""
18ns2in=""
19ns1out=""
20ns2out=""
21
22log_netns=$(sysctl -n net.netfilter.nf_log_all_netns)
23
24nft --version > /dev/null 2>&1
25if [ $? -ne 0 ];then
26	echo "SKIP: Could not run test without nft tool"
27	exit $ksft_skip
28fi
29
30ip -Version > /dev/null 2>&1
31if [ $? -ne 0 ];then
32	echo "SKIP: Could not run test without ip tool"
33	exit $ksft_skip
34fi
35
36which nc > /dev/null 2>&1
37if [ $? -ne 0 ];then
38	echo "SKIP: Could not run test without nc (netcat)"
39	exit $ksft_skip
40fi
41
42ip netns add nsr1
43if [ $? -ne 0 ];then
44	echo "SKIP: Could not create net namespace"
45	exit $ksft_skip
46fi
47
48ip netns add ns1
49ip netns add ns2
50
51ip netns add nsr2
52
53cleanup() {
54	for i in 1 2; do
55		ip netns del ns$i
56		ip netns del nsr$i
57	done
58
59	rm -f "$ns1in" "$ns1out"
60	rm -f "$ns2in" "$ns2out"
61
62	[ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns
63}
64
65trap cleanup EXIT
66
67sysctl -q net.netfilter.nf_log_all_netns=1
68
69ip link add veth0 netns nsr1 type veth peer name eth0 netns ns1
70ip link add veth1 netns nsr1 type veth peer name veth0 netns nsr2
71
72ip link add veth1 netns nsr2 type veth peer name eth0 netns ns2
73
74for dev in lo veth0 veth1; do
75  for i in 1 2; do
76    ip -net nsr$i link set $dev up
77  done
78done
79
80ip -net nsr1 addr add 10.0.1.1/24 dev veth0
81ip -net nsr1 addr add dead:1::1/64 dev veth0
82
83ip -net nsr2 addr add 10.0.2.1/24 dev veth1
84ip -net nsr2 addr add dead:2::1/64 dev veth1
85
86# set different MTUs so we need to push packets coming from ns1 (large MTU)
87# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1),
88# or to do PTMU discovery (send ICMP error back to originator).
89# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers
90# is NOT the lowest link mtu.
91
92ip -net nsr1 link set veth0 mtu 9000
93ip -net ns1 link set eth0 mtu 9000
94
95ip -net nsr2 link set veth1 mtu 2000
96ip -net ns2 link set eth0 mtu 2000
97
98# transfer-net between nsr1 and nsr2.
99# these addresses are not used for connections.
100ip -net nsr1 addr add 192.168.10.1/24 dev veth1
101ip -net nsr1 addr add fee1:2::1/64 dev veth1
102
103ip -net nsr2 addr add 192.168.10.2/24 dev veth0
104ip -net nsr2 addr add fee1:2::2/64 dev veth0
105
106for i in 1 2; do
107  ip netns exec nsr$i sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
108  ip netns exec nsr$i sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
109
110  ip -net ns$i link set lo up
111  ip -net ns$i link set eth0 up
112  ip -net ns$i addr add 10.0.$i.99/24 dev eth0
113  ip -net ns$i route add default via 10.0.$i.1
114  ip -net ns$i addr add dead:$i::99/64 dev eth0
115  ip -net ns$i route add default via dead:$i::1
116  ip netns exec ns$i sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null
117
118  # don't set ip DF bit for first two tests
119  ip netns exec ns$i sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null
120done
121
122ip -net nsr1 route add default via 192.168.10.2
123ip -net nsr2 route add default via 192.168.10.1
124
125ip netns exec nsr1 nft -f - <<EOF
126table inet filter {
127  flowtable f1 {
128     hook ingress priority 0
129     devices = { veth0, veth1 }
130   }
131
132   chain forward {
133      type filter hook forward priority 0; policy drop;
134
135      # flow offloaded? Tag ct with mark 1, so we can detect when it fails.
136      meta oif "veth1" tcp dport 12345 flow offload @f1 counter
137
138      # use packet size to trigger 'should be offloaded by now'.
139      # otherwise, if 'flow offload' expression never offloads, the
140      # test will pass.
141      tcp dport 12345 meta length gt 200 ct mark set 1 counter
142
143      # this turns off flow offloading internally, so expect packets again
144      tcp flags fin,rst ct mark set 0 accept
145
146      # this allows large packets from responder, we need this as long
147      # as PMTUd is off.
148      # This rule is deleted for the last test, when we expect PMTUd
149      # to kick in and ensure all packets meet mtu requirements.
150      meta length gt 1500 accept comment something-to-grep-for
151
152      # next line blocks connection w.o. working offload.
153      # we only do this for reverse dir, because we expect packets to
154      # enter slow path due to MTU mismatch of veth0 and veth1.
155      tcp sport 12345 ct mark 1 counter log prefix "mark failure " drop
156
157      ct state established,related accept
158
159      # for packets that we can't offload yet, i.e. SYN (any ct that is not confirmed)
160      meta length lt 200 oif "veth1" tcp dport 12345 counter accept
161
162      meta nfproto ipv4 meta l4proto icmp accept
163      meta nfproto ipv6 meta l4proto icmpv6 accept
164   }
165}
166EOF
167
168if [ $? -ne 0 ]; then
169	echo "SKIP: Could not load nft ruleset"
170	exit $ksft_skip
171fi
172
173# test basic connectivity
174ip netns exec ns1 ping -c 1 -q 10.0.2.99 > /dev/null
175if [ $? -ne 0 ];then
176  echo "ERROR: ns1 cannot reach ns2" 1>&2
177  bash
178  exit 1
179fi
180
181ip netns exec ns2 ping -c 1 -q 10.0.1.99 > /dev/null
182if [ $? -ne 0 ];then
183  echo "ERROR: ns2 cannot reach ns1" 1>&2
184  exit 1
185fi
186
187if [ $ret -eq 0 ];then
188	echo "PASS: netns routing/connectivity: ns1 can reach ns2"
189fi
190
191ns1in=$(mktemp)
192ns1out=$(mktemp)
193ns2in=$(mktemp)
194ns2out=$(mktemp)
195
196make_file()
197{
198	name=$1
199	who=$2
200
201	SIZE=$((RANDOM % (1024 * 8)))
202	TSIZE=$((SIZE * 1024))
203
204	dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null
205
206	SIZE=$((RANDOM % 1024))
207	SIZE=$((SIZE + 128))
208	TSIZE=$((TSIZE + SIZE))
209	dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null
210}
211
212check_transfer()
213{
214	in=$1
215	out=$2
216	what=$3
217
218	cmp "$in" "$out" > /dev/null 2>&1
219	if [ $? -ne 0 ] ;then
220		echo "FAIL: file mismatch for $what" 1>&2
221		ls -l "$in"
222		ls -l "$out"
223		return 1
224	fi
225
226	return 0
227}
228
229test_tcp_forwarding()
230{
231	local nsa=$1
232	local nsb=$2
233	local lret=0
234
235	ip netns exec $nsb nc -w 5 -l -p 12345 < "$ns2in" > "$ns2out" &
236	lpid=$!
237
238	sleep 1
239	ip netns exec $nsa nc -w 4 10.0.2.99 12345 < "$ns1in" > "$ns1out" &
240	cpid=$!
241
242	sleep 3
243
244	kill $lpid
245	kill $cpid
246	wait
247
248	check_transfer "$ns1in" "$ns2out" "ns1 -> ns2"
249	if [ $? -ne 0 ];then
250		lret=1
251	fi
252
253	check_transfer "$ns2in" "$ns1out" "ns1 <- ns2"
254	if [ $? -ne 0 ];then
255		lret=1
256	fi
257
258	return $lret
259}
260
261make_file "$ns1in" "ns1"
262make_file "$ns2in" "ns2"
263
264# First test:
265# No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed.
266test_tcp_forwarding ns1 ns2
267if [ $? -eq 0 ] ;then
268	echo "PASS: flow offloaded for ns1/ns2"
269else
270	echo "FAIL: flow offload for ns1/ns2:" 1>&2
271	ip netns exec nsr1 nft list ruleset
272	ret=1
273fi
274
275# delete default route, i.e. ns2 won't be able to reach ns1 and
276# will depend on ns1 being masqueraded in nsr1.
277# expect ns1 has nsr1 address.
278ip -net ns2 route del default via 10.0.2.1
279ip -net ns2 route del default via dead:2::1
280ip -net ns2 route add 192.168.10.1 via 10.0.2.1
281
282# Second test:
283# Same, but with NAT enabled.
284ip netns exec nsr1 nft -f - <<EOF
285table ip nat {
286   chain postrouting {
287      type nat hook postrouting priority 0; policy accept;
288      meta oifname "veth1" masquerade
289   }
290}
291EOF
292
293test_tcp_forwarding ns1 ns2
294
295if [ $? -eq 0 ] ;then
296	echo "PASS: flow offloaded for ns1/ns2 with NAT"
297else
298	echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2
299	ip netns exec nsr1 nft list ruleset
300	ret=1
301fi
302
303# Third test:
304# Same as second test, but with PMTU discovery enabled.
305handle=$(ip netns exec nsr1 nft -a list table inet filter | grep something-to-grep-for | cut -d \# -f 2)
306
307ip netns exec nsr1 nft delete rule inet filter forward $handle
308if [ $? -ne 0 ] ;then
309	echo "FAIL: Could not delete large-packet accept rule"
310	exit 1
311fi
312
313ip netns exec ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
314ip netns exec ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
315
316test_tcp_forwarding ns1 ns2
317if [ $? -eq 0 ] ;then
318	echo "PASS: flow offloaded for ns1/ns2 with NAT and pmtu discovery"
319else
320	echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2
321	ip netns exec nsr1 nft list ruleset
322fi
323
324KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1)
325KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1)
326SPI1=$RANDOM
327SPI2=$RANDOM
328
329if [ $SPI1 -eq $SPI2 ]; then
330	SPI2=$((SPI2+1))
331fi
332
333do_esp() {
334    local ns=$1
335    local me=$2
336    local remote=$3
337    local lnet=$4
338    local rnet=$5
339    local spi_out=$6
340    local spi_in=$7
341
342    ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in  enc aes $KEY_AES  auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet
343    ip -net $ns xfrm state add src $me  dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet
344
345    # to encrypt packets as they go out (includes forwarded packets that need encapsulation)
346    ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow
347    # to fwd decrypted packets after esp processing:
348    ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow
349
350}
351
352do_esp nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2
353
354do_esp nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1
355
356ip netns exec nsr1 nft delete table ip nat
357
358# restore default routes
359ip -net ns2 route del 192.168.10.1 via 10.0.2.1
360ip -net ns2 route add default via 10.0.2.1
361ip -net ns2 route add default via dead:2::1
362
363test_tcp_forwarding ns1 ns2
364if [ $? -eq 0 ] ;then
365	echo "PASS: ipsec tunnel mode for ns1/ns2"
366else
367	echo "FAIL: ipsec tunnel mode for ns1/ns2"
368	ip netns exec nsr1 nft list ruleset 1>&2
369	ip netns exec nsr1 cat /proc/net/xfrm_stat 1>&2
370fi
371
372exit $ret
373