1#!/bin/bash 2# SPDX-License-Identifier: GPL-2.0 3# 4# This tests basic flowtable functionality. 5# Creates following default topology: 6# 7# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000) 8# Router1 is the one doing flow offloading, Router2 has no special 9# purpose other than having a link that is smaller than either Originator 10# and responder, i.e. TCPMSS announced values are too large and will still 11# result in fragmentation and/or PMTU discovery. 12# 13# You can check with different Orgininator/Link/Responder MTU eg: 14# nft_flowtable.sh -o8000 -l1500 -r2000 15# 16 17sfx=$(mktemp -u "XXXXXXXX") 18ns1="ns1-$sfx" 19ns2="ns2-$sfx" 20nsr1="nsr1-$sfx" 21nsr2="nsr2-$sfx" 22 23# Kselftest framework requirement - SKIP code is 4. 24ksft_skip=4 25ret=0 26 27nsin="" 28ns1out="" 29ns2out="" 30 31log_netns=$(sysctl -n net.netfilter.nf_log_all_netns) 32 33checktool (){ 34 if ! $1 > /dev/null 2>&1; then 35 echo "SKIP: Could not $2" 36 exit $ksft_skip 37 fi 38} 39 40checktool "nft --version" "run test without nft tool" 41checktool "ip -Version" "run test without ip tool" 42checktool "which nc" "run test without nc (netcat)" 43checktool "ip netns add $nsr1" "create net namespace $nsr1" 44 45ip netns add $ns1 46ip netns add $ns2 47ip netns add $nsr2 48 49cleanup() { 50 ip netns del $ns1 51 ip netns del $ns2 52 ip netns del $nsr1 53 ip netns del $nsr2 54 55 rm -f "$nsin" "$ns1out" "$ns2out" 56 57 [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns 58} 59 60trap cleanup EXIT 61 62sysctl -q net.netfilter.nf_log_all_netns=1 63 64ip link add veth0 netns $nsr1 type veth peer name eth0 netns $ns1 65ip link add veth1 netns $nsr1 type veth peer name veth0 netns $nsr2 66 67ip link add veth1 netns $nsr2 type veth peer name eth0 netns $ns2 68 69for dev in lo veth0 veth1; do 70 ip -net $nsr1 link set $dev up 71 ip -net $nsr2 link set $dev up 72done 73 74ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 75ip -net $nsr1 addr add dead:1::1/64 dev veth0 76 77ip -net $nsr2 addr add 10.0.2.1/24 dev veth1 78ip -net $nsr2 addr add dead:2::1/64 dev veth1 79 80# set different MTUs so we need to push packets coming from ns1 (large MTU) 81# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1), 82# or to do PTMU discovery (send ICMP error back to originator). 83# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers 84# is NOT the lowest link mtu. 85 86omtu=9000 87lmtu=1500 88rmtu=2000 89 90usage(){ 91 echo "nft_flowtable.sh [OPTIONS]" 92 echo 93 echo "MTU options" 94 echo " -o originator" 95 echo " -l link" 96 echo " -r responder" 97 exit 1 98} 99 100while getopts "o:l:r:" o 101do 102 case $o in 103 o) omtu=$OPTARG;; 104 l) lmtu=$OPTARG;; 105 r) rmtu=$OPTARG;; 106 *) usage;; 107 esac 108done 109 110if ! ip -net $nsr1 link set veth0 mtu $omtu; then 111 exit 1 112fi 113 114ip -net $ns1 link set eth0 mtu $omtu 115 116if ! ip -net $nsr2 link set veth1 mtu $rmtu; then 117 exit 1 118fi 119 120ip -net $ns2 link set eth0 mtu $rmtu 121 122# transfer-net between nsr1 and nsr2. 123# these addresses are not used for connections. 124ip -net $nsr1 addr add 192.168.10.1/24 dev veth1 125ip -net $nsr1 addr add fee1:2::1/64 dev veth1 126 127ip -net $nsr2 addr add 192.168.10.2/24 dev veth0 128ip -net $nsr2 addr add fee1:2::2/64 dev veth0 129 130for i in 0 1; do 131 ip netns exec $nsr1 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null 132 ip netns exec $nsr2 sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null 133done 134 135for ns in $ns1 $ns2;do 136 ip -net $ns link set lo up 137 ip -net $ns link set eth0 up 138 139 if ! ip netns exec $ns sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then 140 echo "ERROR: Check Originator/Responder values (problem during address addition)" 141 exit 1 142 fi 143 # don't set ip DF bit for first two tests 144 ip netns exec $ns sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null 145done 146 147ip -net $ns1 addr add 10.0.1.99/24 dev eth0 148ip -net $ns2 addr add 10.0.2.99/24 dev eth0 149ip -net $ns1 route add default via 10.0.1.1 150ip -net $ns2 route add default via 10.0.2.1 151ip -net $ns1 addr add dead:1::99/64 dev eth0 152ip -net $ns2 addr add dead:2::99/64 dev eth0 153ip -net $ns1 route add default via dead:1::1 154ip -net $ns2 route add default via dead:2::1 155 156ip -net $nsr1 route add default via 192.168.10.2 157ip -net $nsr2 route add default via 192.168.10.1 158 159ip netns exec $nsr1 nft -f - <<EOF 160table inet filter { 161 flowtable f1 { 162 hook ingress priority 0 163 devices = { veth0, veth1 } 164 } 165 166 counter routed_orig { } 167 counter routed_repl { } 168 169 chain forward { 170 type filter hook forward priority 0; policy drop; 171 172 # flow offloaded? Tag ct with mark 1, so we can detect when it fails. 173 meta oif "veth1" tcp dport 12345 ct mark set 1 flow add @f1 counter name routed_orig accept 174 175 # count packets supposedly offloaded as per direction. 176 ct mark 1 counter name ct direction map { original : routed_orig, reply : routed_repl } accept 177 178 ct state established,related accept 179 180 meta nfproto ipv4 meta l4proto icmp accept 181 meta nfproto ipv6 meta l4proto icmpv6 accept 182 } 183} 184EOF 185 186if [ $? -ne 0 ]; then 187 echo "SKIP: Could not load nft ruleset" 188 exit $ksft_skip 189fi 190 191ip netns exec $ns2 nft -f - <<EOF 192table inet filter { 193 counter ip4dscp0 { } 194 counter ip4dscp3 { } 195 196 chain input { 197 type filter hook input priority 0; policy accept; 198 meta l4proto tcp goto { 199 ip dscp cs3 counter name ip4dscp3 accept 200 ip dscp 0 counter name ip4dscp0 accept 201 } 202 } 203} 204EOF 205 206if [ $? -ne 0 ]; then 207 echo "SKIP: Could not load nft ruleset" 208 exit $ksft_skip 209fi 210 211# test basic connectivity 212if ! ip netns exec $ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then 213 echo "ERROR: $ns1 cannot reach ns2" 1>&2 214 exit 1 215fi 216 217if ! ip netns exec $ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then 218 echo "ERROR: $ns2 cannot reach $ns1" 1>&2 219 exit 1 220fi 221 222if [ $ret -eq 0 ];then 223 echo "PASS: netns routing/connectivity: $ns1 can reach $ns2" 224fi 225 226nsin=$(mktemp) 227ns1out=$(mktemp) 228ns2out=$(mktemp) 229 230make_file() 231{ 232 name=$1 233 234 SIZE=$((RANDOM % (1024 * 128))) 235 SIZE=$((SIZE + (1024 * 8))) 236 TSIZE=$((SIZE * 1024)) 237 238 dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null 239 240 SIZE=$((RANDOM % 1024)) 241 SIZE=$((SIZE + 128)) 242 TSIZE=$((TSIZE + SIZE)) 243 dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null 244} 245 246check_counters() 247{ 248 local what=$1 249 local ok=1 250 251 local orig=$(ip netns exec $nsr1 nft reset counter inet filter routed_orig | grep packets) 252 local repl=$(ip netns exec $nsr1 nft reset counter inet filter routed_repl | grep packets) 253 254 local orig_cnt=${orig#*bytes} 255 local repl_cnt=${repl#*bytes} 256 257 local fs=$(du -sb $nsin) 258 local max_orig=${fs%%/*} 259 local max_repl=$((max_orig/4)) 260 261 if [ $orig_cnt -gt $max_orig ];then 262 echo "FAIL: $what: original counter $orig_cnt exceeds expected value $max_orig" 1>&2 263 ret=1 264 ok=0 265 fi 266 267 if [ $repl_cnt -gt $max_repl ];then 268 echo "FAIL: $what: reply counter $repl_cnt exceeds expected value $max_repl" 1>&2 269 ret=1 270 ok=0 271 fi 272 273 if [ $ok -eq 1 ]; then 274 echo "PASS: $what" 275 fi 276} 277 278check_dscp() 279{ 280 local what=$1 281 local ok=1 282 283 local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp3 | grep packets) 284 285 local pc4=${counter%*bytes*} 286 local pc4=${pc4#*packets} 287 288 local counter=$(ip netns exec $ns2 nft reset counter inet filter ip4dscp0 | grep packets) 289 local pc4z=${counter%*bytes*} 290 local pc4z=${pc4z#*packets} 291 292 case "$what" in 293 "dscp_none") 294 if [ $pc4 -gt 0 ] || [ $pc4z -eq 0 ]; then 295 echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2 296 ret=1 297 ok=0 298 fi 299 ;; 300 "dscp_fwd") 301 if [ $pc4 -eq 0 ] || [ $pc4z -eq 0 ]; then 302 echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2 303 ret=1 304 ok=0 305 fi 306 ;; 307 "dscp_ingress") 308 if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then 309 echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 310 ret=1 311 ok=0 312 fi 313 ;; 314 "dscp_egress") 315 if [ $pc4 -eq 0 ] || [ $pc4z -gt 0 ]; then 316 echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2 317 ret=1 318 ok=0 319 fi 320 ;; 321 *) 322 echo "FAIL: Unknown DSCP check" 1>&2 323 ret=1 324 ok=0 325 esac 326 327 if [ $ok -eq 1 ] ;then 328 echo "PASS: $what: dscp packet counters match" 329 fi 330} 331 332check_transfer() 333{ 334 in=$1 335 out=$2 336 what=$3 337 338 if ! cmp "$in" "$out" > /dev/null 2>&1; then 339 echo "FAIL: file mismatch for $what" 1>&2 340 ls -l "$in" 341 ls -l "$out" 342 return 1 343 fi 344 345 return 0 346} 347 348test_tcp_forwarding_ip() 349{ 350 local nsa=$1 351 local nsb=$2 352 local dstip=$3 353 local dstport=$4 354 local lret=0 355 356 ip netns exec $nsb nc -w 5 -l -p 12345 < "$nsin" > "$ns2out" & 357 lpid=$! 358 359 sleep 1 360 ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$nsin" > "$ns1out" & 361 cpid=$! 362 363 sleep 1 364 365 prev="$(ls -l $ns1out $ns2out)" 366 sleep 1 367 368 while [[ "$prev" != "$(ls -l $ns1out $ns2out)" ]]; do 369 sleep 1; 370 prev="$(ls -l $ns1out $ns2out)" 371 done 372 373 if test -d /proc/"$lpid"/; then 374 kill $lpid 375 fi 376 377 if test -d /proc/"$cpid"/; then 378 kill $cpid 379 fi 380 381 wait $lpid 382 wait $cpid 383 384 if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then 385 lret=1 386 fi 387 388 if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then 389 lret=1 390 fi 391 392 return $lret 393} 394 395test_tcp_forwarding() 396{ 397 test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 398 399 return $? 400} 401 402test_tcp_forwarding_set_dscp() 403{ 404 check_dscp "dscp_none" 405 406ip netns exec $nsr1 nft -f - <<EOF 407table netdev dscpmangle { 408 chain setdscp0 { 409 type filter hook ingress device "veth0" priority 0; policy accept 410 ip dscp set cs3 411 } 412} 413EOF 414if [ $? -eq 0 ]; then 415 test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 416 check_dscp "dscp_ingress" 417 418 ip netns exec $nsr1 nft delete table netdev dscpmangle 419else 420 echo "SKIP: Could not load netdev:ingress for veth0" 421fi 422 423ip netns exec $nsr1 nft -f - <<EOF 424table netdev dscpmangle { 425 chain setdscp0 { 426 type filter hook egress device "veth1" priority 0; policy accept 427 ip dscp set cs3 428 } 429} 430EOF 431if [ $? -eq 0 ]; then 432 test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 433 check_dscp "dscp_egress" 434 435 ip netns exec $nsr1 nft flush table netdev dscpmangle 436else 437 echo "SKIP: Could not load netdev:egress for veth1" 438fi 439 440 # partial. If flowtable really works, then both dscp-is-0 and dscp-is-cs3 441 # counters should have seen packets (before and after ft offload kicks in). 442 ip netns exec $nsr1 nft -a insert rule inet filter forward ip dscp set cs3 443 test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 444 check_dscp "dscp_fwd" 445} 446 447test_tcp_forwarding_nat() 448{ 449 local lret 450 local pmtu 451 452 test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345 453 lret=$? 454 455 pmtu=$3 456 what=$4 457 458 if [ $lret -eq 0 ] ; then 459 if [ $pmtu -eq 1 ] ;then 460 check_counters "flow offload for ns1/ns2 with masquerade and pmtu discovery $what" 461 else 462 echo "PASS: flow offload for ns1/ns2 with masquerade $what" 463 fi 464 465 test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666 466 lret=$? 467 if [ $pmtu -eq 1 ] ;then 468 check_counters "flow offload for ns1/ns2 with dnat and pmtu discovery $what" 469 elif [ $lret -eq 0 ] ; then 470 echo "PASS: flow offload for ns1/ns2 with dnat $what" 471 fi 472 fi 473 474 return $lret 475} 476 477make_file "$nsin" 478 479# First test: 480# No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed. 481# Due to MTU mismatch in both directions, all packets (except small packets like pure 482# acks) have to be handled by normal forwarding path. Therefore, packet counters 483# are not checked. 484if test_tcp_forwarding $ns1 $ns2; then 485 echo "PASS: flow offloaded for ns1/ns2" 486else 487 echo "FAIL: flow offload for ns1/ns2:" 1>&2 488 ip netns exec $nsr1 nft list ruleset 489 ret=1 490fi 491 492# delete default route, i.e. ns2 won't be able to reach ns1 and 493# will depend on ns1 being masqueraded in nsr1. 494# expect ns1 has nsr1 address. 495ip -net $ns2 route del default via 10.0.2.1 496ip -net $ns2 route del default via dead:2::1 497ip -net $ns2 route add 192.168.10.1 via 10.0.2.1 498 499# Second test: 500# Same, but with NAT enabled. Same as in first test: we expect normal forward path 501# to handle most packets. 502ip netns exec $nsr1 nft -f - <<EOF 503table ip nat { 504 chain prerouting { 505 type nat hook prerouting priority 0; policy accept; 506 meta iif "veth0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345 507 } 508 509 chain postrouting { 510 type nat hook postrouting priority 0; policy accept; 511 meta oifname "veth1" counter masquerade 512 } 513} 514EOF 515 516if ! test_tcp_forwarding_set_dscp $ns1 $ns2 0 ""; then 517 echo "FAIL: flow offload for ns1/ns2 with dscp update" 1>&2 518 exit 0 519fi 520 521if ! test_tcp_forwarding_nat $ns1 $ns2 0 ""; then 522 echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2 523 ip netns exec $nsr1 nft list ruleset 524 ret=1 525fi 526 527# Third test: 528# Same as second test, but with PMTU discovery enabled. This 529# means that we expect the fastpath to handle packets as soon 530# as the endpoints adjust the packet size. 531ip netns exec $ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null 532ip netns exec $ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null 533 534# reset counters. 535# With pmtu in-place we'll also check that nft counters 536# are lower than file size and packets were forwarded via flowtable layer. 537# For earlier tests (large mtus), packets cannot be handled via flowtable 538# (except pure acks and other small packets). 539ip netns exec $nsr1 nft reset counters table inet filter >/dev/null 540 541if ! test_tcp_forwarding_nat $ns1 $ns2 1 ""; then 542 echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 543 ip netns exec $nsr1 nft list ruleset 544fi 545 546# Another test: 547# Add bridge interface br0 to Router1, with NAT enabled. 548ip -net $nsr1 link add name br0 type bridge 549ip -net $nsr1 addr flush dev veth0 550ip -net $nsr1 link set up dev veth0 551ip -net $nsr1 link set veth0 master br0 552ip -net $nsr1 addr add 10.0.1.1/24 dev br0 553ip -net $nsr1 addr add dead:1::1/64 dev br0 554ip -net $nsr1 link set up dev br0 555 556ip netns exec $nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null 557 558# br0 with NAT enabled. 559ip netns exec $nsr1 nft -f - <<EOF 560flush table ip nat 561table ip nat { 562 chain prerouting { 563 type nat hook prerouting priority 0; policy accept; 564 meta iif "br0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345 565 } 566 567 chain postrouting { 568 type nat hook postrouting priority 0; policy accept; 569 meta oifname "veth1" counter masquerade 570 } 571} 572EOF 573 574if ! test_tcp_forwarding_nat $ns1 $ns2 1 "on bridge"; then 575 echo "FAIL: flow offload for ns1/ns2 with bridge NAT" 1>&2 576 ip netns exec $nsr1 nft list ruleset 577 ret=1 578fi 579 580 581# Another test: 582# Add bridge interface br0 to Router1, with NAT and VLAN. 583ip -net $nsr1 link set veth0 nomaster 584ip -net $nsr1 link set down dev veth0 585ip -net $nsr1 link add link veth0 name veth0.10 type vlan id 10 586ip -net $nsr1 link set up dev veth0 587ip -net $nsr1 link set up dev veth0.10 588ip -net $nsr1 link set veth0.10 master br0 589 590ip -net $ns1 addr flush dev eth0 591ip -net $ns1 link add link eth0 name eth0.10 type vlan id 10 592ip -net $ns1 link set eth0 up 593ip -net $ns1 link set eth0.10 up 594ip -net $ns1 addr add 10.0.1.99/24 dev eth0.10 595ip -net $ns1 route add default via 10.0.1.1 596ip -net $ns1 addr add dead:1::99/64 dev eth0.10 597 598if ! test_tcp_forwarding_nat $ns1 $ns2 1 "bridge and VLAN"; then 599 echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2 600 ip netns exec $nsr1 nft list ruleset 601 ret=1 602fi 603 604# restore test topology (remove bridge and VLAN) 605ip -net $nsr1 link set veth0 nomaster 606ip -net $nsr1 link set veth0 down 607ip -net $nsr1 link set veth0.10 down 608ip -net $nsr1 link delete veth0.10 type vlan 609ip -net $nsr1 link delete br0 type bridge 610ip -net $ns1 addr flush dev eth0.10 611ip -net $ns1 link set eth0.10 down 612ip -net $ns1 link set eth0 down 613ip -net $ns1 link delete eth0.10 type vlan 614 615# restore address in ns1 and nsr1 616ip -net $ns1 link set eth0 up 617ip -net $ns1 addr add 10.0.1.99/24 dev eth0 618ip -net $ns1 route add default via 10.0.1.1 619ip -net $ns1 addr add dead:1::99/64 dev eth0 620ip -net $ns1 route add default via dead:1::1 621ip -net $nsr1 addr add 10.0.1.1/24 dev veth0 622ip -net $nsr1 addr add dead:1::1/64 dev veth0 623ip -net $nsr1 link set up dev veth0 624 625KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1) 626KEY_AES="0x"$(ps -af | md5sum | cut -d " " -f 1) 627SPI1=$RANDOM 628SPI2=$RANDOM 629 630if [ $SPI1 -eq $SPI2 ]; then 631 SPI2=$((SPI2+1)) 632fi 633 634do_esp() { 635 local ns=$1 636 local me=$2 637 local remote=$3 638 local lnet=$4 639 local rnet=$5 640 local spi_out=$6 641 local spi_in=$7 642 643 ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet 644 ip -net $ns xfrm state add src $me dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet 645 646 # to encrypt packets as they go out (includes forwarded packets that need encapsulation) 647 ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow 648 # to fwd decrypted packets after esp processing: 649 ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow 650 651} 652 653do_esp $nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2 654 655do_esp $nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1 656 657ip netns exec $nsr1 nft delete table ip nat 658 659# restore default routes 660ip -net $ns2 route del 192.168.10.1 via 10.0.2.1 661ip -net $ns2 route add default via 10.0.2.1 662ip -net $ns2 route add default via dead:2::1 663 664if test_tcp_forwarding $ns1 $ns2; then 665 check_counters "ipsec tunnel mode for ns1/ns2" 666else 667 echo "FAIL: ipsec tunnel mode for ns1/ns2" 668 ip netns exec $nsr1 nft list ruleset 1>&2 669 ip netns exec $nsr1 cat /proc/net/xfrm_stat 1>&2 670fi 671 672exit $ret 673