1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/kernel.h>
3 #include <linux/init.h>
4 #include <linux/module.h>
5 #include <linux/netfilter.h>
6 #include <linux/rhashtable.h>
7 #include <linux/netdevice.h>
8 #include <net/ip.h>
9 #include <net/ip6_route.h>
10 #include <net/netfilter/nf_tables.h>
11 #include <net/netfilter/nf_flow_table.h>
12 #include <net/netfilter/nf_conntrack.h>
13 #include <net/netfilter/nf_conntrack_core.h>
14 #include <net/netfilter/nf_conntrack_l4proto.h>
15 #include <net/netfilter/nf_conntrack_tuple.h>
16 
17 struct flow_offload_entry {
18 	struct flow_offload	flow;
19 	struct nf_conn		*ct;
20 	struct rcu_head		rcu_head;
21 };
22 
23 static DEFINE_MUTEX(flowtable_lock);
24 static LIST_HEAD(flowtables);
25 
26 static void
flow_offload_fill_dir(struct flow_offload * flow,struct nf_conn * ct,struct nf_flow_route * route,enum flow_offload_tuple_dir dir)27 flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
28 		      struct nf_flow_route *route,
29 		      enum flow_offload_tuple_dir dir)
30 {
31 	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
32 	struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
33 	struct dst_entry *other_dst = route->tuple[!dir].dst;
34 	struct dst_entry *dst = route->tuple[dir].dst;
35 
36 	ft->dir = dir;
37 
38 	switch (ctt->src.l3num) {
39 	case NFPROTO_IPV4:
40 		ft->src_v4 = ctt->src.u3.in;
41 		ft->dst_v4 = ctt->dst.u3.in;
42 		ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
43 		break;
44 	case NFPROTO_IPV6:
45 		ft->src_v6 = ctt->src.u3.in6;
46 		ft->dst_v6 = ctt->dst.u3.in6;
47 		ft->mtu = ip6_dst_mtu_forward(dst);
48 		break;
49 	}
50 
51 	ft->l3proto = ctt->src.l3num;
52 	ft->l4proto = ctt->dst.protonum;
53 	ft->src_port = ctt->src.u.tcp.port;
54 	ft->dst_port = ctt->dst.u.tcp.port;
55 
56 	ft->iifidx = other_dst->dev->ifindex;
57 	ft->dst_cache = dst;
58 }
59 
60 struct flow_offload *
flow_offload_alloc(struct nf_conn * ct,struct nf_flow_route * route)61 flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
62 {
63 	struct flow_offload_entry *entry;
64 	struct flow_offload *flow;
65 
66 	if (unlikely(nf_ct_is_dying(ct) ||
67 	    !atomic_inc_not_zero(&ct->ct_general.use)))
68 		return NULL;
69 
70 	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
71 	if (!entry)
72 		goto err_ct_refcnt;
73 
74 	flow = &entry->flow;
75 
76 	if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst))
77 		goto err_dst_cache_original;
78 
79 	if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst))
80 		goto err_dst_cache_reply;
81 
82 	entry->ct = ct;
83 
84 	flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL);
85 	flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY);
86 
87 	if (ct->status & IPS_SRC_NAT)
88 		flow->flags |= FLOW_OFFLOAD_SNAT;
89 	if (ct->status & IPS_DST_NAT)
90 		flow->flags |= FLOW_OFFLOAD_DNAT;
91 
92 	return flow;
93 
94 err_dst_cache_reply:
95 	dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
96 err_dst_cache_original:
97 	kfree(entry);
98 err_ct_refcnt:
99 	nf_ct_put(ct);
100 
101 	return NULL;
102 }
103 EXPORT_SYMBOL_GPL(flow_offload_alloc);
104 
flow_offload_fixup_tcp(struct ip_ct_tcp * tcp)105 static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
106 {
107 	tcp->state = TCP_CONNTRACK_ESTABLISHED;
108 	tcp->seen[0].td_maxwin = 0;
109 	tcp->seen[1].td_maxwin = 0;
110 }
111 
112 #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT	(120 * HZ)
113 #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT	(30 * HZ)
114 
nf_flow_timeout_delta(unsigned int timeout)115 static inline __s32 nf_flow_timeout_delta(unsigned int timeout)
116 {
117 	return (__s32)(timeout - (u32)jiffies);
118 }
119 
flow_offload_fixup_ct_timeout(struct nf_conn * ct)120 static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
121 {
122 	const struct nf_conntrack_l4proto *l4proto;
123 	int l4num = nf_ct_protonum(ct);
124 	unsigned int timeout;
125 
126 	l4proto = nf_ct_l4proto_find(l4num);
127 	if (!l4proto)
128 		return;
129 
130 	if (l4num == IPPROTO_TCP)
131 		timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT;
132 	else if (l4num == IPPROTO_UDP)
133 		timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT;
134 	else
135 		return;
136 
137 	if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout)
138 		ct->timeout = nfct_time_stamp + timeout;
139 }
140 
flow_offload_fixup_ct_state(struct nf_conn * ct)141 static void flow_offload_fixup_ct_state(struct nf_conn *ct)
142 {
143 	if (nf_ct_protonum(ct) == IPPROTO_TCP)
144 		flow_offload_fixup_tcp(&ct->proto.tcp);
145 }
146 
flow_offload_fixup_ct(struct nf_conn * ct)147 static void flow_offload_fixup_ct(struct nf_conn *ct)
148 {
149 	flow_offload_fixup_ct_state(ct);
150 	flow_offload_fixup_ct_timeout(ct);
151 }
152 
flow_offload_free(struct flow_offload * flow)153 void flow_offload_free(struct flow_offload *flow)
154 {
155 	struct flow_offload_entry *e;
156 
157 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
158 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
159 	e = container_of(flow, struct flow_offload_entry, flow);
160 	if (flow->flags & FLOW_OFFLOAD_DYING)
161 		nf_ct_delete(e->ct, 0, 0);
162 	nf_ct_put(e->ct);
163 	kfree_rcu(e, rcu_head);
164 }
165 EXPORT_SYMBOL_GPL(flow_offload_free);
166 
flow_offload_hash(const void * data,u32 len,u32 seed)167 static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
168 {
169 	const struct flow_offload_tuple *tuple = data;
170 
171 	return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
172 }
173 
flow_offload_hash_obj(const void * data,u32 len,u32 seed)174 static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
175 {
176 	const struct flow_offload_tuple_rhash *tuplehash = data;
177 
178 	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
179 }
180 
flow_offload_hash_cmp(struct rhashtable_compare_arg * arg,const void * ptr)181 static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
182 					const void *ptr)
183 {
184 	const struct flow_offload_tuple *tuple = arg->key;
185 	const struct flow_offload_tuple_rhash *x = ptr;
186 
187 	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
188 		return 1;
189 
190 	return 0;
191 }
192 
193 static const struct rhashtable_params nf_flow_offload_rhash_params = {
194 	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
195 	.hashfn			= flow_offload_hash,
196 	.obj_hashfn		= flow_offload_hash_obj,
197 	.obj_cmpfn		= flow_offload_hash_cmp,
198 	.automatic_shrinking	= true,
199 };
200 
flow_offload_add(struct nf_flowtable * flow_table,struct flow_offload * flow)201 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
202 {
203 	int err;
204 
205 	flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
206 
207 	err = rhashtable_insert_fast(&flow_table->rhashtable,
208 				     &flow->tuplehash[0].node,
209 				     nf_flow_offload_rhash_params);
210 	if (err < 0)
211 		return err;
212 
213 	err = rhashtable_insert_fast(&flow_table->rhashtable,
214 				     &flow->tuplehash[1].node,
215 				     nf_flow_offload_rhash_params);
216 	if (err < 0) {
217 		rhashtable_remove_fast(&flow_table->rhashtable,
218 				       &flow->tuplehash[0].node,
219 				       nf_flow_offload_rhash_params);
220 		return err;
221 	}
222 
223 	return 0;
224 }
225 EXPORT_SYMBOL_GPL(flow_offload_add);
226 
nf_flow_has_expired(const struct flow_offload * flow)227 static inline bool nf_flow_has_expired(const struct flow_offload *flow)
228 {
229 	return nf_flow_timeout_delta(flow->timeout) <= 0;
230 }
231 
flow_offload_del(struct nf_flowtable * flow_table,struct flow_offload * flow)232 static void flow_offload_del(struct nf_flowtable *flow_table,
233 			     struct flow_offload *flow)
234 {
235 	struct flow_offload_entry *e;
236 
237 	rhashtable_remove_fast(&flow_table->rhashtable,
238 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
239 			       nf_flow_offload_rhash_params);
240 	rhashtable_remove_fast(&flow_table->rhashtable,
241 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
242 			       nf_flow_offload_rhash_params);
243 
244 	e = container_of(flow, struct flow_offload_entry, flow);
245 	clear_bit(IPS_OFFLOAD_BIT, &e->ct->status);
246 
247 	if (nf_flow_has_expired(flow))
248 		flow_offload_fixup_ct(e->ct);
249 	else if (flow->flags & FLOW_OFFLOAD_TEARDOWN)
250 		flow_offload_fixup_ct_timeout(e->ct);
251 
252 	flow_offload_free(flow);
253 }
254 
flow_offload_teardown(struct flow_offload * flow)255 void flow_offload_teardown(struct flow_offload *flow)
256 {
257 	struct flow_offload_entry *e;
258 
259 	flow->flags |= FLOW_OFFLOAD_TEARDOWN;
260 
261 	e = container_of(flow, struct flow_offload_entry, flow);
262 	flow_offload_fixup_ct_state(e->ct);
263 }
264 EXPORT_SYMBOL_GPL(flow_offload_teardown);
265 
266 struct flow_offload_tuple_rhash *
flow_offload_lookup(struct nf_flowtable * flow_table,struct flow_offload_tuple * tuple)267 flow_offload_lookup(struct nf_flowtable *flow_table,
268 		    struct flow_offload_tuple *tuple)
269 {
270 	struct flow_offload_tuple_rhash *tuplehash;
271 	struct flow_offload *flow;
272 	struct flow_offload_entry *e;
273 	int dir;
274 
275 	tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple,
276 				      nf_flow_offload_rhash_params);
277 	if (!tuplehash)
278 		return NULL;
279 
280 	dir = tuplehash->tuple.dir;
281 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
282 	if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))
283 		return NULL;
284 
285 	e = container_of(flow, struct flow_offload_entry, flow);
286 	if (unlikely(nf_ct_is_dying(e->ct)))
287 		return NULL;
288 
289 	return tuplehash;
290 }
291 EXPORT_SYMBOL_GPL(flow_offload_lookup);
292 
293 static int
nf_flow_table_iterate(struct nf_flowtable * flow_table,void (* iter)(struct flow_offload * flow,void * data),void * data)294 nf_flow_table_iterate(struct nf_flowtable *flow_table,
295 		      void (*iter)(struct flow_offload *flow, void *data),
296 		      void *data)
297 {
298 	struct flow_offload_tuple_rhash *tuplehash;
299 	struct rhashtable_iter hti;
300 	struct flow_offload *flow;
301 	int err = 0;
302 
303 	rhashtable_walk_enter(&flow_table->rhashtable, &hti);
304 	rhashtable_walk_start(&hti);
305 
306 	while ((tuplehash = rhashtable_walk_next(&hti))) {
307 		if (IS_ERR(tuplehash)) {
308 			if (PTR_ERR(tuplehash) != -EAGAIN) {
309 				err = PTR_ERR(tuplehash);
310 				break;
311 			}
312 			continue;
313 		}
314 		if (tuplehash->tuple.dir)
315 			continue;
316 
317 		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
318 
319 		iter(flow, data);
320 	}
321 	rhashtable_walk_stop(&hti);
322 	rhashtable_walk_exit(&hti);
323 
324 	return err;
325 }
326 
nf_flow_offload_gc_step(struct flow_offload * flow,void * data)327 static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
328 {
329 	struct nf_flowtable *flow_table = data;
330 	struct flow_offload_entry *e;
331 
332 	e = container_of(flow, struct flow_offload_entry, flow);
333 	if (nf_flow_has_expired(flow) || nf_ct_is_dying(e->ct) ||
334 	    (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)))
335 		flow_offload_del(flow_table, flow);
336 }
337 
nf_flow_offload_work_gc(struct work_struct * work)338 static void nf_flow_offload_work_gc(struct work_struct *work)
339 {
340 	struct nf_flowtable *flow_table;
341 
342 	flow_table = container_of(work, struct nf_flowtable, gc_work.work);
343 	nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
344 	queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
345 }
346 
nf_flow_nat_port_tcp(struct sk_buff * skb,unsigned int thoff,__be16 port,__be16 new_port)347 static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
348 				__be16 port, __be16 new_port)
349 {
350 	struct tcphdr *tcph;
351 
352 	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
353 	    skb_try_make_writable(skb, thoff + sizeof(*tcph)))
354 		return -1;
355 
356 	tcph = (void *)(skb_network_header(skb) + thoff);
357 	inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
358 
359 	return 0;
360 }
361 
nf_flow_nat_port_udp(struct sk_buff * skb,unsigned int thoff,__be16 port,__be16 new_port)362 static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
363 				__be16 port, __be16 new_port)
364 {
365 	struct udphdr *udph;
366 
367 	if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
368 	    skb_try_make_writable(skb, thoff + sizeof(*udph)))
369 		return -1;
370 
371 	udph = (void *)(skb_network_header(skb) + thoff);
372 	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
373 		inet_proto_csum_replace2(&udph->check, skb, port,
374 					 new_port, true);
375 		if (!udph->check)
376 			udph->check = CSUM_MANGLED_0;
377 	}
378 
379 	return 0;
380 }
381 
nf_flow_nat_port(struct sk_buff * skb,unsigned int thoff,u8 protocol,__be16 port,__be16 new_port)382 static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
383 			    u8 protocol, __be16 port, __be16 new_port)
384 {
385 	switch (protocol) {
386 	case IPPROTO_TCP:
387 		if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
388 			return NF_DROP;
389 		break;
390 	case IPPROTO_UDP:
391 		if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
392 			return NF_DROP;
393 		break;
394 	}
395 
396 	return 0;
397 }
398 
nf_flow_snat_port(const struct flow_offload * flow,struct sk_buff * skb,unsigned int thoff,u8 protocol,enum flow_offload_tuple_dir dir)399 int nf_flow_snat_port(const struct flow_offload *flow,
400 		      struct sk_buff *skb, unsigned int thoff,
401 		      u8 protocol, enum flow_offload_tuple_dir dir)
402 {
403 	struct flow_ports *hdr;
404 	__be16 port, new_port;
405 
406 	if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
407 	    skb_try_make_writable(skb, thoff + sizeof(*hdr)))
408 		return -1;
409 
410 	hdr = (void *)(skb_network_header(skb) + thoff);
411 
412 	switch (dir) {
413 	case FLOW_OFFLOAD_DIR_ORIGINAL:
414 		port = hdr->source;
415 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
416 		hdr->source = new_port;
417 		break;
418 	case FLOW_OFFLOAD_DIR_REPLY:
419 		port = hdr->dest;
420 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
421 		hdr->dest = new_port;
422 		break;
423 	default:
424 		return -1;
425 	}
426 
427 	return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
428 }
429 EXPORT_SYMBOL_GPL(nf_flow_snat_port);
430 
nf_flow_dnat_port(const struct flow_offload * flow,struct sk_buff * skb,unsigned int thoff,u8 protocol,enum flow_offload_tuple_dir dir)431 int nf_flow_dnat_port(const struct flow_offload *flow,
432 		      struct sk_buff *skb, unsigned int thoff,
433 		      u8 protocol, enum flow_offload_tuple_dir dir)
434 {
435 	struct flow_ports *hdr;
436 	__be16 port, new_port;
437 
438 	if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
439 	    skb_try_make_writable(skb, thoff + sizeof(*hdr)))
440 		return -1;
441 
442 	hdr = (void *)(skb_network_header(skb) + thoff);
443 
444 	switch (dir) {
445 	case FLOW_OFFLOAD_DIR_ORIGINAL:
446 		port = hdr->dest;
447 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
448 		hdr->dest = new_port;
449 		break;
450 	case FLOW_OFFLOAD_DIR_REPLY:
451 		port = hdr->source;
452 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
453 		hdr->source = new_port;
454 		break;
455 	default:
456 		return -1;
457 	}
458 
459 	return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
460 }
461 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
462 
nf_flow_table_init(struct nf_flowtable * flowtable)463 int nf_flow_table_init(struct nf_flowtable *flowtable)
464 {
465 	int err;
466 
467 	INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
468 
469 	err = rhashtable_init(&flowtable->rhashtable,
470 			      &nf_flow_offload_rhash_params);
471 	if (err < 0)
472 		return err;
473 
474 	queue_delayed_work(system_power_efficient_wq,
475 			   &flowtable->gc_work, HZ);
476 
477 	mutex_lock(&flowtable_lock);
478 	list_add(&flowtable->list, &flowtables);
479 	mutex_unlock(&flowtable_lock);
480 
481 	return 0;
482 }
483 EXPORT_SYMBOL_GPL(nf_flow_table_init);
484 
nf_flow_table_do_cleanup(struct flow_offload * flow,void * data)485 static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
486 {
487 	struct net_device *dev = data;
488 	struct flow_offload_entry *e;
489 
490 	e = container_of(flow, struct flow_offload_entry, flow);
491 
492 	if (!dev) {
493 		flow_offload_teardown(flow);
494 		return;
495 	}
496 	if (net_eq(nf_ct_net(e->ct), dev_net(dev)) &&
497 	    (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
498 	     flow->tuplehash[1].tuple.iifidx == dev->ifindex))
499 		flow_offload_dead(flow);
500 }
501 
nf_flow_table_iterate_cleanup(struct nf_flowtable * flowtable,struct net_device * dev)502 static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
503 					  struct net_device *dev)
504 {
505 	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
506 	flush_delayed_work(&flowtable->gc_work);
507 }
508 
nf_flow_table_cleanup(struct net_device * dev)509 void nf_flow_table_cleanup(struct net_device *dev)
510 {
511 	struct nf_flowtable *flowtable;
512 
513 	mutex_lock(&flowtable_lock);
514 	list_for_each_entry(flowtable, &flowtables, list)
515 		nf_flow_table_iterate_cleanup(flowtable, dev);
516 	mutex_unlock(&flowtable_lock);
517 }
518 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
519 
nf_flow_table_free(struct nf_flowtable * flow_table)520 void nf_flow_table_free(struct nf_flowtable *flow_table)
521 {
522 	mutex_lock(&flowtable_lock);
523 	list_del(&flow_table->list);
524 	mutex_unlock(&flowtable_lock);
525 	cancel_delayed_work_sync(&flow_table->gc_work);
526 	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
527 	nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
528 	rhashtable_destroy(&flow_table->rhashtable);
529 }
530 EXPORT_SYMBOL_GPL(nf_flow_table_free);
531 
532 MODULE_LICENSE("GPL");
533 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
534